diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index a318ef0b6bd68..6d0ba466347c1 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1894,6 +1894,19 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  /// Checks if the indirect call / jump is accepted by the landing pad at the
+  /// start of the target BasicBlock.
+  virtual bool isCallCoveredByBTI(MCInst &Call, MCInst &Pad) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  /// Inserts a BTI landing pad to the start of the BB, that matches the
+  /// indirect call inst used to call the BB.
+  virtual void insertBTI(BinaryBasicBlock &BB, MCInst &Call) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Store \p Target absolute address to \p RegName
   virtual InstructionListType materializeAddress(const MCSymbol *Target,
                                                  MCContext *Ctx,
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 5881d3fba70f6..729f854736f6c 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2806,6 +2806,81 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createImm(HintNum));
   }
 
+  bool isCallCoveredByBTI(MCInst &Call, MCInst &Pad) const override {
+    assert((isIndirectCall(Call) || isIndirectBranch(Call)) &&
+           "Not an indirect call or branch.");
+
+    // A BLR can be accepted by a BTI c.
+    if (isIndirectCall(Call))
+      return isBTILandingPad(Pad, true, false) ||
+             isBTILandingPad(Pad, true, true);
+
+    // A BR can be accepted by a BTI j or BTI c (and BTI jc) IF the operand is
+    // x16 or x17. If the operand is not x16 or x17, it can be accepted by a BTI
+    // j or BTI jc (and not BTI c).
+    if (isIndirectBranch(Call)) {
+      assert(Call.getNumOperands() == 1 &&
+             "Indirect branch needs to have 1 operand.");
+      assert(Call.getOperand(0).isReg() &&
+             "Indirect branch does not have a register operand.");
+      MCPhysReg Reg = Call.getOperand(0).getReg();
+      if (Reg == AArch64::X16 || Reg == AArch64::X17)
+        return isBTILandingPad(Pad, true, false) ||
+               isBTILandingPad(Pad, false, true) ||
+               isBTILandingPad(Pad, true, true);
+      return isBTILandingPad(Pad, false, true) ||
+             isBTILandingPad(Pad, true, true);
+    }
+    return false;
+  }
+
+  void insertBTI(BinaryBasicBlock &BB, MCInst &Call) const override {
+    auto II = BB.getFirstNonPseudo();
+    // Only check the first instruction for non-empty BasicBlocks
+    bool Empty = (II == BB.end());
+    if (!Empty && isCallCoveredByBTI(Call, *II))
+      return;
+    // A BLR can be accepted by a BTI c.
+    if (isIndirectCall(Call)) {
+      // if we have a BTI j at the start, extend it to a BTI jc,
+      // otherwise insert a new BTI c.
+      if (!Empty && isBTILandingPad(*II, false, true)) {
+        updateBTIVariant(*II, true, true);
+      } else {
+        MCInst BTIInst;
+        createBTI(BTIInst, true, false);
+        BB.insertInstruction(II, BTIInst);
+      }
+    }
+
+    // A BR can be accepted by a BTI j or BTI c (and BTI jc) IF the operand is
+    // x16 or x17. If the operand is not x16 or x17, it can be accepted by a
+    // BTI j or BTI jc (and not BTI c).
+    if (isIndirectBranch(Call)) {
+      assert(Call.getNumOperands() == 1 &&
+             "Indirect branch needs to have 1 operand.");
+      assert(Call.getOperand(0).isReg() &&
+             "Indirect branch does not have a register operand.");
+      MCPhysReg Reg = Call.getOperand(0).getReg();
+      if (Reg == AArch64::X16 || Reg == AArch64::X17) {
+        // Add a new BTI c
+        MCInst BTIInst;
+        createBTI(BTIInst, true, false);
+        BB.insertInstruction(II, BTIInst);
+      } else {
+        // If BB starts with a BTI c, extend it to BTI jc,
+        // otherwise insert a new BTI j.
+        if (!Empty && isBTILandingPad(*II, true, false)) {
+          updateBTIVariant(*II, true, true);
+        } else {
+          MCInst BTIInst;
+          createBTI(BTIInst, false, true);
+          BB.insertInstruction(II, BTIInst);
+        }
+      }
+    }
+  }
+
   InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx,
                                          MCPhysReg RegName,
                                          int64_t Addend = 0) const override {
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index 7b6f1620a3f2c..e8323e87fe148 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -198,6 +198,122 @@ TEST_P(MCPlusBuilderTester, AArch64_BTI) {
   ASSERT_TRUE(BC->MIB->isImplicitBTIC(*II));
 }
 
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_empty) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16);
+  BC->MIB->insertBTI(*BB, CallInst);
+  // Check that BTI c is added to the empty block.
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false));
+}
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_0) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR);
+  BB->addInstruction(Inst);
+  // BR x16 needs BTI c or BTI j. We prefer adding a BTI c.
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_1) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst BTIc;
+  BC->MIB->createBTI(BTIc, true, false);
+  BB->addInstruction(BTIc);
+  // BR x16 needs BTI c or BTI j. We have a BTI c, no change is needed.
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_2) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst BTIc;
+  BC->MIB->createBTI(BTIc, true, false);
+  BB->addInstruction(BTIc);
+  // BR x5 needs BTI j
+  // we have BTI c -> extend it to BTI jc.
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X5);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, true));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_3) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR);
+  BB->addInstruction(Inst);
+  // BR x5 needs BTI j
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X5);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, false, true));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_4) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR);
+  BB->addInstruction(Inst);
+  // BLR needs BTI c, regardless of the register used.
+  MCInst CallInst = MCInstBuilder(AArch64::BLR).addReg(AArch64::X5);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_5) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst BTIj;
+  BC->MIB->createBTI(BTIj, false, true);
+  BB->addInstruction(BTIj);
+  // BLR needs BTI c, regardless of the register used.
+  // We have a BTI j -> extend it to BTI jc.
+  MCInst CallInst = MCInstBuilder(AArch64::BLR).addReg(AArch64::X5);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, true));
+}
+
+TEST_P(MCPlusBuilderTester, AArch64_insertBTI_6) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+  BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true);
+  std::unique_ptr<BinaryBasicBlock> BB = BF->createBasicBlock();
+  MCInst Paciasp =
+      MCInstBuilder(AArch64::PACIASP).addReg(AArch64::LR).addReg(AArch64::SP);
+  BB->addInstruction(Paciasp);
+  // PACI(AB)SP are implicit BTI c, no change needed.
+  MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X17);
+  BC->MIB->insertBTI(*BB, CallInst);
+  auto II = BB->begin();
+  ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(*II));
+}
+
 TEST_P(MCPlusBuilderTester, AArch64_CmpJNE) {
   if (GetParam() != Triple::aarch64)
     GTEST_SKIP();
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
index 4f33670a8500a..6618341296aaf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
@@ -237,6 +237,12 @@ static bool isIdenticalStmt(const ASTContext &Ctx, const Stmt *Stmt1,
       return false;
     return true;
   }
+  case Stmt::DeferStmtClass: {
+    const auto *DefStmt1 = cast<DeferStmt>(Stmt1);
+    const auto *DefStmt2 = cast<DeferStmt>(Stmt2);
+    return isIdenticalStmt(Ctx, DefStmt1->getBody(), DefStmt2->getBody(),
+                           IgnoreSideEffects);
+  }
   case Stmt::CompoundStmtClass: {
     const auto *CompStmt1 = cast<CompoundStmt>(Stmt1);
     const auto *CompStmt2 = cast<CompoundStmt>(Stmt2);
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f5bc3293427b0..661bb0f4d52df 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -208,6 +208,11 @@ Resolutions to C++ Defect Reports
 C Language Changes
 ------------------
 
+- Implemented the ``defer`` draft Technical Specification
+  (`WG14 N3734 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3734.pdf>`_); it is enabled in C mode by
+  passing ``-fdefer-ts``. Note, the details of this feature are subject to change given that the Technical
+  Specification is not yet ratified.
+
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
 - No longer triggering ``-Wstatic-in-inline`` in C2y mode; use of a static
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 8f427427d71ed..c3ac310bf5402 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2561,6 +2561,7 @@ DEF_TRAVERSE_STMT(DefaultStmt, {})
 DEF_TRAVERSE_STMT(DoStmt, {})
 DEF_TRAVERSE_STMT(ForStmt, {})
 DEF_TRAVERSE_STMT(GotoStmt, {})
+DEF_TRAVERSE_STMT(DeferStmt, {})
 DEF_TRAVERSE_STMT(IfStmt, {})
 DEF_TRAVERSE_STMT(IndirectGotoStmt, {})
 DEF_TRAVERSE_STMT(LabelStmt, {})
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index e1cca34d2212c..d56de08eaf279 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -317,6 +317,16 @@ class alignas(void *) Stmt {
     SourceLocation KeywordLoc;
   };
 
+  class DeferStmtBitfields {
+    friend class DeferStmt;
+
+    LLVM_PREFERRED_TYPE(StmtBitfields)
+    unsigned : NumStmtBits;
+
+    /// The location of the "defer".
+    SourceLocation DeferLoc;
+  };
+
   //===--- Expression bitfields classes ---===//
 
   class ExprBitfields {
@@ -1318,6 +1328,7 @@ class alignas(void *) Stmt {
     LoopControlStmtBitfields LoopControlStmtBits;
     ReturnStmtBitfields ReturnStmtBits;
     SwitchCaseBitfields SwitchCaseBits;
+    DeferStmtBitfields DeferStmtBits;
 
     // Expressions
     ExprBitfields ExprBits;
@@ -3211,6 +3222,47 @@ class ReturnStmt final
   }
 };
 
+/// DeferStmt - This represents a deferred statement.
+class DeferStmt : public Stmt {
+  friend class ASTStmtReader;
+
+  /// The deferred statement.
+  Stmt *Body;
+
+  DeferStmt(EmptyShell Empty);
+  DeferStmt(SourceLocation DeferLoc, Stmt *Body);
+
+public:
+  static DeferStmt *CreateEmpty(ASTContext &Context, EmptyShell Empty);
+  static DeferStmt *Create(ASTContext &Context, SourceLocation DeferLoc,
+                           Stmt *Body);
+
+  SourceLocation getDeferLoc() const { return DeferStmtBits.DeferLoc; }
+  void setDeferLoc(SourceLocation DeferLoc) {
+    DeferStmtBits.DeferLoc = DeferLoc;
+  }
+
+  Stmt *getBody() { return Body; }
+  const Stmt *getBody() const { return Body; }
+  void setBody(Stmt *S) {
+    assert(S && "defer body must not be null");
+    Body = S;
+  }
+
+  SourceLocation getBeginLoc() const { return getDeferLoc(); }
+  SourceLocation getEndLoc() const { return Body->getEndLoc(); }
+
+  child_range children() { return child_range(&Body, &Body + 1); }
+
+  const_child_range children() const {
+    return const_child_range(&Body, &Body + 1);
+  }
+
+  static bool classof(const Stmt *S) {
+    return S->getStmtClass() == DeferStmtClass;
+  }
+};
+
 /// AsmStmt is the base class for GCCAsmStmt and MSAsmStmt.
 class AsmStmt : public Stmt {
 protected:
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 9401377002223..442a90ec2472d 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -350,6 +350,8 @@ def err_address_of_label_outside_fn : Error<
   "use of address-of-label extension outside of a function body">;
 def err_asm_operand_wide_string_literal : Error<
   "cannot use %select{unicode|wide}0 string literal in 'asm'">;
+def err_defer_ts_labeled_stmt : Error<
+  "substatement of defer must not be a label">;
 
 def err_asm_expected_string : Error<
   "expected string literal %select{or parenthesized constant expression |}0in 'asm'">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 127818ec5767b..5444071fa8261 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6844,6 +6844,7 @@ def note_protected_by_objc_weak_init : Note<
   "jump bypasses initialization of __weak variable">;
 def note_protected_by_non_trivial_c_struct_init : Note<
   "jump bypasses initialization of variable of non-trivial C struct type">;
+def note_protected_by_defer_stmt : Note<"jump bypasses defer statement">;
 def note_enters_block_captures_cxx_obj : Note<
   "jump enters lifetime of block which captures a destructible C++ object">;
 def note_enters_block_captures_strong : Note<
@@ -6857,6 +6858,7 @@ def note_enters_compound_literal_scope : Note<
   "jump enters lifetime of a compound literal that is non-trivial to destruct">;
 def note_enters_statement_expression : Note<
   "jump enters a statement expression">;
+def note_enters_defer_stmt : Note<"jump enters a defer statement">;
 
 def note_exits_cleanup : Note<
   "jump exits scope of variable with __attribute__((cleanup))">;
@@ -6902,6 +6904,16 @@ def note_exits_block_captures_non_trivial_c_struct : Note<
   "to destroy">;
 def note_exits_compound_literal_scope : Note<
   "jump exits lifetime of a compound literal that is non-trivial to destruct">;
+def note_exits_defer_stmt : Note<"jump exits a defer statement">;
+def err_jump_out_of_defer_stmt : Error<
+  "cannot %enum_select<DeferJumpKind>{"
+  "%Break{break out of a}|"
+  "%Continue{continue loop outside of enclosing}|"
+  "%Return{return from a}|"
+  "%SEHLeave{__leave a}"
+  "}0 defer statement">;
+def err_defer_invalid_sjlj : Error<
+  "cannot use %0 inside a defer statement">;
 
 def err_func_returning_qualified_void : ExtWarn<
   "function cannot return qualified void type %0">,
@@ -11016,6 +11028,8 @@ def err_switch_explicit_conversion : Error<
 def err_switch_incomplete_class_type : Error<
   "switch condition has incomplete class type %0">;
 
+// TODO: It ought to be possible to refactor these to be a single warning that
+// uses %enum_select.
 def warn_empty_if_body : Warning<
   "if statement has empty body">, InGroup<EmptyBody>;
 def warn_empty_for_body : Warning<
@@ -11026,6 +11040,8 @@ def warn_empty_while_body : Warning<
   "while loop has empty body">, InGroup<EmptyBody>;
 def warn_empty_switch_body : Warning<
   "switch statement has empty body">, InGroup<EmptyBody>;
+def warn_empty_defer_body : Warning<
+  "defer statement has empty body">, InGroup<EmptyBody>;
 def note_empty_body_on_separate_line : Note<
   "put the semicolon on a separate line to silence this warning">;
 
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index b27492d19a65b..043c184323876 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -77,7 +77,8 @@ enum TokenKey : unsigned {
   KEYNOZOS = 0x4000000,
   KEYHLSL = 0x8000000,
   KEYFIXEDPOINT = 0x10000000,
-  KEYMAX = KEYFIXEDPOINT, // The maximum key
+  KEYDEFERTS = 0x20000000,
+  KEYMAX = KEYDEFERTS, // The maximum key
   KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
   KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
            ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index bd3fb665b4a8d..2f0359c597613 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -194,6 +194,7 @@ LANGOPT(NoSignedZero      , 1, 0, Benign, "Permit Floating Point optimization wi
 LANGOPT(AllowRecip        , 1, 0, Benign, "Permit Floating Point reciprocal")
 LANGOPT(ApproxFunc        , 1, 0, Benign, "Permit Floating Point approximation")
 LANGOPT(NamedLoops        , 1, 0, Benign, "Permit named break/continue")
+LANGOPT(DeferTS           , 1, 0, Benign, "C '_Defer' Technical Specification")
 
 ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 3, CX_None, NotCompatible, "Enable use of range reduction for complex arithmetics.")
 
@@ -515,6 +516,8 @@ LANGOPT(BoundsSafety, 1, 0, NotCompatible, "Bounds safety extension for C")
 
 LANGOPT(EnableLifetimeSafety, 1, 0, NotCompatible, "Experimental lifetime safety analysis for C++")
 
+LANGOPT(EnableLifetimeSafetyInference, 1, 0, NotCompatible, "Experimental lifetime safety inference analysis for C++")
+
 LANGOPT(PreserveVec3Type, 1, 0, NotCompatible, "Preserve 3-component vector type")
 
 #undef LANGOPT
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index bf3686bb372d5..2d740425a3cb0 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -17,6 +17,7 @@ def ForStmt : StmtNode<Stmt>;
 def GotoStmt : StmtNode<Stmt>;
 def IndirectGotoStmt : StmtNode<Stmt>;
 def ReturnStmt : StmtNode<Stmt>;
+def DeferStmt : StmtNode<Stmt>;
 def DeclStmt  : StmtNode<Stmt>;
 def SwitchCase : StmtNode<Stmt, 1>;
 def CaseStmt : StmtNode<SwitchCase>;
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 564d6010181cc..8240d395d3e8f 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -293,6 +293,7 @@ PUNCTUATOR(greatergreatergreater, ">>>")
 //   CHAR8SUPPORT - This is a keyword if 'char8_t' is a built-in type
 //   KEYFIXEDPOINT - This is a keyword according to the N1169 fixed point
 //                   extension.
+//   KEYDEFERTS - This is a keyword if the C '_Defer' TS is enabled
 //   KEYZOS - This is a keyword in C/C++ on z/OS
 //
 KEYWORD(auto                        , KEYALL)
@@ -441,6 +442,9 @@ KEYWORD(_Float16                    , KEYALL)
 C23_KEYWORD(typeof                  , KEYGNU)
 C23_KEYWORD(typeof_unqual           , 0)
 
+// '_Defer' TS
+KEYWORD(_Defer                      , KEYDEFERTS)
+
 // ISO/IEC JTC1 SC22 WG14 N1169 Extension
 KEYWORD(_Accum                      , KEYFIXEDPOINT)
 KEYWORD(_Fract                      , KEYFIXEDPOINT)
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index c4ad6a0c0732c..74e0860762ec6 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1080,6 +1080,12 @@ def CIR_SwitchOp : CIR_Op<"switch", [
     conditionally executing multiple regions of code. The operand to an switch
     is an integral condition value.
 
+    Besides taking an integer condition and CIR regions, it also accepts an
+    `all_enum_cases_covered` attribute indicating whether all enum cases are 
+    handled by the operation. Note that the presence of a default CaseOp does 
+    not imply `all_enum_cases_covered`. The original AST switch must explicitly list 
+    every enum case.
+
     The set of `cir.case` operations and their enclosing `cir.switch`
     represent the semantics of a C/C++ switch statement. Users can use
     `collectCases(llvm::SmallVector<CaseOp> &cases)` to collect the `cir.case`
@@ -1206,7 +1212,10 @@ def CIR_SwitchOp : CIR_Op<"switch", [
     ```
   }];
 
-  let arguments = (ins CIR_IntType:$condition);
+  let arguments = (ins 
+    CIR_IntType:$condition,
+    UnitAttr:$allEnumCasesCovered
+  );
 
   let regions = (region AnyRegion:$body);
 
@@ -1217,9 +1226,9 @@ def CIR_SwitchOp : CIR_Op<"switch", [
   ];
 
   let assemblyFormat = [{
-    custom<SwitchOp>(
-      $body, $condition, type($condition)
-    )
+    `(` $condition `:` qualified(type($condition)) `)` 
+     (`allEnumCasesCovered` $allEnumCasesCovered^)?
+    $body
     attr-dict
   }];
 
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 84ae211bed0a4..a6b2cee072413 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -1677,6 +1677,14 @@ defm named_loops
           PosFlag<SetTrue, [], [CC1Option], "Enable support for named loops">,
           NegFlag<SetFalse>>;
 
+// C '_Defer' TS
+defm defer_ts : BoolFOption<"defer-ts",
+  LangOpts<"DeferTS">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Enable support for the C '_Defer' Technical Specification">,
+  NegFlag<SetFalse>>,
+  ShouldParseIf<!strconcat("!", cplusplus.KeyPath)>;
+
 // C++ Coroutines
 defm coroutines : BoolFOption<"coroutines",
   LangOpts<"Coroutines">, Default<cpp20.KeyPath>,
@@ -1962,6 +1970,14 @@ defm lifetime_safety : BoolFOption<
   BothFlags<[], [CC1Option],
           " experimental lifetime safety for C++">>;
 
+defm lifetime_safety_inference
+    : BoolFOption<"experimental-lifetime-safety-inference",
+                  LangOpts<"EnableLifetimeSafetyInference">, DefaultFalse,
+                  PosFlag<SetTrue, [], [CC1Option], "Enable">,
+                  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+                  BothFlags<[], [CC1Option],
+                            " experimental lifetime safety inference for C++">>;
+
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Emit">,
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 58eb1c0a7c114..47eedf216a44b 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -7500,6 +7500,16 @@ class Parser : public CodeCompletionHandler {
 
   StmtResult ParseBreakOrContinueStatement(bool IsContinue);
 
+  /// ParseDeferStatement
+  /// \verbatim
+  ///       defer-statement:
+  ///         '_Defer' deferred-block
+  ///
+  ///       deferred-block:
+  ///         unlabeled-statement
+  /// \endverbatim
+  StmtResult ParseDeferStatement(SourceLocation *TrailingElseLoc);
+
   StmtResult ParsePragmaLoopHint(StmtVector &Stmts, ParsedStmtContext StmtCtx,
                                  SourceLocation *TrailingElseLoc,
                                  ParsedAttributes &Attrs,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d14b5dc5ffaa4..97b6bb3d1b3a8 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10935,6 +10935,10 @@ class Sema final : public SemaBase {
   /// Stack of active SEH __finally scopes.  Can be empty.
   SmallVector<Scope *, 2> CurrentSEHFinally;
 
+  /// Stack of '_Defer' statements that are currently being parsed, as well
+  /// as the locations of their '_Defer' keywords. Can be empty.
+  SmallVector<std::pair<Scope *, SourceLocation>, 2> CurrentDefer;
+
   StmtResult ActOnExprStmt(ExprResult Arg, bool DiscardedValue = true);
   StmtResult ActOnExprStmtError();
 
@@ -11081,6 +11085,10 @@ class Sema final : public SemaBase {
   StmtResult ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope,
                             LabelDecl *Label, SourceLocation LabelLoc);
 
+  void ActOnStartOfDeferStmt(SourceLocation DeferLoc, Scope *CurScope);
+  void ActOnDeferStmtError(Scope *CurScope);
+  StmtResult ActOnEndOfDeferStmt(Stmt *Body, Scope *CurScope);
+
   struct NamedReturnInfo {
     const VarDecl *Candidate;
 
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 0f868c1c7c5da..f6e08d7a8a995 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2061,6 +2061,7 @@ enum StmtCode {
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
 
+  STMT_DEFER,
 };
 
 /// The kinds of designators that can occur in a
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 11ece494490de..10aacd75a650a 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -1499,3 +1499,19 @@ const Stmt *LoopControlStmt::getNamedLoopOrSwitch() const {
     return nullptr;
   return getLabelDecl()->getStmt()->getInnermostLabeledStmt();
 }
+
+DeferStmt::DeferStmt(EmptyShell Empty) : Stmt(DeferStmtClass, Empty) {}
+DeferStmt::DeferStmt(SourceLocation DeferLoc, Stmt *Body)
+    : Stmt(DeferStmtClass) {
+  setDeferLoc(DeferLoc);
+  setBody(Body);
+}
+
+DeferStmt *DeferStmt::CreateEmpty(ASTContext &Context, EmptyShell Empty) {
+  return new (Context) DeferStmt(Empty);
+}
+
+DeferStmt *DeferStmt::Create(ASTContext &Context, SourceLocation DeferLoc,
+                             Stmt *Body) {
+  return new (Context) DeferStmt(DeferLoc, Body);
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index ff8ca01ec5477..9bc5ee0c7f40e 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -491,6 +491,11 @@ void StmtPrinter::VisitBreakStmt(BreakStmt *Node) {
   if (Policy.IncludeNewlines) OS << NL;
 }
 
+void StmtPrinter::VisitDeferStmt(DeferStmt *Node) {
+  Indent() << "_Defer";
+  PrintControlledStmt(Node->getBody());
+}
+
 void StmtPrinter::VisitReturnStmt(ReturnStmt *Node) {
   Indent() << "return";
   if (Node->getRetValue()) {
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 4a8c638c85331..b6395a17547f7 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -323,6 +323,8 @@ void StmtProfiler::VisitReturnStmt(const ReturnStmt *S) {
   VisitStmt(S);
 }
 
+void StmtProfiler::VisitDeferStmt(const DeferStmt *S) { VisitStmt(S); }
+
 void StmtProfiler::VisitGCCAsmStmt(const GCCAsmStmt *S) {
   VisitStmt(S);
   ID.AddBoolean(S->isVolatile());
diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp
index 74792768e2c57..99071d6b46c1e 100644
--- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp
@@ -55,13 +55,14 @@ class LifetimeChecker {
   const LiveOriginsAnalysis &LiveOrigins;
   const FactManager &FactMgr;
   LifetimeSafetyReporter *Reporter;
+  ASTContext &AST;
 
 public:
   LifetimeChecker(const LoanPropagationAnalysis &LoanPropagation,
                   const LiveOriginsAnalysis &LiveOrigins, const FactManager &FM,
                   AnalysisDeclContext &ADC, LifetimeSafetyReporter *Reporter)
       : LoanPropagation(LoanPropagation), LiveOrigins(LiveOrigins), FactMgr(FM),
-        Reporter(Reporter) {
+        Reporter(Reporter), AST(ADC.getASTContext()) {
     for (const CFGBlock *B : *ADC.getAnalysis<PostOrderCFGView>())
       for (const Fact *F : FactMgr.getFacts(B))
         if (const auto *EF = F->getAs<ExpireFact>())
@@ -70,6 +71,11 @@ class LifetimeChecker {
           checkAnnotations(OEF);
     issuePendingWarnings();
     suggestAnnotations();
+    //  Annotation inference is currently guarded by a frontend flag. In the
+    //  future, this might be replaced by a design that differentiates between
+    //  explicit and inferred findings with separate warning groups.
+    if (AST.getLangOpts().EnableLifetimeSafetyInference)
+      inferAnnotations();
   }
 
   /// Checks if an escaping origin holds a placeholder loan, indicating a
@@ -160,6 +166,20 @@ class LifetimeChecker {
     for (const auto &[PVD, EscapeExpr] : AnnotationWarningsMap)
       Reporter->suggestAnnotation(PVD, EscapeExpr);
   }
+
+  void inferAnnotations() {
+    // FIXME: To maximise inference propagation, functions should be analyzed in
+    // post-order of the call graph, allowing inferred annotations to propagate
+    // through the call chain
+    // FIXME: Add the inferred attribute to all redeclarations of the function,
+    // not just the definition being analyzed.
+    for (const auto &[ConstPVD, EscapeExpr] : AnnotationWarningsMap) {
+      ParmVarDecl *PVD = const_cast<ParmVarDecl *>(ConstPVD);
+      if (!PVD->hasAttr<LifetimeBoundAttr>())
+        PVD->addAttr(
+            LifetimeBoundAttr::CreateImplicit(AST, PVD->getLocation()));
+    }
+  }
 };
 } // namespace
 
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index d1c959b9687c4..9b4019834c4be 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -164,6 +164,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
     return KS_Unknown;
   case KEYFIXEDPOINT:
     return LangOpts.FixedPoint ? KS_Enabled : KS_Disabled;
+  case KEYDEFERTS:
+    return LangOpts.DeferTS ? KS_Enabled : KS_Disabled;
   default:
     llvm_unreachable("Unknown KeywordStatus flag");
   }
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 8313826d88500..664c9e15d8d18 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -368,7 +368,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
   bool supportsCpuSupports() const override {
     llvm::Triple Triple = getTriple();
     // AIX 7.2 is the minimum requirement to support __builtin_cpu_supports().
-    return Triple.isOSGlibc() ||
+    return Triple.isOSGlibc() || Triple.isMusl() ||
            (Triple.isOSAIX() &&
             !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR));
   }
@@ -376,7 +376,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
   bool supportsCpuIs() const override {
     llvm::Triple Triple = getTriple();
     // AIX 7.2 is the minimum requirement to support __builtin_cpu_is().
-    return Triple.isOSGlibc() ||
+    return Triple.isOSGlibc() || Triple.isMusl() ||
            (Triple.isOSAIX() &&
             !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR));
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index f13e7cb32c71e..b7bd405bf4df4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -1105,6 +1105,8 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) {
     terminateBody(builder, caseOp.getCaseRegion(), caseOp.getLoc());
   terminateBody(builder, swop.getBody(), swop.getLoc());
 
+  swop.setAllEnumCasesCovered(s.isAllEnumCasesCovered());
+
   return res;
 }
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 8077dc6597047..d888fdcf081e7 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1359,44 +1359,6 @@ void cir::CaseOp::build(OpBuilder &builder, OperationState &result,
 // SwitchOp
 //===----------------------------------------------------------------------===//
 
-static ParseResult parseSwitchOp(OpAsmParser &parser, mlir::Region &regions,
-                                 mlir::OpAsmParser::UnresolvedOperand &cond,
-                                 mlir::Type &condType) {
-  cir::IntType intCondType;
-
-  if (parser.parseLParen())
-    return mlir::failure();
-
-  if (parser.parseOperand(cond))
-    return mlir::failure();
-  if (parser.parseColon())
-    return mlir::failure();
-  if (parser.parseCustomTypeWithFallback(intCondType))
-    return mlir::failure();
-  condType = intCondType;
-
-  if (parser.parseRParen())
-    return mlir::failure();
-  if (parser.parseRegion(regions, /*arguments=*/{}, /*argTypes=*/{}))
-    return failure();
-
-  return mlir::success();
-}
-
-static void printSwitchOp(OpAsmPrinter &p, cir::SwitchOp op,
-                          mlir::Region &bodyRegion, mlir::Value condition,
-                          mlir::Type condType) {
-  p << "(";
-  p << condition;
-  p << " : ";
-  p.printStrippedAttrOrType(condType);
-  p << ")";
-
-  p << ' ';
-  p.printRegion(bodyRegion, /*printEntryBlockArgs=*/false,
-                /*printBlockTerminators=*/true);
-}
-
 void cir::SwitchOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &region) {
   if (!point.isParent()) {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 5643c58ca52f4..6e30ace8c791c 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -1190,6 +1190,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::ContinueStmtClass:
   case Stmt::DefaultStmtClass:
   case Stmt::CaseStmtClass:
+  case Stmt::DeferStmtClass:
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
     llvm_unreachable("should have emitted these statements as simple");
@@ -1616,6 +1617,9 @@ bool CodeGenFunction::EmitSimpleStmt(const Stmt *S,
   case Stmt::CaseStmtClass:
     EmitCaseStmt(cast<CaseStmt>(*S), Attrs);
     break;
+  case Stmt::DeferStmtClass:
+    EmitDeferStmt(cast<DeferStmt>(*S));
+    break;
   case Stmt::SEHLeaveStmtClass:
     EmitSEHLeaveStmt(cast<SEHLeaveStmt>(*S));
     break;
@@ -3264,6 +3268,87 @@ void CodeGenFunction::EmitDefaultStmt(const DefaultStmt &S,
   EmitStmt(S.getSubStmt());
 }
 
+namespace {
+struct EmitDeferredStatement final : EHScopeStack::Cleanup {
+  const DeferStmt &Stmt;
+  EmitDeferredStatement(const DeferStmt *Stmt) : Stmt(*Stmt) {}
+
+  void Emit(CodeGenFunction &CGF, Flags) override {
+    // Take care that any cleanups pushed by the body of a '_Defer' statement
+    // don't clobber the current cleanup slot value.
+    //
+    // Assume we have a scope that pushes a cleanup; when that scope is exited,
+    // we need to run that cleanup; this is accomplished by emitting the cleanup
+    // into a separate block and then branching to that block at scope exit.
+    //
+    // Where this gets complicated is if we exit the scope in multiple different
+    // ways; e.g. in a 'for' loop, we may exit the scope of its body by falling
+    // off the end (in which case we need to run the cleanup and then branch to
+    // the increment), or by 'break'ing out of the loop (in which case we need
+    // to run the cleanup and then branch to the loop exit block); in both cases
+    // we first branch to the cleanup block to run the cleanup, but the block we
+    // need to jump to *after* running the cleanup is different.
+    //
+    // This is accomplished using a local integer variable called the 'cleanup
+    // slot': before branching to the cleanup block, we store a value into that
+    // slot. Then, in the cleanup block, after running the cleanup, we load the
+    // value of that variable and 'switch' on it to branch to the appropriate
+    // continuation block.
+    //
+    // The problem that arises once '_Defer' statements are involved is that the
+    // body of a '_Defer' is an arbitrary statement which itself can create more
+    // cleanups. This means we may end up overwriting the cleanup slot before we
+    // ever have a chance to 'switch' on it, which means that once we *do* get
+    // to the 'switch', we end up in whatever block the cleanup code happened to
+    // pick as the default 'switch' exit label!
+    //
+    // That is, what is normally supposed to happen is something like:
+    //
+    //   1. Store 'X' to cleanup slot.
+    //   2. Branch to cleanup block.
+    //   3. Execute cleanup.
+    //   4. Read value from cleanup slot.
+    //   5. Branch to the block associated with 'X'.
+    //
+    // But if we encounter a _Defer' statement that contains a cleanup, then
+    // what might instead happen is:
+    //
+    //   1. Store 'X' to cleanup slot.
+    //   2. Branch to cleanup block.
+    //   3. Execute cleanup; this ends up pushing another cleanup, so:
+    //       3a. Store 'Y' to cleanup slot.
+    //       3b. Run steps 2–5 recursively.
+    //   4. Read value from cleanup slot, which is now 'Y' instead of 'X'.
+    //   5. Branch to the block associated with 'Y'... which doesn't even
+    //      exist because the value 'Y' is only meaningful for the inner
+    //      cleanup. The result is we just branch 'somewhere random'.
+    //
+    // The rest of the cleanup code simply isn't prepared to handle this case
+    // because most other cleanups can't push more cleanups, and thus, emitting
+    // other cleanups generally cannot clobber the cleanup slot.
+    //
+    // To prevent this from happening, save the current cleanup slot value and
+    // restore it after emitting the '_Defer' statement.
+    llvm::Value *SavedCleanupDest = nullptr;
+    if (CGF.NormalCleanupDest.isValid())
+      SavedCleanupDest =
+          CGF.Builder.CreateLoad(CGF.NormalCleanupDest, "cleanup.dest.saved");
+
+    CGF.EmitStmt(Stmt.getBody());
+
+    if (SavedCleanupDest && CGF.HaveInsertPoint())
+      CGF.Builder.CreateStore(SavedCleanupDest, CGF.NormalCleanupDest);
+
+    // Cleanups must end with an insert point.
+    CGF.EnsureInsertPoint();
+  }
+};
+} // namespace
+
+void CodeGenFunction::EmitDeferStmt(const DeferStmt &S) {
+  EHStack.pushCleanup<EmitDeferredStatement>(NormalAndEHCleanup, &S);
+}
+
 /// CollectStatementsForCase - Given the body of a 'switch' statement and a
 /// constant value that is being switched on, see if we can dead code eliminate
 /// the body of the switch to a simple series of statements to emit.  Basically,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 5e028371eee6d..a8b9b0d109357 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3689,6 +3689,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitDefaultStmt(const DefaultStmt &S, ArrayRef<const Attr *> Attrs);
   void EmitCaseStmt(const CaseStmt &S, ArrayRef<const Attr *> Attrs);
   void EmitCaseStmtRange(const CaseStmt &S, ArrayRef<const Attr *> Attrs);
+  void EmitDeferStmt(const DeferStmt &S);
   void EmitAsmStmt(const AsmStmt &S);
 
   const BreakContinue *GetDestForLoopControlStmt(const LoopControlStmt &S);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e55df74244234..22cf31b7d30c3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -382,15 +382,11 @@ static void checkDataLayoutConsistency(const TargetInfo &Target,
     Check("bfloat", llvm::Type::getBFloatTy(Context), Target.BFloat16Align);
   Check("float", llvm::Type::getFloatingPointTy(Context, *Target.FloatFormat),
         Target.FloatAlign);
-  // FIXME: AIX specifies wrong double alignment in DataLayout
-  if (!Triple.isOSAIX()) {
-    Check("double",
-          llvm::Type::getFloatingPointTy(Context, *Target.DoubleFormat),
-          Target.DoubleAlign);
-    Check("long double",
-          llvm::Type::getFloatingPointTy(Context, *Target.LongDoubleFormat),
-          Target.LongDoubleAlign);
-  }
+  Check("double", llvm::Type::getFloatingPointTy(Context, *Target.DoubleFormat),
+        Target.DoubleAlign);
+  Check("long double",
+        llvm::Type::getFloatingPointTy(Context, *Target.LongDoubleFormat),
+        Target.LongDoubleAlign);
   if (Target.hasFloat128Type())
     Check("__float128", llvm::Type::getFP128Ty(Context), Target.Float128Align);
   if (Target.hasIbm128Type())
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 613438dcacee5..6b5f536f59d34 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7178,6 +7178,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       types::isCXX(InputType))
     CmdArgs.push_back("-fcoro-aligned-allocation");
 
+  if (Args.hasFlag(options::OPT_fdefer_ts, options::OPT_fno_defer_ts,
+                   /*Default=*/false))
+    CmdArgs.push_back("-fdefer-ts");
+
   Args.AddLastArg(CmdArgs, options::OPT_fdouble_square_bracket_attributes,
                   options::OPT_fno_double_square_bracket_attributes);
 
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index a54f863754046..3b9475a646452 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -498,6 +498,11 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
   Builder.defineMacro("__STDC_EMBED_EMPTY__",
                       llvm::itostr(static_cast<int>(EmbedResult::Empty)));
 
+  // We define this to '1' here to indicate that we only support '_Defer'
+  // as a keyword.
+  if (LangOpts.DeferTS)
+    Builder.defineMacro("__STDC_DEFER_TS25755__", "1");
+
   if (LangOpts.ObjC)
     Builder.defineMacro("__OBJC__");
 
diff --git a/clang/lib/Headers/stddefer.h b/clang/lib/Headers/stddefer.h
new file mode 100644
index 0000000000000..162876ddfa395
--- /dev/null
+++ b/clang/lib/Headers/stddefer.h
@@ -0,0 +1,19 @@
+/*===---- stddefer.h - Standard header for 'defer' -------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDDEFER_H
+#define __CLANG_STDDEFER_H
+
+/* Provide 'defer' if '_Defer' is supported. */
+#ifdef __STDC_DEFER_TS25755__
+#define __STDC_VERSION_STDDEFER_H__ 202602L
+#define defer _Defer
+#endif
+
+#endif /* __CLANG_STDDEFER_H */
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 7e73d89c2a18c..78ce4b76d29ae 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -28,6 +28,7 @@
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include <optional>
 
 using namespace clang;
@@ -312,6 +313,8 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes(
     Res = ParseReturnStatement();
     SemiError = "co_return";
     break;
+  case tok::kw__Defer: // C defer TS: defer-statement
+    return ParseDeferStatement(TrailingElseLoc);
 
   case tok::kw_asm: {
     for (const ParsedAttr &AL : CXX11Attrs)
@@ -2370,6 +2373,29 @@ StmtResult Parser::ParseReturnStatement() {
   return Actions.ActOnReturnStmt(ReturnLoc, R.get(), getCurScope());
 }
 
+StmtResult Parser::ParseDeferStatement(SourceLocation *TrailingElseLoc) {
+  assert(Tok.is(tok::kw__Defer));
+  SourceLocation DeferLoc = ConsumeToken();
+
+  Actions.ActOnStartOfDeferStmt(DeferLoc, getCurScope());
+
+  auto OnError = llvm::make_scope_exit(
+      [&] { Actions.ActOnDeferStmtError(getCurScope()); });
+
+  StmtResult Res = ParseStatement(TrailingElseLoc);
+  if (!Res.isUsable())
+    return StmtError();
+
+  // The grammar specifically calls for an unlabeled-statement here.
+  if (auto *L = dyn_cast<LabelStmt>(Res.get())) {
+    Diag(L->getIdentLoc(), diag::err_defer_ts_labeled_stmt);
+    return StmtError();
+  }
+
+  OnError.release();
+  return Actions.ActOnEndOfDeferStmt(Res.get(), getCurScope());
+}
+
 StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts,
                                        ParsedStmtContext StmtCtx,
                                        SourceLocation *TrailingElseLoc,
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index 2c6ae89513241..1dcbf7dd1eb90 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -595,6 +595,27 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
     break;
   }
 
+  case Stmt::DeferStmtClass: {
+    auto *D = cast<DeferStmt>(S);
+
+    {
+      // Disallow jumps over defer statements.
+      unsigned NewParentScope = Scopes.size();
+      Scopes.emplace_back(ParentScope, diag::note_protected_by_defer_stmt, 0,
+                          D->getDeferLoc());
+      origParentScope = NewParentScope;
+    }
+
+    // Disallow jumps into or out of defer statements.
+    {
+      unsigned NewParentScope = Scopes.size();
+      Scopes.emplace_back(ParentScope, diag::note_enters_defer_stmt,
+                          diag::note_exits_defer_stmt, D->getDeferLoc());
+      BuildScopeInformation(D->getBody(), NewParentScope);
+    }
+    return;
+  }
+
   case Stmt::CaseStmtClass:
   case Stmt::DefaultStmtClass:
   case Stmt::LabelStmtClass:
@@ -977,7 +998,7 @@ void JumpScopeChecker::CheckJump(Stmt *From, Stmt *To, SourceLocation DiagLoc,
   // Common case: exactly the same scope, which is fine.
   if (FromScope == ToScope) return;
 
-  // Warn on gotos out of __finally blocks.
+  // Warn on gotos out of __finally blocks and defer statements.
   if (isa<GotoStmt>(From) || isa<IndirectGotoStmt>(From)) {
     // If FromScope > ToScope, FromScope is more nested and the jump goes to a
     // less nested scope.  Check if it crosses a __finally along the way.
@@ -995,6 +1016,10 @@ void JumpScopeChecker::CheckJump(Stmt *From, Stmt *To, SourceLocation DiagLoc,
         S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope);
         S.Diag(Scopes[I].Loc, diag::note_acc_branch_out_of_compute_construct);
         return;
+      } else if (Scopes[I].OutDiag == diag::note_exits_defer_stmt) {
+        S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope);
+        S.Diag(Scopes[I].Loc, diag::note_exits_defer_stmt);
+        return;
       }
     }
   }
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index a0483c3027199..b5ff1dbd26d68 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1538,6 +1538,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::SEHTryStmtClass:
   case Stmt::SwitchStmtClass:
   case Stmt::WhileStmtClass:
+  case Stmt::DeferStmtClass:
     return canSubStmtsThrow(*this, S);
 
   case Stmt::DeclStmtClass: {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 39ffa69a4535a..70d4f4191267d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6877,6 +6877,34 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
   FunctionDecl *FDecl = dyn_cast_or_null<FunctionDecl>(NDecl);
   unsigned BuiltinID = (FDecl ? FDecl->getBuiltinID() : 0);
 
+  auto IsSJLJ = [&] {
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_longjmp:
+    case Builtin::BI__builtin_setjmp:
+    case Builtin::BI__sigsetjmp:
+    case Builtin::BI_longjmp:
+    case Builtin::BI_setjmp:
+    case Builtin::BIlongjmp:
+    case Builtin::BIsetjmp:
+    case Builtin::BIsiglongjmp:
+    case Builtin::BIsigsetjmp:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // Forbid any call to setjmp/longjmp and friends inside a '_Defer' statement.
+  if (!CurrentDefer.empty() && IsSJLJ()) {
+    // Note: If we ever start supporting '_Defer' in C++ we'll have to check
+    // for more than just blocks (e.g. lambdas, nested classes...).
+    Scope *DeferParent = CurrentDefer.back().first;
+    Scope *Block = CurScope->getBlockParent();
+    if (DeferParent->Contains(*CurScope) &&
+        (!Block || !DeferParent->Contains(*Block)))
+      Diag(Fn->getExprLoc(), diag::err_defer_invalid_sjlj) << FDecl;
+  }
+
   // Functions with 'interrupt' attribute cannot be called directly.
   if (FDecl) {
     if (FDecl->hasAttr<AnyX86InterruptAttr>()) {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 6bb1a27d1800c..1b1643250d05e 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3267,12 +3267,23 @@ Sema::ActOnIndirectGotoStmt(SourceLocation GotoLoc, SourceLocation StarLoc,
   return new (Context) IndirectGotoStmt(GotoLoc, StarLoc, E);
 }
 
-static void CheckJumpOutOfSEHFinally(Sema &S, SourceLocation Loc,
-                                     const Scope &DestScope) {
+static void CheckJumpOutOfSEHFinallyOrDefer(Sema &S, SourceLocation Loc,
+                                            const Scope &DestScope,
+                                            unsigned DeferJumpKind) {
   if (!S.CurrentSEHFinally.empty() &&
       DestScope.Contains(*S.CurrentSEHFinally.back())) {
     S.Diag(Loc, diag::warn_jump_out_of_seh_finally);
   }
+
+  if (!S.CurrentDefer.empty()) {
+    Scope *Parent = S.CurrentDefer.back().first;
+    assert(Parent);
+
+    // Note: We don't create a new scope for defer statements, so 'Parent'
+    // is actually the scope that contains the '_Defer'.
+    if (DestScope.Contains(*Parent) || &DestScope == Parent)
+      S.Diag(Loc, diag::err_jump_out_of_defer_stmt) << DeferJumpKind;
+  }
 }
 
 static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope,
@@ -3346,7 +3357,8 @@ StmtResult Sema::ActOnContinueStmt(SourceLocation ContinueLoc, Scope *CurScope,
         Diag(ContinueLoc, diag::err_acc_branch_in_out_compute_construct)
         << /*branch*/ 0 << /*out of */ 0);
 
-  CheckJumpOutOfSEHFinally(*this, ContinueLoc, *S);
+  CheckJumpOutOfSEHFinallyOrDefer(*this, ContinueLoc, *S,
+                                  diag::DeferJumpKind::Continue);
 
   return new (Context) ContinueStmt(ContinueLoc, LabelLoc, Target);
 }
@@ -3387,7 +3399,8 @@ StmtResult Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope,
         Diag(BreakLoc, diag::err_acc_branch_in_out_compute_construct)
         << /*branch*/ 0 << /*out of */ 0);
 
-  CheckJumpOutOfSEHFinally(*this, BreakLoc, *S);
+  CheckJumpOutOfSEHFinallyOrDefer(*this, BreakLoc, *S,
+                                  diag::DeferJumpKind::Break);
 
   return new (Context) BreakStmt(BreakLoc, LabelLoc, Target);
 }
@@ -3932,11 +3945,30 @@ Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
 
   CurScope->updateNRVOCandidate(VD);
 
-  CheckJumpOutOfSEHFinally(*this, ReturnLoc, *CurScope->getFnParent());
+  CheckJumpOutOfSEHFinallyOrDefer(*this, ReturnLoc, *CurScope->getFnParent(),
+                                  diag::DeferJumpKind::Return);
 
   return R;
 }
 
+void Sema::ActOnStartOfDeferStmt(SourceLocation DeferLoc, Scope *CurScope) {
+  CurrentDefer.emplace_back(CurScope, DeferLoc);
+}
+
+void Sema::ActOnDeferStmtError([[maybe_unused]] Scope *CurScope) {
+  assert(!CurrentDefer.empty() && CurrentDefer.back().first == CurScope);
+  CurrentDefer.pop_back();
+}
+
+StmtResult Sema::ActOnEndOfDeferStmt(Stmt *Body,
+                                     [[maybe_unused]] Scope *CurScope) {
+  assert(!CurrentDefer.empty() && CurrentDefer.back().first == CurScope);
+  SourceLocation DeferLoc = CurrentDefer.pop_back_val().second;
+  DiagnoseEmptyStmtBody(DeferLoc, Body, diag::warn_empty_defer_body);
+  setFunctionHasBranchProtectedScope();
+  return DeferStmt::Create(Context, DeferLoc, Body);
+}
+
 static bool CheckSimplerImplicitMovesMSVCWorkaround(const Sema &S,
                                                     const Expr *E) {
   if (!E || !S.getLangOpts().CPlusPlus23 || !S.getLangOpts().MSVCCompat)
@@ -4554,7 +4586,8 @@ Sema::ActOnSEHLeaveStmt(SourceLocation Loc, Scope *CurScope) {
     SEHTryParent = SEHTryParent->getParent();
   if (!SEHTryParent)
     return StmtError(Diag(Loc, diag::err_ms___leave_not_in___try));
-  CheckJumpOutOfSEHFinally(*this, Loc, *SEHTryParent);
+  CheckJumpOutOfSEHFinallyOrDefer(*this, Loc, *SEHTryParent,
+                                  diag::DeferJumpKind::SEHLeave);
 
   return new (Context) SEHLeaveStmt(Loc);
 }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 8e5dbeb792348..d5b6fdd7dc405 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -8552,6 +8552,14 @@ TreeTransform<Derived>::TransformBreakStmt(BreakStmt *S) {
       BreakStmt(S->getKwLoc(), S->getLabelLoc(), cast<LabelDecl>(LD));
 }
 
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformDeferStmt(DeferStmt *S) {
+  StmtResult Result = getDerived().TransformStmt(S->getBody());
+  if (!Result.isUsable())
+    return StmtError();
+  return DeferStmt::Create(getSema().Context, S->getDeferLoc(), Result.get());
+}
+
 template<typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformReturnStmt(ReturnStmt *S) {
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index eef97a8588f0b..495517ccb31f3 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -335,6 +335,12 @@ void ASTStmtReader::VisitContinueStmt(ContinueStmt *S) {
 
 void ASTStmtReader::VisitBreakStmt(BreakStmt *S) { VisitLoopControlStmt(S); }
 
+void ASTStmtReader::VisitDeferStmt(DeferStmt *S) {
+  VisitStmt(S);
+  S->setDeferLoc(readSourceLocation());
+  S->setBody(Record.readSubStmt());
+}
+
 void ASTStmtReader::VisitReturnStmt(ReturnStmt *S) {
   VisitStmt(S);
 
@@ -3146,6 +3152,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) BreakStmt(Empty);
       break;
 
+    case STMT_DEFER:
+      S = DeferStmt::CreateEmpty(Context, Empty);
+      break;
+
     case STMT_RETURN:
       S = ReturnStmt::CreateEmpty(
           Context, /* HasNRVOCandidate=*/Record[ASTStmtReader::NumStmtFields]);
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index acf345392aa1a..a457e627799c9 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -330,6 +330,13 @@ void ASTStmtWriter::VisitBreakStmt(BreakStmt *S) {
   Code = serialization::STMT_BREAK;
 }
 
+void ASTStmtWriter::VisitDeferStmt(DeferStmt *S) {
+  VisitStmt(S);
+  Record.AddSourceLocation(S->getDeferLoc());
+  Record.AddStmt(S->getBody());
+  Code = serialization::STMT_DEFER;
+}
+
 void ASTStmtWriter::VisitReturnStmt(ReturnStmt *S) {
   VisitStmt(S);
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index a759aee47b8ea..d3de632179e1d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1874,6 +1874,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::NullStmtClass:
     case Stmt::SwitchStmtClass:
     case Stmt::WhileStmtClass:
+    case Stmt::DeferStmtClass:
     case Expr::MSDependentExistsStmtClass:
       llvm_unreachable("Stmt should not be in analyzer evaluation loop");
     case Stmt::ImplicitValueInitExprClass:
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 1d55f615de8a9..46b2cc1ac99c1 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -97,7 +97,7 @@ static bool ignoreExtraCC1Commands(const driver::Compilation *Compilation) {
       OffloadCompilation = true;
 
   if (Jobs.size() > 1) {
-    for (auto *A : Actions){
+    for (auto *A : Actions) {
       // On MacOSX real actions may end up being wrapped in BindArchAction
       if (isa<driver::BindArchAction>(A))
         A = *A->input_begin();
@@ -414,8 +414,8 @@ bool ToolInvocation::run() {
       Driver->BuildCompilation(llvm::ArrayRef(Argv)));
   if (!Compilation)
     return false;
-  const llvm::opt::ArgStringList *const CC1Args = getCC1Arguments(
-      &*Diagnostics, Compilation.get());
+  const llvm::opt::ArgStringList *const CC1Args =
+      getCC1Arguments(&*Diagnostics, Compilation.get());
   if (!CC1Args)
     return false;
   std::unique_ptr<CompilerInvocation> Invocation(
@@ -498,9 +498,7 @@ void ClangTool::appendArgumentsAdjuster(ArgumentsAdjuster Adjuster) {
   ArgsAdjuster = combineAdjusters(std::move(ArgsAdjuster), std::move(Adjuster));
 }
 
-void ClangTool::clearArgumentsAdjusters() {
-  ArgsAdjuster = nullptr;
-}
+void ClangTool::clearArgumentsAdjusters() { ArgsAdjuster = nullptr; }
 
 static void injectResourceDir(CommandLineArguments &Args, const char *Argv0,
                               void *MainAddr) {
@@ -555,8 +553,9 @@ int ClangTool::run(ToolAction *Action) {
   }
 
   size_t NumOfTotalFiles = AbsolutePaths.size();
-  unsigned ProcessedFileCounter = 0;
+  unsigned CurrentFileIndex = 0;
   for (llvm::StringRef File : AbsolutePaths) {
+    ++CurrentFileIndex;
     // Currently implementations of CompilationDatabase::getCompileCommands can
     // change the state of the file system (e.g.  prepare generated headers), so
     // this method needs to run right before we invoke the tool, as the next
@@ -571,6 +570,7 @@ int ClangTool::run(ToolAction *Action) {
       FileSkipped = true;
       continue;
     }
+    unsigned CurrentCommandIndexForFile = 0;
     for (CompileCommand &CompileCommand : CompileCommandsForFile) {
       // If the 'directory' field of the compilation database is empty, display
       // an error and use the working directory instead.
@@ -617,13 +617,20 @@ int ClangTool::run(ToolAction *Action) {
       // pass in made-up names here. Make sure this works on other platforms.
       injectResourceDir(CommandLine, "clang_tool", &StaticSymbol);
 
+      ++CurrentCommandIndexForFile;
+
       // FIXME: We need a callback mechanism for the tool writer to output a
       // customized message for each file.
-      if (NumOfTotalFiles > 1)
-        llvm::errs() << "[" + std::to_string(++ProcessedFileCounter) + "/" +
-                            std::to_string(NumOfTotalFiles) +
-                            "] Processing file " + File
-                     << ".\n";
+      if (NumOfTotalFiles > 1 || CompileCommandsForFile.size() > 1) {
+        llvm::errs() << "[" << std::to_string(CurrentFileIndex) << "/"
+                     << std::to_string(NumOfTotalFiles) << "]";
+        if (CompileCommandsForFile.size() > 1) {
+          llvm::errs() << " (" << std::to_string(CurrentCommandIndexForFile)
+                       << "/" << std::to_string(CompileCommandsForFile.size())
+                       << ")";
+        }
+        llvm::errs() << " Processing file " << File << ".\n";
+      }
       ToolInvocation Invocation(std::move(CommandLine), Action, Files.get(),
                                 PCHContainerOps);
       Invocation.setDiagnosticConsumer(DiagConsumer);
diff --git a/clang/test/AST/ast-dump-defer-ts.c b/clang/test/AST/ast-dump-defer-ts.c
new file mode 100644
index 0000000000000..eba057f93c9c2
--- /dev/null
+++ b/clang/test/AST/ast-dump-defer-ts.c
@@ -0,0 +1,27 @@
+// Test without serialization:
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -ast-dump %s -triple x86_64-linux-gnu \
+// RUN: | FileCheck %s
+//
+// Test with serialization:
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -triple x86_64-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -triple x86_64-linux-gnu -include-pch %t -ast-dump-all /dev/null \
+// RUN: | FileCheck %s
+
+static inline void f() {
+  _Defer 3;
+  _Defer { 4; }
+  _Defer _Defer if (true) {}
+}
+
+// CHECK-LABEL: f 'void (void)' static inline
+// CHECK-NEXT:  `-CompoundStmt {{.*}} <col:24, line:14:1>
+// CHECK-NEXT:    |-DeferStmt {{.*}} <line:11:3, col:10>
+// CHECK-NEXT:    | `-IntegerLiteral {{.*}} <col:10> 'int' 3
+// CHECK-NEXT:    |-DeferStmt {{.*}} <line:12:3, col:15>
+// CHECK-NEXT:    | `-CompoundStmt {{.*}} <col:10, col:15>
+// CHECK-NEXT:    |   `-IntegerLiteral {{.*}} <col:12> 'int' 4
+// CHECK-NEXT:    `-DeferStmt {{.*}} <line:13:3, col:28>
+// CHECK-NEXT:      `-DeferStmt {{.*}} <col:10, col:28>
+// CHECK-NEXT:        `-IfStmt {{.*}} <col:17, col:28>
+// CHECK-NEXT:          |-CXXBoolLiteralExpr {{.*}} <col:21> 'bool' true
+// CHECK-NEXT:          `-CompoundStmt {{.*}} <col:27, col:28>
diff --git a/clang/test/AST/ast-print-defer-ts.c b/clang/test/AST/ast-print-defer-ts.c
new file mode 100644
index 0000000000000..bcc217a597778
--- /dev/null
+++ b/clang/test/AST/ast-print-defer-ts.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -ast-print %s | FileCheck %s
+
+void g();
+
+// CHECK: void f
+void f() {
+    // CHECK-NEXT: _Defer
+    // CHECK-NEXT:     g();
+    // CHECK-NEXT: _Defer
+    // CHECK-NEXT:     _Defer
+    // CHECK-NEXT:         g();
+    // CHECK-NEXT: _Defer {
+    // CHECK-NEXT: }
+    // CHECK-NEXT: _Defer {
+    // CHECK-NEXT:     int x;
+    // CHECK-NEXT: }
+    // CHECK-NEXT: _Defer
+    // CHECK-NEXT:     if (1) {
+    // CHECK-NEXT:     }
+    _Defer
+        g();
+    _Defer
+        _Defer
+            g();
+    _Defer {
+    }
+    _Defer {
+        int x;
+    }
+    _Defer
+        if (1) {
+        }
+}
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 64e0961fe20d9..5fbbee0e88a15 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -1143,7 +1143,7 @@ int atomic_load_dynamic_order(int *ptr, int order) {
   
   // CIR:      %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
   // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-  // CIR-NEXT: cir.switch (%[[ORDER]] : !s32i) {
+  // CIR-NEXT: cir.switch(%[[ORDER]] : !s32i) {
   // CIR-NEXT:   cir.case(default, []) {
   // CIR-NEXT:     %[[RES:.+]] = cir.load align(4) syncscope(system) atomic(relaxed) %[[PTR]] : !cir.ptr<!s32i>, !s32i
   // CIR-NEXT:     cir.store align(4) %[[RES]], %[[RES_SLOT:.+]] : !s32i, !cir.ptr<!s32i>
@@ -1219,7 +1219,7 @@ void atomic_store_dynamic_order(int *ptr, int order) {
 
   // CIR:      %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
   // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-  // CIR:      cir.switch (%[[ORDER]] : !s32i) {
+  // CIR:      cir.switch(%[[ORDER]] : !s32i) {
   // CIR-NEXT:   cir.case(default, []) {
   // CIR-NEXT:     %[[VALUE:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
   // CIR-NEXT:     cir.store align(4) atomic(relaxed) %[[VALUE]], %[[PTR]] : !s32i, !cir.ptr<!s32i>
@@ -1288,7 +1288,7 @@ int atomic_load_and_store_dynamic_order(int *ptr, int order) {
 
   // CIR:      %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
   // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-  // CIR:      cir.switch (%[[ORDER]] : !s32i) {
+  // CIR:      cir.switch(%[[ORDER]] : !s32i) {
   // CIR-NEXT:   cir.case(default, []) {
   // CIR-NEXT:     %[[LIT:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
   // CIR-NEXT:     %[[RES:.+]] = cir.atomic.xchg relaxed %[[PTR]], %[[LIT]] : (!cir.ptr<!s32i>, !s32i) -> !s32i
diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
index 3824be0d08c2f..b7bd2da5e39b8 100644
--- a/clang/test/CIR/CodeGen/switch.cpp
+++ b/clang/test/CIR/CodeGen/switch.cpp
@@ -20,7 +20,7 @@ void sw1(int a) {
 }
 
 // CIR: cir.func{{.*}} @_Z3sw1i
-// CIR: cir.switch (%[[COND:.*]] : !s32i) {
+// CIR: cir.switch(%[[COND:.*]] : !s32i) {
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR: cir.break
 // CIR: cir.case(equal, [#cir.int<1> : !s32i]) {
@@ -101,7 +101,7 @@ void sw2(int a) {
 // CIR: cir.scope {
 // CIR-NEXT:   %[[YOLO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["yolo", init]
 // CIR-NEXT:   %[[FOMO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["fomo", init]
-// CIR:        cir.switch (%[[COND:.*]] : !s32i) {
+// CIR:        cir.switch(%[[COND:.*]] : !s32i) {
 // CIR-NEXT:   cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:     %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
 // CIR-NEXT:     cir.store{{.*}} %[[ZERO]], %[[FOMO]] : !s32i, !cir.ptr<!s32i>
@@ -154,7 +154,7 @@ void sw3(int a) {
 // CIR: cir.func{{.*}} @_Z3sw3i
 // CIR: cir.scope {
 // CIR-NEXT:   %[[COND:.*]] = cir.load{{.*}} %[[A:.*]] : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:   cir.switch (%[[COND]] : !s32i) {
+// CIR-NEXT:   cir.switch(%[[COND]] : !s32i) {
 // CIR-NEXT:   cir.case(default, []) {
 // CIR-NEXT:     cir.break
 // CIR-NEXT:   }
@@ -196,7 +196,7 @@ int sw4(int a) {
 }
 
 // CIR: cir.func{{.*}} @_Z3sw4i
-// CIR:       cir.switch (%[[COND:.*]] : !s32i) {
+// CIR:       cir.switch(%[[COND:.*]] : !s32i) {
 // CIR-NEXT:       cir.case(equal, [#cir.int<42> : !s32i]) {
 // CIR-NEXT:         cir.scope {
 // CIR-NEXT:           %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
@@ -264,7 +264,7 @@ void sw5(int a) {
 }
 
 // CIR: cir.func{{.*}} @_Z3sw5i
-// CIR: cir.switch (%[[A:.*]] : !s32i) {
+// CIR: cir.switch(%[[A:.*]] : !s32i) {
 // CIR-NEXT:   cir.case(equal, [#cir.int<1> : !s32i]) {
 // CIR-NEXT:     cir.yield
 // CIR-NEXT:   }
@@ -314,7 +314,7 @@ void sw6(int a) {
 }
 
 // CIR: cir.func{{.*}} @_Z3sw6i
-// CIR: cir.switch (%[[A:.*]] : !s32i) {
+// CIR: cir.switch(%[[A:.*]] : !s32i) {
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR-NEXT:     cir.yield
 // CIR-NEXT: }
@@ -406,7 +406,7 @@ void sw7(int a) {
 
 // CIR: cir.func{{.*}} @_Z3sw7i
 // CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x"]
-// CIR: cir.switch (%[[A:.*]] : !s32i)
+// CIR: cir.switch(%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR-NEXT:     cir.yield
 // CIR-NEXT: }
@@ -499,7 +499,7 @@ void sw8(int a) {
 }
 
 // CIR:    cir.func{{.*}} @_Z3sw8i
-// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR:    cir.switch(%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
@@ -557,7 +557,7 @@ void sw9(int a) {
 }
 
 // CIR:    cir.func{{.*}} @_Z3sw9i
-// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR:    cir.switch(%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
@@ -616,7 +616,7 @@ void sw10(int a) {
 }
 
 // CIR:    cir.func{{.*}} @_Z4sw10i
-// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR:    cir.switch(%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
@@ -688,7 +688,7 @@ void sw11(int a) {
 }
 
 // CIR:    cir.func{{.*}} @_Z4sw11i
-// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR:    cir.switch(%[[A:.*]] : !s32i)
 // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
@@ -1063,7 +1063,7 @@ int nested_switch(int a) {
   return 0;
 }
 
-// CIR: cir.switch (%[[COND:.*]] : !s32i) {
+// CIR: cir.switch(%[[COND:.*]] : !s32i) {
 // CIR:   cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR:     cir.yield
 // CIR:   }
@@ -1198,7 +1198,7 @@ int sw_return_multi_cases(int x) {
 }
 
 // CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi
-// CIR:       cir.switch (%{{.*}} : !s32i) {
+// CIR:       cir.switch(%{{.*}} : !s32i) {
 // CIR-NEXT:  cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
 // CIR:         cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr<!s32i>
@@ -1270,3 +1270,25 @@ int sw_return_multi_cases(int x) {
 // OGCG: [[RETURN]]:
 // OGCG:   %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4
 // OGCG:   ret i32 %[[RETVAL_LOAD]]
+
+enum M {
+ Six,
+ Seven
+};
+
+void testSwitchCoverAllCase(M m) {
+  switch (m) {
+  case Six:case Seven:
+    break;
+  }
+}
+// CIR: cir.switch(%[[ARG:.*]] : !s32i) allEnumCasesCovered {
+
+void testSwitchNotCoverAllCase(M m) {
+  switch (m) {
+  case Six:
+  default:
+    break;
+  }
+}
+// CIR: cir.switch(%[[ARG:.*]] : !s32i) {
diff --git a/clang/test/CIR/CodeGen/switch_flat_op.cpp b/clang/test/CIR/CodeGen/switch_flat_op.cpp
index a3ea7e7a15547..ba0a82da52c70 100644
--- a/clang/test/CIR/CodeGen/switch_flat_op.cpp
+++ b/clang/test/CIR/CodeGen/switch_flat_op.cpp
@@ -21,7 +21,7 @@ void swf(int a) {
 // BEFORE:  cir.func{{.*}} @_Z3swfi
 // BEFORE:   %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
 // BEFORE:   %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// BEFORE:   cir.switch (%[[COND:.*]] : !s32i) {
+// BEFORE:   cir.switch(%[[COND:.*]] : !s32i) {
 // BEFORE:     cir.case(equal, [#cir.int<3> : !s32i]) {
 // BEFORE:       %[[LOAD_B_EQ:.*]] = cir.load{{.*}} %[[VAR_B]] : !cir.ptr<!s32i>, !s32i
 // BEFORE:       %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
diff --git a/clang/test/CIR/IR/switch.cir b/clang/test/CIR/IR/switch.cir
index 87d45bf1f5219..89614480e43cd 100644
--- a/clang/test/CIR/IR/switch.cir
+++ b/clang/test/CIR/IR/switch.cir
@@ -21,7 +21,7 @@ cir.func @s0() {
   cir.return
 }
 
-// CHECK: cir.switch (%0 : !s32i) {
+// CHECK: cir.switch(%0 : !s32i) {
 // CHECK-NEXT: cir.case(default, [])  {
 // CHECK-NEXT:   cir.return
 // CHECK-NEXT: }
@@ -36,3 +36,33 @@ cir.func @s0() {
 // CHECK-NEXT: }
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
+
+
+// Pretends that this is lowered from a C file and was tagged with allEnumCasesCovered = true
+cir.func @s1(%1 : !s32i) {
+  cir.switch (%1 : !s32i) allEnumCasesCovered {
+    cir.case (default, []) {
+      cir.return
+    }
+    cir.case (equal, [#cir.int<1> : !s32i]) {
+      cir.yield
+    }
+    cir.case (equal, [#cir.int<2> : !s32i]) {
+      cir.yield
+    }
+    cir.yield
+  } { }
+  cir.return
+} 
+// CHECK: cir.switch(%[[ARG:.*]] : !s32i) allEnumCasesCovered {
+// CHECK-NEXT: cir.case(default, []) {
+// CHECK-NEXT:   cir.return
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CHECK-NEXT:   cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) {
+// CHECK-NEXT:   cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
diff --git a/clang/test/CIR/Transforms/switch-fold.cir b/clang/test/CIR/Transforms/switch-fold.cir
index 62a94f4fde2c3..c348a05128671 100644
--- a/clang/test/CIR/Transforms/switch-fold.cir
+++ b/clang/test/CIR/Transforms/switch-fold.cir
@@ -27,7 +27,7 @@ module {
     cir.return
   }
   //CHECK: cir.func @foldCascade
-  //CHECK:   cir.switch (%[[COND:.*]] : !s32i) {
+  //CHECK:   cir.switch(%[[COND:.*]] : !s32i) {
   //CHECK-NEXT:     cir.case(anyof, [#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i]) {
   //CHECK-NEXT:       %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
   //CHECK-NEXT:       cir.store %[[TWO]], %[[ARG0:.*]] : !s32i, !cir.ptr<!s32i>
@@ -66,7 +66,7 @@ module {
     cir.return
   }
   //CHECK: @foldCascade2
-  //CHECK:   cir.switch (%[[COND2:.*]] : !s32i) {
+  //CHECK:   cir.switch(%[[COND2:.*]] : !s32i) {
   //CHECK:     cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i]) {
   //CHECK:       cir.break
   //cehck:     }
@@ -106,7 +106,7 @@ module {
     cir.return
   }
   //CHECK: cir.func @foldCascade3
-  //CHECK:   cir.switch (%[[COND3:.*]] : !s32i) {
+  //CHECK:   cir.switch(%[[COND3:.*]] : !s32i) {
   //CHECK:     cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i]) {
   //CHECK:       cir.break
   //CHECK:    }
@@ -142,7 +142,7 @@ module {
     cir.return
   }
   //CHECK: cir.func @foldCascadeWithDefault
-  //CHECK:   cir.switch (%[[COND:.*]] : !s32i) {
+  //CHECK:   cir.switch(%[[COND:.*]] : !s32i) {
   //CHECK:      cir.case(equal, [#cir.int<3> : !s32i]) {
   //CHECK:        cir.break
   //CHECK:      }
@@ -187,7 +187,7 @@ module {
     cir.return
   }
   //CHECK: cir.func @foldAllCascade
-  //CHECK:   cir.switch (%[[COND:.*]] : !s32i) {
+  //CHECK:   cir.switch(%[[COND:.*]] : !s32i) {
   //CHECK:     cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i]) {
   //CHECK:       cir.yield
   //CHECK:     }
diff --git a/clang/test/CodeGen/2007-05-07-PaddingElements.c b/clang/test/CodeGen/2007-05-07-PaddingElements.c
index f8ec2483a8d61..28d24800abbe6 100644
--- a/clang/test/CodeGen/2007-05-07-PaddingElements.c
+++ b/clang/test/CodeGen/2007-05-07-PaddingElements.c
@@ -1,6 +1,9 @@
 // PR 1278
-// RUN: %clang_cc1 %s -emit-llvm -o - | grep struct.s | not grep "4 x i8] zeroinitializer"
-// RUN: %clang_cc1 %s -emit-llvm -o - | not grep "i32 0, i32 2"
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -emit-llvm -o - | FileCheck %s --check-prefix=AIX
+
+// CHECK: %struct.s = type { double, i32 }
+// AIX: %struct.s = type { double, i32, [4 x i8] }
 struct s {
   double d1;
   int s1;
diff --git a/clang/test/CodeGen/defer-ts-musttail.c b/clang/test/CodeGen/defer-ts-musttail.c
new file mode 100644
index 0000000000000..5622fecbb4fed
--- /dev/null
+++ b/clang/test/CodeGen/defer-ts-musttail.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o /dev/null -verify
+
+int bar() { return 12; }
+int foo() {
+  _Defer {};
+  [[clang::musttail]] return bar(); // expected-error {{cannot compile this tail call skipping over cleanups yet}}
+}
diff --git a/clang/test/CodeGen/defer-ts-nested-cleanups.c b/clang/test/CodeGen/defer-ts-nested-cleanups.c
new file mode 100644
index 0000000000000..d831b4380b929
--- /dev/null
+++ b/clang/test/CodeGen/defer-ts-nested-cleanups.c
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o - -O1 -disable-llvm-passes | FileCheck %s
+
+// Test that cleanups emitted in a '_Defer' don't clobber the cleanup slot; we
+// test this using lifetime intrinsics, which are emitted starting at -O1.
+
+void g();
+
+// CHECK-LABEL: define {{.*}} void @f1()
+// CHECK: entry:
+// CHECK-NEXT:   %i = alloca i32, align 4
+// CHECK-NEXT:   %cleanup.dest.slot = alloca i32, align 4
+// CHECK-NEXT:   %j = alloca i32, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(ptr %i)
+// CHECK-NEXT:   store i32 0, ptr %i, align 4
+// CHECK-NEXT:   br label %for.cond
+// CHECK: for.cond:
+// CHECK-NEXT:   %0 = load i32, ptr %i, align 4
+// CHECK-NEXT:   %cmp = icmp eq i32 %0, 1
+// CHECK-NEXT:   br i1 %cmp, label %if.then, label %if.end
+// CHECK: if.then:
+// CHECK-NEXT:   store i32 2, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: cleanup:
+// CHECK-NEXT:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(ptr %j)
+// CHECK-NEXT:   store i32 0, ptr %j, align 4
+// CHECK-NEXT:   br label %for.cond1
+// CHECK: for.cond1:
+// CHECK-NEXT:   %1 = load i32, ptr %j, align 4
+// CHECK-NEXT:   %cmp2 = icmp ne i32 %1, 1
+// CHECK-NEXT:   br i1 %cmp2, label %for.body, label %for.cond.cleanup
+// CHECK: for.cond.cleanup:
+// CHECK-NEXT:   store i32 5, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(ptr %j)
+// CHECK-NEXT:   br label %for.end
+// CHECK: for.body:
+// CHECK-NEXT:   call void @g()
+// CHECK-NEXT:   br label %for.inc
+// CHECK: for.inc:
+// CHECK-NEXT:   %2 = load i32, ptr %j, align 4
+// CHECK-NEXT:   %inc = add nsw i32 %2, 1
+// CHECK-NEXT:   store i32 %inc, ptr %j, align 4
+// CHECK-NEXT:   br label %for.cond1
+// CHECK: for.end:
+// CHECK-NEXT:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest, label %cleanup6 [
+// CHECK-NEXT:     i32 0, label %cleanup.cont
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont:
+// CHECK-NEXT:   br label %for.inc4
+// CHECK: for.inc4:
+// CHECK-NEXT:   %3 = load i32, ptr %i, align 4
+// CHECK-NEXT:   %inc5 = add nsw i32 %3, 1
+// CHECK-NEXT:   store i32 %inc5, ptr %i, align 4
+// CHECK-NEXT:   br label %for.cond
+// CHECK: cleanup6:
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(ptr %i)
+// CHECK-NEXT:   br label %for.end7
+// CHECK: for.end7:
+// CHECK-NEXT:   ret void
+void f1() {
+  for (int i = 0;; i++) {
+    _Defer {
+      for (int j = 0; j != 1; j++) {
+        g();
+      }
+    }
+    if (i == 1) break;
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @f2()
+// CHECK: entry:
+// CHECK-NEXT:   %i = alloca i32, align 4
+// CHECK-NEXT:   %cleanup.dest.slot = alloca i32, align 4
+// CHECK-NEXT:   %j = alloca i32, align 4
+// CHECK-NEXT:   %k = alloca i32, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(ptr %i)
+// CHECK-NEXT:   store i32 0, ptr %i, align 4
+// CHECK-NEXT:   br label %for.cond
+// CHECK: for.cond:
+// CHECK-NEXT:   %0 = load i32, ptr %i, align 4
+// CHECK-NEXT:   %cmp = icmp eq i32 %0, 1
+// CHECK-NEXT:   br i1 %cmp, label %if.then, label %if.end
+// CHECK: if.then:
+// CHECK-NEXT:   store i32 2, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: cleanup:
+// CHECK-NEXT:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(ptr %j)
+// CHECK-NEXT:   store i32 0, ptr %j, align 4
+// CHECK-NEXT:   br label %for.cond1
+// CHECK: for.cond1:
+// CHECK-NEXT:   %1 = load i32, ptr %j, align 4
+// CHECK-NEXT:   %cmp2 = icmp eq i32 %1, 1
+// CHECK-NEXT:   br i1 %cmp2, label %if.then3, label %if.end4
+// CHECK: if.then3:
+// CHECK-NEXT:   store i32 5, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup5
+// CHECK: if.end4:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup5
+// CHECK: cleanup5:
+// CHECK-NEXT:   %cleanup.dest.saved6 = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(ptr %k)
+// CHECK-NEXT:   store i32 0, ptr %k, align 4
+// CHECK-NEXT:   br label %for.cond7
+// CHECK: for.cond7:
+// CHECK-NEXT:   %2 = load i32, ptr %k, align 4
+// CHECK-NEXT:   %cmp8 = icmp ne i32 %2, 1
+// CHECK-NEXT:   br i1 %cmp8, label %for.body, label %for.cond.cleanup
+// CHECK: for.cond.cleanup:
+// CHECK-NEXT:   store i32 8, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(ptr %k)
+// CHECK-NEXT:   br label %for.end
+// CHECK: for.body:
+// CHECK-NEXT:   call void @g()
+// CHECK-NEXT:   br label %for.inc
+// CHECK: for.inc:
+// CHECK-NEXT:   %3 = load i32, ptr %k, align 4
+// CHECK-NEXT:   %inc = add nsw i32 %3, 1
+// CHECK-NEXT:   store i32 %inc, ptr %k, align 4
+// CHECK-NEXT:   br label %for.cond7
+// CHECK: for.end:
+// CHECK-NEXT:   store i32 %cleanup.dest.saved6, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest, label %cleanup12 [
+// CHECK-NEXT:     i32 0, label %cleanup.cont
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont:
+// CHECK-NEXT:   br label %for.inc10
+// CHECK: for.inc10:
+// CHECK-NEXT:   %4 = load i32, ptr %j, align 4
+// CHECK-NEXT:   %inc11 = add nsw i32 %4, 1
+// CHECK-NEXT:   store i32 %inc11, ptr %j, align 4
+// CHECK-NEXT:   br label %for.cond1
+// CHECK: cleanup12:
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(ptr %j)
+// CHECK-NEXT:   br label %for.end13
+// CHECK: for.end13:
+// CHECK-NEXT:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest14 = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest14, label %cleanup18 [
+// CHECK-NEXT:     i32 0, label %cleanup.cont15
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont15:
+// CHECK-NEXT:   br label %for.inc16
+// CHECK: for.inc16:
+// CHECK-NEXT:   %5 = load i32, ptr %i, align 4
+// CHECK-NEXT:   %inc17 = add nsw i32 %5, 1
+// CHECK-NEXT:   store i32 %inc17, ptr %i, align 4
+// CHECK-NEXT:   br label %for.cond
+// CHECK: cleanup18:
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(ptr %i)
+// CHECK-NEXT:   br label %for.end19
+// CHECK: for.end19:
+// CHECK-NEXT:   ret void
+void f2() {
+  for (int i = 0;; i++) {
+    _Defer {
+      for (int j = 0;; j++) {
+        _Defer {
+          for (int k = 0; k != 1; k++) {
+            g();
+          }
+        }
+	if (j == 1) break;
+      }
+    }
+    if (i == 1) break;
+  }
+}
diff --git a/clang/test/CodeGen/defer-ts-seh.c b/clang/test/CodeGen/defer-ts-seh.c
new file mode 100644
index 0000000000000..a91816f50d8d5
--- /dev/null
+++ b/clang/test/CodeGen/defer-ts-seh.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -std=c23 -fdefer-ts -fms-compatibility -emit-llvm %s -o - | FileCheck %s
+
+void g();
+void h();
+
+void f() {
+  __try {
+    _Defer h();
+    g();
+  } __finally {
+
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @f() {{.*}} personality ptr @__C_specific_handler
+// CHECK: entry:
+// CHECK:   invoke void @g() #4
+// CHECK:           to label %invoke.cont unwind label %ehcleanup
+// CHECK: invoke.cont:
+// CHECK:   invoke void @h() #4
+// CHECK:           to label %invoke.cont1 unwind label %ehcleanup3
+// CHECK: invoke.cont1:
+// CHECK:   %0 = call ptr @llvm.localaddress()
+// CHECK:   call void @"?fin$0@0@f@@"(i8 {{.*}} 0, ptr {{.*}} %0)
+// CHECK:   ret void
+// CHECK: ehcleanup:
+// CHECK:   %1 = cleanuppad within none []
+// CHECK:   invoke void @h() #4 [ "funclet"(token %1) ]
+// CHECK:           to label %invoke.cont2 unwind label %ehcleanup3
+// CHECK: invoke.cont2:
+// CHECK:   cleanupret from %1 unwind label %ehcleanup3
+// CHECK: ehcleanup3:
+// CHECK:   %2 = cleanuppad within none []
+// CHECK:   %3 = call ptr @llvm.localaddress()
+// CHECK:   call void @"?fin$0@0@f@@"(i8 {{.*}} 1, ptr {{.*}} %3) [ "funclet"(token %2) ]
+// CHECK:   cleanupret from %2 unwind to caller
+
+// CHECK-LABEL: define {{.*}} void @"?fin$0@0@f@@"(i8 {{.*}} %abnormal_termination, ptr {{.*}} %frame_pointer)
+// CHECK: entry:
+// CHECK:   %frame_pointer.addr = alloca ptr, align 8
+// CHECK:   %abnormal_termination.addr = alloca i8, align 1
+// CHECK:   store ptr %frame_pointer, ptr %frame_pointer.addr, align 8
+// CHECK:   store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1
+// CHECK:   ret void
diff --git a/clang/test/CodeGen/defer-ts.c b/clang/test/CodeGen/defer-ts.c
new file mode 100644
index 0000000000000..79b09064d330c
--- /dev/null
+++ b/clang/test/CodeGen/defer-ts.c
@@ -0,0 +1,652 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o - | FileCheck %s
+
+#define defer _Defer
+
+void a();
+void b();
+void c();
+void x(int q);
+bool q(int q);
+[[noreturn]] void noreturn();
+
+// CHECK-LABEL: define {{.*}} void @f1()
+void f1() {
+  // CHECK: call void @c()
+  // CHECK: call void @b()
+  // CHECK: call void @a()
+  defer a();
+  defer b();
+  defer c();
+}
+
+// CHECK-LABEL: define {{.*}} void @f2()
+void f2() {
+  // CHECK: call void @x(i32 {{.*}} 1)
+  // CHECK: call void @x(i32 {{.*}} 2)
+  // CHECK: call void @x(i32 {{.*}} 3)
+  // CHECK: call void @x(i32 {{.*}} 4)
+  // CHECK: call void @x(i32 {{.*}} 5)
+  defer x(5);
+  {
+    defer x(4);
+    {
+      defer x(2);
+      defer x(1);
+    }
+    x(3);
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @f3(i1 {{.*}} %ret)
+void f3(bool ret) {
+  // CHECK: entry:
+  // CHECK:   %ret.addr = alloca i8, align 1
+  // CHECK:   %cleanup.dest.slot = alloca i32, align 4
+  // CHECK:   %storedv = zext i1 %ret to i8
+  // CHECK:   store i8 %storedv, ptr %ret.addr, align 1
+  // CHECK:   %0 = load i8, ptr %ret.addr, align 1
+  // CHECK:   %loadedv = trunc i8 %0 to i1
+  // CHECK:   br i1 %loadedv, label %if.then, label %if.end
+  // CHECK: if.then:
+  // CHECK:   store i32 1, ptr %cleanup.dest.slot, align 4
+  // CHECK:   br label %cleanup
+  // CHECK: if.end:
+  // CHECK:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+  // CHECK:   call void @x(i32 {{.*}} 1)
+  // CHECK:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+  // CHECK:   store i32 0, ptr %cleanup.dest.slot, align 4
+  // CHECK:   br label %cleanup
+  // CHECK: cleanup:
+  // CHECK:   %cleanup.dest.saved1 = load i32, ptr %cleanup.dest.slot, align 4
+  // CHECK:   call void @x(i32 {{.*}} 2)
+  // CHECK:   store i32 %cleanup.dest.saved1, ptr %cleanup.dest.slot, align 4
+  // CHECK:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+  // CHECK:   switch i32 %cleanup.dest, label %unreachable [
+  // CHECK:     i32 0, label %cleanup.cont
+  // CHECK:     i32 1, label %cleanup.cont
+  // CHECK:   ]
+  // CHECK: cleanup.cont:
+  // CHECK:   ret void
+  // CHECK: unreachable:
+  // CHECK:   unreachable
+  defer x(2);
+  if (ret) return;
+  defer x(1);
+}
+
+// CHECK-LABEL: define {{.*}} void @ts_g()
+void ts_g() {
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:   ret void
+  // CHECK-NEXT: }
+  return;
+  defer x(42);
+}
+
+// CHECK-LABEL: define {{.*}} void @ts_h()
+void ts_h() {
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:   br label %b
+  // CHECK-EMPTY:
+  goto b;
+  {
+    defer x(42);
+  }
+
+  // CHECK-NEXT: b:
+  // CHECK-NEXT:   ret void
+  // CHECK-NEXT: }
+  b:
+}
+
+// CHECK-LABEL: define {{.*}} void @ts_i()
+void ts_i() {
+  // CHECK: entry:
+  // CHECK:   %cleanup.dest.slot = alloca i32, align 4
+  // CHECK:   store i32 2, ptr %cleanup.dest.slot, align 4
+  // CHECK:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+  // CHECK:   call void @x(i32 {{.*}} 42)
+  // CHECK:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+  // CHECK:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+  // CHECK:   switch i32 %cleanup.dest, label %unreachable [
+  // CHECK:     i32 2, label %b
+  // CHECK:   ]
+  // CHECK: b:
+  // CHECK:   ret void
+  // CHECK: unreachable:
+  // CHECK:   unreachable
+  {
+    defer { x(42); }
+    goto b;
+  }
+  b:
+}
+
+
+// CHECK-LABEL: define {{.*}} void @ts_m()
+void ts_m() {
+  // CHECK: entry:
+  // CHECK:   br label %b
+  // CHECK: b:
+  // CHECK:   call void @x(i32 {{.*}} 1)
+  // CHECK:   ret void
+  goto b;
+  {
+    b:
+    defer x(1);
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @ts_p()
+void ts_p() {
+  // CHECK: entry:
+  // CHECK:   br label %b
+  // CHECK: b:
+  // CHECK:   ret void
+  {
+    goto b;
+    defer x(42);
+  }
+  b:
+}
+
+// CHECK-LABEL: define {{.*}} void @ts_r()
+void ts_r() {
+  // CHECK: entry:
+  // CHECK:   br label %b
+  // CHECK: b:
+  // CHECK:   call void @x(i32 {{.*}} 42)
+  // CHECK:   br label %b
+  {
+    b:
+    defer x(42);
+  }
+  goto b;
+}
+
+// CHECK-LABEL: define {{.*}} i32 @return_value()
+int return_value() {
+  // CHECK: entry:
+  // CHECK:   %r = alloca i32, align 4
+  // CHECK:   %p = alloca ptr, align 8
+  // CHECK:   store i32 4, ptr %r, align 4
+  // CHECK:   store ptr %r, ptr %p, align 8
+  // CHECK:   %0 = load ptr, ptr %p, align 8
+  // CHECK:   %1 = load i32, ptr %0, align 4
+  // CHECK:   %2 = load ptr, ptr %p, align 8
+  // CHECK:   store i32 5, ptr %2, align 4
+  // CHECK:   ret i32 %1
+  int r = 4;
+  int* p = &r;
+  defer { *p = 5; }
+  return *p;
+}
+
+void* malloc(__SIZE_TYPE__ size);
+void free(void* ptr);
+int use_buffer(__SIZE_TYPE__ size, void* ptr);
+
+// CHECK-LABEL: define {{.*}} i32 @malloc_free_example()
+int malloc_free_example() {
+  // CHECK: entry:
+  // CHECK:   %size = alloca i32, align 4
+  // CHECK:   %buf = alloca ptr, align 8
+  // CHECK:   store i32 20, ptr %size, align 4
+  // CHECK:   %call = call ptr @malloc(i64 {{.*}} 20)
+  // CHECK:   store ptr %call, ptr %buf, align 8
+  // CHECK:   %0 = load ptr, ptr %buf, align 8
+  // CHECK:   %call1 = call i32 @use_buffer(i64 {{.*}} 20, ptr {{.*}} %0)
+  // CHECK:   %1 = load ptr, ptr %buf, align 8
+  // CHECK:   call void @free(ptr {{.*}} %1)
+  // CHECK:   ret i32 %call1
+  const int size = 20;
+  void* buf = malloc(size);
+  defer { free(buf); }
+  return use_buffer(size, buf);
+}
+
+// CHECK-LABEL: define {{.*}} void @sequencing_1()
+void sequencing_1() {
+  // CHECK: entry:
+  // CHECK:   call void @x(i32 {{.*}} 1)
+  // CHECK:   call void @x(i32 {{.*}} 2)
+  // CHECK:   call void @x(i32 {{.*}} 3)
+  // CHECK:   ret void
+  {
+    defer {
+      x(3);
+    }
+    if (true)
+      defer x(1);
+    x(2);
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @sequencing_2()
+void sequencing_2() {
+  // CHECK: entry:
+  // CHECK:   %arr = alloca [3 x i32], align 4
+  // CHECK:   %i = alloca i32, align 4
+  // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @__const.sequencing_2.arr, i64 12, i1 false)
+  // CHECK:   store i32 0, ptr %i, align 4
+  // CHECK:   br label %for.cond
+  // CHECK: for.cond:
+  // CHECK:   %0 = load i32, ptr %i, align 4
+  // CHECK:   %cmp = icmp ult i32 %0, 3
+  // CHECK:   br i1 %cmp, label %for.body, label %for.end
+  // CHECK: for.body:
+  // CHECK:   %1 = load i32, ptr %i, align 4
+  // CHECK:   %idxprom = zext i32 %1 to i64
+  // CHECK:   %arrayidx = getelementptr inbounds nuw [3 x i32], ptr %arr, i64 0, i64 %idxprom
+  // CHECK:   %2 = load i32, ptr %arrayidx, align 4
+  // CHECK:   call void @x(i32 {{.*}} %2)
+  // CHECK:   br label %for.inc
+  // CHECK: for.inc:
+  // CHECK:   %3 = load i32, ptr %i, align 4
+  // CHECK:   %inc = add i32 %3, 1
+  // CHECK:   store i32 %inc, ptr %i, align 4
+  // CHECK:   br label %for.cond
+  // CHECK: for.end:
+  // CHECK:   call void @x(i32 {{.*}} 4)
+  // CHECK:   call void @x(i32 {{.*}} 5)
+  // CHECK:   ret void
+  {
+    int arr[] = {1, 2, 3};
+    defer {
+      x(5);
+    }
+    for (unsigned i = 0; i < 3; ++i)
+      defer x(arr[i]);
+    x(4);
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @sequencing_3()
+void sequencing_3() {
+  // CHECK: entry:
+  // CHECK:   %r = alloca i32, align 4
+  // CHECK:   store i32 0, ptr %r, align 4
+  // CHECK:   %0 = load i32, ptr %r, align 4
+  // CHECK:   %add = add nsw i32 %0, 1
+  // CHECK:   store i32 %add, ptr %r, align 4
+  // CHECK:   %1 = load i32, ptr %r, align 4
+  // CHECK:   %mul = mul nsw i32 %1, 2
+  // CHECK:   store i32 %mul, ptr %r, align 4
+  // CHECK:   %2 = load i32, ptr %r, align 4
+  // CHECK:   %add1 = add nsw i32 %2, 3
+  // CHECK:   store i32 %add1, ptr %r, align 4
+  // CHECK:   %3 = load i32, ptr %r, align 4
+  // CHECK:   %mul2 = mul nsw i32 %3, 4
+  // CHECK:   store i32 %mul2, ptr %r, align 4
+  // CHECK:   ret void
+  int r = 0;
+  {
+    defer {
+      defer r *= 4;
+      r *= 2;
+      defer {
+        r += 3;
+      }
+    }
+    defer r += 1;
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @defer_stmt(i32 {{.*}} %q)
+void defer_stmt(int q) {
+  // CHECK: entry:
+  // CHECK:   %q.addr = alloca i32, align 4
+  // CHECK:   store i32 %q, ptr %q.addr, align 4
+  // CHECK:   %0 = load i32, ptr %q.addr, align 4
+  // CHECK:   %cmp = icmp eq i32 %0, 3
+  // CHECK:   br i1 %cmp, label %if.then, label %if.end
+  // CHECK: if.then:
+  // CHECK:   call void @x(i32 {{.*}} 42)
+  // CHECK:   br label %if.end
+  // CHECK: if.end:
+  // CHECK:   ret void
+  defer if (q == 3) x(42);
+}
+
+// CHECK-LABEL: define {{.*}} void @defer_defer()
+void defer_defer() {
+  // CHECK: entry:
+  // CHECK:   call void @x(i32 {{.*}} 0)
+  // CHECK:   call void @x(i32 {{.*}} 1)
+  // CHECK:   call void @x(i32 {{.*}} 2)
+  // CHECK:   call void @x(i32 {{.*}} 3)
+  // CHECK:   call void @x(i32 {{.*}} 4)
+  // CHECK:   ret void
+  defer x(4);
+  defer defer x(3);
+  defer defer defer x(2);
+  defer defer defer defer x(1);
+  x(0);
+}
+
+// CHECK-LABEL: define {{.*}} i32 @vla(ptr {{.*}} %p, i32 {{.*}} %x)
+int vla(int* p, int x) {
+    // CHECK: entry:
+    // CHECK:   %retval = alloca i32, align 4
+    // CHECK:   %p.addr = alloca ptr, align 8
+    // CHECK:   %x.addr = alloca i32, align 4
+    // CHECK:   %cleanup.dest.slot = alloca i32, align 4
+    // CHECK:   %saved_stack = alloca ptr, align 8
+    // CHECK:   %__vla_expr0 = alloca i64, align 8
+    // CHECK:   %saved_stack3 = alloca ptr, align 8
+    // CHECK:   %__vla_expr1 = alloca i64, align 8
+    // CHECK:   store ptr %p, ptr %p.addr, align 8
+    // CHECK:   store i32 %x, ptr %x.addr, align 4
+    // CHECK:   %0 = load i32, ptr %x.addr, align 4
+    // CHECK:   %cmp = icmp slt i32 %0, 5
+    // CHECK:   br i1 %cmp, label %if.then, label %if.end
+    // CHECK: if.then:
+    // CHECK:   store i32 10, ptr %retval, align 4
+    // CHECK:   store i32 1, ptr %cleanup.dest.slot, align 4
+    // CHECK:   br label %cleanup
+    // CHECK: if.end:
+    // CHECK:   store i32 7, ptr %retval, align 4
+    // CHECK:   store i32 1, ptr %cleanup.dest.slot, align 4
+    // CHECK:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+    // CHECK:   %1 = load i32, ptr %x.addr, align 4
+    // CHECK:   %2 = zext i32 %1 to i64
+    // CHECK:   %3 = call ptr @llvm.stacksave.p0()
+    // CHECK:   store ptr %3, ptr %saved_stack, align 8
+    // CHECK:   %vla = alloca i32, i64 %2, align 16
+    // CHECK:   store i64 %2, ptr %__vla_expr0, align 8
+    // CHECK:   %arrayidx = getelementptr inbounds i32, ptr %vla, i64 2
+    // CHECK:   store i32 4, ptr %arrayidx, align 8
+    // CHECK:   %arrayidx1 = getelementptr inbounds i32, ptr %vla, i64 2
+    // CHECK:   %4 = load i32, ptr %arrayidx1, align 8
+    // CHECK:   %5 = load ptr, ptr %p.addr, align 8
+    // CHECK:   store i32 %4, ptr %5, align 4
+    // CHECK:   %6 = load ptr, ptr %saved_stack, align 8
+    // CHECK:   call void @llvm.stackrestore.p0(ptr %6)
+    // CHECK:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+    // CHECK:   br label %cleanup
+    // CHECK: cleanup:
+    // CHECK:   %cleanup.dest.saved2 = load i32, ptr %cleanup.dest.slot, align 4
+    // CHECK:   %7 = load i32, ptr %x.addr, align 4
+    // CHECK:   %8 = zext i32 %7 to i64
+    // CHECK:   %9 = call ptr @llvm.stacksave.p0()
+    // CHECK:   store ptr %9, ptr %saved_stack3, align 8
+    // CHECK:   %vla4 = alloca i32, i64 %8, align 16
+    // CHECK:   store i64 %8, ptr %__vla_expr1, align 8
+    // CHECK:   %arrayidx5 = getelementptr inbounds i32, ptr %vla4, i64 2
+    // CHECK:   store i32 3, ptr %arrayidx5, align 8
+    // CHECK:   %arrayidx6 = getelementptr inbounds i32, ptr %vla4, i64 2
+    // CHECK:   %10 = load i32, ptr %arrayidx6, align 8
+    // CHECK:   %11 = load ptr, ptr %p.addr, align 8
+    // CHECK:   store i32 %10, ptr %11, align 4
+    // CHECK:   %12 = load ptr, ptr %saved_stack3, align 8
+    // CHECK:   call void @llvm.stackrestore.p0(ptr %12)
+    // CHECK:   store i32 %cleanup.dest.saved2, ptr %cleanup.dest.slot, align 4
+    // CHECK:   %13 = load i32, ptr %retval, align 4
+    // CHECK:   ret i32 %13
+    defer {
+        int a[x];
+        a[2] = 3;
+        *p = a[2];
+    }
+    if (x < 5) { return 10; }
+    defer {
+        int b[x];
+        b[2] = 4;
+        *p = b[2];
+    }
+    return 7;
+}
+
+[[noreturn]] void exit();
+[[noreturn]] void _Exit();
+[[noreturn]] void foobar();
+
+// CHECK-LABEL: define {{.*}} i32 @call_exit()
+int call_exit() {
+    // CHECK: entry:
+    // CHECK:   call void @exit()
+    // CHECK:   unreachable
+    defer x(1);
+    exit();
+}
+
+// CHECK-LABEL: define {{.*}} i32 @call__Exit()
+int call__Exit() {
+    // CHECK: entry:
+    // CHECK:   call void @_Exit()
+    // CHECK:   unreachable
+    defer x(1);
+    _Exit();
+}
+
+// CHECK-LABEL: define {{.*}} i32 @call_foobar()
+int call_foobar() {
+    // CHECK: entry:
+    // CHECK:   call void @foobar()
+    // CHECK:   unreachable
+    defer x(1);
+    foobar();
+}
+
+// CHECK-LABEL: define {{.*}} i32 @main()
+int main() {
+  // CHECK: entry:
+  // CHECK:   %retval = alloca i32, align 4
+  // CHECK:   store i32 0, ptr %retval, align 4
+  // CHECK:   store i32 5, ptr %retval, align 4
+  // CHECK:   call void @x(i32 {{.*}} 42)
+  // CHECK:   %0 = load i32, ptr %retval, align 4
+  // CHECK:   ret i32 %0
+  defer x(42);
+  return 5;
+}
+
+// CHECK-LABEL: define {{.*}} void @t()
+// CHECK: entry:
+// CHECK-NEXT:   %count = alloca i32, align 4
+// CHECK-NEXT:   %cleanup.dest.slot = alloca i32, align 4
+// CHECK-NEXT:   store i32 0, ptr %count, align 4
+// CHECK-NEXT:   br label %target
+// CHECK: target:
+// CHECK-NEXT:   %0 = load i32, ptr %count, align 4
+// CHECK-NEXT:   %inc = add nsw i32 %0, 1
+// CHECK-NEXT:   store i32 %inc, ptr %count, align 4
+// CHECK-NEXT:   %1 = load i32, ptr %count, align 4
+// CHECK-NEXT:   %cmp = icmp sle i32 %1, 2
+// CHECK-NEXT:   br i1 %cmp, label %if.then, label %if.end
+// CHECK: if.then:
+// CHECK-NEXT:   store i32 2, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: cleanup:
+// CHECK-NEXT:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @x(i32 {{.*}} 1)
+// CHECK-NEXT:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest, label %unreachable [
+// CHECK-NEXT:     i32 0, label %cleanup.cont
+// CHECK-NEXT:     i32 2, label %target
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont:
+// CHECK-NEXT:   call void @x(i32 {{.*}} 2)
+// CHECK-NEXT:   ret void
+// CHECK: unreachable:
+// CHECK-NEXT:   unreachable
+void t() {
+   int count = 0;
+
+   {
+     target:
+     _Defer { x(1); }
+     ++count;
+     if (count <= 2) {
+       goto target;
+     }
+   }
+
+   x(2);
+}
+
+// CHECK-LABEL: define {{.*}} void @stmt_expr()
+// CHECK: entry:
+// CHECK-NEXT:   %tmp = alloca i32, align 4
+// CHECK-NEXT:   call void @x(i32 {{.*}} 1)
+// CHECK-NEXT:   call void @x(i32 {{.*}} 2)
+// CHECK-NEXT:   call void @x(i32 {{.*}} 3)
+// CHECK-NEXT:   call void @x(i32 {{.*}} 4)
+// CHECK-NEXT:   store i32 6, ptr %tmp, align 4
+// CHECK-NEXT:   call void @x(i32 {{.*}} 5)
+// CHECK-NEXT:   %0 = load i32, ptr %tmp, align 4
+// CHECK-NEXT:   call void @x(i32 {{.*}} %0)
+// CHECK-NEXT:   ret void
+void stmt_expr() {
+  ({
+    _Defer x(4);
+    _Defer ({
+      _Defer x(3);
+      x(2);
+    });
+    x(1);
+  });
+
+  x(({
+    _Defer x(5);
+    6;
+  }));
+}
+
+// CHECK-LABEL: define {{.*}} void @cleanup_no_insert_point()
+// CHECK: entry:
+// CHECK-NEXT:   %cleanup.dest.slot = alloca i32, align 4
+// CHECK-NEXT:   br label %while.cond
+// CHECK: while.cond:
+// CHECK-NEXT:   %call = call {{.*}} i1 @q(i32 {{.*}} 1)
+// CHECK-NEXT:   br i1 %call, label %while.body, label %while.end
+// CHECK: while.body:
+// CHECK-NEXT:   %call1 = call {{.*}} i1 @q(i32 {{.*}} 2)
+// CHECK-NEXT:   br i1 %call1, label %if.then, label %if.end
+// CHECK: if.then:
+// CHECK-NEXT:   store i32 2, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end:
+// CHECK-NEXT:   %call2 = call {{.*}} i1 @q(i32 {{.*}} 3)
+// CHECK-NEXT:   br i1 %call2, label %if.then3, label %if.end4
+// CHECK: if.then3:
+// CHECK-NEXT:   store i32 3, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end4:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: cleanup:
+// CHECK-NEXT:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   call void @noreturn()
+// CHECK-NEXT:   unreachable
+// CHECK: 0:
+// CHECK-NEXT:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest, label %unreachable [
+// CHECK-NEXT:     i32 0, label %cleanup.cont
+// CHECK-NEXT:     i32 2, label %while.cond
+// CHECK-NEXT:     i32 3, label %while.end
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont:
+// CHECK-NEXT:   br label %while.cond
+// CHECK: while.end:
+// CHECK-NEXT:   ret void
+// CHECK: unreachable:
+// CHECK-NEXT:   unreachable
+void cleanup_no_insert_point() {
+  while (q(1)) {
+    _Defer {
+      noreturn();
+    };
+    if (q(2)) continue;
+    if (q(3)) break;
+  }
+}
+
+// CHECK-LABEL: define {{.*}} void @cleanup_nested()
+// CHECK: entry:
+// CHECK-NEXT:   %cleanup.dest.slot = alloca i32, align 4
+// CHECK-NEXT:   br label %while.cond
+// CHECK: while.cond:
+// CHECK-NEXT:   %call = call {{.*}} i1 @q(i32 {{.*}} 1)
+// CHECK-NEXT:   br i1 %call, label %while.body, label %while.end19
+// CHECK: while.body:
+// CHECK-NEXT:   %call1 = call {{.*}} i1 @q(i32 {{.*}} 6)
+// CHECK-NEXT:   br i1 %call1, label %if.then, label %if.end
+// CHECK: if.then:
+// CHECK-NEXT:   store i32 2, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end:
+// CHECK-NEXT:   %call2 = call {{.*}} i1 @q(i32 {{.*}} 7)
+// CHECK-NEXT:   br i1 %call2, label %if.then3, label %if.end4
+// CHECK: if.then3:
+// CHECK-NEXT:   store i32 3, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: if.end4:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup
+// CHECK: cleanup:
+// CHECK-NEXT:   %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %while.cond5
+// CHECK: while.cond5:
+// CHECK-NEXT:   %call6 = call {{.*}} i1 @q(i32 {{.*}} 2)
+// CHECK-NEXT:   br i1 %call6, label %while.body7, label %while.end
+// CHECK: while.body7:
+// CHECK-NEXT:   %call8 = call {{.*}} i1 @q(i32 {{.*}} 4)
+// CHECK-NEXT:   br i1 %call8, label %if.then9, label %if.end10
+// CHECK: if.then9:
+// CHECK-NEXT:   store i32 4, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup14
+// CHECK: if.end10:
+// CHECK-NEXT:   %call11 = call {{.*}} i1 @q(i32 {{.*}} 5)
+// CHECK-NEXT:   br i1 %call11, label %if.then12, label %if.end13
+// CHECK: if.then12:
+// CHECK-NEXT:   store i32 5, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup14
+// CHECK: if.end13:
+// CHECK-NEXT:   store i32 0, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   br label %cleanup14
+// CHECK: cleanup14:
+// CHECK-NEXT:   %cleanup.dest.saved15 = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %call16 = call {{.*}} i1 @q(i32 {{.*}} 3)
+// CHECK-NEXT:   store i32 %cleanup.dest.saved15, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest, label %unreachable [
+// CHECK-NEXT:     i32 0, label %cleanup.cont
+// CHECK-NEXT:     i32 4, label %while.cond5
+// CHECK-NEXT:     i32 5, label %while.end
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont:
+// CHECK-NEXT:   br label %while.cond5
+// CHECK: while.end:
+// CHECK-NEXT:   store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   %cleanup.dest17 = load i32, ptr %cleanup.dest.slot, align 4
+// CHECK-NEXT:   switch i32 %cleanup.dest17, label %unreachable [
+// CHECK-NEXT:     i32 0, label %cleanup.cont18
+// CHECK-NEXT:     i32 2, label %while.cond
+// CHECK-NEXT:     i32 3, label %while.end19
+// CHECK-NEXT:   ]
+// CHECK: cleanup.cont18:
+// CHECK-NEXT:   br label %while.cond
+// CHECK: while.end19:
+// CHECK-NEXT:   ret void
+// CHECK: unreachable:
+// CHECK-NEXT:   unreachable
+void cleanup_nested() {
+  while (q(1)) {
+    _Defer {
+      while (q(2)) {
+        _Defer {
+          q(3);
+        }
+        if (q(4)) continue;
+        if (q(5)) break;
+      }
+    };
+    if (q(6)) continue;
+    if (q(7)) break;
+  }
+}
diff --git a/clang/test/Lexer/defer-keyword.cpp b/clang/test/Lexer/defer-keyword.cpp
new file mode 100644
index 0000000000000..929f2c58f974a
--- /dev/null
+++ b/clang/test/Lexer/defer-keyword.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -fdefer-ts %s
+
+// expected-no-diagnostics
+int _Defer;
diff --git a/clang/test/Parser/defer-ts.c b/clang/test/Parser/defer-ts.c
new file mode 100644
index 0000000000000..118fe9ee3cc8f
--- /dev/null
+++ b/clang/test/Parser/defer-ts.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -std=c11 -fsyntax-only -fdefer-ts -verify %s
+// RUN: %clang_cc1 -std=c23 -fsyntax-only -fdefer-ts -verify %s
+
+#define defer _Defer
+
+int g(void);
+int h(int x);
+
+void f1(void) {
+  defer 1; // expected-warning {{expression result unused}}
+  defer 1 + 1; // expected-warning {{expression result unused}}
+  defer "a"; // expected-warning {{expression result unused}}
+  defer "a" "b" "c"; // expected-warning {{expression result unused}}
+  defer defer 1; // expected-warning {{expression result unused}}
+  defer defer defer defer 1; // expected-warning {{expression result unused}}
+  defer (int) 4; // expected-warning {{expression result unused}}
+  defer g();
+
+  defer {}
+  defer { defer {} }
+  defer { defer {} defer {} }
+
+  defer if (g()) g();
+  defer while (g()) g();
+  defer for (int i = 0; i < 10; i++) h(i);
+  defer switch (g()) { case 1: g(); }
+
+  defer; // expected-warning {{defer statement has empty body}} expected-note {{put the semicolon on a separate line}}
+  defer
+    ;
+
+  defer a: g(); // expected-error {{substatement of defer must not be a label}}
+  defer b: {} // expected-error {{substatement of defer must not be a label}}
+  defer { c: g(); }
+
+  if (g()) defer g();
+  while (g()) defer g();
+  defer ({});
+  ({ defer g(); });
+
+  defer int x; // expected-error {{expected expression}}
+  defer void q() {} // expected-error {{expected expression}}
+}
+
+void f2(void) {
+  [[some, attributes]] defer g(); // expected-warning 2 {{unknown attribute}}
+  __attribute__((some_attribute)) defer g(); // expected-warning {{unknown attribute}}
+  [[some, attributes]] defer { g(); } // expected-warning 2 {{unknown attribute}}
+  __attribute__((some_attribute)) defer { g(); } // expected-warning {{unknown attribute}}
+}
+
+void f3(void) {
+  _Defer 1; // expected-warning {{expression result unused}}
+  _Defer {}
+  _Defer _Defer {}
+  _Defer { defer {} _Defer {} }
+  _Defer if (g()) g();
+}
diff --git a/clang/test/Parser/defer-ts.cpp b/clang/test/Parser/defer-ts.cpp
new file mode 100644
index 0000000000000..fa25cac8575f6
--- /dev/null
+++ b/clang/test/Parser/defer-ts.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fdefer-ts -verify %s
+
+void f() {
+  _Defer {} // expected-error {{use of undeclared identifier '_Defer'}}
+}
diff --git a/clang/test/Preprocessor/defer-ts.c b/clang/test/Preprocessor/defer-ts.c
new file mode 100644
index 0000000000000..e4995ac9b23ea
--- /dev/null
+++ b/clang/test/Preprocessor/defer-ts.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -fdefer-ts -verify=enabled %s
+// RUN: %clang_cc1 -fsyntax-only -verify=disabled %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -fdefer-ts -verify=disabled %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -verify=disabled %s
+// enabled-no-diagnostics
+#if __STDC_DEFER_TS25755__ != 1
+// disabled-error@+1 {{Should have defined __STDC_DEFER_TS25755__ to 1}}
+#  error Should have defined __STDC_DEFER_TS25755__ to 1
+#endif
diff --git a/clang/test/Sema/defer-ts-seh.c b/clang/test/Sema/defer-ts-seh.c
new file mode 100644
index 0000000000000..4b773ed3f09a0
--- /dev/null
+++ b/clang/test/Sema/defer-ts-seh.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -fms-compatibility -triple x86_64-windows-msvc -fsyntax-only -verify %s
+
+void f() {
+  __try {
+    _Defer {
+      __leave; // expected-error {{cannot __leave a defer statement}}
+    }
+  } __finally {}
+
+  __try {
+    _Defer {
+      __try {
+        __leave;
+      } __finally {}
+    }
+  } __finally {}
+}
diff --git a/clang/test/Sema/defer-ts-sjlj.c b/clang/test/Sema/defer-ts-sjlj.c
new file mode 100644
index 0000000000000..49230fa721e0f
--- /dev/null
+++ b/clang/test/Sema/defer-ts-sjlj.c
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -std=gnu23 -fdefer-ts -fsyntax-only -fblocks -verify %s
+
+typedef void** jmp_buf;
+typedef void** sigjmp_buf;
+
+int setjmp(jmp_buf env);
+int _setjmp(jmp_buf env);
+int sigsetjmp(sigjmp_buf env, int savesigs);
+int __sigsetjmp(sigjmp_buf env, int savesigs);
+void longjmp(jmp_buf env, int val);
+void _longjmp(jmp_buf env, int val);
+void siglongjmp(sigjmp_buf env, int val);
+
+jmp_buf x;
+sigjmp_buf y;
+void f() {
+    _Defer {
+        __builtin_setjmp(x); // expected-error {{cannot use '__builtin_setjmp' inside a defer statement}}
+        __builtin_longjmp(x, 1); // expected-error {{cannot use '__builtin_longjmp' inside a defer statement}}
+        setjmp(x); // expected-error {{cannot use 'setjmp' inside a defer statement}}
+        _setjmp(x); // expected-error {{cannot use '_setjmp' inside a defer statement}}
+        sigsetjmp(y, 0); // expected-error {{cannot use 'sigsetjmp' inside a defer statement}}
+        __sigsetjmp(y, 0); // expected-error {{cannot use '__sigsetjmp' inside a defer statement}}
+        longjmp(x, 0); // expected-error {{cannot use 'longjmp' inside a defer statement}}
+        _longjmp(x, 0); // expected-error {{cannot use '_longjmp' inside a defer statement}}
+        siglongjmp(y, 0); // expected-error {{cannot use 'siglongjmp' inside a defer statement}}
+
+        (void) ^{
+            __builtin_setjmp(x);
+            __builtin_longjmp(x, 1);
+            setjmp(x);
+            _setjmp(x);
+            sigsetjmp(y, 0);
+            __sigsetjmp(y, 0);
+            longjmp(x, 0);
+            _longjmp(x, 0);
+            siglongjmp(y, 0);
+
+            _Defer {
+                __builtin_setjmp(x); // expected-error {{cannot use '__builtin_setjmp' inside a defer statement}}
+                __builtin_longjmp(x, 1); // expected-error {{cannot use '__builtin_longjmp' inside a defer statement}}
+                setjmp(x); // expected-error {{cannot use 'setjmp' inside a defer statement}}
+                _setjmp(x); // expected-error {{cannot use '_setjmp' inside a defer statement}}
+                sigsetjmp(y, 0); // expected-error {{cannot use 'sigsetjmp' inside a defer statement}}
+                __sigsetjmp(y, 0); // expected-error {{cannot use '__sigsetjmp' inside a defer statement}}
+                longjmp(x, 0); // expected-error {{cannot use 'longjmp' inside a defer statement}}
+                _longjmp(x, 0); // expected-error {{cannot use '_longjmp' inside a defer statement}}
+                siglongjmp(y, 0); // expected-error {{cannot use 'siglongjmp' inside a defer statement}}
+            }
+        };
+    }
+}
diff --git a/clang/test/Sema/defer-ts.c b/clang/test/Sema/defer-ts.c
new file mode 100644
index 0000000000000..95c68fa213eaa
--- /dev/null
+++ b/clang/test/Sema/defer-ts.c
@@ -0,0 +1,172 @@
+// RUN: %clang_cc1 -std=c23 -fdefer-ts -fsyntax-only -verify %s
+
+#define defer _Defer
+
+void a();
+
+void f1() {
+  defer {
+    goto l1;
+    l1:
+  }
+
+  defer {
+    l2:
+    goto l2;
+  }
+}
+
+void f2() {
+  goto l1; // expected-error {{cannot jump from this goto statement to its label}}
+  defer { // expected-note {{jump enters a defer statement}}
+    l1:
+  }
+
+  goto l2; // expected-error {{cannot jump from this goto statement to its label}}
+  defer {} // expected-note {{jump bypasses defer statement}}
+  l2:
+}
+
+void f3() {
+  x:
+  defer { // expected-note {{jump exits a defer statement}}
+    goto x; // expected-error {{cannot jump from this goto statement to its label}}
+  }
+}
+
+void f4() {
+  defer { // expected-note {{jump exits a defer statement}}
+    goto y; // expected-error {{cannot jump from this goto statement to its label}}
+  }
+  y:
+}
+
+void f5() {
+  defer { // expected-note {{jump enters a defer statement}}
+    l2:
+  }
+  goto l2; // expected-error {{cannot jump from this goto statement to its label}}
+}
+
+void f6() {
+  goto b; // expected-error {{cannot jump from this goto statement to its label}}
+  {
+    defer {} // expected-note {{jump bypasses defer statement}}
+    b:
+  }
+
+  {
+    defer {} // expected-note {{jump bypasses defer statement}}
+    b2:
+  }
+  goto b2; // expected-error {{cannot jump from this goto statement to its label}}
+}
+
+void f7() {
+  defer { // expected-note {{jump bypasses defer statement}}
+    goto cross1; // expected-error {{cannot jump from this goto statement to its label}}
+    cross2:
+  }
+  defer { // expected-note {{jump exits a defer statement}} expected-note {{jump enters a defer statement}}
+    goto cross2; // expected-error {{cannot jump from this goto statement to its label}}
+    cross1:
+  }
+}
+
+void f8() {
+  defer {
+    return; // expected-error {{cannot return from a defer statement}}
+  }
+
+  {
+    defer {
+      return; // expected-error {{cannot return from a defer statement}}
+    }
+  }
+
+  switch (1) {
+    case 1: defer {
+      break; // expected-error {{cannot break out of a defer statement}}
+    }
+  }
+
+  for (;;) {
+    defer {
+      break; // expected-error {{cannot break out of a defer statement}}
+    }
+  }
+
+  for (;;) {
+    defer {
+      continue; // expected-error {{cannot continue loop outside of enclosing defer statement}}
+    }
+  }
+
+  switch (1) {
+    defer {} // expected-note {{jump bypasses defer statement}}
+  default: // expected-error {{cannot jump from switch statement to this case label}}
+    defer {}
+    break;
+  }
+
+  switch (1) {
+    case 1: {
+      defer { // expected-note {{jump enters a defer statement}}
+        case 2: {} // expected-error {{cannot jump from switch statement to this case label}}
+      }
+    }
+  }
+
+  switch (1) {
+    case 1: defer {
+      switch (2) { case 2: break; }
+    }
+  }
+
+  for (;;) {
+    defer { for (;;) break; }
+  }
+
+  for (;;) {
+    defer { for (;;) continue; }
+  }
+}
+
+void f9() {
+  {
+    defer {}
+    goto l1;
+  }
+  l1:
+
+  {
+    goto l2;
+    defer {}
+  }
+  l2:
+
+  {
+    { defer {} }
+    goto l3;
+  }
+  l3:
+
+  {
+    defer {}
+    { goto l4; }
+  }
+  l4:
+}
+
+void f10(int i) {
+  switch (i) {
+    defer case 12: break; // expected-error {{cannot break out of a defer statement}} \
+                             expected-error {{cannot jump from switch statement to this case label}} \
+                             expected-note {{jump enters a defer statement}} \
+                             expected-note {{jump bypasses defer statement}}
+
+    defer default: break; // expected-error {{cannot break out of a defer statement}} \
+                             expected-error {{cannot jump from switch statement to this case label}} \
+                             expected-note {{jump enters a defer statement}}
+  }
+}
diff --git a/clang/test/Sema/warn-lifetime-safety-suggestions.cpp b/clang/test/Sema/warn-lifetime-safety-suggestions.cpp
index c0f675a301d14..9f3ccb7fca770 100644
--- a/clang/test/Sema/warn-lifetime-safety-suggestions.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-suggestions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -Wexperimental-lifetime-safety-suggestions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -fexperimental-lifetime-safety-inference -Wexperimental-lifetime-safety-suggestions -Wexperimental-lifetime-safety -verify %s
 
 struct MyObj {
   int id;
@@ -89,6 +89,98 @@ void test_getView_on_temporary() {
   (void)sv;
 }
 
+//===----------------------------------------------------------------------===//
+// Annotation Inference Test Cases
+//===----------------------------------------------------------------------===//
+
+namespace correct_order_inference {
+View return_view_by_func (View a) {    // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return return_view_directly(a);      // expected-note {{param returned here}}
+}
+
+MyObj* return_pointer_by_func (MyObj* a) {         // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return return_pointer_object(a);                 // expected-note {{param returned here}} 
+}
+} // namespace correct_order_inference
+
+namespace incorrect_order_inference_view {
+View return_view_callee(View a);
+
+// FIXME: No lifetime annotation suggestion when functions are not present in the callee-before-caller pattern
+View return_view_caller(View a) {
+  return return_view_callee(a);
+}
+
+View return_view_callee(View a) {     // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return a;                           // expected-note {{param returned here}}
+}   
+} // namespace incorrect_order_inference_view
+
+namespace incorrect_order_inference_object {
+MyObj* return_object_callee(MyObj* a);
+
+// FIXME: No lifetime annotation suggestion warning when functions are not present in the callee-before-caller pattern
+MyObj* return_object_caller(MyObj* a) {
+  return return_object_callee(a);
+}
+
+MyObj* return_object_callee(MyObj* a) {      // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return a;                                  // expected-note {{param returned here}}
+}   
+} // namespace incorrect_order_inference_object
+
+namespace simple_annotation_inference {
+View inference_callee_return_identity(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return a;                                     // expected-note {{param returned here}}
+}
+
+View inference_caller_forwards_callee(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return inference_callee_return_identity(a);   // expected-note {{param returned here}}
+}
+
+View inference_top_level_return_stack_view() {
+  MyObj local_stack;
+  return inference_caller_forwards_callee(local_stack);     // expected-warning {{address of stack memory is returned later}}
+                                                            // expected-note@-1 {{returned here}}
+}
+} // namespace simple_annotation_inference
+
+namespace inference_in_order_with_redecls {
+View inference_callee_return_identity(View a);
+View inference_callee_return_identity(View a) {   // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return a;                                       // expected-note {{param returned here}}
+}
+
+View inference_caller_forwards_callee(View a);
+View inference_caller_forwards_callee(View a) {   // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return inference_callee_return_identity(a);     // expected-note {{param returned here}}
+}
+  
+View inference_top_level_return_stack_view() {
+  MyObj local_stack;
+  return inference_caller_forwards_callee(local_stack);     // expected-warning {{address of stack memory is returned later}}
+                                                            // expected-note@-1 {{returned here}}
+}
+} // namespace inference_in_order_with_redecls
+
+namespace inference_with_templates {
+template<typename T>  
+T* template_identity(T* a) {            // expected-warning {{param should be marked [[clang::lifetimebound]]}}.
+  return a;                             // expected-note {{param returned here}}
+}
+
+template<typename T>
+T* template_caller(T* a) {
+  return template_identity(a);          // expected-note {{in instantiation of function template specialization 'inference_with_templates::template_identity<MyObj>' requested here}}
+}
+
+// FIXME: Fails to detect UAR as template instantiations are deferred to the end of the Translation Unit.
+MyObj* test_template_inference_with_stack() {
+  MyObj local_stack;
+  return template_caller(&local_stack); // expected-note {{in instantiation of function template specialization 'inference_with_templates::template_caller<MyObj>' requested here}}                                              
+}
+} // namespace inference_with_templates
+
 //===----------------------------------------------------------------------===//
 // Negative Test Cases
 //===----------------------------------------------------------------------===//
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 0a43d73063c1f..c49ca567049c7 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -224,6 +224,11 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
     K = CXCursor_ReturnStmt;
     break;
 
+  // Not exposed for now because '_Defer' is currently just a TS.
+  case Stmt::DeferStmtClass:
+    K = CXCursor_UnexposedStmt;
+    break;
+
   case Stmt::GCCAsmStmtClass:
     K = CXCursor_GCCAsmStmt;
     break;
diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp
index 25e1d67eb2294..9a7559405c43c 100644
--- a/clang/unittests/Tooling/ToolingTest.cpp
+++ b/clang/unittests/Tooling/ToolingTest.cpp
@@ -20,8 +20,10 @@
 #include "clang/Testing/CommandLineArgs.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
+#include "clang/Tooling/JSONCompilationDatabase.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/TargetParser/Host.h"
@@ -1034,5 +1036,136 @@ TEST(runToolOnCode, TestResetDiagnostics) {
                     "void func() { long x; Foo f(x); }"));
 }
 
+namespace {
+struct TestCommand {
+  llvm::StringRef File;
+  llvm::StringRef Command;
+};
+
+std::string runToolWithProgress(llvm::ArrayRef<TestCommand> Commands,
+                                llvm::StringRef BaseDir) {
+  std::string ErrorMessage;
+
+  llvm::json::Array Entries;
+  for (const auto &Cmd : Commands) {
+    Entries.push_back(llvm::json::Object{
+        {"directory", BaseDir}, {"command", Cmd.Command}, {"file", Cmd.File}});
+  }
+  std::string DatabaseContent;
+  llvm::raw_string_ostream OS(DatabaseContent);
+  OS << llvm::json::Value(std::move(Entries));
+
+  std::unique_ptr<CompilationDatabase> Database(
+      JSONCompilationDatabase::loadFromBuffer(DatabaseContent, ErrorMessage,
+                                              JSONCommandLineSyntax::Gnu));
+  if (!Database) {
+    ADD_FAILURE() << "Failed to load compilation database: " << ErrorMessage;
+    return "";
+  }
+
+  std::vector<std::string> AbsoluteFiles;
+  for (const auto &Cmd : Commands) {
+    SmallString<32> NativeFile(BaseDir);
+    llvm::sys::path::append(NativeFile, Cmd.File);
+    llvm::sys::path::native(NativeFile);
+    std::string AbsPath = std::string(NativeFile);
+    if (AbsoluteFiles.empty() || AbsoluteFiles.back() != AbsPath) {
+      AbsoluteFiles.push_back(AbsPath);
+    }
+  }
+
+  ClangTool Tool(*Database, AbsoluteFiles);
+  for (const auto &F : AbsoluteFiles) {
+    Tool.mapVirtualFile(F, "int x;");
+  }
+
+  testing::internal::CaptureStderr();
+  Tool.run(newFrontendActionFactory<SyntaxOnlyAction>().get());
+  return testing::internal::GetCapturedStderr();
+}
+} // namespace
+
+TEST(ClangToolTest, ProgressReportSingleFile) {
+  SmallString<32> BaseDir;
+  llvm::sys::path::system_temp_directory(false, BaseDir);
+  llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix);
+
+  EXPECT_TRUE(
+      runToolWithProgress({{"test.cpp", "clang++ -c test.cpp"}}, BaseDir)
+          .empty());
+}
+
+TEST(ClangToolTest, ProgressReportMultipleFiles) {
+  SmallString<32> BaseDir;
+  llvm::sys::path::system_temp_directory(false, BaseDir);
+  llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix);
+
+  std::string Output =
+      runToolWithProgress({{"test1.cpp", "clang++ -c test1.cpp"},
+                           {"test2.cpp", "clang++ -c test2.cpp"}},
+                          BaseDir);
+
+  SmallString<32> NativeFile1(BaseDir);
+  llvm::sys::path::append(NativeFile1, "test1.cpp");
+  llvm::sys::path::native(NativeFile1);
+  SmallString<32> NativeFile2(BaseDir);
+  llvm::sys::path::append(NativeFile2, "test2.cpp");
+  llvm::sys::path::native(NativeFile2);
+
+  std::string Expected = "[1/2] Processing file " + std::string(NativeFile1) +
+                         ".\n" + "[2/2] Processing file " +
+                         std::string(NativeFile2) + ".\n";
+  EXPECT_EQ(Output, Expected);
+}
+
+TEST(ClangToolTest, ProgressReportMultipleCommands) {
+  SmallString<32> BaseDir;
+  llvm::sys::path::system_temp_directory(false, BaseDir);
+  llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix);
+
+  std::string Output =
+      runToolWithProgress({{"test.cpp", "clang++ -c test.cpp -DCMD1"},
+                           {"test.cpp", "clang++ -c test.cpp -DCMD2"}},
+                          BaseDir);
+
+  SmallString<32> NativeFile(BaseDir);
+  llvm::sys::path::append(NativeFile, "test.cpp");
+  llvm::sys::path::native(NativeFile);
+  std::string Expected =
+      "[1/1] (1/2) Processing file " + std::string(NativeFile) + ".\n" +
+      "[1/1] (2/2) Processing file " + std::string(NativeFile) + ".\n";
+  EXPECT_EQ(Output, Expected);
+}
+
+TEST(ClangToolTest, ProgressReportMixed) {
+  SmallString<32> BaseDir;
+  llvm::sys::path::system_temp_directory(false, BaseDir);
+  llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix);
+
+  std::string Output =
+      runToolWithProgress({{"test1.cpp", "clang++ -c test1.cpp"},
+                           {"test2.cpp", "clang++ -c test2.cpp -DCMD1"},
+                           {"test2.cpp", "clang++ -c test2.cpp -DCMD2"},
+                           {"test3.cpp", "clang++ -c test3.cpp"}},
+                          BaseDir);
+
+  SmallString<32> NativeFile1(BaseDir);
+  llvm::sys::path::append(NativeFile1, "test1.cpp");
+  llvm::sys::path::native(NativeFile1);
+  SmallString<32> NativeFile2(BaseDir);
+  llvm::sys::path::append(NativeFile2, "test2.cpp");
+  llvm::sys::path::native(NativeFile2);
+  SmallString<32> NativeFile3(BaseDir);
+  llvm::sys::path::append(NativeFile3, "test3.cpp");
+  llvm::sys::path::native(NativeFile3);
+
+  std::string Expected =
+      "[1/3] Processing file " + std::string(NativeFile1) + ".\n" +
+      "[2/3] (1/2) Processing file " + std::string(NativeFile2) + ".\n" +
+      "[2/3] (2/2) Processing file " + std::string(NativeFile2) + ".\n" +
+      "[3/3] Processing file " + std::string(NativeFile3) + ".\n";
+  EXPECT_EQ(Output, Expected);
+}
+
 } // end namespace tooling
 } // end namespace clang
diff --git a/compiler-rt/lib/orc/CMakeLists.txt b/compiler-rt/lib/orc/CMakeLists.txt
index b8d1b03b788c9..649d988d9d608 100644
--- a/compiler-rt/lib/orc/CMakeLists.txt
+++ b/compiler-rt/lib/orc/CMakeLists.txt
@@ -119,6 +119,7 @@ else() # not Apple
       elfnix_tls.x86-64.S
       elfnix_tls.aarch64.S
       elfnix_tls.ppc64.S
+      elfnix_tls.systemz.S
       sysv_reenter.arm64.S
       sysv_reenter.x86-64.S
       )
diff --git a/compiler-rt/lib/orc/elfnix_tls.systemz.S b/compiler-rt/lib/orc/elfnix_tls.systemz.S
new file mode 100644
index 0000000000000..4e116c92a5a88
--- /dev/null
+++ b/compiler-rt/lib/orc/elfnix_tls.systemz.S
@@ -0,0 +1,42 @@
+//===-- orc_rt_elfnix_tls_systemz.s -------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of the ORC runtime support library.
+//
+//===----------------------------------------------------------------------===//
+
+// The special thing about the s390 TLS ABI is that we do not have the
+// standard __tls_get_addr function but the __tls_get_offset function
+// which differs in two important aspects:
+// 1) __tls_get_offset gets a got offset instead of a pointer to the
+//    tls_index structure
+// 2) __tls_get_offset returns the offset of the requested variable to
+//    the thread descriptor instead of a pointer to the variable.
+
+// The content of this file is systemz-only
+
+#if defined(__s390x__)
+
+	.text
+	// returns offset of TLV from TP in %r2.
+	.globl	___orc_rt_elfnix_tls_get_offset
+___orc_rt_elfnix_tls_get_offset:
+	stmg	%r14, %r15, 112(%r15)
+	aghi	%r15, -160
+	// Pass pointer to tls_index.
+	la 	%r2, 0(%r2, %r12)
+	brasl	%r14, __orc_rt_elfnix_tls_get_addr_impl
+	// Return offset from TP.
+	ear 	%r0, %a0
+	sllg	%r0, %r0, 32
+	ear 	%r0, %a1
+	sgr 	%r2, %r0
+	lmg 	%r14, %r15, 272(%r15)
+	br	%r14
+
+#endif // defined(__s390x__)
diff --git a/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c b/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c
index 8b86d7879f7e7..f0c9fcd30c07e 100644
--- a/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c
+++ b/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c
@@ -1,3 +1,4 @@
+// XFAIL: *
 // REQUIRES: target-is-powerpc64le
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 
diff --git a/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c b/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c
index 0eee31db1b5dd..fe706986a345d 100644
--- a/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c
+++ b/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c
@@ -1,3 +1,4 @@
+// XFAIL: *
 // REQUIRES: target-is-powerpc64le
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 
diff --git a/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S b/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S
new file mode 100644
index 0000000000000..4bf1c578bd1d7
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S
@@ -0,0 +1,67 @@
+// RUN: %clang -c -o %t %s
+// RUN: %llvm_jitlink %t
+//
+// Test that basic ELF TLS work by adding together TLSs with values
+// 0, 1, and -1, and returning the result (0 for success). This setup
+// tests both zero-initialized (.tbss) and non-zero-initialized
+// (.tdata) sections.
+
+        .section        .data.rel.ro,"aw",@progbits
+        .p2align        3, 0x0
+.LCPI0_0:
+        .quad   x@TLSGD
+.LCPI0_1:
+        .quad   y@TLSGD
+.LCPI0_2:
+        .quad   z@TLSGD
+
+        .text
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        stmg    %r10, %r15, 80(%r15)
+        aghi    %r15, -160
+        lgrl    %r2, .LCPI0_0
+        larl    %r12, _GLOBAL_OFFSET_TABLE_
+        brasl   %r14, __tls_get_offset@PLT:tls_gdcall:x
+        lgr     %r13, %r2
+        lgrl    %r2, .LCPI0_1
+        brasl   %r14, __tls_get_offset@PLT:tls_gdcall:y
+        ear     %r0, %a0
+        sllg    %r11, %r0, 32
+        ear     %r11, %a1
+        l       %r10, 0(%r2,%r11)
+        lgrl    %r2, .LCPI0_2
+        a       %r10, 0(%r13,%r11)
+        brasl   %r14, __tls_get_offset@PLT:tls_gdcall:z
+        a       %r10, 0(%r2,%r11)
+        lgfr    %r2, %r10
+        lmg     %r10, %r15, 240(%r15)
+        br      %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+
+        .type   x,@object                       # @x
+        .section        .tbss,"awT",@nobits
+        .globl  x
+        .p2align        2, 0x0
+x:
+        .long   0                               # 0x0
+        .size   x, 4
+
+        .type   y,@object                       # @y
+        .section        .tdata,"awT",@progbits
+        .globl  y
+        .p2align        2, 0x0
+y:
+        .long   1                               # 0x1
+        .size   y, 4
+
+        .type   z,@object                       # @z
+        .globl  z
+        .p2align        2, 0x0
+z:
+        .long   4294967295                      # 0xffffffff
+        .size   z, 4
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..662703dfb6321 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -57,26 +57,34 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    bool sourceIsDevice) {
   int stat{RTNAME(CUFAllocatableAllocate)(
       alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
-    Fortran::runtime::DoFromSourceAssign(
-        alloc, source, terminator, &MemmoveHostToDevice);
+    Fortran::runtime::DoFromSourceAssign(alloc, source, terminator,
+        sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
   }
   return stat;
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
     const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    bool sourceIsDevice) {
+  int stat;
+  if (sourceIsDevice) {
+    stat = RTNAME(CUFAllocatableAllocate)(
+        alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine);
+  } else {
+    stat = RTNAME(CUFAllocatableAllocateSync)(
+        alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine);
+  }
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
-    Fortran::runtime::DoFromSourceAssign(
-        alloc, source, terminator, &MemmoveHostToDevice);
+    Fortran::runtime::DoFromSourceAssign(alloc, source, terminator,
+        sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
   }
   return stat;
 }
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..f07b1a9b60924 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -56,26 +56,28 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
     const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    bool sourceIsDevice) {
   int stat{RTNAME(CUFPointerAllocate)(
       pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
-    Fortran::runtime::DoFromSourceAssign(
-        pointer, source, terminator, &MemmoveHostToDevice);
+    Fortran::runtime::DoFromSourceAssign(pointer, source, terminator,
+        sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
   }
   return stat;
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
     const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    bool sourceIsDevice) {
   int stat{RTNAME(CUFPointerAllocateSync)(
       pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
-    Fortran::runtime::DoFromSourceAssign(
-        pointer, source, terminator, &MemmoveHostToDevice);
+    Fortran::runtime::DoFromSourceAssign(pointer, source, terminator,
+        sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
   }
   return stat;
 }
diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
index b4932594114a1..0b70778eba3af 100644
--- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -99,11 +99,25 @@ struct TBAATree {
   //   |- "any data access"
   //      |
   //      |- "dummy arg data"
-  //      |- "target data"
-  //         |
-  //         |- "allocated data"
-  //         |- "direct data"
-  //         |- "global data"
+  //        |
+  //        |- <dummy arg name 1>
+  //        |- <dummy arg name 2>
+  //      |- "target data" <-- Any POINTER variable or TARGET dummy arg
+  //        |
+  //        |- <target name 1> <--- any TARGET variable which isn't a dummy arg
+  //        |- <target name 2>
+  //      |- "allocated data"
+  //        |
+  //        |- <allocated name 1>
+  //        |- <allocated name 2>
+  //      |- "direct data"
+  //        |
+  //        |- <direct name 1>
+  //        |- <direct name 2>
+  //      |- "global data"
+  //        |
+  //        |- <global name 1>
+  //        |- <global name 2>
   static TBAATree buildTree(mlir::StringAttr functionName);
 
 private:
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 636879f28a2fb..34ac21c51b933 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -100,8 +100,9 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
       Optional<fir_ReferenceType>:$stream,
       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
-      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
-      UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer);
+      OptionalAttr<cuf_DataAttributeAttr>:$data_attr, UnitAttr:$hasStat,
+      UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer,
+      UnitAttr:$device_source);
 
   let results = (outs AnyIntegerType:$stat);
 
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..97f24bc34bfb8 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -34,14 +34,16 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const char *sourceFile = nullptr, int sourceLine = 0,
+    bool sourceIsDevice = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
     const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const char *sourceFile = nullptr, int sourceLine = 0,
+    bool sourceIsDevice = false);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..b845fd59114d4 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -34,14 +34,16 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
     const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const char *sourceFile = nullptr, int sourceLine = 0,
+    bool sourceIsDevice = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
     const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const char *sourceFile = nullptr, int sourceLine = 0,
+    bool sourceIsDevice = false);
 
 } // extern "C"
 
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 5fc47107f0e88..bdf651f49f76e 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -682,9 +682,10 @@ class AllocateStmtHelper {
     bool isAMDMemoryAllocatorEnabled = langFeatures.IsEnabled(
         Fortran::common::LanguageFeature::AmdMemoryAllocator);
 
+    bool sourceIsDevice = false;
     if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
       if (Fortran::semantics::IsCUDADevice(*sym))
-        TODO(loc, "CUDA Fortran: allocate with device source");
+        sourceIsDevice = true;
 
     // Generate a sequence of runtime calls.
     errorManager.genStatCheck(builder, loc);
@@ -704,7 +705,7 @@ class AllocateStmtHelper {
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()) || sourceIsDevice) {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
     } else {
@@ -853,13 +854,19 @@ class AllocateStmtHelper {
     // Keep return type the same as a standard AllocatableAllocate call.
     mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
 
+    bool isSourceDevice = false;
+    if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
+      if (Fortran::semantics::IsCUDADevice(*sym))
+        isSourceDevice = true;
+
     bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
     return cuf::AllocateOp::create(
                builder, loc, retTy, box.getAddr(), errmsg, stream, pinned,
                source, cudaAttr,
                errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
                doubleDescriptors ? builder.getUnitAttr() : nullptr,
-               box.isPointer() ? builder.getUnitAttr() : nullptr)
+               box.isPointer() ? builder.getUnitAttr() : nullptr,
+               isSourceDevice ? builder.getUnitAttr() : nullptr)
         .getResult();
   }
 
diff --git a/flang/lib/Optimizer/Analysis/TBAAForest.cpp b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
index 44a0348da3a6f..7154785c62c75 100644
--- a/flang/lib/Optimizer/Analysis/TBAAForest.cpp
+++ b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
@@ -66,12 +66,9 @@ fir::TBAATree::TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess,
                         mlir::LLVM::TBAATypeDescriptorAttr dataRoot,
                         mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc)
     : targetDataTree(dataRoot.getContext(), "target data", dataRoot),
-      globalDataTree(dataRoot.getContext(), "global data",
-                     targetDataTree.getRoot()),
-      allocatedDataTree(dataRoot.getContext(), "allocated data",
-                        targetDataTree.getRoot()),
+      globalDataTree(dataRoot.getContext(), "global data", dataRoot),
+      allocatedDataTree(dataRoot.getContext(), "allocated data", dataRoot),
       dummyArgDataTree(dataRoot.getContext(), "dummy arg data", dataRoot),
-      directDataTree(dataRoot.getContext(), "direct data",
-                     targetDataTree.getRoot()),
+      directDataTree(dataRoot.getContext(), "direct data", dataRoot),
       anyAccessDesc(anyAccess), boxMemberTypeDesc(boxMemberTypeDesc),
       anyDataTypeDesc(dataRoot) {}
diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
index 0221c7a8184d7..558ffa1a80bcf 100644
--- a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
+++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
@@ -60,6 +60,9 @@ static llvm::cl::opt<unsigned> localAllocsThreshold(
     llvm::cl::desc("If present, stops generating TBAA tags for accesses of "
                    "local allocations after N accesses in a module"));
 
+// Defined in AliasAnalysis.cpp
+extern llvm::cl::opt<bool> supportCrayPointers;
+
 namespace {
 
 // Return the size and alignment (in bytes) for the given type.
@@ -668,6 +671,7 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
   LLVM_DEBUG(llvm::dbgs() << "Analysing " << op << "\n");
 
   const fir::AliasAnalysis::Source &source = state.getSource(memref);
+  LLVM_DEBUG(llvm::dbgs() << "Got source " << source << "\n");
 
   // Process the scopes, if not processed yet.
   state.processFunctionScopes(func);
@@ -686,14 +690,22 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
   }
 
   mlir::LLVM::TBAATagAttr tag;
-  // TBAA for dummy arguments
-  if (enableDummyArgs &&
-      source.kind == fir::AliasAnalysis::SourceKind::Argument) {
+  // Cray pointer/pointee is a special case. These might alias with any data.
+  if (supportCrayPointers && source.isCrayPointerOrPointee()) {
+    LLVM_DEBUG(llvm::dbgs().indent(2)
+               << "Found reference to Cray pointer/pointee at " << *op << "\n");
+    mlir::LLVM::TBAATypeDescriptorAttr anyDataDesc =
+        state.getFuncTreeWithScope(func, scopeOp).anyDataTypeDesc;
+    tag = mlir::LLVM::TBAATagAttr::get(anyDataDesc, anyDataDesc, /*offset=*/0);
+    // TBAA for dummy arguments
+  } else if (enableDummyArgs &&
+             source.kind == fir::AliasAnalysis::SourceKind::Argument) {
     LLVM_DEBUG(llvm::dbgs().indent(2)
                << "Found reference to dummy argument at " << *op << "\n");
     std::string name = getFuncArgName(llvm::cast<mlir::Value>(source.origin.u));
-    // If it is a TARGET or POINTER, then we do not care about the name,
-    // because the tag points to the root of the subtree currently.
+    // POINTERS can alias with any POINTER or TARGET. Assume that TARGET dummy
+    // arguments might alias with each other (because of the "TARGET" hole for
+    // dummy arguments). See flang/docs/Aliasing.md.
     if (source.isTargetOrPointer()) {
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
     } else if (!name.empty()) {
@@ -715,13 +727,10 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
     LLVM_DEBUG(llvm::dbgs().indent(2)
                << "Found reference to global " << globalName.str() << " at "
                << *op << "\n");
-    if (source.isPointer()) {
-      tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
-    } else {
-      // In general, place the tags under the "global data" root.
-      fir::TBAATree::SubtreeState *subTree =
-          &state.getMutableFuncTreeWithScope(func, scopeOp).globalDataTree;
 
+    // Add a named tag inside the given subtree, disambiguating members of a
+    // common block
+    auto addTagUsingStorageDesc = [&](fir::TBAATree::SubtreeState *subTree) {
       mlir::Operation *instantiationPoint = source.origin.instantiationPoint;
       auto storageIface =
           mlir::dyn_cast_or_null<fir::FortranVariableStorageOpInterface>(
@@ -766,6 +775,19 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
         LLVM_DEBUG(llvm::dbgs()
                    << "Tagged under '" << globalName << "' root\n");
       }
+    };
+
+    if (source.isPointer()) {
+      // Pointers can alias with any pointer or target.
+      tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
+    } else if (source.isTarget()) {
+      // Targets could alias with any pointer but not with each other.
+      addTagUsingStorageDesc(
+          &state.getMutableFuncTreeWithScope(func, scopeOp).targetDataTree);
+    } else {
+      // In general, place the tags under the "global data" root.
+      addTagUsingStorageDesc(
+          &state.getMutableFuncTreeWithScope(func, scopeOp).globalDataTree);
     }
 
     // TBAA for global variables with descriptors
@@ -776,9 +798,17 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
       const char *name = glbl.getRootReference().data();
       LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to direct " << name
                                         << " at " << *op << "\n");
+      // Pointer can alias with any pointer or target so that gets the root.
       if (source.isPointer())
         tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
+      // Targets could alias with any pointer but not with each other so they
+      // get their own node inside of the target data tree.
+      else if (source.isTarget())
+        tag = state.getFuncTreeWithScope(func, scopeOp)
+                  .targetDataTree.getTag(name);
       else
+        // Boxes that are not pointers or targets cannot alias with those that
+        // are. Put them under global data.
         tag = state.getFuncTreeWithScope(func, scopeOp)
                   .directDataTree.getTag(name);
     } else {
@@ -815,8 +845,13 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
                  << "\n");
     } else if (source.isPointer() && state.attachLocalAllocTag()) {
       LLVM_DEBUG(llvm::dbgs().indent(2)
-                 << "Found reference to allocation at " << *op << "\n");
+                 << "Found reference to POINTER allocation at " << *op << "\n");
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
+    } else if (source.isTarget() && state.attachLocalAllocTag()) {
+      LLVM_DEBUG(llvm::dbgs().indent(2)
+                 << "Found reference to TARGET allocation at " << *op << "\n");
+      tag = state.getFuncTreeWithScope(func, scopeOp)
+                .targetDataTree.getTag(*name);
     } else if (name && state.attachLocalAllocTag()) {
       LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to allocation "
                                         << name << " at " << *op << "\n");
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
index 6579c2362cd87..4444fc61239ea 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -99,7 +99,6 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
 
   mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
                                         : builder.createBool(loc, false);
-
   mlir::Value errmsg;
   if (op.getErrmsg()) {
     errmsg = op.getErrmsg();
@@ -116,12 +115,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
                   loc, fir::ReferenceType::get(
                            mlir::IntegerType::get(op.getContext(), 1)));
     if (op.getSource()) {
+      mlir::Value isDeviceSource = op.getDeviceSource()
+                                       ? builder.createBool(loc, true)
+                                       : builder.createBool(loc, false);
       mlir::Value stream =
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(2));
       args = fir::runtime::createArguments(
           builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
-          hasStat, errmsg, sourceFile, sourceLine);
+          hasStat, errmsg, sourceFile, sourceLine, isDeviceSource);
     } else {
       mlir::Value stream =
           op.getStream() ? op.getStream()
diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index b39295d72918f..438804ce42b76 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -77,13 +77,13 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.cond_br %[[VAL_17]], ^bb2, ^bb3
 // CHECK:         ^bb2:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // AA:              %[[VAL_18:.*]] = llvm.load %[[ARG0]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_18:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
 
-// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_19:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_20:.*]] = llvm.add %[[VAL_18]], %[[VAL_19]] : i32
@@ -92,7 +92,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 
 // CHECK:           %[[VAL_21:.*]] = llvm.trunc %[[VAL_10]] : i64 to i32
 
-// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_22:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_23:.*]] = llvm.add %[[VAL_22]], %[[VAL_21]] overflow<nsw> : i32
@@ -100,7 +100,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.br ^bb1(%[[VAL_23]], %[[VAL_24]] : i32, i64)
 // CHECK:         ^bb3:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // CHECK:           llvm.return
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 9d0d181609ada..5184561a03e67 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -128,11 +128,14 @@ func.func @_QPallocate_source() {
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
+  %devsource = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
   %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   %5 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a_d", data_attr = #cuf.cuda<device>, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   %7 = fir.declare %5 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   %8 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   %22 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%8 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+  %9 = fir.load %devsource : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %23 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%9 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {device_source} -> i32
   return
 }
 
@@ -142,8 +145,8 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, i1) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %true{{.*}})
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
   %c0 = arith.constant 0 : index
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 4907aa03ec5a5..071d3ec89394c 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -114,4 +114,3 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK: ![[TMP_DATA_ACCESS_TAG]] = !{![[TMP_DATA_ACCESS_TYPE:.*]], ![[TMP_DATA_ACCESS_TYPE]], i64 0}
 // CHECK: ![[TMP_DATA_ACCESS_TYPE]] = !{!"allocated data/", ![[TMP_ACCESS_TYPE:.*]], i64 0}
 // CHECK: ![[TMP_ACCESS_TYPE]] = !{!"allocated data", ![[TARGET_ACCESS_TAG:.*]], i64 0}
-// CHECK: ![[TARGET_ACCESS_TAG]] = !{!"target data", ![[DATA_ACCESS_TYPE]], i64 0}
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
deleted file mode 100644
index 3e59e2f01119e..0000000000000
--- a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
+++ /dev/null
@@ -1,9 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
-
-program main
-  implicit none
-  integer, device, allocatable :: a_d(:)
-  integer, allocatable :: a(:)
-! CHECK: not yet implemented: CUDA Fortran: allocate with device source
-  allocate(a, source=a_d)
-end program
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 43e716532ecca..52303d126b8dc 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -261,3 +261,12 @@ end subroutine
 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+
+attributes(global) subroutine from_device_source()
+  real, device, allocatable :: a(:)
+  real, allocatable :: b(:)
+  allocate(b, source=a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPfrom_device_source()
+! CHECK: cuf.allocate{{.*}}device_source
diff --git a/flang/test/Transforms/tbaa-cray-pointer.fir b/flang/test/Transforms/tbaa-cray-pointer.fir
new file mode 100644
index 0000000000000..54406271aaa58
--- /dev/null
+++ b/flang/test/Transforms/tbaa-cray-pointer.fir
@@ -0,0 +1,43 @@
+// RUN: fir-opt -funsafe-cray-pointers --fir-add-alias-tags %s | FileCheck %s
+
+// Fortran source:
+// subroutine test()
+//   real :: a, b
+//   pointer(p, a)
+//   p = loc(b)
+//   b = 2
+// end subroutine
+
+// CHECK:      #[[TBAA_ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root _QPtest">
+// CHECK-NEXT: #[[ANY_ACCESS:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TBAA_ROOT]], 0>}>
+// CHECK-NEXT: #[[ANY_DATA:.*]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
+// CHECK-NEXT: #[[ANY_DATA_TAG:.*]] = #llvm.tbaa_tag<base_type = #[[ANY_DATA]], access_type = #[[ANY_DATA]], offset = 0>
+// CHECK-NEXT: #[[ALLOCATED_DATA:.*]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK-NEXT: #[[B:.*]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtestEb", members = {<#[[ALLOCATED_DATA]], 0>}>
+// CHECK-NEXT: #[[B_TAG:.*]] = #llvm.tbaa_tag<base_type = #[[B]], access_type = #[[B]], offset = 0>
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i8 = dense<[8, 32]> : vector<2xi64>, i16 = dense<[16, 32]> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 128 : i64, "dlti.function_pointer_alignment" = #dlti.function_pointer_alignment<32, function_dependent = true>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"} {
+// CHECK-LABEL: func.func @_QPtest()
+  func.func @_QPtest() {
+    %cst = arith.constant 2.000000e+00 : f32
+    %0 = fir.alloca !fir.box<!fir.ptr<f32>>
+    %1 = fir.dummy_scope : !fir.dscope
+    %2 = fir.alloca i64 {bindc_name = "p", uniq_name = "_QFtestEp"}
+    %3 = fir.declare %2 {fortran_attrs = #fir.var_attrs<cray_pointer>, uniq_name = "_QFtestEp"} : (!fir.ref<i64>) -> !fir.ref<i64>
+    %4 = fir.alloca f32 {bindc_name = "b", uniq_name = "_QFtestEb"}
+    %5 = fir.declare %4 {uniq_name = "_QFtestEb"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %6 = fir.declare %0 {fortran_attrs = #fir.var_attrs<pointer, cray_pointee>, uniq_name = "_QFtestEa"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+    %7 = fir.zero_bits !fir.ptr<f32>
+    %8 = fir.embox %7 : (!fir.ptr<f32>) -> !fir.box<!fir.ptr<f32>>
+    fir.store %8 to %6 : !fir.ref<!fir.box<!fir.ptr<f32>>>
+// Descriptor tagged in codegen
+// CHECK: fir.store %{{.*}} to %{{.*}} : !fir.ref<!fir.box<!fir.ptr<f32>>
+    %9 = fir.convert %5 : (!fir.ref<f32>) -> i64
+    fir.store %9 to %3 : !fir.ref<i64>
+// CHECK: fir.store {{.*}} to {{.*}} {tbaa = [#[[ANY_DATA_TAG]]]} : !fir.ref<i64>
+    fir.store %cst to %5 : !fir.ref<f32>
+// CHECK: fir.store {{.*}} to {{.*}} {tbaa = [#[[B_TAG]]]} : !fir.ref<f32>
+    return
+  }
+}
+
diff --git a/flang/test/Transforms/tbaa-for-common-vars.fir b/flang/test/Transforms/tbaa-for-common-vars.fir
index a8dd86bff72ed..992658ee2387f 100644
--- a/flang/test/Transforms/tbaa-for-common-vars.fir
+++ b/flang/test/Transforms/tbaa-for-common-vars.fir
@@ -28,8 +28,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_4_to_7", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_0_to_3", members = {<#[[$ATTR_5]], 0>}>
@@ -66,8 +65,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_10]], 0>}>
 // CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_11]], 0>}>
-// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_12]], 0>}>
-// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_13]], 0>}>
+// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_12]], 0>}>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_", members = {<#[[$ATTR_14]], 0>}>
 // CHECK: #[[$ATTR_16:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_/bytes_0_to_3", members = {<#[[$ATTR_15]], 0>}>
 // CHECK: #[[$ATTR_18:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_16]], access_type = #[[$ATTR_16]], offset = 0>
@@ -118,14 +116,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ANYACC3INNER:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT3INNER]], 0>}>
 // CHECK: #[[ANYDATA3:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3]], 0>}>
 // CHECK: #[[ANYDATA3INNER:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3INNER]], 0>}>
-// CHECK: #[[TARGETDATA3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA3]], 0>}>
+// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA3]], 0>}>
 // CHECK: #[[DUMMYARG3INNER:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA3INNER]], 0>}>
-// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA3]], 0>}>
+// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
 // CHECK: #[[DUMMYD:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEd", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYC:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEc", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYDTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYD]], access_type = #[[DUMMYD]], offset = 0>
 // CHECK: #[[DUMMYCTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYC]], access_type = #[[DUMMYC]], offset = 0>
-// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
 // CHECK: #[[GLOBALB:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_4_to_7", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALA:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_0_to_3", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBALB]], access_type = #[[GLOBALB]], offset = 0>
@@ -180,10 +177,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[INNER4ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[INNER4ROOT]], 0>}>
 // CHECK: #[[TEST4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST4ANYCC]], 0>}>
 // CHECK: #[[INNER4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[INNER4ANYACC]], 0>}>
-// CHECK: #[[TEST4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST4ANYDATA]], 0>}>
-// CHECK: #[[INNER4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[INNER4ANYDATA]], 0>}>
-// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4TARGET]], 0>}>
-// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4TARGET]], 0>}>
+// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4ANYDATA]], 0>}>
+// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4ANYDATA]], 0>}>
 // CHECK: #[[TEST4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[TEST4GLOBAL]], 0>}>
 // CHECK: #[[INNER4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[INNER4GLOBAL]], 0>}>
 // CHECK: #[[TEST4B:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_/bytes_4_to_7", members = {<#[[TEST4COMMON]], 0>}>
@@ -229,8 +224,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[TEST5ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest5">
 // CHECK: #[[TEST5ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TEST5ROOT]], 0>}>
 // CHECK: #[[TEST5ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST5ANYACC]], 0>}>
-// CHECK: #[[TEST5TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST5ANYDATA]], 0>}>
-// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5TARGET]], 0>}>
+// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5ANYDATA]], 0>}>
 // CHECK: #[[TEST5COMMON5:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_", members = {<#[[TEST5GLOBAL]], 0>}>
 // CHECK: #[[TEST5COMMON5TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TEST5COMMON5]], access_type = #[[TEST5COMMON5]], offset = 0>
 // CHECK: #[[TEST5A:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_/bytes_0_to_3", members = {<#[[TEST5COMMON5]], 0>}>
@@ -288,8 +282,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest6">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_0_to_79", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
@@ -354,8 +347,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_74:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_73]], 0>}>
 // CHECK: #[[$ATTR_75:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_74]], 0>}>
 // CHECK: #[[$ATTR_76:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_75]], 0>}>
+// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_75]], 0>}>
 // CHECK: #[[$ATTR_77:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_76]], access_type = #[[$ATTR_76]], offset = 0>
-// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_76]], 0>}>
 // CHECK: #[[$ATTR_79:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_78]], 0>}>
 // CHECK: #[[$ATTR_80:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_40_to_43", members = {<#[[$ATTR_79]], 0>}>
 // CHECK: #[[$ATTR_81:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_80]], access_type = #[[$ATTR_80]], offset = 0>
@@ -425,12 +418,61 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_82:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest8">
 // CHECK: #[[$ATTR_83:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_82]], 0>}>
 // CHECK: #[[$ATTR_84:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_83]], 0>}>
+// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_84]], 0>}>
 // CHECK: #[[$ATTR_85:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_84]], 0>}>
-// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_85]], access_type = #[[$ATTR_85]], offset = 0>
-// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_85]], 0>}>
 // CHECK: #[[$ATTR_88:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_87]], access_type = #[[$ATTR_87]], offset = 0>
+// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_85]], access_type = #[[$ATTR_85]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest8() {
 // CHECK:           fir.load %{{[0-9]+}} : !fir.ref<!fir.box<!fir.ptr<f32>>>
 // CHECK:           fir.load %{{[0-9]+}} {tbaa = [#[[$ATTR_86]]]} : !fir.ptr<f32>
 // CHECK:           fir.load %{{[0-9]+}} : !fir.ref<i32>
 // CHECK:           fir.store %{{[0-9]+}} to %{{[0-9]+}} : !fir.ref<f32>
+
+// -----
+
+// Fortran source:
+// subroutine target_comon_tbaa()
+//   real :: a
+//   real, target :: b, c
+//   common /common1/ a,b,c
+//   a = b
+// end subroutine
+//
+// Test generation of tbaa tags where some members of a common block are TARGET
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {  fir.global common @block_(dense<0> : vector<44xi8>) {alignment = 4 : i64} : !fir.array<44xi8>
+  fir.global common @common1_(dense<0> : vector<12xi8>) {alignment = 4 : i64} : !fir.array<12xi8>
+  func.func @_QPtarget_common_tbaa() {
+    %c8 = arith.constant 8 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.address_of(@common1_) : !fir.ref<!fir.array<12xi8>>
+    %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<12xi8>>, index) -> !fir.ref<i8>
+    %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %4 = fir.declare %3 storage(%1[0]) {uniq_name = "_QFtarget_comon_tbaaEa"} : (!fir.ref<f32>, !fir.ref<!fir.array<12xi8>>) -> !fir.ref<f32>
+    %5 = fir.coordinate_of %1, %c4 : (!fir.ref<!fir.array<12xi8>>, index) -> !fir.ref<i8>
+    %6 = fir.convert %5 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %7 = fir.declare %6 storage(%1[4]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtarget_comon_tbaaEb"} : (!fir.ref<f32>, !fir.ref<!fir.array<12xi8>>) -> !fir.ref<f32>
+    %8 = fir.coordinate_of %1, %c8 : (!fir.ref<!fir.array<12xi8>>, index) -> !fir.ref<i8>
+    %9 = fir.convert %8 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %10 = fir.declare %9 storage(%1[8]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtarget_comon_tbaaEc"} : (!fir.ref<f32>, !fir.ref<!fir.array<12xi8>>) -> !fir.ref<f32>
+    %11 = fir.load %7 : !fir.ref<f32>
+    fir.store %11 to %4 : !fir.ref<f32>
+    return
+  }
+}
+// CHECK:      #[[TBAA_FUNC_ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root _QPtarget_common_tbaa">
+// CHECK-NEXT: #[[ANY_ACCESS:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TBAA_FUNC_ROOT]], 0>}>
+// CHECK-NEXT: #[[ANY_DATA:.*]]  = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
+// CHECK-NEXT: #[[TARGET_DATA:.*]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK-NEXT: #[[GLOBAL_DATA:.*]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK-NEXT: #[[TARGET_COMMON:.*]] = #llvm.tbaa_type_desc<id = "target data/common1_", members = {<#[[TARGET_DATA]], 0>}>
+// CHECK-NEXT: #[[GLOBAL_COMMON:.*]] = #llvm.tbaa_type_desc<id = "global data/common1_", members = {<#[[GLOBAL_DATA]], 0>}>
+// CHECK-NEXT: #[[B:.*]] = #llvm.tbaa_type_desc<id = "target data/common1_/bytes_4_to_7", members = {<#[[TARGET_COMMON]], 0>}>
+// CHECK-NEXT: #[[A:.*]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_0_to_3", members = {<#[[GLOBAL_COMMON]], 0>}>
+// CHECK-NEXT: #[[B_TAG:.*]] = #llvm.tbaa_tag<base_type = #[[B]], access_type = #[[B]], offset = 0>
+// CHECK-NEXT: #[[A_TAG:.*]] = #llvm.tbaa_tag<base_type = #[[A]], access_type = #[[A]], offset = 0>
+
+// CHECK-LABEL: func.func @_QPtarget_common_tbaa()
+// CHECK:         %[[LOAD:.*]] = fir.load %{{.*}} {tbaa = [#[[B_TAG]]]}
+// CHECK:         fir.store %[[LOAD]] to %{{.*}} {tbaa = [#[[A_TAG]]]}
diff --git a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
index dbefa3f8e3f5f..0d082c7504024 100644
--- a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
+++ b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
@@ -30,8 +30,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[ANYACC1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT1]], 0>}>
 // CHECK: #[[ANYDATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC1]], 0>}>
-// CHECK: #[[TARGETDATA1:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA1]], 0>}>
-// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA1]], 0>}>
+// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA1]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1", members = {<#[[GLOBALDATA1]], 0>}>
 // CHECK: #[[GLOB1:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOB1]], access_type = #[[GLOB1]], offset = 0>
@@ -74,8 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT2:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[ANYACC2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT2]], 0>}>
 // CHECK: #[[ANYDATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC2]], 0>}>
-// CHECK: #[[TARGETDATA2:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA2]], 0>}>
-// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA2]], 0>}>
+// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA2]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_", members = {<#[[GLOBALDATA2]], 0>}>
 // CHECK: #[[GLOB1GLOB2:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[GLOB3:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_4_to_7", members = {<#[[GLOB1COMMON]], 0>}>
diff --git a/flang/test/Transforms/tbaa-for-local-vars.fir b/flang/test/Transforms/tbaa-for-local-vars.fir
index 4eb6b2ecf31c4..fde5c400c75ed 100644
--- a/flang/test/Transforms/tbaa-for-local-vars.fir
+++ b/flang/test/Transforms/tbaa-for-local-vars.fir
@@ -35,18 +35,22 @@
 // scope's TBAA tree.
 // RUN: fir-opt --fir-add-alias-tags %s | FileCheck %s
 
-// CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
-// CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$ATTR_6]], 0>}>
-// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_7]], 0>}>
-// CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
-// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
+// CHECK: #[[$SCOPE_2:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
+// CHECK: #[[$SCOPE_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
+// CHECK: #[[$ANY_ACCESS2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_2]], 0>}>
+// CHECK: #[[$ANY_ACCESS1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_1]], 0>}>
+// CHECK: #[[$ANY_DATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS2]], 0>}>
+// CHECK: #[[$ANY_DATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS1]], 0>}>
+// CHECK: #[[$DUMMY_ARG2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA2]], 0>}>
+// CHECK: #[[$ALLOCATED_DATA1:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA1]], 0>}>
+// CHECK: #[[$DUMMY_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA1]], 0>}>
+// CHECK: #[[$ALLOCATED_DATA1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$ALLOCATED_DATA1]], access_type = #[[$ALLOCATED_DATA1]], offset = 0>
+// CHECK: #[[$BAR_THIS2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$DUMMY_ARG2]], 0>}>
+// CHECK: #[[$TEST_VAR1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmFtestEvar", members = {<#[[$ALLOCATED_DATA1]], 0>}>
+// CHECK: #[[$TEST_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFtestEarg", members = {<#[[$DUMMY_ARG1]], 0>}>
+// CHECK: #[[$BAR_THIS2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$BAR_THIS2]], access_type = #[[$BAR_THIS2]], offset = 0>
+// CHECK: #[[$TEST_VAR1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_VAR1]], access_type = #[[$TEST_VAR1]], offset = 0>
+// CHECK: #[[$TEST_ARG2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_ARG1]], access_type = #[[$TEST_ARG1]], offset = 0>
 
 // CHECK-LABEL:   func.func @_QMmPtest(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "arg"}) {
@@ -61,10 +65,10 @@
 // CHECK:           %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_11:.*]] = fir.declare %[[VAL_9]] dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMmFbarEthis"} : (!fir.class<!fir.type<_QMmTt{x:f32}>>, !fir.dscope) -> !fir.class<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_11]], x : (!fir.class<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
+// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$BAR_THIS2_TAG]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_13:.*]] = fir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], x : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ALLOCATED_DATA1_TAG]]]} : !fir.ref<f32>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QMmPtest(%arg0: !fir.ref<f32> {fir.bindc_name = "arg"}) {
   %cst = arith.constant 1.000000e+00 : f32
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope.fir b/flang/test/Transforms/tbaa-with-dummy-scope.fir
index 4ae2b8efe2581..d7f33776150ae 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope.fir
@@ -24,7 +24,7 @@
 // CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST1ANYDATA]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
+// CHECK: #[[TARGETDATA_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEx", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_7]], 0>}>
@@ -34,8 +34,8 @@
 // CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_11]], access_type = #[[$ATTR_11]], offset = 0>
 // CHECK:   func.func @test1(
-// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
-// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
+// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_9:.*]] = fir.load %{{.*}} {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
 // CHECK:           fir.store %{{.*}} {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
@@ -83,23 +83,21 @@ func.func @test1(%arg0: !fir.ref<f32> {fir.bindc_name = "x", fir.target}, %arg1:
 // CHECK: #[[$ATTR_33:.+]] = #llvm.tbaa_root<id = "Flang function root _QMtestPcaller - Scope 1">
 // CHECK: #[[$ATTR_34:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_32]], 0>}>
 // CHECK: #[[$ATTR_35:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_33]], 0>}>
-// CHECK: #[[$ATTR_36:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
-// CHECK: #[[$ATTR_37:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
-// CHECK: #[[CALLERTARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_36]], 0>}>
-// CHECK: #[[CALLEETARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_37]], 0>}>
-// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_37]], 0>}>
-// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLERTARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLEETARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
-// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
+// CHECK: #[[$CALLERANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
+// CHECK: #[[$CALLEEANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
+// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLERANYDATA]], 0>}>
+// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLEEANYDATA]], 0>}>
+// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$CALLEEANYDATA]], 0>}>
 // CHECK: #[[$ATTR_41:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_42:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_43:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_39]], 0>}>
 // CHECK: #[[$ATTR_44:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_39]], 0>}>
+// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
 // CHECK: #[[$ATTR_46:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_41]], access_type = #[[$ATTR_41]], offset = 0>
 // CHECK: #[[$ATTR_47:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_42]], access_type = #[[$ATTR_42]], offset = 0>
 // CHECK: #[[$ATTR_48:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_43]], access_type = #[[$ATTR_43]], offset = 0>
 // CHECK: #[[$ATTR_49:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_44]], access_type = #[[$ATTR_44]], offset = 0>
+// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
 // CHECK:   func.func @_QMtestPcaller(
 // CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "z"}) {
 // CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope2.fir b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
index 54902ca7d41e1..6f5ed69fbc9c6 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope2.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
@@ -44,16 +44,15 @@ func.func @_QPtest1() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1FinnerEx", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
+// CHECK: #[[$ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANYDATA]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANYDATA]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANYDATA]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1FinnerEx", members = {<#[[$ATTR_3]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest1() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest1FinnerEy"}
@@ -90,19 +89,18 @@ func.func @_QPtest2() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2 - Scope 1">
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$TARGETDATA_0:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_4]], 0>}>
-// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA_0]], 0>}>
-// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest2FinnerEx", members = {<#[[$ATTR_6]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA_0]], 0>}>
-// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
+// CHECK: #[[$ANY_ACCESS_0:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
+// CHECK: #[[$ANY_ACCESS_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$ANY_DATA_0:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_0]], 0>}>
+// CHECK: #[[$ANY_DATA_1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_1]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA_0]], 0>}>
+// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA_1]], 0>}>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANY_DATA_0]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest2FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest2FinnerEx", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_7]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
+// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest2() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest2FinnerEy"}
diff --git a/flang/test/Transforms/tbaa2.fir b/flang/test/Transforms/tbaa2.fir
index a594e6b32fdac..9b5307ba69d17 100644
--- a/flang/test/Transforms/tbaa2.fir
+++ b/flang/test/Transforms/tbaa2.fir
@@ -48,18 +48,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmodPcallee">
 // CHECK: #[[ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK: #[[ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
-// CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANY_DATA]], 0>}>
 // CHECK: #[[ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANY_DATA]], 0>}>
-// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
-// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
-// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
-
-// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
-// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
-// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
+// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANY_DATA]], 0>}>
 
 // CHECK: #[[GLBL_ZSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[GLBL_ZSTOP:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstop", members = {<#[[ANY_GLBL]], 0>}>
@@ -69,10 +61,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEj", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[GLBL_XSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodExstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[LOCAL3_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEi", members = {<#[[ANY_LOCAL]], 0>}>
+// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL4_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdxold", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[DIRECT_A:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEa", members = {<#[[ANY_DIRECT]], 0>}>
 // CHECK: #[[DIRECT_B:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEb", members = {<#[[ANY_DIRECT]], 0>}>
+// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[GLBL_DYINV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEdyinv", members = {<#[[ANY_GLBL]], 0>}>
+// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL5_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdzinv", members = {<#[[ANY_LOCAL]], 0>}>
 
 // CHECK: #[[GLBL_ZSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_ZSTART]], access_type = #[[GLBL_ZSTART]], offset = 0>
@@ -83,10 +78,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL2_ALLOC]], access_type = #[[LOCAL2_ALLOC]], offset = 0>
 // CHECK: #[[GLBL_XSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_XSTART]], access_type = #[[GLBL_XSTART]], offset = 0>
 // CHECK: #[[LOCAL3_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL3_ALLOC]], access_type = #[[LOCAL3_ALLOC]], offset = 0>
+// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
 // CHECK: #[[LOCAL4_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL4_ALLOC]], access_type = #[[LOCAL4_ALLOC]], offset = 0>
 // CHECK: #[[DIRECT_A_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_A]], access_type = #[[DIRECT_A]], offset = 0>
 // CHECK: #[[DIRECT_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_B]], access_type = #[[DIRECT_B]], offset = 0>
+// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
 // CHECK: #[[GLBL_DYINV_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_DYINV]], access_type = #[[GLBL_DYINV]], offset = 0>
+// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
 // CHECK: #[[LOCAL5_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL5_ALLOC]], access_type = #[[LOCAL5_ALLOC]], offset = 0>
 
   func.func @_QMmodPcallee(%arg0: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "z"}, %arg1: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "y"}, %arg2: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>> {fir.bindc_name = "low"}) {
diff --git a/flang/test/Transforms/tbaa3.fir b/flang/test/Transforms/tbaa3.fir
index abcb7e000bac1..79f79cb6ca26b 100644
--- a/flang/test/Transforms/tbaa3.fir
+++ b/flang/test/Transforms/tbaa3.fir
@@ -1,5 +1,4 @@
-// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL,DEFAULT %s
-// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa %s | FileCheck --check-prefixes=ALL,LOCAL %s
+// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL %s
 
 // Test AddAliasTagsPass creating sub-tree for TARGET/POINTER variables.
 
@@ -56,56 +55,57 @@
 //    |  |- "dummy arg data/_QFtest1Edummyas"
 //    |  |- "dummy arg data/_QFtest1Edummya"
 //    |
-//    |- "target data" <- all pointers and taget dummys
-//       |
-//       |- "global data"
-//       |  |
-//       |  |- "global data/_QMdataEglob"
-//       |  |- "global data/_QMdataEglobt"
-//       |
-//       |- "direct data"
-//       |  |
-//       |  |- "direct data/_QMdataEgloba"
-//       |  |- "direct data/_QMdataEglobat"
+//    |- "target data" <--- all pointers and target dummy arguments go here
+//    |  |- "target data/_QMdataEglobt"
+//    |  |- "target data/_QMdataEglobat"
+//    |  |- "target data/_QFtest1Elocalt"
+//    |  |- "target data/_QFtest1Elocalat"
+//    |
+//    |- "global data"
+//    |  |
+//    |  |- "global data/_QMdataEglob"
+//    |
+//    |- "direct data"
+//    |  |
+//    |  |- "direct data/_QMdataEgloba"
+//    |
+//    |- "allocated data"
 //       |
-//       |- "allocated data"
-//          |
-//          |- "allocated data/_QFtest1Elocal"
-//          |- "allocated data/_QFtest1Elocalt"
-//          |- "allocated data/_QFtest1Elocala"
-//          |- "allocated data/_QFtest1Elocalat"
+//       |- "allocated data/_QFtest1Elocal"
+//       |- "allocated data/_QFtest1Elocala"
 
 // ALL: #[[FUNCROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // ALL: #[[ANYACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[FUNCROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACCESS]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[DUMMYDATA:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
-// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
+// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobt", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
+// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobat", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[DUMMYFVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyf", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYASVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyas", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYAVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummya", members = {<#[[DUMMYDATA]], 0>}>
-// LOCAL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
-// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
-// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
-// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
-// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglobt", members = {<#[[GLOBALDATA]], 0>}>
-// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
-// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEglobat", members = {<#[[DIRECTDATA]], 0>}>
-// LOCAL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalt", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalat", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QFtest1Elocalt", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QFtest1Elocalat", members = {<#[[TARGETDATA]], 0>}>
+
 // ALL: #[[GLOBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBVAR]], access_type = #[[GLOBVAR]], offset = 0>
 // ALL: #[[GLOBTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBTVAR]], access_type = #[[GLOBTVAR]], offset = 0>
 // ALL: #[[GLOBATAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBAVAR]], access_type = #[[GLOBAVAR]], offset = 0>
 // ALL: #[[GLOBATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBATVAR]], access_type = #[[GLOBATVAR]], offset = 0>
-// LOCAL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
-// LOCAL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
-// LOCAL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
-// LOCAL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
+// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
+// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
+// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
+// ALL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
+// ALL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
+// ALL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
+// ALL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
   fir.global @_QMdataEglob : !fir.array<10xf32> {
@@ -263,13 +263,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     fir.store %cst to %67 : !fir.ref<f32>
     %68 = fir.array_coor %20(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real :: local(10)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
     fir.store %cst to %68 : !fir.ref<f32>
     %69 = fir.array_coor %33(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real, target :: localt(10)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
     fir.store %cst to %69 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %70 = fir.load %25 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -278,8 +276,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %73 = fir.shape_shift %72#0, %72#1 : (index, index) -> !fir.shapeshift<1>
     %74 = fir.array_coor %71(%73) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable :: locala(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
     fir.store %cst to %74 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %75 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -288,8 +285,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %78 = fir.shape_shift %77#0, %77#1 : (index, index) -> !fir.shapeshift<1>
     %79 = fir.array_coor %76(%78) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable, target :: localat(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
     fir.store %cst to %79 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %80 = fir.load %31 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
@@ -297,8 +293,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %82 = fir.shift %81#0 : (index) -> !fir.shift<1>
     %83 = fir.array_coor %80(%82) %c1 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shift<1>, index) -> !fir.ref<f32>
 // real, pointer :: localp(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
     fir.store %cst to %83 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %84 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
diff --git a/flang/test/Transforms/tbaa4.fir b/flang/test/Transforms/tbaa4.fir
index c368a3d06c2ba..5e29014af8935 100644
--- a/flang/test/Transforms/tbaa4.fir
+++ b/flang/test/Transforms/tbaa4.fir
@@ -1,12 +1,10 @@
 // Test TBAA tags for common and equivalence.
-// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL,DEFAULT %s
-// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa --split-input-file %s | FileCheck --check-prefixes=ALL,LOCAL %s
+// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL %s
 
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_common">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[BLK:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[BLK_A:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_0_to_3", members = {<#[[BLK]], 0>}>
 // ALL: #[[BLK_C:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_8_to_47", members = {<#[[BLK]], 0>}>
@@ -54,19 +52,17 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 
 // -----
 
-// LOCAL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
-// LOCAL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
-// LOCAL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// LOCAL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// LOCAL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// LOCAL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
-// LOCAL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
+// ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
+// ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
+// ALL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
+// ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 
 // ALL-LABEL:   func.func @_QPtest_local_equiv() {
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
-// DEFAULT-NOT:   fir.store{{.}}tbaa
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QPtest_local_equiv() {
   %c1 = arith.constant 1 : index
@@ -98,8 +94,7 @@ func.func @_QPtest_local_equiv() {
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_save_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QFtest_save_equivEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
@@ -143,8 +138,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_global_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 
diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.cl b/libclc/opencl/lib/clspv/shared/vstore_half.cl
index 341ec3e251719..cfcbf55caeae7 100644
--- a/libclc/opencl/lib/clspv/shared/vstore_half.cl
+++ b/libclc/opencl/lib/clspv/shared/vstore_half.cl
@@ -8,7 +8,13 @@
 
 #include <clc/clc_as_type.h>
 #include <clc/float/definitions.h>
+#include <clc/math/clc_copysign.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_nextafter.h>
 #include <clc/opencl/opencl-base.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_min.h>
 
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
 
@@ -48,32 +54,32 @@
 
 _CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
   /* Handle nan corner case */
-  if (isnan(x))
+  if (__clc_isnan(x))
     return x;
   /* RTZ does not produce Inf for large numbers */
-  if (fabs(x) > 65504.0f && !isinf(x))
-    return copysign(65504.0f, x);
+  if (__clc_fabs(x) > 65504.0f && !__clc_isinf(x))
+    return __clc_copysign(65504.0f, x);
 
   const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
   /* Manage range rounded to +- zero explicitely */
   if (exp < -24)
-    return copysign(0.0f, x);
+    return __clc_copysign(0.0f, x);
 
   /* Remove lower 13 bits to make sure the number is rounded down */
   int mask = 0xffffe000;
   /* Denormals cannot be flushed, and they use different bit for rounding */
   if (exp < -14)
-    mask <<= min(-(exp + 14), 10);
+    mask <<= __clc_min(-(exp + 14), 10);
 
   return __clc_as_float(__clc_as_uint(x) & mask);
 }
 
 _CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
   /* Handle nan corner case */
-  if (isnan(x))
+  if (__clc_isnan(x))
     return x;
 
-  const float inf = copysign(INFINITY, x);
+  const float inf = __clc_copysign(INFINITY, x);
   uint ux = __clc_as_uint(x);
 
   /* Manage +- infinity explicitely */
@@ -82,23 +88,23 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
   }
   /* Manage +- zero explicitely */
   if ((ux & 0x7fffffff) == 0) {
-    return copysign(0.0f, x);
+    return __clc_copysign(0.0f, x);
   }
 
   const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
   /* Manage range rounded to smallest half denormal explicitely */
   if (exp < -24) {
-    return copysign(0x1.0p-24f, x);
+    return __clc_copysign(0x1.0p-24f, x);
   }
 
   /* Set lower 13 bits */
   int mask = (1 << 13) - 1;
   /* Denormals cannot be flushed, and they use different bit for rounding */
   if (exp < -14) {
-    mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
+    mask = (1 << (13 + __clc_min(-(exp + 14), 10))) - 1;
   }
 
-  const float next = nextafter(__clc_as_float(ux | mask), inf);
+  const float next = __clc_nextafter(__clc_as_float(ux | mask), inf);
   return ((ux & mask) == 0) ? __clc_as_float(ux) : next;
 }
 _CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
@@ -116,7 +122,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
     /* The default assumes lower 13 bits are rounded,
      * but it might be more for denormals.
      * Shifting beyond last == 0b, and qr == 00b is not necessary */
-    shift += min(-(exp + 14), 15);
+    shift += __clc_min(-(exp + 14), 15);
   }
   int mask = (1 << shift) - 1;
   const uint grs = mantissa & mask;
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 56eb0e588d81d..f1912668e4013 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -122,5 +122,9 @@ ABI Affecting Changes
 - ``ranges::iota_view`` is now aware of ``__int128``. This causes ``iota_view::difference_type`` to change from
   ``long long`` to ``__int128`` in some cases.
 
+- ``std::allocator`` is now trivially default constructible. The behaviour can be reverted by defining
+  ``_LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR``. Please inform the libc++ team if you need this flag, since it will
+  be removed in LLVM 24 if there is no evidence that it's required.
+
 Build System Changes
 --------------------
diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h
index 6c9eba672e154..8758d2c9e7b5d 100644
--- a/libcxx/include/__algorithm/copy_backward.h
+++ b/libcxx/include/__algorithm/copy_backward.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/copy_move_common.h>
 #include <__algorithm/copy_n.h>
+#include <__algorithm/for_each_segment.h>
 #include <__algorithm/iterator_operations.h>
 #include <__algorithm/min.h>
 #include <__config>
@@ -173,27 +174,10 @@ struct __copy_backward_impl {
   template <class _InIter, class _OutIter, __enable_if_t<__is_segmented_iterator_v<_InIter>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _InIter __last, _OutIter __result) const {
-    using _Traits = __segmented_iterator_traits<_InIter>;
-    auto __sfirst = _Traits::__segment(__first);
-    auto __slast  = _Traits::__segment(__last);
-    if (__sfirst == __slast) {
-      auto __iters =
-          std::__copy_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__local(__last), std::move(__result));
-      return std::make_pair(__last, __iters.second);
-    }
-
-    __result =
-        std::__copy_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__local(__last), std::move(__result))
-            .second;
-    --__slast;
-    while (__sfirst != __slast) {
-      __result =
-          std::__copy_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__end(__slast), std::move(__result))
-              .second;
-      --__slast;
-    }
-    __result = std::__copy_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__end(__slast), std::move(__result))
-                   .second;
+    using __local_iterator = typename __segmented_iterator_traits<_InIter>::__local_iterator;
+    std::__for_each_segment_backward(__first, __last, [&__result](__local_iterator __lfirst, __local_iterator __llast) {
+      __result = std::__copy_backward<_AlgPolicy>(std::move(__lfirst), std::move(__llast), std::move(__result)).second;
+    });
     return std::make_pair(__last, std::move(__result));
   }
 
diff --git a/libcxx/include/__algorithm/find_end.h b/libcxx/include/__algorithm/find_end.h
index 86b4a3e2e3689..84b43e31a3a59 100644
--- a/libcxx/include/__algorithm/find_end.h
+++ b/libcxx/include/__algorithm/find_end.h
@@ -76,6 +76,111 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1>
   }
 }
 
+template <class _AlgPolicy,
+          class _Pred,
+          class _Iter1,
+          class _Sent1,
+          class _Iter2,
+          class _Sent2,
+          class _Proj1,
+          class _Proj2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter1> __find_end_impl(
+    _Iter1 __first1,
+    _Sent1 __sent1,
+    _Iter2 __first2,
+    _Sent2 __sent2,
+    _Pred& __pred,
+    _Proj1& __proj1,
+    _Proj2& __proj2,
+    bidirectional_iterator_tag,
+    bidirectional_iterator_tag) {
+  auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1);
+  auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2);
+  // modeled after search algorithm (in reverse)
+  if (__first2 == __last2)
+    return std::make_pair(__last1, __last1); // Everything matches an empty sequence
+  _Iter1 __l1 = __last1;
+  _Iter2 __l2 = __last2;
+  --__l2;
+  while (true) {
+    // Find last element in sequence 1 that matches *(__last2-1), with a mininum of loop checks
+    while (true) {
+      if (__first1 == __l1) // return __last1 if no element matches *__first2
+        return std::make_pair(__last1, __last1);
+      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
+        break;
+    }
+    // *__l1 matches *__l2, now match elements before here
+    _Iter1 __match_last = __l1;
+    _Iter1 __m1         = __l1;
+    _Iter2 __m2         = __l2;
+    while (true) {
+      if (__m2 == __first2) // If pattern exhausted, __m1 is the answer (works for 1 element pattern)
+        return std::make_pair(__m1, ++__match_last);
+      if (__m1 == __first1) // Otherwise if source exhaused, pattern not found
+        return std::make_pair(__last1, __last1);
+
+      // if there is a mismatch, restart with a new __l1
+      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) {
+        break;
+      } // else there is a match, check next elements
+    }
+  }
+}
+
+template <class _AlgPolicy,
+          class _Pred,
+          class _Iter1,
+          class _Sent1,
+          class _Iter2,
+          class _Sent2,
+          class _Proj1,
+          class _Proj2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __find_end_impl(
+    _Iter1 __first1,
+    _Sent1 __sent1,
+    _Iter2 __first2,
+    _Sent2 __sent2,
+    _Pred& __pred,
+    _Proj1& __proj1,
+    _Proj2& __proj2,
+    random_access_iterator_tag,
+    random_access_iterator_tag) {
+  typedef typename iterator_traits<_Iter1>::difference_type _D1;
+  auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1);
+  auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2);
+  // Take advantage of knowing source and pattern lengths.  Stop short when source is smaller than pattern
+  auto __len2 = __last2 - __first2;
+  if (__len2 == 0)
+    return std::make_pair(__last1, __last1);
+  auto __len1 = __last1 - __first1;
+  if (__len1 < __len2)
+    return std::make_pair(__last1, __last1);
+  const _Iter1 __s = __first1 + _D1(__len2 - 1); // End of pattern match can't go before here
+  _Iter1 __l1      = __last1;
+  _Iter2 __l2      = __last2;
+  --__l2;
+  while (true) {
+    while (true) {
+      if (__s == __l1)
+        return std::make_pair(__last1, __last1);
+      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
+        break;
+    }
+    _Iter1 __last_match = __l1;
+    _Iter1 __m1         = __l1;
+    _Iter2 __m2         = __l2;
+    while (true) {
+      if (__m2 == __first2)
+        return std::make_pair(__m1, ++__last_match);
+      // no need to check range on __m1 because __s guarantees we have enough source
+      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) {
+        break;
+      }
+    }
+  }
+}
+
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
 [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_end_classic(
     _ForwardIterator1 __first1,
diff --git a/libcxx/include/__algorithm/for_each_segment.h b/libcxx/include/__algorithm/for_each_segment.h
index 93aa8259b2f7f..c02436c9aa33c 100644
--- a/libcxx/include/__algorithm/for_each_segment.h
+++ b/libcxx/include/__algorithm/for_each_segment.h
@@ -48,6 +48,32 @@ __for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Funct
   __func(_Traits::__begin(__sfirst), _Traits::__local(__last));
 }
 
+template <class _SegmentedIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
+__for_each_segment_backward(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func) {
+  using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
+
+  auto __sfirst = _Traits::__segment(__first);
+  auto __slast  = _Traits::__segment(__last);
+
+  // We are in a single segment, so we might not be at the beginning or end
+  if (__sfirst == __slast) {
+    __func(_Traits::__local(__first), _Traits::__local(__last));
+    return;
+  }
+
+  // We have more than one segment. Iterate over the last segment, since we might not start at the end
+  __func(_Traits::__begin(__slast), _Traits::__local(__last));
+  --__slast;
+  // iterate over the segments which are guaranteed to be completely in the range
+  while (__sfirst != __slast) {
+    __func(_Traits::__begin(__slast), _Traits::__end(__slast));
+    --__slast;
+  }
+  // iterate over the first segment
+  __func(_Traits::__local(__first), _Traits::__end(__slast));
+}
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ALGORITHM_FOR_EACH_SEGMENT_H
diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h
index a4698327b474d..43b72057a5eca 100644
--- a/libcxx/include/__algorithm/move_backward.h
+++ b/libcxx/include/__algorithm/move_backward.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/copy_backward.h>
 #include <__algorithm/copy_move_common.h>
+#include <__algorithm/for_each_segment.h>
 #include <__algorithm/iterator_operations.h>
 #include <__algorithm/min.h>
 #include <__config>
@@ -54,27 +55,10 @@ struct __move_backward_impl {
   template <class _InIter, class _OutIter, __enable_if_t<__is_segmented_iterator_v<_InIter>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _InIter __last, _OutIter __result) const {
-    using _Traits = __segmented_iterator_traits<_InIter>;
-    auto __sfirst = _Traits::__segment(__first);
-    auto __slast  = _Traits::__segment(__last);
-    if (__sfirst == __slast) {
-      auto __iters =
-          std::__move_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__local(__last), std::move(__result));
-      return std::make_pair(__last, __iters.second);
-    }
-
-    __result =
-        std::__move_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__local(__last), std::move(__result))
-            .second;
-    --__slast;
-    while (__sfirst != __slast) {
-      __result =
-          std::__move_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__end(__slast), std::move(__result))
-              .second;
-      --__slast;
-    }
-    __result = std::__move_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__end(__slast), std::move(__result))
-                   .second;
+    using __local_iterator = typename __segmented_iterator_traits<_InIter>::__local_iterator;
+    std::__for_each_segment_backward(__first, __last, [&__result](__local_iterator __lfirst, __local_iterator __llast) {
+      __result = std::__move_backward<_AlgPolicy>(std::move(__lfirst), std::move(__llast), std::move(__result)).second;
+    });
     return std::make_pair(__last, std::move(__result));
   }
 
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 52f4122a9bf5f..1c96a2ab64578 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -14,7 +14,6 @@
 #include <__cstddef/ptrdiff_t.h>
 #include <__cstddef/size_t.h>
 #include <__memory/addressof.h>
-#include <__memory/allocate_at_least.h>
 #include <__memory/allocator_traits.h>
 #include <__new/allocate.h>
 #include <__new/exceptions.h>
@@ -51,33 +50,21 @@ class allocator<void> {
 };
 #endif // _LIBCPP_STD_VER <= 17
 
-// This class provides a non-trivial default constructor to the class that derives from it
-// if the condition is satisfied.
-//
-// The second template parameter exists to allow giving a unique type to __non_trivial_if,
-// which makes it possible to avoid breaking the ABI when making this a base class of an
-// existing class. Without that, imagine we have classes D1 and D2, both of which used to
-// have no base classes, but which now derive from __non_trivial_if. The layout of a class
-// that inherits from both D1 and D2 will change because the two __non_trivial_if base
-// classes are not allowed to share the same address.
-//
-// By making those __non_trivial_if base classes unique, we work around this problem and
-// it is safe to start deriving from __non_trivial_if in existing classes.
-template <bool _Cond, class _Unique>
-struct __non_trivial_if {};
+template <bool, class _Unique>
+struct __non_trivially_default_constructible_if {};
 
 template <class _Unique>
-struct __non_trivial_if<true, _Unique> {
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __non_trivial_if() _NOEXCEPT {}
+struct __non_trivially_default_constructible_if<true, _Unique> {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __non_trivially_default_constructible_if() {}
 };
 
-// allocator
-//
-// Note: For ABI compatibility between C++20 and previous standards, we make
-//       allocator<void> trivial in C++20.
-
 template <class _Tp>
-class allocator : private __non_trivial_if<!is_void<_Tp>::value, allocator<_Tp> > {
+class allocator
+// TODO(LLVM 24): Remove the opt-out
+#ifdef _LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR
+    : __non_trivially_default_constructible_if<!is_void<_Tp>::value, allocator<_Tp> >
+#endif
+{
   static_assert(!is_const<_Tp>::value, "std::allocator does not support const types");
   static_assert(!is_volatile<_Tp>::value, "std::allocator does not support volatile types");
 
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 1e05e4df8ba0f..d6176f8ca2749 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -33,7 +33,6 @@
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
-#include <__type_traits/remove_reference.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 
@@ -54,8 +53,7 @@ class __split_buffer_pointer_layout {
 protected:
   using value_type                      = _Tp;
   using allocator_type                  = _Allocator;
-  using __alloc_rr _LIBCPP_NODEBUG      = __libcpp_remove_reference_t<allocator_type>;
-  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<__alloc_rr>;
+  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<allocator_type>;
   using reference                       = value_type&;
   using const_reference                 = const value_type&;
   using size_type                       = typename __alloc_traits::size_type;
@@ -159,9 +157,9 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { return *(__end_ - 1); }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator(
-      __split_buffer_pointer_layout<__split_buffer<value_type, __alloc_rr&, __split_buffer_pointer_layout>,
+      __split_buffer_pointer_layout<__split_buffer<value_type, allocator_type, __split_buffer_pointer_layout>,
                                     value_type,
-                                    __alloc_rr&>& __other) _NOEXCEPT {
+                                    allocator_type>& __other) _NOEXCEPT {
     std::swap(__front_cap_, __other.__front_cap_);
     std::swap(__begin_, __other.__begin_);
     std::swap(__back_cap_, __other.__back_cap_);
@@ -207,8 +205,7 @@ class __split_buffer_size_layout {
 protected:
   using value_type                      = _Tp;
   using allocator_type                  = _Allocator;
-  using __alloc_rr _LIBCPP_NODEBUG      = __libcpp_remove_reference_t<allocator_type>;
-  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<__alloc_rr>;
+  using __alloc_traits _LIBCPP_NODEBUG  = allocator_traits<allocator_type>;
   using reference                       = value_type&;
   using const_reference                 = const value_type&;
   using size_type                       = typename __alloc_traits::size_type;
@@ -316,9 +313,9 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator(
-      __split_buffer_pointer_layout<__split_buffer<value_type, __alloc_rr&, __split_buffer_pointer_layout>,
+      __split_buffer_pointer_layout<__split_buffer<value_type, allocator_type, __split_buffer_pointer_layout>,
                                     value_type,
-                                    __alloc_rr&>& __other) _NOEXCEPT {
+                                    allocator_type>& __other) _NOEXCEPT {
     std::swap(__front_cap_, __other.__front_cap_);
     std::swap(__begin_, __other.__begin_);
     std::swap(__cap_, __other.__cap_);
@@ -386,8 +383,7 @@ private:
 //    protected:
 //      using value_type                     = _Tp;
 //      using allocator_type                 = _Allocator;
-//      using __alloc_rr                     = __libcpp_remove_reference_t<allocator_type>;
-//      using __alloc_traits                 = allocator_traits<__alloc_rr>;
+//      using __alloc_traits                 = allocator_traits<allocator_type>;
 //      using reference                      = value_type&;
 //      using const_reference                = const value_type&;
 //      using size_type                      = typename __alloc_traits::size_type;
@@ -462,7 +458,6 @@ public:
   using __base_type::__set_sentinel;
   using __base_type::__set_valid_range;
 
-  using typename __base_type::__alloc_rr;
   using typename __base_type::__alloc_traits;
   using typename __base_type::allocator_type;
   using typename __base_type::const_iterator;
@@ -489,18 +484,18 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __split_buffer() = default;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a) : __base_type(__a) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(allocator_type& __a) : __base_type(__a) {}
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const __alloc_rr& __a)
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const allocator_type& __a)
       : __base_type(__a) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
-  __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a);
+  __split_buffer(size_type __cap, size_type __start, allocator_type& __a);
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c)
       _NOEXCEPT_(is_nothrow_move_constructible<allocator_type>::value);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const __alloc_rr& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const allocator_type& __a);
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer& operator=(__split_buffer&& __c)
       _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value &&
@@ -560,7 +555,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, true_type) _NOEXCEPT;
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer& __x)
-      _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>);
+      _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<allocator_type>);
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const {
     if (__front_cap() == nullptr) {
@@ -589,7 +584,7 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-  __swap_without_allocator(__split_buffer<value_type, __alloc_rr&, _Layout>& __other) _NOEXCEPT {
+  __swap_without_allocator(__split_buffer<value_type, allocator_type, _Layout>& __other) _NOEXCEPT {
     __base_type::__swap_without_allocator(__other);
   }
 
@@ -653,7 +648,7 @@ template <class _Tp, class _Allocator, template <class, class, class> class _Lay
 template <class _Iterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 __split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) {
-  __alloc_rr& __a = __get_allocator();
+  allocator_type& __a = __get_allocator();
   for (; __first != __last; ++__first) {
     if (__back_spare() == 0) {
       size_type __old_cap = capacity();
@@ -718,7 +713,7 @@ __split_buffer<_Tp, _Allocator, _Layout>::__destruct_at_end(pointer __new_last,
 
 template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(size_type __cap, size_type __start, __alloc_rr& __a)
+__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(size_type __cap, size_type __start, allocator_type& __a)
     : __base_type(__a) {
   _LIBCPP_ASSERT_INTERNAL(__cap >= __start, "can't have a start point outside the capacity");
   if (__cap > 0) {
@@ -748,7 +743,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator, _Layout>::__split_
 
 template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a)
+__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c, const allocator_type& __a)
     : __base_type(__a) {
   if (__a == __c.__get_allocator()) {
     __set_data(__c.__front_cap());
@@ -781,7 +776,7 @@ __split_buffer<_Tp, _Allocator, _Layout>::operator=(__split_buffer&& __c)
 
 template <class _Tp, class _Allocator, template <class, class, class> class _Layout>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::swap(__split_buffer& __x)
-    _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>) {
+    _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<allocator_type>) {
   __base_type::swap(__x);
 }
 
@@ -791,7 +786,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::shr
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __split_buffer<value_type, __alloc_rr&, _Layout> __t(size(), 0, __get_allocator());
+      __split_buffer<value_type, allocator_type, _Layout> __t(size(), 0, __get_allocator());
       if (__t.capacity() < capacity()) {
         __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(end()));
         __t.__set_sentinel(size());
@@ -818,7 +813,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emp
       __set_valid_range(std::move_backward(begin(), __end, __new_end), __new_end);
     } else {
       size_type __c = std::max<size_type>(2 * capacity(), 1);
-      __split_buffer<value_type, __alloc_rr&, _Layout> __t(__c, (__c + 3) / 4, __get_allocator());
+      __split_buffer<value_type, allocator_type, _Layout> __t(__c, (__c + 3) / 4, __get_allocator());
       __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(__end));
       __base_type::__swap_without_allocator(__t);
     }
@@ -840,7 +835,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emp
       __set_valid_range(begin() - __d, __end);
     } else {
       size_type __c = std::max<size_type>(2 * capacity(), 1);
-      __split_buffer<value_type, __alloc_rr&, _Layout> __t(__c, __c / 4, __get_allocator());
+      __split_buffer<value_type, allocator_type, _Layout> __t(__c, __c / 4, __get_allocator());
       __t.__construct_at_end(move_iterator<pointer>(begin()), move_iterator<pointer>(__end));
       __base_type::__swap_without_allocator(__t);
     }
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index ceae22bb48702..f8064106de075 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -902,8 +902,6 @@ public:
   _LIBCPP_HIDE_FROM_ABI __tree& operator=(const __tree& __t);
   template <class _ForwardIterator>
   _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last);
-  template <class _InputIterator>
-  _LIBCPP_HIDE_FROM_ABI void __assign_multi(_InputIterator __first, _InputIterator __last);
   _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t) _NOEXCEPT_(
       is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible<value_compare>::value);
   _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t, const allocator_type& __a);
@@ -1036,11 +1034,6 @@ public:
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __node_assign_unique(const value_type& __v, __node_pointer __dest);
-
-  _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(__node_pointer __nd);
-  _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(const_iterator __p, __node_pointer __nd);
-
   template <class _InIter, class _Sent>
   _LIBCPP_HIDE_FROM_ABI void __insert_range_unique(_InIter __first, _Sent __last) {
     if (__first == __last)
@@ -1311,43 +1304,6 @@ private:
     __lhs = std::forward<_From>(__rhs);
   }
 
-  struct _DetachedTreeCache {
-    _LIBCPP_HIDE_FROM_ABI explicit _DetachedTreeCache(__tree* __t) _NOEXCEPT
-        : __t_(__t),
-          __cache_root_(__detach_from_tree(__t)) {
-      __advance();
-    }
-
-    _LIBCPP_HIDE_FROM_ABI __node_pointer __get() const _NOEXCEPT { return __cache_elem_; }
-
-    _LIBCPP_HIDE_FROM_ABI void __advance() _NOEXCEPT {
-      __cache_elem_ = __cache_root_;
-      if (__cache_root_) {
-        __cache_root_ = __detach_next(__cache_root_);
-      }
-    }
-
-    _LIBCPP_HIDE_FROM_ABI ~_DetachedTreeCache() {
-      __t_->destroy(__cache_elem_);
-      if (__cache_root_) {
-        while (__cache_root_->__parent_ != nullptr)
-          __cache_root_ = static_cast<__node_pointer>(__cache_root_->__parent_);
-        __t_->destroy(__cache_root_);
-      }
-    }
-
-    _DetachedTreeCache(_DetachedTreeCache const&)            = delete;
-    _DetachedTreeCache& operator=(_DetachedTreeCache const&) = delete;
-
-  private:
-    _LIBCPP_HIDE_FROM_ABI static __node_pointer __detach_from_tree(__tree* __t) _NOEXCEPT;
-    _LIBCPP_HIDE_FROM_ABI static __node_pointer __detach_next(__node_pointer) _NOEXCEPT;
-
-    __tree* __t_;
-    __node_pointer __cache_root_;
-    __node_pointer __cache_elem_;
-  };
-
   class __tree_deleter {
     __node_allocator& __alloc_;
 
@@ -1486,47 +1442,6 @@ private:
   }
 };
 
-// Precondition:  __size_ != 0
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::__node_pointer
-__tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_from_tree(__tree* __t) _NOEXCEPT {
-  __node_pointer __cache                = static_cast<__node_pointer>(__t->__begin_node_);
-  __t->__begin_node_                    = __t->__end_node();
-  __t->__end_node()->__left_->__parent_ = nullptr;
-  __t->__end_node()->__left_            = nullptr;
-  __t->__size_                          = 0;
-  // __cache->__left_ == nullptr
-  if (__cache->__right_ != nullptr)
-    __cache = static_cast<__node_pointer>(__cache->__right_);
-  // __cache->__left_ == nullptr
-  // __cache->__right_ == nullptr
-  return __cache;
-}
-
-// Precondition:  __cache != nullptr
-//    __cache->left_ == nullptr
-//    __cache->right_ == nullptr
-//    This is no longer a red-black tree
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::__node_pointer
-__tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_next(__node_pointer __cache) _NOEXCEPT {
-  if (__cache->__parent_ == nullptr)
-    return nullptr;
-  if (std::__tree_is_left_child(static_cast<__node_base_pointer>(__cache))) {
-    __cache->__parent_->__left_ = nullptr;
-    __cache                     = static_cast<__node_pointer>(__cache->__parent_);
-    if (__cache->__right_ == nullptr)
-      return __cache;
-    return static_cast<__node_pointer>(std::__tree_leaf(__cache->__right_));
-  }
-  // __cache is right child
-  __cache->__parent_unsafe()->__right_ = nullptr;
-  __cache                              = static_cast<__node_pointer>(__cache->__parent_);
-  if (__cache->__left_ == nullptr)
-    return __cache;
-  return static_cast<__node_pointer>(std::__tree_leaf(__cache->__left_));
-}
-
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(const __tree& __t) {
   if (this == std::addressof(__t))
@@ -1549,46 +1464,6 @@ __tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(
   return *this;
 }
 
-template <class _Tp, class _Compare, class _Allocator>
-template <class _ForwardIterator>
-void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first, _ForwardIterator __last) {
-  using _ITraits     = iterator_traits<_ForwardIterator>;
-  using _ItValueType = typename _ITraits::value_type;
-  static_assert(
-      is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
-  static_assert(
-      __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator");
-  if (__size_ != 0) {
-    _DetachedTreeCache __cache(this);
-    for (; __cache.__get() != nullptr && __first != __last; ++__first) {
-      if (__node_assign_unique(*__first, __cache.__get()).second)
-        __cache.__advance();
-    }
-  }
-  for (; __first != __last; ++__first)
-    __emplace_unique(*__first);
-}
-
-template <class _Tp, class _Compare, class _Allocator>
-template <class _InputIterator>
-void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _InputIterator __last) {
-  using _ITraits     = iterator_traits<_InputIterator>;
-  using _ItValueType = typename _ITraits::value_type;
-  static_assert(
-      is_same<_ItValueType, value_type>::value, "__assign_multi may only be called with the containers value_type");
-  if (__size_ != 0) {
-    _DetachedTreeCache __cache(this);
-    for (; __cache.__get() && __first != __last; ++__first) {
-      __assign_value(__cache.__get()->__get_value(), *__first);
-      __node_insert_multi(__cache.__get());
-      __cache.__advance();
-    }
-  }
-  const_iterator __e = end();
-  for (; __first != __last; ++__first)
-    __emplace_hint_multi(__e, *__first);
-}
-
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t)
     : __begin_node_(__end_node()),
@@ -1942,39 +1817,6 @@ __tree<_Tp, _Compare, _Allocator>::__emplace_hint_multi(const_iterator __p, _Arg
   return iterator(static_cast<__node_pointer>(__h.release()));
 }
 
-template <class _Tp, class _Compare, class _Allocator>
-pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
-__tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const value_type& __v, __node_pointer __nd) {
-  auto [__parent, __child] = __find_equal(__v);
-  __node_pointer __r       = static_cast<__node_pointer>(__child);
-  bool __inserted          = false;
-  if (__child == nullptr) {
-    __assign_value(__nd->__get_value(), __v);
-    __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd));
-    __r        = __nd;
-    __inserted = true;
-  }
-  return pair<iterator, bool>(iterator(__r), __inserted);
-}
-
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
-__tree<_Tp, _Compare, _Allocator>::__node_insert_multi(__node_pointer __nd) {
-  __end_node_pointer __parent;
-  __node_base_pointer& __child = __find_leaf_high(__parent, __nd->__get_value());
-  __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd));
-  return iterator(__nd);
-}
-
-template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::iterator
-__tree<_Tp, _Compare, _Allocator>::__node_insert_multi(const_iterator __p, __node_pointer __nd) {
-  __end_node_pointer __parent;
-  __node_base_pointer& __child = __find_leaf(__p, __parent, __nd->__get_value());
-  __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd));
-  return iterator(__nd);
-}
-
 template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _NOEXCEPT {
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 4961a5fcb2067..93358d863492e 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -687,9 +687,9 @@ class vector {
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v);
+  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer
-  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v, pointer __p);
+  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v, pointer __p);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
   __move_range(pointer __from_s, pointer __from_e, pointer __to);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type)
@@ -810,7 +810,7 @@ class vector {
     return __p;
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type&>& __sb) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type>& __sb) {
     auto __vector_begin    = __begin_;
     auto __vector_sentinel = __end_;
     auto __vector_cap      = __cap_;
@@ -855,7 +855,7 @@ vector(from_range_t, _Range&&, _Alloc = _Alloc()) -> vector<ranges::range_value_
 // function has a strong exception guarantee.
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v) {
+vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v) {
   __annotate_delete();
   auto __new_begin = __v.begin() - size();
   std::__uninitialized_allocator_relocate(
@@ -874,7 +874,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
 // function has a strong exception guarantee if __begin_ == __p || __end_ == __p.
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
-vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v, pointer __p) {
+vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v, pointer __p) {
   __annotate_delete();
   pointer __ret = __v.begin();
 
@@ -1074,7 +1074,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::reserve(size_type __
   if (__n > capacity()) {
     if (__n > max_size())
       this->__throw_length_error();
-    __split_buffer<value_type, allocator_type&> __v(__n, size(), this->__alloc_);
+    __split_buffer<value_type, allocator_type> __v(__n, size(), this->__alloc_);
     __swap_out_circular_buffer(__v);
   }
 }
@@ -1085,7 +1085,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOE
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __split_buffer<value_type, allocator_type&> __v(size(), size(), this->__alloc_);
+      __split_buffer<value_type, allocator_type> __v(size(), size(), this->__alloc_);
       // The Standard mandates shrink_to_fit() does not increase the capacity.
       // With equal capacity keep the existing buffer. This avoids extra work
       // due to swapping the elements.
@@ -1102,7 +1102,7 @@ template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) {
-  __split_buffer<value_type, allocator_type&> __v(__recommend(size() + 1), size(), this->__alloc_);
+  __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), size(), this->__alloc_);
   //    __v.emplace_back(std::forward<_Args>(__args)...);
   pointer __end = __v.end();
   __alloc_traits::construct(this->__alloc_, std::__to_address(__end), std::forward<_Args>(__args)...);
@@ -1205,7 +1205,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x)
       *__p = *__xr;
     }
   } else {
-    __split_buffer<value_type, allocator_type&> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(__x);
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1224,7 +1224,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x) {
       *__p = std::move(__x);
     }
   } else {
-    __split_buffer<value_type, allocator_type&> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(std::move(__x));
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1245,7 +1245,7 @@ vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args) {
       *__p = std::move(__tmp.get());
     }
   } else {
-    __split_buffer<value_type, allocator_type&> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(std::forward<_Args>(__args)...);
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1273,7 +1273,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, size_type __n, const_
         std::fill_n(__p, __n, *__xr);
       }
     } else {
-      __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
+      __split_buffer<value_type, allocator_type> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
       __v.__construct_at_end(__n, __x);
       __p = __swap_out_circular_buffer(__v, __p);
     }
@@ -1294,11 +1294,11 @@ vector<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __position, _Inpu
   if (__first == __last)
     (void)std::rotate(__p, __old_last, this->__end_);
   else {
-    __split_buffer<value_type, allocator_type&> __v(__alloc_);
+    __split_buffer<value_type, allocator_type> __v(__alloc_);
     auto __guard = std::__make_exception_guard(
         _AllocatorDestroyRangeReverse<allocator_type, pointer>(__alloc_, __old_last, this->__end_));
     __v.__construct_at_end_with_sentinel(std::move(__first), std::move(__last));
-    __split_buffer<value_type, allocator_type&> __merged(
+    __split_buffer<value_type, allocator_type> __merged(
         __recommend(size() + __v.size()), __off, __alloc_); // has `__off` positions available at the front
     std::__uninitialized_allocator_relocate(
         __alloc_, std::__to_address(__old_last), std::__to_address(this->__end_), std::__to_address(__merged.end()));
@@ -1344,7 +1344,7 @@ vector<_Tp, _Allocator>::__insert_with_size(
         __insert_assign_n_unchecked<_AlgPolicy>(std::move(__first), __n, __p);
       }
     } else {
-      __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
+      __split_buffer<value_type, allocator_type> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
       __v.__construct_at_end_with_size(std::move(__first), __n);
       __p = __swap_out_circular_buffer(__v, __p);
     }
@@ -1359,7 +1359,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n
     if (__new_size <= capacity()) {
       __construct_at_end(__new_size - __current_size);
     } else {
-      __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_);
+      __split_buffer<value_type, allocator_type> __v(__recommend(__new_size), __current_size, __alloc_);
       __v.__construct_at_end(__new_size - __current_size);
       __swap_out_circular_buffer(__v);
     }
@@ -1375,7 +1375,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n
     if (__new_size <= capacity())
       __construct_at_end(__new_size - __current_size, __x);
     else {
-      __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_);
+      __split_buffer<value_type, allocator_type> __v(__recommend(__new_size), __current_size, __alloc_);
       __v.__construct_at_end(__new_size - __current_size, __x);
       __swap_out_circular_buffer(__v);
     }
diff --git a/libcxx/include/deque b/libcxx/include/deque
index ad2d759e1fcac..befe27bb4282d 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -1785,9 +1785,9 @@ template <class _Tp, class _Allocator>
 template <class _Iterator, class _Sentinel>
 _LIBCPP_HIDE_FROM_ABI typename deque<_Tp, _Allocator>::iterator
 deque<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Sentinel __l) {
-  __split_buffer<value_type, allocator_type&> __buf(__alloc());
+  __split_buffer<value_type, allocator_type> __buf(__alloc());
   __buf.__construct_at_end_with_sentinel(std::move(__f), std::move(__l));
-  typedef typename __split_buffer<value_type, allocator_type&>::iterator __bi;
+  typedef typename __split_buffer<value_type, allocator_type>::iterator __bi;
   return insert(__p, move_iterator<__bi>(__buf.begin()), move_iterator<__bi>(__buf.end()));
 }
 
@@ -1802,9 +1802,9 @@ template <class _Tp, class _Allocator>
 template <class _Iterator>
 _LIBCPP_HIDE_FROM_ABI typename deque<_Tp, _Allocator>::iterator
 deque<_Tp, _Allocator>::__insert_with_size(const_iterator __p, _Iterator __f, size_type __n) {
-  __split_buffer<value_type, allocator_type&> __buf(__n, 0, __alloc());
+  __split_buffer<value_type, allocator_type> __buf(__n, 0, __alloc());
   __buf.__construct_at_end_with_size(__f, __n);
-  typedef typename __split_buffer<value_type, allocator_type&>::iterator __fwd;
+  typedef typename __split_buffer<value_type, allocator_type>::iterator __fwd;
   return insert(__p, move_iterator<__fwd>(__buf.begin()), move_iterator<__fwd>(__buf.end()));
 }
 
@@ -1982,7 +1982,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() {
   }
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
-    __split_buffer<pointer, __pointer_allocator&> __buf(
+    __split_buffer<pointer, __pointer_allocator> __buf(
         std::max<size_type>(2 * __map_.capacity(), 1), 0, __map_.__get_allocator());
 
     typedef __allocator_destructor<_Allocator> _Dp;
@@ -2042,7 +2042,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
   // Else need to allocate __nb buffers, *and* we need to reallocate __map_.
   else {
     size_type __ds = (__nb + __back_capacity) * __block_size - __map_.empty();
-    __split_buffer<pointer, __pointer_allocator&> __buf(
+    __split_buffer<pointer, __pointer_allocator> __buf(
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__get_allocator());
     auto __guard = std::__make_exception_guard([&] {
       __annotate_delete();
@@ -2094,7 +2094,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() {
   }
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
-    __split_buffer<pointer, __pointer_allocator&> __buf(
+    __split_buffer<pointer, __pointer_allocator> __buf(
         std::max<size_type>(2 * __map_.capacity(), 1), __map_.size(), __map_.__get_allocator());
 
     typedef __allocator_destructor<_Allocator> _Dp;
@@ -2154,7 +2154,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
   // Else need to allocate __nb buffers, *and* we need to reallocate __map_.
   else {
     size_type __ds = __front_capacity * __block_size;
-    __split_buffer<pointer, __pointer_allocator&> __buf(
+    __split_buffer<pointer, __pointer_allocator> __buf(
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()),
         __map_.size() - __front_capacity,
         __map_.__get_allocator());
diff --git a/libcxx/include/map b/libcxx/include/map
index 0dca11cabd12e..e67f7cef5861d 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -1015,7 +1015,8 @@ public:
 #    endif
 
   _LIBCPP_HIDE_FROM_ABI map& operator=(initializer_list<value_type> __il) {
-    __tree_.__assign_unique(__il.begin(), __il.end());
+    clear();
+    insert(__il.begin(), __il.end());
     return *this;
   }
 
@@ -1689,7 +1690,8 @@ public:
 #    endif
 
   _LIBCPP_HIDE_FROM_ABI multimap& operator=(initializer_list<value_type> __il) {
-    __tree_.__assign_multi(__il.begin(), __il.end());
+    clear();
+    insert(__il.begin(), __il.end());
     return *this;
   }
 
diff --git a/libcxx/include/set b/libcxx/include/set
index 3d6f571a42a1a..f333d97defac1 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -692,7 +692,8 @@ public:
 #    endif
 
   _LIBCPP_HIDE_FROM_ABI set& operator=(initializer_list<value_type> __il) {
-    __tree_.__assign_unique(__il.begin(), __il.end());
+    clear();
+    insert(__il.begin(), __il.end());
     return *this;
   }
 
@@ -1136,7 +1137,8 @@ public:
 #    endif
 
   _LIBCPP_HIDE_FROM_ABI multiset& operator=(initializer_list<value_type> __il) {
-    __tree_.__assign_multi(__il.begin(), __il.end());
+    clear();
+    insert(__il.begin(), __il.end());
     return *this;
   }
 
diff --git a/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp
new file mode 100644
index 0000000000000..ff298963e074a
--- /dev/null
+++ b/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// Make sure that std::allocator<T> is trivial.
+
+// <memory>
+
+#include <memory>
+#include <string>
+#include <type_traits>
+
+static_assert(std::is_trivially_default_constructible<std::allocator<char> >::value, "");
+static_assert(std::is_trivially_default_constructible<std::allocator<std::string> >::value, "");
+static_assert(std::is_trivially_default_constructible<std::allocator<void> >::value, "");
+
+static_assert(std::is_trivially_copyable<std::allocator<char> >::value, "");
+static_assert(std::is_trivially_copyable<std::allocator<std::string> >::value, "");
+static_assert(std::is_trivially_copyable<std::allocator<void> >::value, "");
diff --git a/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp
new file mode 100644
index 0000000000000..be2a1840ec903
--- /dev/null
+++ b/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// Make sure that std::allocator<T> is not trivial if _LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR if defined.
+// std::allocator<void> _should_ still be trivial, since it has always been trivial.
+
+// <memory>
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR
+
+#include <memory>
+#include <string>
+#include <type_traits>
+
+static_assert(!std::is_trivially_default_constructible<std::allocator<char> >::value, "");
+static_assert(!std::is_trivially_default_constructible<std::allocator<std::string> >::value, "");
+static_assert(std::is_trivially_default_constructible<std::allocator<void> >::value, "");
+
+static_assert(std::is_trivially_copyable<std::allocator<char> >::value, "");
+static_assert(std::is_trivially_copyable<std::allocator<std::string> >::value, "");
+static_assert(std::is_trivially_copyable<std::allocator<void> >::value, "");
diff --git a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
deleted file mode 100644
index b7dfc190e8e91..0000000000000
--- a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Make sure that std::allocator<void> is trivial. This was the case before C++20
-// with the std::allocator<void> explicit specialization, and this test makes sure
-// that we maintain that property across all standards.
-//
-// This is important since triviality has implications on how the type is passed
-// as a function argument in the ABI.
-
-#include <memory>
-#include <type_traits>
-
-typedef std::allocator<void> A1;
-struct A2 : std::allocator<void> { };
-
-static_assert(std::is_trivially_default_constructible<A1>::value, "");
-static_assert(std::is_trivially_copyable<A1>::value, "");
-
-static_assert(std::is_trivially_default_constructible<A2>::value, "");
-static_assert(std::is_trivially_copyable<A2>::value, "");
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
index 5dd1d6f0b3380..b08fce2b701e2 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
@@ -12,9 +12,6 @@
 // template <class T, class... Args>
 //   constexpr optional<T> make_optional(Args&&... args);
 
-// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
-// XFAIL: gcc-15
-
 #include <cassert>
 #include <memory>
 #include <optional>
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
index 5ddb229ad9268..80371d6333712 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
@@ -12,9 +12,6 @@
 // template <class T, class U, class... Args>
 //   constexpr optional<T> make_optional(initializer_list<U> il, Args&&... args);
 
-// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
-// XFAIL: gcc-15
-
 #include <cassert>
 #include <memory>
 #include <optional>
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 33fcd841b2ab0..afa0cae790377 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -41,7 +41,8 @@
 #define _LIBUNWIND_CHECK_LINUX_SIGRETURN 1
 #endif
 
-#if defined(_LIBUNWIND_TARGET_HAIKU) && defined(_LIBUNWIND_TARGET_X86_64)
+#if defined(_LIBUNWIND_TARGET_HAIKU) &&                                        \
+    (defined(_LIBUNWIND_TARGET_I386) || defined(_LIBUNWIND_TARGET_X86_64))
 #include <OS.h>
 #include <signal.h>
 #define _LIBUNWIND_CHECK_HAIKU_SIGRETURN 1
@@ -1366,7 +1367,7 @@ class UnwindCursor : public AbstractUnwindCursor{
   bool             _unwindInfoMissing;
   bool             _isSignalFrame;
 #if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) ||                               \
-    defined(_LIBUNWIND_TARGET_HAIKU)
+    defined(_LIBUNWIND_CHECK_HAIKU_SIGRETURN)
   bool             _isSigReturn = false;
 #endif
 #ifdef _LIBUNWIND_TRACE_RET_INJECT
diff --git a/lldb/include/lldb/Host/Terminal.h b/lldb/include/lldb/Host/Terminal.h
index 3d66515c18812..da0d05e8bd265 100644
--- a/lldb/include/lldb/Host/Terminal.h
+++ b/lldb/include/lldb/Host/Terminal.h
@@ -68,18 +68,6 @@ class Terminal {
 
   llvm::Error SetHardwareFlowControl(bool enabled);
 
-  /// Returns whether or not the current terminal supports Unicode rendering.
-  ///
-  /// The value is cached after the first computation.
-  ///
-  /// On POSIX systems, we check if the LANG environment variable contains the
-  /// substring "UTF-8", case insensitive.
-  ///
-  /// On Windows, we always return true since we use the `WriteConsoleW` API
-  /// internally. Note that the default Windows codepage (437) does not support
-  /// all Unicode characters. This function does not check the codepage.
-  static bool SupportsUnicode();
-
 protected:
   struct Data;
 
diff --git a/lldb/include/lldb/Host/common/DiagnosticsRendering.h b/lldb/include/lldb/Host/common/DiagnosticsRendering.h
index 3eea0647da37e..dd33d671c24a5 100644
--- a/lldb/include/lldb/Host/common/DiagnosticsRendering.h
+++ b/lldb/include/lldb/Host/common/DiagnosticsRendering.h
@@ -59,27 +59,10 @@ struct DiagnosticDetail {
 
 StructuredData::ObjectSP Serialize(llvm::ArrayRef<DiagnosticDetail> details);
 
-/// Renders an array of DiagnosticDetail instances.
-///
-/// \param[in] stream
-///     The stream to render the diagnostics to.
-/// \param offset_in_command
-///     An optional offset to the column position of the diagnostic in the
-///     source.
-/// \param show_inline
-///     Whether to show the diagnostics inline.
-/// \param details
-///     The array of DiagnosticsDetail to render.
-/// \param force_ascii
-///     Whether to force ascii rendering. If false, Unicode characters will be
-///     used if the output file supports them.
-///
-/// \see lldb_private::Terminal::SupportsUnicode
 void RenderDiagnosticDetails(Stream &stream,
                              std::optional<uint16_t> offset_in_command,
                              bool show_inline,
-                             llvm::ArrayRef<DiagnosticDetail> details,
-                             bool force_ascii = false);
+                             llvm::ArrayRef<DiagnosticDetail> details);
 
 class DiagnosticError
     : public llvm::ErrorInfo<DiagnosticError, CloneableECError> {
diff --git a/lldb/source/Host/common/DiagnosticsRendering.cpp b/lldb/source/Host/common/DiagnosticsRendering.cpp
index 2c9d33a6c325c..f2cd3968967fb 100644
--- a/lldb/source/Host/common/DiagnosticsRendering.cpp
+++ b/lldb/source/Host/common/DiagnosticsRendering.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/common/DiagnosticsRendering.h"
-#include "lldb/Host/Terminal.h"
-
 #include <cstdint>
 
 using namespace lldb_private;
@@ -87,8 +85,7 @@ static llvm::raw_ostream &PrintSeverity(Stream &stream,
 void RenderDiagnosticDetails(Stream &stream,
                              std::optional<uint16_t> offset_in_command,
                              bool show_inline,
-                             llvm::ArrayRef<DiagnosticDetail> details,
-                             bool force_ascii) {
+                             llvm::ArrayRef<DiagnosticDetail> details) {
   if (details.empty())
     return;
 
@@ -100,8 +97,12 @@ void RenderDiagnosticDetails(Stream &stream,
     return;
   }
 
+  // Since there is no other way to find this out, use the color
+  // attribute as a proxy for whether the terminal supports Unicode
+  // characters.  In the future it might make sense to move this into
+  // Host so it can be customized for a specific platform.
   llvm::StringRef cursor, underline, vbar, joint, hbar, spacer;
-  if (Terminal::SupportsUnicode() && !force_ascii) {
+  if (stream.AsRawOstream().colors_enabled()) {
     cursor = "˄";
     underline = "˜";
     vbar = "│";
diff --git a/lldb/source/Host/common/Terminal.cpp b/lldb/source/Host/common/Terminal.cpp
index d3647835e3937..436dfd8130d9b 100644
--- a/lldb/source/Host/common/Terminal.cpp
+++ b/lldb/source/Host/common/Terminal.cpp
@@ -400,22 +400,6 @@ llvm::Error Terminal::SetHardwareFlowControl(bool enabled) {
 #endif // LLDB_ENABLE_TERMIOS
 }
 
-bool Terminal::SupportsUnicode() {
-  static std::optional<bool> g_result;
-  if (g_result)
-    return g_result.value();
-#ifdef _WIN32
-  return true;
-#else
-  const char *lang_var = std::getenv("LANG");
-  if (!lang_var)
-    return false;
-  g_result =
-      llvm::StringRef(lang_var).lower().find("utf-8") != std::string::npos;
-#endif
-  return g_result.value();
-}
-
 TerminalState::TerminalState(Terminal term, bool save_process_group)
     : m_tty(term) {
   Save(term, save_process_group);
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index afc1753e21c46..0198ddcfa31e0 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -316,6 +316,11 @@ void CommandInterpreter::Initialize() {
     AddAlias("continue", cmd_obj_sp);
   }
 
+  // At this point, I'm leaving "b" command aliased to "_regexp-break".  There's
+  // a catch-all regexp in the command that takes any unrecognized input and
+  // runs it as `break set <input>` and switching the command to break add
+  // would change that behavior.  People who want to use the break add for the
+  // "b" alias can do so in their .lldbinit.
   cmd_obj_sp = GetCommandSPExact("_regexp-break");
   if (cmd_obj_sp)
     AddAlias("b", cmd_obj_sp)->SetSyntax(cmd_obj_sp->GetSyntax());
@@ -668,6 +673,89 @@ void CommandInterpreter::LoadCommandDictionary() {
     }
   }
 
+  // clang-format off
+  // FIXME: It would be simpler to just use the linespec's directly here, but
+  // the `b` alias allows "foo.c   :   12   :   45" but the linespec parser
+  // is more rigorous, and doesn't strip spaces, so the two are not equivalent.
+  const char *break_add_regexes[][2] = {
+      {"^(.*[^[:space:]])[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*$",
+       "breakpoint add file --file '%1' --line %2 --column %3"},
+      {"^(.*[^[:space:]])[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*$",
+       "breakpoint add file --file '%1' --line %2"},
+      {"^/([^/]+)/$", "breakpoint add pattern -- %1"},
+      {"^([[:digit:]]+)[[:space:]]*$",
+      "breakpoint add file --line %1"},
+      {"^\\*?(0x[[:xdigit:]]+)[[:space:]]*$",
+      "breakpoint add address %1"},
+      {"^[\"']?([-+]?\\[.*\\])[\"']?[[:space:]]*$",
+       "breakpoint add name '%1'"},
+      {"^(-.*)$",
+      "breakpoint add name '%1'"},
+      {"^(.*[^[:space:]])`(.*[^[:space:]])[[:space:]]*$",
+       "breakpoint add name '%2' --shlib '%1'"},
+      {"^\\&(.*[^[:space:]])[[:space:]]*$",
+       "breakpoint add name '%1' --skip-prologue=0"},
+      {"^[\"']?(.*[^[:space:]\"'])[\"']?[[:space:]]*$",
+       "breakpoint add name '%1'"}};
+  // clang-format on
+
+  size_t num_add_regexes = std::size(break_add_regexes);
+
+  std::unique_ptr<CommandObjectRegexCommand> break_add_regex_cmd_up(
+      new CommandObjectRegexCommand(
+          *this, "_regexp-break-add",
+          "Set a breakpoint using one of several shorthand formats, or list "
+          "the existing breakpoints if no arguments are provided.",
+          "\n"
+          "_regexp-break-add <filename>:<linenum>:<colnum>\n"
+          "              main.c:12:21          // Break at line 12 and column "
+          "21 of main.c\n\n"
+          "_regexp-break-add <filename>:<linenum>\n"
+          "              main.c:12             // Break at line 12 of "
+          "main.c\n\n"
+          "_regexp-break-add <linenum>\n"
+          "              12                    // Break at line 12 of current "
+          "file\n\n"
+          "_regexp-break-add 0x<address>\n"
+          "              0x1234000             // Break at address "
+          "0x1234000\n\n"
+          "_regexp-break-add <name>\n"
+          "              main                  // Break in 'main' after the "
+          "prologue\n\n"
+          "_regexp-break-add &<name>\n"
+          "              &main                 // Break at first instruction "
+          "in 'main'\n\n"
+          "_regexp-break-add <module>`<name>\n"
+          "              libc.so`malloc        // Break in 'malloc' from "
+          "'libc.so'\n\n"
+          "_regexp-break-add /<source-regex>/\n"
+          "              /break here/          // Break on source lines in "
+          "current file\n"
+          "                                    // containing text 'break "
+          "here'.\n"
+          "_regexp-break-add\n"
+          "                                    // List the existing "
+          "breakpoints\n",
+          lldb::eSymbolCompletion | lldb::eSourceFileCompletion, false));
+
+  if (break_add_regex_cmd_up) {
+    bool success = true;
+    for (size_t i = 0; i < num_add_regexes; i++) {
+      success = break_add_regex_cmd_up->AddRegexCommand(
+          break_add_regexes[i][0], break_add_regexes[i][1]);
+      if (!success)
+        break;
+    }
+    success =
+        break_add_regex_cmd_up->AddRegexCommand("^$", "breakpoint list --full");
+
+    if (success) {
+      CommandObjectSP break_add_regex_cmd_sp(break_add_regex_cmd_up.release());
+      m_command_dict[std::string(break_add_regex_cmd_sp->GetCommandName())] =
+          break_add_regex_cmd_sp;
+    }
+  }
+
   std::unique_ptr<CommandObjectRegexCommand> tbreak_regex_cmd_up(
       new CommandObjectRegexCommand(
           *this, "_regexp-tbreak",
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py
index 235a41d1adef3..930d497032171 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py
@@ -10,10 +10,15 @@
 
 
 class RegexpBreakCommandTestCase(TestBase):
-    def test(self):
+    def test_set_version(self):
         """Test _regexp-break command."""
         self.build()
-        self.regexp_break_command()
+        self.regexp_break_command("_regexp-break")
+
+    def test_add_version(self):
+        """Test _regexp-break-add command."""
+        self.build()
+        self.regexp_break_command("_regexp-break-add")
 
     def setUp(self):
         # Call super's setUp().
@@ -22,12 +27,12 @@ def setUp(self):
         self.source = "main.c"
         self.line = line_number(self.source, "// Set break point at this line.")
 
-    def regexp_break_command(self):
+    def regexp_break_command(self, cmd_name):
         """Test the super consie "b" command, which is analias for _regexp-break."""
         exe = self.getBuildArtifact("a.out")
         self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET)
 
-        break_results = lldbutil.run_break_set_command(self, "b %d" % self.line)
+        break_results = lldbutil.run_break_set_command(self, f"{cmd_name} {self.line}")
         lldbutil.check_breakpoint_result(
             self,
             break_results,
@@ -37,7 +42,7 @@ def regexp_break_command(self):
         )
 
         break_results = lldbutil.run_break_set_command(
-            self, "b %s:%d" % (self.source, self.line)
+            self, f"{cmd_name} {self.source}:{self.line}"
         )
         lldbutil.check_breakpoint_result(
             self,
@@ -50,7 +55,7 @@ def regexp_break_command(self):
         # Check breakpoint with full file path.
         full_path = os.path.join(self.getSourceDir(), self.source)
         break_results = lldbutil.run_break_set_command(
-            self, "b %s:%d" % (full_path, self.line)
+            self, f"{cmd_name} {full_path}:{self.line}"
         )
         lldbutil.check_breakpoint_result(
             self,
@@ -60,6 +65,17 @@ def regexp_break_command(self):
             num_locations=1,
         )
 
+        # Check breakpoint with symbol name.  I'm also passing in
+        # the module so I can check the number of locations.
+        exe_spec = lldb.SBFileSpec(exe)
+        exe_filename = exe_spec.basename
+        cmd = f"{cmd_name} {exe_filename}`main"
+        print(f"About to run: '{cmd}'")
+        break_results = lldbutil.run_break_set_command(self, cmd)
+        lldbutil.check_breakpoint_result(
+            self, break_results, symbol_name="main", num_locations=1
+        )
+
         self.runCmd("run", RUN_SUCCEEDED)
 
         # The stop reason of the thread should be breakpoint.
diff --git a/lldb/test/API/terminal/TestEditlineCompletions.py b/lldb/test/API/terminal/TestEditlineCompletions.py
index b4ea0f39ec10c..ac1d3f90e2970 100644
--- a/lldb/test/API/terminal/TestEditlineCompletions.py
+++ b/lldb/test/API/terminal/TestEditlineCompletions.py
@@ -72,11 +72,11 @@ def test_completion_pagination(self):
         self.child.expect("Available completions:")
         self.child.expect("        _regexp-attach")
         self.child.expect("        _regexp-break")
+        self.child.expect("        _regexp-break-add")
         self.child.expect("        _regexp-bt")
         self.child.expect("        _regexp-display")
         self.child.expect("        _regexp-down")
         self.child.expect("        _regexp-env")
-        self.child.expect("        _regexp-jump")
         self.child.expect("More")
 
     @skipIfAsan
diff --git a/lldb/test/Shell/BuildScript/toolchain-msvc.test b/lldb/test/Shell/BuildScript/toolchain-msvc.test
index dce87d5aee2af..bde895fa6dbaf 100644
--- a/lldb/test/Shell/BuildScript/toolchain-msvc.test
+++ b/lldb/test/Shell/BuildScript/toolchain-msvc.test
@@ -23,15 +23,15 @@ RUN:    | FileCheck --check-prefix=64BIT %s
 32BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.pdb
 32BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.exe
 32BIT: compiling foobar.c -> foo.exe-foobar.obj
-32BIT:   Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x86|arm)}}\cl.{{EXE|exe}}
+32BIT:   Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|arm)}}\cl.{{EXE|exe}}
 32BIT: linking foo.exe-foobar.obj -> foo.exe
-32BIT:   Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x86|arm)}}\link.{{EXE|exe}}
+32BIT:   Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|arm)}}\link.{{EXE|exe}}
 32BIT:   Env
 32BIT:     LIB = {{.*}}\ATLMFC\lib\{{(x86|arm)}}
 32BIT:           {{.*}}\lib\{{(x86|arm)}}
 32BIT:           {{.*}}\ucrt\{{(x86|arm)}}
 32BIT:           {{.*}}\um\{{(x86|arm)}}
-32BIT:     PATH = {{.*}}\bin\{{[Hh]ost[Xx](64|86)}}\{{(x86|x64)}}
+32BIT:     PATH = {{.*}}\bin\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|x64|arm64)}}
 
 
 64BIT: Script Arguments:
@@ -51,12 +51,12 @@ RUN:    | FileCheck --check-prefix=64BIT %s
 64BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.pdb
 64BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.exe
 64BIT: compiling foobar.c -> foo.exe-foobar.obj
-64BIT:   Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x64|arm64)}}\cl.{{EXE|exe}}
+64BIT:   Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x64|arm64)}}\cl.{{EXE|exe}}
 64BIT: linking foo.exe-foobar.obj -> foo.exe
-64BIT:   Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x64|arm64)}}\link.{{EXE|exe}}
+64BIT:   Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x64|arm64)}}\link.{{EXE|exe}}
 64BIT:   Env
 64BIT:     LIB = {{.*}}\ATLMFC\lib\{{(x64|arm64)}}
 64BIT:           {{.*}}\lib\{{(x64|arm64)}}
 64BIT:           {{.*}}\ucrt\{{(x64|arm64)}}
 64BIT:           {{.*}}\um\{{(x64|arm64)}}
-64BIT:     PATH = {{.*}}\bin\{{[Hh]ost[Xx](64|86)}}\{{(x86|x64)}}
+64BIT:     PATH = {{.*}}\bin\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|x64|arm64)}}
diff --git a/lldb/test/Shell/Commands/command-dwim-print.test b/lldb/test/Shell/Commands/command-dwim-print.test
index 88e7314976ad8..9153edbd21791 100644
--- a/lldb/test/Shell/Commands/command-dwim-print.test
+++ b/lldb/test/Shell/Commands/command-dwim-print.test
@@ -1,16 +1,16 @@
 # RUN: echo quit | %lldb -o "dwim-print a" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK1
 #            (lldb) dwim-print a 
-# CHECK1:{{^                  (\^|˄)}}
+# CHECK1:{{^                  \^}}
 # CHECK1: {{^                  error: use of undeclared identifier 'a'}}
 # RUN: echo quit | %lldb -o "p a" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK2
 #            (lldb) p a 
-# CHECK2:{{^         (\^|˄)}}
+# CHECK2:{{^         \^}}
 # RUN: echo quit | %lldb -o "dwim-print -- a" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK3
 #            (lldb) dwim-print -- a 
-# CHECK3:{{^                     (\^|˄)}}
+# CHECK3:{{^                     \^}}
 # RUN: echo quit | %lldb -o "settings set show-inline-diagnostics false" \
 # RUN:   -o "dwim-print a" 2>&1 | FileCheck %s --check-prefix=CHECK4
 # CHECK4: error: <user expression 0>:1:1: use of undeclared identifier
diff --git a/lldb/test/Shell/Commands/command-expr-diagnostics.test b/lldb/test/Shell/Commands/command-expr-diagnostics.test
index cde0e6c6768f7..3c827fb4516ec 100644
--- a/lldb/test/Shell/Commands/command-expr-diagnostics.test
+++ b/lldb/test/Shell/Commands/command-expr-diagnostics.test
@@ -2,19 +2,19 @@
 # RUN: echo quit | %lldb -o "expression a+b" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK1
 #            (lldb) expression a+b
-# CHECK1:{{^                  (\^|˄) (\^|˄)}}
-# CHECK1: {{^                  (\||│) error: use of undeclared identifier 'b'}}
+# CHECK1:{{^                  \^ \^}}
+# CHECK1: {{^                  | error: use of undeclared identifier 'b'}}
 # CHECK1: {{^                  error: use of undeclared identifier 'a'}}
 
 # RUN: echo quit | %lldb -o "expr a" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK2
 #            (lldb) expr a 
-# CHECK2:{{^            (\^|˄)}}
+# CHECK2:{{^            \^}}
 
 # RUN: echo quit | %lldb -o "expr -i 0 -o 0 -- a" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK3
 #            (lldb) expr -i 0 -o 0 -- a
-# CHECK3:{{^                         (\^|˄)}}
+# CHECK3:{{^                         \^}}
 # CHECK3: {{^                         error: use of undeclared identifier 'a'}}
 
 # RUN: echo "int main(){return 0;}">%t.c
@@ -23,7 +23,7 @@
 # RUN: "expr --top-level -- template<typename T> T FOO(T x) { return x/2;}" -o \
 # RUN: "expression -- FOO(\"\")" 2>&1 | FileCheck %s --check-prefix=CHECK4
 #            (lldb) expression -- FOO("")
-# CHECK4:{{^                     (\^|˄)}}
+# CHECK4:{{^                     \^}}
 # CHECK4: {{^                     note: in instantiation of function template}}
 # CHECK4: error: <user expression
 
diff --git a/lldb/test/Shell/Commands/command-options.test b/lldb/test/Shell/Commands/command-options.test
index cec495e30b28c..73aa374bde297 100644
--- a/lldb/test/Shell/Commands/command-options.test
+++ b/lldb/test/Shell/Commands/command-options.test
@@ -1,16 +1,16 @@
 # RUN: echo quit | %lldb -O "log enable -x" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK1
 #            (lldb) log enable -x
-# CHECK1:{{^                  (\^|˄)(~|˜)}}
+# CHECK1:{{^                  \^~}}
 # CHECK1: {{^                  error: unknown or ambiguous option}}
 
 # RUN: echo quit | %lldb -O "    log enable -xxxxxxx" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK2
 #            (lldb)     log enable -xxxxxxx
-# CHECK2:{{^                      [\^|]~~~~~~~}}
+# CHECK2:{{^                      \^~~~~~~~}}
 # CHECK2: {{^                      error: unknown or ambiguous option}}
 # RUN: echo quit | %lldb -O "log enable dwarf all -f dwarf.log -x" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK3
 #            (lldb) log enable dwarf all -f dwarf.log -x
-# CHECK3:{{^                                         [\^|]~}}
+# CHECK3:{{^                                         \^~}}
 # CHECK3: {{^                                         error: unknown or ambiguous option}}
diff --git a/lldb/unittests/Host/common/DiagnosticsRenderingTest.cpp b/lldb/unittests/Host/common/DiagnosticsRenderingTest.cpp
index 896ce1995fe1c..851b478def32e 100644
--- a/lldb/unittests/Host/common/DiagnosticsRenderingTest.cpp
+++ b/lldb/unittests/Host/common/DiagnosticsRenderingTest.cpp
@@ -10,7 +10,7 @@ class ErrorDisplayTest : public ::testing::Test {};
 
 std::string Render(std::vector<DiagnosticDetail> details) {
   StreamString stream;
-  RenderDiagnosticDetails(stream, 0, true, details, /*force_ascii=*/true);
+  RenderDiagnosticDetails(stream, 0, true, details);
   return stream.GetData();
 }
 } // namespace
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
index 09ec56db6826f..bfd22ec753074 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
@@ -507,6 +507,21 @@ enum EdgeKind_systemz : Edge::Kind {
   ///
   RequestGOTAndTransformToDelta32dbl,
 
+  /// A TLSInfo entry getter/constructor, transformed to Delta64FromGOT.
+  ///
+  /// Indicates that this edge should be transformed into a Delta64FromGOT
+  /// targeting the TLSInfo entry for the edge's current target. A TLSInfo
+  /// entry for the target should be created if one does not already exist.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestTLSDescInGOTAndTransformToDelta64FromGOT,
+
   /// A 32-bit Delta to GOT base.
   ///
   /// Fixup expression:
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 11b76cd183108..9480e7b36dc2c 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -769,7 +769,7 @@ class Triple {
   bool isOSGlibc() const {
     return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD ||
             getOS() == Triple::Hurd) &&
-           !isAndroid();
+           !isAndroid() && !isMusl();
   }
 
   /// Tests whether the OS is AIX.
diff --git a/llvm/lib/Analysis/CmpInstAnalysis.cpp b/llvm/lib/Analysis/CmpInstAnalysis.cpp
index a1a79e5685f80..a6d0d3ff4fcd4 100644
--- a/llvm/lib/Analysis/CmpInstAnalysis.cpp
+++ b/llvm/lib/Analysis/CmpInstAnalysis.cpp
@@ -75,7 +75,7 @@ Constant *llvm::getPredForFCmpCode(unsigned Code, Type *OpTy,
 
 std::optional<DecomposedBitTest>
 llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred,
-                           bool LookThruTrunc, bool AllowNonZeroC,
+                           bool LookThroughTrunc, bool AllowNonZeroC,
                            bool DecomposeAnd) {
   using namespace PatternMatch;
 
@@ -173,7 +173,7 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred,
     Result.Pred = ICmpInst::getInversePredicate(Result.Pred);
 
   Value *X;
-  if (LookThruTrunc && match(LHS, m_Trunc(m_Value(X)))) {
+  if (LookThroughTrunc && match(LHS, m_Trunc(m_Value(X)))) {
     Result.X = X;
     Result.Mask = Result.Mask.zext(X->getType()->getScalarSizeInBits());
     Result.C = Result.C.zext(X->getType()->getScalarSizeInBits());
@@ -185,7 +185,7 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred,
 }
 
 std::optional<DecomposedBitTest> llvm::decomposeBitTest(Value *Cond,
-                                                        bool LookThruTrunc,
+                                                        bool LookThroughTrunc,
                                                         bool AllowNonZeroC,
                                                         bool DecomposeAnd) {
   using namespace PatternMatch;
@@ -194,7 +194,7 @@ std::optional<DecomposedBitTest> llvm::decomposeBitTest(Value *Cond,
     if (!ICmp->getOperand(0)->getType()->isIntOrIntVectorTy())
       return std::nullopt;
     return decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
-                                ICmp->getPredicate(), LookThruTrunc,
+                                ICmp->getPredicate(), LookThroughTrunc,
                                 AllowNonZeroC, DecomposeAnd);
   }
   Value *X;
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b39b32042dd2f..a9b51065a1d99 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1223,11 +1223,12 @@ Constant *llvm::ConstantFoldCompareInstOperands(
         }
       }
 
-      // Only do this transformation if the int is intptrty in size, otherwise
-      // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt) {
-        Type *IntPtrTy = DL.getIntPtrType(CE0->getOperand(0)->getType());
-        if (CE0->getType() == IntPtrTy) {
+      // icmp only compares the address part of the pointer, so only do this
+      // transform if the integer size matches the address size.
+      if (CE0->getOpcode() == Instruction::PtrToInt ||
+          CE0->getOpcode() == Instruction::PtrToAddr) {
+        Type *AddrTy = DL.getAddressType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == AddrTy) {
           Constant *C = CE0->getOperand(0);
           Constant *Null = Constant::getNullValue(C->getType());
           return ConstantFoldCompareInstOperands(Predicate, C, Null, DL, TLI);
@@ -1250,11 +1251,12 @@ Constant *llvm::ConstantFoldCompareInstOperands(
             return ConstantFoldCompareInstOperands(Predicate, C0, C1, DL, TLI);
         }
 
-        // Only do this transformation if the int is intptrty in size, otherwise
-        // there is a truncation or extension that we aren't modeling.
-        if (CE0->getOpcode() == Instruction::PtrToInt) {
-          Type *IntPtrTy = DL.getIntPtrType(CE0->getOperand(0)->getType());
-          if (CE0->getType() == IntPtrTy &&
+        // icmp only compares the address part of the pointer, so only do this
+        // transform if the integer size matches the address size.
+        if (CE0->getOpcode() == Instruction::PtrToInt ||
+            CE0->getOpcode() == Instruction::PtrToAddr) {
+          Type *AddrTy = DL.getAddressType(CE0->getOperand(0)->getType());
+          if (CE0->getType() == AddrTy &&
               CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()) {
             return ConstantFoldCompareInstOperands(
                 Predicate, CE0->getOperand(0), CE1->getOperand(0), DL, TLI);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9cb6f19b9340c..92577cd7517e6 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7286,15 +7286,15 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
                                                    const SimplifyQuery &SQ,
                                                    bool IsNSW) {
-  KnownBits LHSKnown = computeKnownBits(LHS, SQ);
-  KnownBits RHSKnown = computeKnownBits(RHS, SQ);
+  ConstantRange LHSRange =
+      computeConstantRangeIncludingKnownBits(LHS, /*ForSigned=*/false, SQ);
+  ConstantRange RHSRange =
+      computeConstantRangeIncludingKnownBits(RHS, /*ForSigned=*/false, SQ);
 
   // mul nsw of two non-negative numbers is also nuw.
-  if (IsNSW && LHSKnown.isNonNegative() && RHSKnown.isNonNegative())
+  if (IsNSW && LHSRange.isAllNonNegative() && RHSRange.isAllNonNegative())
     return OverflowResult::NeverOverflows;
 
-  ConstantRange LHSRange = ConstantRange::fromKnownBits(LHSKnown, false);
-  ConstantRange RHSRange = ConstantRange::fromKnownBits(RHSKnown, false);
   return mapOverflowResult(LHSRange.unsignedMulMayOverflow(RHSRange));
 }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
index 29eeecceea766..50acd6ea2e542 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
@@ -27,12 +27,67 @@ using namespace llvm::jitlink;
 namespace {
 
 constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_";
+constexpr StringRef ELFTLSInfoSectionName = "$__TLSINFO";
+
+// TLS Info Builder.
+class TLSInfoTableManager_ELF_systemz
+    : public TableManager<TLSInfoTableManager_ELF_systemz> {
+public:
+  static StringRef getSectionName() { return ELFTLSInfoSectionName; }
+
+  static const uint8_t TLSInfoEntryContent[16];
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getKind() ==
+        systemz::RequestTLSDescInGOTAndTransformToDelta64FromGOT) {
+      LLVM_DEBUG({
+        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+               << formatv("{0:x}", B->getFixupAddress(E)) << " ("
+               << formatv("{0:x}", B->getAddress()) << " + "
+               << formatv("{0:x}", E.getOffset()) << ")\n";
+      });
+      E.setKind(systemz::Delta64FromGOT);
+      E.setTarget(getEntryForTarget(G, E.getTarget()));
+      return true;
+    }
+    return false;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    // the TLS Info entry's key value will be written by the fixTLVSectionByName
+    // pass, so create mutable content.
+    auto &TLSInfoEntry = G.createMutableContentBlock(
+        getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()),
+        orc::ExecutorAddr(), 8, 0);
+    TLSInfoEntry.addEdge(systemz::Pointer64, 8, Target, 0);
+    return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false);
+  }
+
+private:
+  Section &getTLSInfoSection(LinkGraph &G) {
+    if (!TLSInfoTable)
+      TLSInfoTable = &G.createSection(getSectionName(), orc::MemProt::Read);
+    return *TLSInfoTable;
+  }
+
+  ArrayRef<char> getTLSInfoEntryContent() const {
+    return {reinterpret_cast<const char *>(TLSInfoEntryContent),
+            sizeof(TLSInfoEntryContent)};
+  }
+
+  Section *TLSInfoTable = nullptr;
+};
+
+const uint8_t TLSInfoTableManager_ELF_systemz::TLSInfoEntryContent[16] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
 
 Error buildTables_ELF_systemz(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
   systemz::GOTTableManager GOT;
   systemz::PLTTableManager PLT(GOT);
-  visitExistingEdges(G, GOT, PLT);
+  TLSInfoTableManager_ELF_systemz TLSInfo;
+  visitExistingEdges(G, GOT, PLT, TLSInfo);
   return Error::success();
 }
 
@@ -329,6 +384,15 @@ class ELFLinkGraphBuilder_systemz
       Kind = systemz::Delta32dblGOTBase;
       break;
     }
+    // Tag for function call in general dynamic TLS code.
+    case ELF::R_390_TLS_GDCALL: {
+      break;
+    }
+    // Direct 64 bit for general dynamic thread local data.
+    case ELF::R_390_TLS_GD64: {
+      Kind = systemz::RequestTLSDescInGOTAndTransformToDelta64FromGOT;
+      break;
+    }
     default:
       return make_error<JITLinkError>(
           "In " + G->getName() + ": Unsupported systemz relocation type " +
diff --git a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
index f6cc29fa6e6a1..dbb924c3f9291 100644
--- a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
@@ -104,6 +104,8 @@ const char *getEdgeKindName(Edge::Kind R) {
     return "RequestGOTAndTransformToDelta12FromGOT";
   case RequestGOTAndTransformToDelta32dbl:
     return "RequestGOTAndTransformToDelta32dbl";
+  case RequestTLSDescInGOTAndTransformToDelta64FromGOT:
+    return "RequestTLSDescInGOTAndTransformToDelta64FromGOT";
   default:
     return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
   }
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 7dc1ae520f132..0a761290373aa 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -988,6 +988,7 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges(
     jitlink::LinkGraph &G, JITDylib &JD) {
   auto TLSGetAddrSymbolName = G.intern("__tls_get_addr");
   auto TLSDescResolveSymbolName = G.intern("__tlsdesc_resolver");
+  auto TLSGetOffsetSymbolName = G.intern("__tls_get_offset");
   for (auto *Sym : G.external_symbols()) {
     if (Sym->getName() == TLSGetAddrSymbolName) {
       auto TLSGetAddr =
@@ -997,6 +998,10 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges(
       auto TLSGetAddr =
           MP.getExecutionSession().intern("___orc_rt_elfnix_tlsdesc_resolver");
       Sym->setName(std::move(TLSGetAddr));
+    } else if (Sym->getName() == TLSGetOffsetSymbolName) {
+      auto TLSGetAddr =
+          MP.getExecutionSession().intern("___orc_rt_elfnix_tls_get_offset");
+      Sym->setName(std::move(TLSGetAddr));
     }
   }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d47aafb31ebdf..c54c428926bf8 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6469,7 +6469,13 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
       if (Pos != size_t(-1))
         Res.insert(Pos + I64.size(), I128);
     }
-    return Res;
+  }
+
+  if (T.isPPC() && T.isOSAIX() && !DL.contains("f64:32:64") && !DL.empty()) {
+    size_t Pos = Res.find("-S128");
+    if (Pos == StringRef::npos)
+      Pos = Res.size();
+    Res.insert(Pos, "-f64:32:64");
   }
 
   if (!T.isX86())
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 33ca46ca1c2c6..b95c1466871bc 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1271,6 +1271,7 @@ bool Instruction::isAssociative() const {
 
   switch (Opcode) {
   case FMul:
+    return cast<FPMathOperator>(this)->hasAllowReassoc();
   case FAdd:
     return cast<FPMathOperator>(this)->hasAllowReassoc() &&
            cast<FPMathOperator>(this)->hasNoSignedZeros();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 054c85d115970..7e2dfbbb4772b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2692,6 +2692,9 @@ void Verifier::verifyFunctionMetadata(
 }
 
 void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
+  if (EntryC->getNumOperands() == 0)
+    return;
+
   if (!ConstantExprVisited.insert(EntryC).second)
     return;
 
@@ -5626,14 +5629,8 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (isa<InlineAsm>(I.getOperand(i))) {
       Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
             "Cannot take the address of an inline asm!", &I);
-    } else if (auto *CPA = dyn_cast<ConstantPtrAuth>(I.getOperand(i))) {
-      visitConstantExprsRecursively(CPA);
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
-      if (CE->getType()->isPtrOrPtrVectorTy()) {
-        // If we have a ConstantExpr pointer, we need to see if it came from an
-        // illegal bitcast.
-        visitConstantExprsRecursively(CE);
-      }
+    } else if (auto *C = dyn_cast<Constant>(I.getOperand(i))) {
+      visitConstantExprsRecursively(C);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 28a52ab8b1ae6..87256352faccd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -2423,7 +2423,7 @@ static bool targetSupportsPAuthRelocation(const Triple &TT,
                                           const MCExpr *Target,
                                           const MCExpr *DSExpr) {
   // No released version of glibc supports PAuth relocations.
-  if (TT.isOSGlibc())
+  if (TT.isOSGlibc() || TT.isMusl())
     return false;
 
   // We emit PAuth constants as IRELATIVE relocations in cases where the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30eb19036ddda..41caa817c11a4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24759,7 +24759,12 @@ static SDValue performPostLD1Combine(SDNode *N,
 static bool performTBISimplification(SDValue Addr,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
-  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  // If MTE is enabled, TBI only applies to the top 4 bits.
+  // Both arm64 and arm64e processes on Darwin may run with MTE enabled.
+  unsigned NumIgnoreBits =
+      Subtarget.hasMTE() || Subtarget.isTargetDarwin() ? 4 : 8;
+  APInt DemandedMask = APInt::getLowBitsSet(64, 64 - NumIgnoreBits);
   KnownBits Known;
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
@@ -31792,12 +31797,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
   unsigned OperandOrder;
   if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
                 OperandOrder) &&
-      WhichResult == 0)
-    return convertFromScalableVector(
-        DAG, VT,
-        DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
-                    OperandOrder == 0 ? Op1 : Op2,
-                    OperandOrder == 0 ? Op2 : Op1));
+      WhichResult == 0) {
+    SDValue ZIP = DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
+                              OperandOrder == 0 ? Op1 : Op2,
+                              OperandOrder == 0 ? Op2 : Op1);
+    return convertFromScalableVector(DAG, VT, ZIP);
+  }
 
   if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
                 OperandOrder)) {
@@ -31847,12 +31852,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
 
     if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
                   OperandOrder) &&
-        WhichResult != 0)
-      return convertFromScalableVector(
-          DAG, VT,
-          DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
-                      OperandOrder == 0 ? Op1 : Op2,
-                      OperandOrder == 0 ? Op2 : Op1));
+        WhichResult != 0) {
+      SDValue ZIP = DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
+                                OperandOrder == 0 ? Op1 : Op2,
+                                OperandOrder == 0 ? Op2 : Op1);
+      return convertFromScalableVector(DAG, VT, ZIP);
+    }
 
     if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4d2e740779961..892b8ee1ed3cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4386,7 +4386,7 @@ multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
                               DAGOperand regtype > {
   def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
                                (ins GPR64sp:$Rn, simm9:$offset), asm, []>,
-          Sched<[WriteST]> {
+          Sched<[WriteLD]> {
     let Inst{29} = 0;
     let Inst{24} = 1;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index c7d6b31291197..12a53aad08aa8 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6631,43 +6631,44 @@ inline bool isZIPMask(ArrayRef<int> M, unsigned NumElts,
   if (NumElts % 2 != 0)
     return false;
 
-  // "Variant" refers to the distinction bwetween zip1 and zip2, while
-  // "Order" refers to sequence of input registers (matching vs flipped).
-  bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
-  bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
-  bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
-  bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+  // "Result" corresponds to "WhichResultOut", selecting between zip1 and zip2.
+  // "Order" corresponds to "OperandOrderOut", selecting the order of operands
+  // for the instruction (flipped or not).
+  bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+  bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+  bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+  bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
   // Check all elements match.
   for (unsigned i = 0; i != NumElts; i += 2) {
     if (M[i] >= 0) {
       unsigned EvenElt = (unsigned)M[i];
       if (EvenElt != i / 2)
-        Variant0Order0 = false;
+        Result0Order0 = false;
       if (EvenElt != NumElts / 2 + i / 2)
-        Variant1Order0 = false;
+        Result1Order0 = false;
       if (EvenElt != NumElts + i / 2)
-        Variant0Order1 = false;
+        Result0Order1 = false;
       if (EvenElt != NumElts + NumElts / 2 + i / 2)
-        Variant1Order1 = false;
+        Result1Order1 = false;
     }
     if (M[i + 1] >= 0) {
       unsigned OddElt = (unsigned)M[i + 1];
       if (OddElt != NumElts + i / 2)
-        Variant0Order0 = false;
+        Result0Order0 = false;
       if (OddElt != NumElts + NumElts / 2 + i / 2)
-        Variant1Order0 = false;
+        Result1Order0 = false;
       if (OddElt != i / 2)
-        Variant0Order1 = false;
+        Result0Order1 = false;
       if (OddElt != NumElts / 2 + i / 2)
-        Variant1Order1 = false;
+        Result1Order1 = false;
     }
   }
 
-  if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1)
+  if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1)
     return false;
 
-  WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1;
-  OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1;
+  WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1;
+  OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1;
   return true;
 }
 
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index d4058fac4304a..584b45b4111cd 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -157,6 +157,29 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+// Decode AMSWAP.W and UD, which share the same base encoding.
+// If rk == 1 and rd == rj, interpret the instruction as UD;
+// otherwise decode as AMSWAP.W.
+static DecodeStatus DecodeAMOrUDInstruction(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rj = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rk = fieldFromInstruction(Insn, 10, 5);
+
+  if (Rk == 1 && Rd == Rj) {
+    Inst.setOpcode(LoongArch::UD);
+    Inst.addOperand(MCOperand::createImm(Rd));
+  } else {
+    Inst.setOpcode(LoongArch::AMSWAP_W);
+    Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rd));
+    Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rk));
+    Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rj));
+  }
+
+  return MCDisassembler::Success;
+}
+
 #include "LoongArchGenDisassemblerTables.inc"
 
 DecodeStatus LoongArchDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
index 419e20431c59f..fa049fcbc2d21 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
@@ -401,3 +401,16 @@ class FmtLDPTE<dag outs, dag ins, string opnstr, list<dag> pattern = []>
   let Inst{9-5} = rj;
   let Inst{4-0} = 0b00000;
 }
+
+// FmtUD
+// <0b0011100001100000000001 | I5 | I5>
+class FmtUD<dag outs, dag ins, string opnstr, list<dag> pattern = []>
+    : LAInst<outs, ins, deriveInsnMnemonic<NAME>.ret, opnstr, pattern> {
+  bits<5> imm5;
+
+  let Inst{31-10} = 0b0011100001100000000001;
+  let Inst{9-5} = imm5;
+  let Inst{4-0} = imm5;
+
+  let DecoderMethod = "DecodeAMOrUDInstruction";
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2e6653e1a09ac..d971f8bc1986b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -800,6 +800,10 @@ class AM_3R<bits<32> op>
   let Constraints = "@earlyclobber $rd";
 }
 
+class AU_3R<bits<32> op> : AM_3R<op> {
+  let DecoderMethod = "DecodeAMOrUDInstruction";
+}
+
 class AMCAS_3R<bits<32> op>
     : Fmt3R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rk, GPRMemAtomic:$rj),
             "$rd, $rk, $rj"> {
@@ -923,6 +927,9 @@ def BREAK   : MISC_I15<0x002a0000>;
 def RDTIMEL_W : RDTIME_2R<0x00006000>;
 def RDTIMEH_W : RDTIME_2R<0x00006400>;
 
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+def UD : FmtUD<(outs), (ins uimm5:$imm5), "$imm5">;
+
 // The CPUCFG instruction offers a reliable way to probing CPU features.
 // Although support is not guaranteed on LA32R, having compiler support
 // nevertheless enables applications to rely on its presence, potentially
@@ -1087,7 +1094,7 @@ def STLE_D : STORE_3R<0x387f8000>;
 // Atomic Memory Access Instructions for 64-bits
 def AMSWAP_B     : AM_3R<0x385c0000>;
 def AMSWAP_H     : AM_3R<0x385c8000>;
-def AMSWAP_W     : AM_3R<0x38600000>;
+def AMSWAP_W     : AU_3R<0x38600000>;
 def AMSWAP_D     : AM_3R<0x38608000>;
 def AMADD_B      : AM_3R<0x385d0000>;
 def AMADD_H      : AM_3R<0x385d8000>;
@@ -1410,12 +1417,8 @@ def : Pat<(and GPR:$rj, BstrinsImm:$imm),
 
 /// Traps
 
-// We lower `trap` to `amswap.w rd:$r0, rk:$r1, rj:$r0`, as this is guaranteed
-// to trap with an INE (non-existent on LA32, explicitly documented to INE on
-// LA64). And the resulting signal is different from `debugtrap` like on some
-// other existing ports so programs/porters might have an easier time.
-def PseudoUNIMP : Pseudo<(outs), (ins), [(trap)]>,
-                  PseudoInstExpansion<(AMSWAP_W R0, R1, R0)>;
+// We lower `trap` to `ud 0`, which is an alias for `amswap.w $r0, $r1, $r0`.
+def PseudoUNIMP : Pseudo<(outs), (ins), [(trap)]>, PseudoInstExpansion<(UD 0)>;
 
 // We lower `debugtrap` to `break 0`, as this is guaranteed to exist and work,
 // even for LA32 Primary. Also, because so far the ISA does not provide a
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 366a7b6d0135a..99bef417eaa89 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -3180,7 +3180,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case PPC::PPCLdFixedAddr: {
-    assert(Subtarget.getTargetTriple().isOSGlibc() &&
+    assert((Subtarget.getTargetTriple().isOSGlibc() ||
+            Subtarget.getTargetTriple().isMusl()) &&
            "Only targets with Glibc expected to contain PPCLdFixedAddr");
     int64_t Offset = 0;
     const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 40b46f503ca53..74066c86d6ebe 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -433,6 +433,8 @@ enum OperandType : unsigned {
   OPERAND_RTZARG,
   // Condition code used by select and short forward branch pseudos.
   OPERAND_COND_CODE,
+  // Ordering for atomic pseudos.
+  OPERAND_ATOMIC_ORDERING,
   // Vector policy operand.
   OPERAND_VEC_POLICY,
   // Vector SEW operand. Stores in log2(SEW).
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 1a5bb837a4318..39228a11e1309 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -908,6 +908,7 @@ def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minima
 
 def HasVInstructionsBF16Minimal : Predicate<"Subtarget->hasVInstructionsBF16Minimal()">;
 def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">;
+def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">;
 def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
 
 def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMultiply()">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 8bfdbef39708a..b6b716be35c3e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2608,8 +2608,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       MachineSDNode *TileLoad =
           CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands);
-      if (auto *MemOp = dyn_cast<MemSDNode>(Node))
-        CurDAG->setNodeMemRefs(TileLoad, {MemOp->getMemOperand()});
+      CurDAG->setNodeMemRefs(TileLoad,
+                             {cast<MemSDNode>(Node)->getMemOperand()});
 
       ReplaceNode(Node, TileLoad);
       return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 29fc2ddb818b5..2c0a02ae396c7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -526,7 +526,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal);
     setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal);
     setOperationAction(ISD::SPLAT_VECTOR, VTs, Legal);
-    setOperationAction(ISD::SHL, VTs, Custom);
+    setOperationAction({ISD::SHL, ISD::SRL, ISD::SRA}, VTs, Custom);
     setOperationAction(ISD::BITCAST, VTs, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom);
   }
@@ -2196,6 +2196,60 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 5,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false);
+  case Intrinsic::riscv_sf_vlte8:
+  case Intrinsic::riscv_sf_vlte16:
+  case Intrinsic::riscv_sf_vlte32:
+  case Intrinsic::riscv_sf_vlte64:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.ptrVal = I.getArgOperand(1);
+    switch (Intrinsic) {
+    case Intrinsic::riscv_sf_vlte8:
+      Info.memVT = MVT::i8;
+      Info.align = Align(1);
+      break;
+    case Intrinsic::riscv_sf_vlte16:
+      Info.memVT = MVT::i16;
+      Info.align = Align(2);
+      break;
+    case Intrinsic::riscv_sf_vlte32:
+      Info.memVT = MVT::i32;
+      Info.align = Align(4);
+      break;
+    case Intrinsic::riscv_sf_vlte64:
+      Info.memVT = MVT::i64;
+      Info.align = Align(8);
+      break;
+    }
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::riscv_sf_vste8:
+  case Intrinsic::riscv_sf_vste16:
+  case Intrinsic::riscv_sf_vste32:
+  case Intrinsic::riscv_sf_vste64:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.ptrVal = I.getArgOperand(1);
+    switch (Intrinsic) {
+    case Intrinsic::riscv_sf_vste8:
+      Info.memVT = MVT::i8;
+      Info.align = Align(1);
+      break;
+    case Intrinsic::riscv_sf_vste16:
+      Info.memVT = MVT::i16;
+      Info.align = Align(2);
+      break;
+    case Intrinsic::riscv_sf_vste32:
+      Info.memVT = MVT::i32;
+      Info.align = Align(4);
+      break;
+    case Intrinsic::riscv_sf_vste64:
+      Info.memVT = MVT::i64;
+      Info.align = Align(8);
+      break;
+    }
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOStore;
+    return true;
   }
 }
 
@@ -8608,22 +8662,21 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VSELECT:
     return lowerToScalableOp(Op, DAG);
   case ISD::SHL:
-    if (Subtarget.enablePExtCodeGen() &&
-        Op.getSimpleValueType().isFixedLengthVector()) {
-      // We have patterns for scalar/immediate shift amount, so no lowering
-      // needed.
-      if (Op.getOperand(1)->getOpcode() == ISD::SPLAT_VECTOR)
-        return Op;
-
-      // There's no vector-vector version of shift instruction in P extension so
-      // we need to unroll to scalar computation and pack them back.
-      return DAG.UnrollVectorOp(Op.getNode());
-    }
-    [[fallthrough]];
-  case ISD::SRA:
   case ISD::SRL:
-    if (Op.getSimpleValueType().isFixedLengthVector())
+  case ISD::SRA:
+    if (Op.getSimpleValueType().isFixedLengthVector()) {
+      if (Subtarget.enablePExtCodeGen()) {
+        // We have patterns for scalar/immediate shift amount, so no lowering
+        // needed.
+        if (Op.getOperand(1)->getOpcode() == ISD::SPLAT_VECTOR)
+          return Op;
+
+        // There's no vector-vector version of shift instruction in P extension
+        // so we need to unroll to scalar computation and pack them back.
+        return DAG.UnrollVectorOp(Op.getNode());
+      }
       return lowerToScalableOp(Op, DAG);
+    }
     // This can be called for an i32 shift amount that needs to be promoted.
     assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ce5a67bd23a9a..76dc57c45fb0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3025,6 +3025,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_COND_CODE:
           Ok = Imm >= 0 && Imm < RISCVCC::COND_INVALID;
           break;
+        case RISCVOp::OPERAND_ATOMIC_ORDERING:
+          Ok = isValidAtomicOrdering(Imm);
+          break;
         case RISCVOp::OPERAND_VEC_POLICY:
           Ok = (Imm & (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) ==
                Imm;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 5c81a0990a64f..f5fd9acd8b303 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -11,6 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+def ordering : RISCVOp {
+  let OperandType = "OPERAND_ATOMIC_ORDERING";
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -244,7 +252,7 @@ defm : AMOPat<"atomic_load_umin_i64", "AMOMINU_D", i64, [IsRV64]>;
 /// Pseudo AMOs
 
 class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch),
-                         (ins GPR:$addr, GPR:$incr, ixlenimm:$ordering), []> {
+                         (ins GPR:$addr, GPR:$incr, ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
   let mayLoad = 1;
   let mayStore = 1;
@@ -253,7 +261,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch),
 
 class PseudoMaskedAMO
     : Pseudo<(outs GPR:$res, GPR:$scratch),
-             (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$ordering), []> {
+             (ins GPR:$addr, GPR:$incr, GPR:$mask, ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
   let mayLoad = 1;
   let mayStore = 1;
@@ -262,8 +270,8 @@ class PseudoMaskedAMO
 
 class PseudoMaskedAMOMinMax
     : Pseudo<(outs GPR:$res, GPR:$scratch1, GPR:$scratch2),
-             (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$sextshamt,
-              ixlenimm:$ordering), []> {
+             (ins GPR:$addr, GPR:$incr, GPR:$mask, GPR:$sextshamt,
+                  ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch1,"
                     "@earlyclobber $scratch2";
   let mayLoad = 1;
@@ -273,7 +281,7 @@ class PseudoMaskedAMOMinMax
 
 class PseudoMaskedAMOUMinUMax
     : Pseudo<(outs GPR:$res, GPR:$scratch1, GPR:$scratch2),
-             (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$ordering), []> {
+             (ins GPR:$addr, GPR:$incr, GPR:$mask, ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch1,"
                     "@earlyclobber $scratch2";
   let mayLoad = 1;
@@ -419,7 +427,7 @@ defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>;
 
 class PseudoCmpXchg
     : Pseudo<(outs GPR:$res, GPR:$scratch),
-             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ixlenimm:$ordering), []> {
+             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
   let mayLoad = 1;
   let mayStore = 1;
@@ -457,7 +465,7 @@ let Predicates = [HasStdExtZalrsc] in {
 def PseudoMaskedCmpXchg32
     : Pseudo<(outs GPR:$res, GPR:$scratch),
              (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
-              ixlenimm:$ordering), []> {
+              ordering:$ordering), []> {
   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
   let mayLoad = 1;
   let mayStore = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index bba9f961b9639..da4a3a6022337 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -437,7 +437,7 @@ class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVPWideningTernary_rrr<bits<4> f, bits<2> w, string opcodestr>
     : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd_wb),
-                     (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr> {
+                     (ins GPRPairRV32:$rd, GPR:$rs1, GPR:$rs2), opcodestr> {
   let Inst{30-27} = f;
 
   let Constraints = "$rd = $rd_wb";
@@ -1513,26 +1513,55 @@ let Predicates = [HasStdExtP] in {
   def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PABD_H GPR:$rs1, GPR:$rs2)>;
   def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PABDU_H GPR:$rs1, GPR:$rs2)>;
 
-  // 8-bit logical shift left patterns
+  // 8-bit logical shift left/right patterns
   def: Pat<(XLenVecI8VT (shl GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))),
            (PSLLI_B GPR:$rs1, uimm3:$shamt)>;
+  def: Pat<(XLenVecI8VT (srl GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))),
+           (PSRLI_B GPR:$rs1, uimm3:$shamt)>;
 
-  // 16-bit logical shift left patterns
+  // 16-bit logical shift left/right patterns
   def: Pat<(XLenVecI16VT (shl GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))),
            (PSLLI_H GPR:$rs1, uimm4:$shamt)>;
+  def: Pat<(XLenVecI16VT (srl GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))),
+           (PSRLI_H GPR:$rs1, uimm4:$shamt)>;
+
+  // 8-bit arithmetic shift right patterns
+  def: Pat<(XLenVecI8VT (sra GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))),
+           (PSRAI_B GPR:$rs1, uimm3:$shamt)>;
+
+  // 16-bit arithmetic shift right patterns
+  def: Pat<(XLenVecI16VT (sra GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))),
+           (PSRAI_H GPR:$rs1, uimm4:$shamt)>;
 
   // 16-bit signed saturation shift left patterns
   def: Pat<(XLenVecI16VT (sshlsat GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))),
            (PSSLAI_H GPR:$rs1, uimm4:$shamt)>;
 
-  // 8-bit logical shift left
+  // 8-bit logical shift left/right
   def: Pat<(XLenVecI8VT (shl GPR:$rs1,
                              (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))),
            (PSLL_BS GPR:$rs1, GPR:$rs2)>;
-  // 16-bit logical shift left
+  def: Pat<(XLenVecI8VT (srl GPR:$rs1,
+                             (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRL_BS GPR:$rs1, GPR:$rs2)>;
+
+  // 8-bit arithmetic shift left/right
+  def: Pat<(XLenVecI8VT (sra GPR:$rs1,
+                             (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRA_BS GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit logical shift left/right
   def: Pat<(XLenVecI16VT (shl GPR:$rs1,
                               (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))),
            (PSLL_HS GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (srl GPR:$rs1,
+                              (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRL_HS GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit arithmetic shift left/right
+  def: Pat<(XLenVecI16VT (sra GPR:$rs1,
+                              (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRA_HS GPR:$rs1, GPR:$rs2)>;
 
   // 8-bit PLI SD node pattern
   def: Pat<(XLenVecI8VT (splat_vector simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>;
@@ -1580,16 +1609,28 @@ let Predicates = [HasStdExtP, IsRV64] in {
   def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>;
   def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>;
 
-  // 32-bit logical shift left
+  // 32-bit logical shift left/right
   def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))),
            (PSLL_WS GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(v2i32 (srl GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRL_WS GPR:$rs1, GPR:$rs2)>;
+
+  // 32-bit arithmetic shift left/right
+  def: Pat<(v2i32 (sra GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))),
+           (PSRA_WS GPR:$rs1, GPR:$rs2)>;
 
   // splat pattern
   def: Pat<(v2i32 (splat_vector (XLenVT GPR:$rs2))), (PADD_WS (XLenVT X0), GPR:$rs2)>;
 
-  // 32-bit logical shift left patterns
+  // 32-bit logical shift left/right patterns
   def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))),
            (PSLLI_W GPR:$rs1, uimm5:$shamt)>;
+  def: Pat<(v2i32 (srl GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))),
+           (PSRLI_W GPR:$rs1, uimm5:$shamt)>;
+
+  // 32-bit arithmetic shift left/right patterns
+  def: Pat<(v2i32 (sra GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))),
+           (PSRAI_W GPR:$rs1, uimm5:$shamt)>;
 
   // 32-bit signed saturation shift left patterns
   def: Pat<(v2i32 (sshlsat GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 594a75a4746d4..9354b63bced53 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1840,3 +1840,6 @@ let Predicates = [HasVInstructionsI64, IsRV64] in {
 
 include "RISCVInstrInfoVPseudos.td"
 include "RISCVInstrInfoZvfbf.td"
+// Include the non-intrinsic ISel patterns
+include "RISCVInstrInfoVVLPatterns.td"
+include "RISCVInstrInfoVSDPatterns.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index e36204c536c0d..cdbeb0c1046d2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -473,17 +473,27 @@ defset list<VTypeInfoToWide> AllWidenableIntVectors = {
   def : VTypeInfoToWide<VI32M4,  VI64M8>;
 }
 
-defset list<VTypeInfoToWide> AllWidenableFloatVectors = {
-  def : VTypeInfoToWide<VF16MF4, VF32MF2>;
-  def : VTypeInfoToWide<VF16MF2, VF32M1>;
-  def : VTypeInfoToWide<VF16M1, VF32M2>;
-  def : VTypeInfoToWide<VF16M2, VF32M4>;
-  def : VTypeInfoToWide<VF16M4, VF32M8>;
+defset list<VTypeInfoToWide> AllWidenableFloatAndBF16Vectors = {
+  defset list<VTypeInfoToWide> AllWidenableFloatVectors = {
+    def : VTypeInfoToWide<VF16MF4, VF32MF2>;
+    def : VTypeInfoToWide<VF16MF2, VF32M1>;
+    def : VTypeInfoToWide<VF16M1, VF32M2>;
+    def : VTypeInfoToWide<VF16M2, VF32M4>;
+    def : VTypeInfoToWide<VF16M4, VF32M8>;
 
-  def : VTypeInfoToWide<VF32MF2, VF64M1>;
-  def : VTypeInfoToWide<VF32M1, VF64M2>;
-  def : VTypeInfoToWide<VF32M2, VF64M4>;
-  def : VTypeInfoToWide<VF32M4, VF64M8>;
+    def : VTypeInfoToWide<VF32MF2, VF64M1>;
+    def : VTypeInfoToWide<VF32M1, VF64M2>;
+    def : VTypeInfoToWide<VF32M2, VF64M4>;
+    def : VTypeInfoToWide<VF32M4, VF64M8>;
+  }
+
+  defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
+    def : VTypeInfoToWide<VBF16MF4, VF32MF2>;
+    def : VTypeInfoToWide<VBF16MF2, VF32M1>;
+    def : VTypeInfoToWide<VBF16M1, VF32M2>;
+    def : VTypeInfoToWide<VBF16M2, VF32M4>;
+    def : VTypeInfoToWide<VBF16M4, VF32M8>;
+  }
 }
 
 defset list<VTypeInfoToFraction> AllFractionableVF2IntVectors = {
@@ -543,14 +553,6 @@ defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = {
   def : VTypeInfoToWide<VI32M4, VF64M8>;
 }
 
-defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
-  def : VTypeInfoToWide<VBF16MF4, VF32MF2>;
-  def : VTypeInfoToWide<VBF16MF2, VF32M1>;
-  def : VTypeInfoToWide<VBF16M1, VF32M2>;
-  def : VTypeInfoToWide<VBF16M2, VF32M4>;
-  def : VTypeInfoToWide<VBF16M4, VF32M8>;
-}
-
 // This class holds the record of the RISCVVPseudoTable below.
 // This represents the information we need in codegen for each pseudo.
 // The definition should be consistent with `struct PseudoInfo` in
@@ -780,7 +782,7 @@ class GetVRegNoV0<VReg VRegClass> {
 
 class GetVTypePredicates<VTypeInfo vti> {
   list<Predicate> Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16],
-                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal],
+                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16],
                                      !eq(vti.Scalar, f32) : [HasVInstructionsAnyF],
                                      !eq(vti.Scalar, f64) : [HasVInstructionsF64],
                                      !eq(vti.SEW, 64) : [HasVInstructionsI64],
@@ -7326,7 +7328,3 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
 defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllVectors>;
-
-// Include the non-intrinsic ISel patterns
-include "RISCVInstrInfoVVLPatterns.td"
-include "RISCVInstrInfoVSDPatterns.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index a67112b9981b8..14ad7ca0eb35a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -215,13 +215,17 @@ multiclass VPatBinaryFPSDNode_VV_VF<SDPatternOperator vop, string instruction_na
 }
 
 multiclass VPatBinaryFPSDNode_VV_VF_RM<SDPatternOperator vop, string instruction_name,
-                                       bit isSEWAware = 0, bit isBF16 = 0> {
-  foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in {
+                                       list<VTypeInfo> vtilist = AllFloatVectors,
+                                       bit isSEWAware = 0> {
+  foreach vti = vtilist in {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def : VPatBinarySDNode_VV_RM<vop, instruction_name,
+      def : VPatBinarySDNode_VV_RM<vop, instruction_name #
+                                            !if(!eq(vti.Scalar, bf16), "_ALT", ""),
                                    vti.Vector, vti.Vector, vti.Log2SEW,
                                    vti.LMul, vti.AVL, vti.RegClass, isSEWAware>;
-      def : VPatBinarySDNode_VF_RM<vop, instruction_name#"_V"#vti.ScalarSuffix,
+      def : VPatBinarySDNode_VF_RM<vop, instruction_name#
+                                        !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                        "_V"#vti.ScalarSuffix,
                                    vti.Vector, vti.Vector, vti.Scalar,
                                    vti.Log2SEW, vti.LMul, vti.AVL, vti.RegClass,
                                    vti.ScalarRegClass, isSEWAware>;
@@ -246,14 +250,17 @@ multiclass VPatBinaryFPSDNode_R_VF<SDPatternOperator vop, string instruction_nam
 }
 
 multiclass VPatBinaryFPSDNode_R_VF_RM<SDPatternOperator vop, string instruction_name,
-                                   bit isSEWAware = 0, bit isBF16 = 0> {
-  foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in
+                                      list<VTypeInfo> vtilist = AllFloatVectors,
+                                      bit isSEWAware = 0> {
+  foreach fvti = vtilist in
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (fvti.Vector (SplatFPOp fvti.Scalar:$rs2)),
                                 (fvti.Vector fvti.RegClass:$rs1))),
               (!cast<Instruction>(
                            !if(isSEWAware,
-                             instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW,
+                             instruction_name#
+                             !if(!eq(fvti.Scalar, bf16), "_ALT", "")#
+                             "_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW,
                              instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX))
                            (fvti.Vector (IMPLICIT_DEF)),
                            fvti.RegClass:$rs1,
@@ -664,11 +671,10 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name,
     defvar vti = vtiToWti.Vti;
     defvar wti = vtiToWti.Wti;
     defvar suffix = vti.LMul.MX # "_E" # vti.SEW;
-    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                 GetVTypePredicates<wti>.Predicates,
+    let Predicates = !listconcat(GetVTypePredicates<wti>.Predicates,
                                  !if(!eq(vti.Scalar, bf16),
                                      [HasStdExtZvfbfwma],
-                                     [])) in {
+                                     GetVTypePredicates<vti>.Predicates)) in {
       def : Pat<(fma (wti.Vector (riscv_fpextend_vl_sameuser
                                       (vti.Vector vti.RegClass:$rs1),
                                       (vti.Mask true_mask), (XLenVT srcvalue))),
@@ -676,7 +682,9 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name,
                                       (vti.Vector vti.RegClass:$rs2),
                                       (vti.Mask true_mask), (XLenVT srcvalue))),
                      (wti.Vector wti.RegClass:$rd)),
-                (!cast<Instruction>(instruction_name#"_VV_"#suffix)
+                (!cast<Instruction>(instruction_name#
+                                    !if(!eq(vti.Scalar, bf16), "BF16", "")#
+                                    "_VV_"#suffix)
                    wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
@@ -688,7 +696,9 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name,
                                       (vti.Vector vti.RegClass:$rs2),
                                       (vti.Mask true_mask), (XLenVT srcvalue))),
                      (wti.Vector wti.RegClass:$rd)),
-                (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#suffix)
+                (!cast<Instruction>(instruction_name#
+                                    !if(!eq(vti.Scalar, bf16), "BF16", "")#
+                                    "_V"#vti.ScalarSuffix#"_"#suffix)
                    wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    // Value to indicate no rounding mode change in
                    // RISCVInsertReadWriteCSR
@@ -1201,16 +1211,20 @@ foreach mti = AllMasks in {
 // 13. Vector Floating-Point Instructions
 
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
-defm : VPatBinaryFPSDNode_VV_VF_RM<any_fadd, "PseudoVFADD", isSEWAware=1>;
-defm : VPatBinaryFPSDNode_VV_VF_RM<any_fsub, "PseudoVFSUB", isSEWAware=1>;
-defm : VPatBinaryFPSDNode_R_VF_RM<any_fsub, "PseudoVFRSUB", isSEWAware=1>;
+defm : VPatBinaryFPSDNode_VV_VF_RM<any_fadd, "PseudoVFADD", AllFloatAndBF16Vectors,
+                                   isSEWAware=1>;
+defm : VPatBinaryFPSDNode_VV_VF_RM<any_fsub, "PseudoVFSUB", AllFloatAndBF16Vectors,
+                                   isSEWAware=1>;
+defm : VPatBinaryFPSDNode_R_VF_RM<any_fsub, "PseudoVFRSUB", AllFloatAndBF16Vectors,
+                                  isSEWAware=1>;
 
 // 13.3. Vector Widening Floating-Point Add/Subtract Instructions
 defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF_RM<fadd, "PseudoVFWADD">;
 defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF_RM<fsub, "PseudoVFWSUB">;
 
 // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm : VPatBinaryFPSDNode_VV_VF_RM<any_fmul, "PseudoVFMUL", isSEWAware=1>;
+defm : VPatBinaryFPSDNode_VV_VF_RM<any_fmul, "PseudoVFMUL", AllFloatAndBF16Vectors,
+                                   isSEWAware=1>;
 defm : VPatBinaryFPSDNode_VV_VF_RM<any_fdiv, "PseudoVFDIV", isSEWAware=1>;
 defm : VPatBinaryFPSDNode_R_VF_RM<any_fdiv, "PseudoVFRDIV", isSEWAware=1>;
 
@@ -1314,14 +1328,15 @@ foreach fvti = AllFloatVectors in {
 
 // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
 defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACC",
-                                        AllWidenableFloatVectors>;
+                                        AllWidenableFloatAndBF16Vectors>;
 defm : VPatWidenFPNegMulAccSDNode_VV_VF_RM<"PseudoVFWNMACC">;
 defm : VPatWidenFPMulSacSDNode_VV_VF_RM<"PseudoVFWMSAC">;
 defm : VPatWidenFPNegMulSacSDNode_VV_VF_RM<"PseudoVFWNMSAC">;
 
-foreach vti = AllFloatVectors in {
+foreach vti = AllFloatAndBF16Vectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     // 13.8. Vector Floating-Point Square-Root Instruction
+    if !ne(vti.Scalar, bf16) then
     def : Pat<(any_fsqrt (vti.Vector vti.RegClass:$rs2)),
               (!cast<Instruction>("PseudoVFSQRT_V_"# vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
@@ -1333,34 +1348,46 @@ foreach vti = AllFloatVectors in {
 
     // 13.12. Vector Floating-Point Sign-Injection Instructions
     def : Pat<(fabs (vti.Vector vti.RegClass:$rs)),
-              (!cast<Instruction>("PseudoVFSGNJX_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJX"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
     // Handle fneg with VFSGNJN using the same input for both operands.
     def : Pat<(fneg (vti.Vector vti.RegClass:$rs)),
-              (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJN"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
 
     def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
                                      (vti.Vector vti.RegClass:$rs2))),
-              (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJ"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
     def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
                                      (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))),
-              (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJ"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
 
     def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
                                      (vti.Vector (fneg vti.RegClass:$rs2)))),
-              (!cast<Instruction>("PseudoVFSGNJN_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJN"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
     def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
                                      (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))),
-              (!cast<Instruction>("PseudoVFSGNJN_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
+              (!cast<Instruction>("PseudoVFSGNJN"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
   }
@@ -1446,13 +1473,28 @@ defm : VPatNConvertFP2ISDNode_W<any_fp_to_sint, "PseudoVFNCVT_RTZ_X_F_W">;
 defm : VPatNConvertFP2ISDNode_W<any_fp_to_uint, "PseudoVFNCVT_RTZ_XU_F_W">;
 defm : VPatNConvertI2FPSDNode_W_RM<any_sint_to_fp, "PseudoVFNCVT_F_X_W">;
 defm : VPatNConvertI2FPSDNode_W_RM<any_uint_to_fp, "PseudoVFNCVT_F_XU_W">;
-foreach fvtiToFWti = AllWidenableFloatVectors in {
+foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
-                               GetVTypeMinimalPredicates<fwti>.Predicates) in
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fwti>.Predicates,
+                               !if(!eq(fvti.Scalar, bf16),
+                                   [HasStdExtZvfbfmin],
+                                   GetVTypeMinimalPredicates<fvti>.Predicates)) in
+  def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+            (!cast<Instruction>("PseudoVFNCVT"#
+                                !if(!eq(fvti.Scalar, bf16), "BF16", "")#
+                                "_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+                (fvti.Vector (IMPLICIT_DEF)),
+                fwti.RegClass:$rs1,
+                // Value to indicate no rounding mode change in
+                // RISCVInsertReadWriteCSR
+                FRM_DYN,
+                fvti.AVL, fvti.Log2SEW, TA_MA)>;
+  // Define vfncvt.f.f.w for bf16 when Zvfbfa is enabled.
+  if !eq(fvti.Scalar, bf16) then
+  let Predicates = [HasVInstructionsBF16] in
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
-            (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+            (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                 (fvti.Vector (IMPLICIT_DEF)),
                 fwti.RegClass:$rs1,
                 // Value to indicate no rounding mode change in
@@ -1464,10 +1506,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 //===----------------------------------------------------------------------===//
 // Vector Element Extracts
 //===----------------------------------------------------------------------===//
-foreach vti = NoGroupFloatVectors in {
-  defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
-                                                       vti.ScalarSuffix,
-                                                       "_S"));
+foreach vti = !listconcat(NoGroupFloatVectors, NoGroupBF16Vectors) in {
+  defvar vfmv_f_s_inst =
+      !cast<Instruction>(!strconcat("PseudoVFMV_", vti.ScalarSuffix,
+                                    "_S", !if(!eq(vti.Scalar, bf16), "_ALT", "")));
   // Only pattern-match extract-element operations where the index is 0. Any
   // other index will have been custom-lowered to slide the vector correctly
   // into place.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 38edab5400291..9273ce094eb0a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1058,14 +1058,18 @@ multiclass VPatBinaryFPVL_VV_VF<SDPatternOperator vop, string instruction_name,
 }
 
 multiclass VPatBinaryFPVL_VV_VF_RM<SDPatternOperator vop, string instruction_name,
-                                bit isSEWAware = 0, bit isBF16 = 0> {
-  foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in {
+                                   list<VTypeInfo> vtilist = AllFloatVectors,
+                                   bit isSEWAware = 0> {
+  foreach vti = vtilist in {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def : VPatBinaryVL_V_RM<vop, instruction_name, "VV",
+      def : VPatBinaryVL_V_RM<vop, instruction_name #
+                                   !if(!eq(vti.Scalar, bf16), "_ALT", ""), "VV",
                              vti.Vector, vti.Vector, vti.Vector, vti.Mask,
                              vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass,
                              vti.RegClass, isSEWAware>;
-      def : VPatBinaryVL_VF_RM<vop, instruction_name#"_V"#vti.ScalarSuffix,
+      def : VPatBinaryVL_VF_RM<vop, instruction_name#
+                                    !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                    "_V"#vti.ScalarSuffix,
                                vti.Vector, vti.Vector, vti.Vector, vti.Mask,
                                vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass,
                                vti.ScalarRegClass, isSEWAware>;
@@ -1093,8 +1097,9 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name,
 }
 
 multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name,
-                                  bit isSEWAware = 0, bit isBF16 = 0> {
-  foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in {
+                                  list<VTypeInfo> vtilist = AllFloatVectors,
+                                  bit isSEWAware = 0> {
+  foreach fvti = vtilist in {
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
@@ -1103,7 +1108,9 @@ multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
-                               instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
+                               instruction_name#
+                               !if(!eq(fvti.Scalar, bf16), "_ALT", "")#
+                               "_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
                            fvti.RegClass:$passthru,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
@@ -1832,16 +1839,17 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name,
     defvar vti = vtiToWti.Vti;
     defvar wti = vtiToWti.Wti;
     defvar suffix = vti.LMul.MX # "_E" # vti.SEW;
-    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                 GetVTypePredicates<wti>.Predicates,
+    let Predicates = !listconcat(GetVTypePredicates<wti>.Predicates,
                                  !if(!eq(vti.Scalar, bf16),
                                      [HasStdExtZvfbfwma],
-                                     [])) in {
+                                     GetVTypePredicates<vti>.Predicates)) in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
                      (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
-                (!cast<Instruction>(instruction_name#"_VV_"#suffix#"_MASK")
+                (!cast<Instruction>(instruction_name#
+                                    !if(!eq(vti.Scalar, bf16), "BF16", "")#
+                                    "_VV_"#suffix#"_MASK")
                    wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
@@ -1852,7 +1860,9 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name,
                      (vti.Vector vti.RegClass:$rs2),
                      (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
                      VLOpFrag),
-                (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#suffix#"_MASK")
+                (!cast<Instruction>(instruction_name#
+                                    !if(!eq(vti.Scalar, bf16), "BF16", "")#
+                                    "_V"#vti.ScalarSuffix#"_"#suffix#"_MASK")
                    wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask VMV0:$vm),
                    // Value to indicate no rounding mode change in
@@ -2296,9 +2306,12 @@ foreach vtiTowti = AllWidenableIntVectors in {
 // 13. Vector Floating-Point Instructions
 
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
-defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fadd_vl, "PseudoVFADD", isSEWAware=1>;
-defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fsub_vl, "PseudoVFSUB", isSEWAware=1>;
-defm : VPatBinaryFPVL_R_VF_RM<any_riscv_fsub_vl, "PseudoVFRSUB", isSEWAware=1>;
+defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fadd_vl, "PseudoVFADD", AllFloatAndBF16Vectors,
+                               isSEWAware=1>;
+defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fsub_vl, "PseudoVFSUB", AllFloatAndBF16Vectors,
+                               isSEWAware=1>;
+defm : VPatBinaryFPVL_R_VF_RM<any_riscv_fsub_vl, "PseudoVFRSUB", AllFloatAndBF16Vectors,
+                              isSEWAware=1>;
 
 // 13.3. Vector Widening Floating-Point Add/Subtract Instructions
 defm : VPatBinaryFPWVL_VV_VF_WV_WF_RM<riscv_vfwadd_vl, riscv_vfwadd_w_vl,
@@ -2307,7 +2320,8 @@ defm : VPatBinaryFPWVL_VV_VF_WV_WF_RM<riscv_vfwsub_vl, riscv_vfwsub_w_vl,
                                       "PseudoVFWSUB", isSEWAware=1>;
 
 // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fmul_vl, "PseudoVFMUL", isSEWAware=1>;
+defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fmul_vl, "PseudoVFMUL", AllFloatAndBF16Vectors,
+                               isSEWAware=1>;
 defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fdiv_vl, "PseudoVFDIV", isSEWAware=1>;
 defm : VPatBinaryFPVL_R_VF_RM<any_riscv_fdiv_vl, "PseudoVFRDIV", isSEWAware=1>;
 
@@ -2321,7 +2335,8 @@ defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmadd_vl, "PseudoVFNMADD">;
 defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmsub_vl, "PseudoVFNMSUB">;
 
 // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
-defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACC">;
+defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACC",
+                                    AllWidenableFloatAndBF16Vectors>;
 defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwnmadd_vl, "PseudoVFWNMACC">;
 defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmsub_vl, "PseudoVFWMSAC">;
 defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwnmsub_vl, "PseudoVFWNMSAC">;
@@ -2423,6 +2438,66 @@ foreach vti = AllFloatVectors in {
   }
 }
 
+foreach vti = AllBF16Vectors in {
+  let Predicates = GetVTypePredicates<vti>.Predicates in {
+    // 13.12. Vector Floating-Point Sign-Injection Instructions
+    def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
+                             VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJX"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK")
+                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TA_MA)>;
+    // Handle fneg with VFSGNJN using the same input for both operands.
+    def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
+                             VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJN"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK")
+                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TA_MA)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (vti.Vector vti.RegClass:$rs2),
+                                  vti.RegClass:$passthru,
+                                  (vti.Mask VMV0:$vm),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJ"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
+                   vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (riscv_fneg_vl vti.RegClass:$rs2,
+                                                 (vti.Mask true_mask),
+                                                 VLOpFrag),
+                                  srcvalue,
+                                  (vti.Mask true_mask),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJN"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+        (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (SplatFPOp vti.ScalarRegClass:$rs2),
+                                  vti.RegClass:$passthru,
+                                  (vti.Mask VMV0:$vm),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJ"#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", "")#
+                                  "_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
+                   vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TAIL_AGNOSTIC)>;
+  }
+}
+
 // Floating-point vselects:
 // 11.15. Vector Integer Merge Instructions
 // 13.15. Vector Floating-Point Merge Instruction
@@ -2476,7 +2551,7 @@ foreach fvti = AllFloatVectors in {
   }
 }
 
-foreach fvti = AllFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     // 13.16. Vector Floating-Point Move Instruction
@@ -2492,11 +2567,13 @@ foreach fvti = AllFloatVectors in {
   }
 }
 
-foreach fvti = AllFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   let Predicates = GetVTypePredicates<fvti>.Predicates in {
     def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
                            fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+              (!cast<Instruction>("PseudoVFMV_V" #
+                                  !if(!eq(fvti.Scalar, bf16), "_ALT_", "_") #
+                                  fvti.ScalarSuffix # "_" #
                                   fvti.LMul.MX)
                $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
                GPR:$vl, fvti.Log2SEW, TU_MU)>;
@@ -2526,20 +2603,37 @@ defm : VPatWConvertFP2IVL_V<any_riscv_vfcvt_rtz_x_f_vl, "PseudoVFWCVT_RTZ_X_F_V"
 defm : VPatWConvertI2FPVL_V<any_riscv_uint_to_fp_vl, "PseudoVFWCVT_F_XU_V">;
 defm : VPatWConvertI2FPVL_V<any_riscv_sint_to_fp_vl, "PseudoVFWCVT_F_X_V">;
 
-foreach fvtiToFWti = AllWidenableFloatVectors in {
+foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
-  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
-                               GetVTypeMinimalPredicates<fwti>.Predicates) in
+  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enabled.
+  // Define vfwcvtbf16.f.f.v for bf16 when Zvfbfmin is enabled.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fwti>.Predicates,
+                               !if(!eq(fvti.Scalar, bf16),
+                                   [HasStdExtZvfbfmin],
+                                   GetVTypeMinimalPredicates<fvti>.Predicates)) in {
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
                              (fvti.Mask VMV0:$vm),
                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+            (!cast<Instruction>("PseudoVFWCVT"#
+                                !if(!eq(fvti.Scalar, bf16), "BF16", "")#
+                                "_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
                 (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
                 (fvti.Mask VMV0:$vm),
                 GPR:$vl, fvti.Log2SEW, TA_MA)>;
+
+  // Define vfwcvt.f.f.v for bf16 when Zvfbfa is enabled.
+  if !eq(fvti.Scalar, bf16) then
+    let Predicates = [HasVInstructionsBF16] in
+    def : Pat<(fwti.Vector (any_riscv_fpextend_vl
+                               (fvti.Vector fvti.RegClass:$rs1),
+                               (fvti.Mask VMV0:$vm),
+                               VLOpFrag)),
+              (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                  (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  (fvti.Mask VMV0:$vm),
+                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
 }
 
 // 13.19 Narrowing Floating-Point/Integer Type-Convert Instructions
@@ -2555,16 +2649,21 @@ defm : VPatNConvertI2FPVL_W_RM<any_riscv_sint_to_fp_vl, "PseudoVFNCVT_F_X_W">;
 defm : VPatNConvertI2FP_RM_VL_W<riscv_vfcvt_rm_f_xu_vl, "PseudoVFNCVT_F_XU_W">;
 defm : VPatNConvertI2FP_RM_VL_W<riscv_vfcvt_rm_f_x_vl, "PseudoVFNCVT_F_X_W">;
 
-foreach fvtiToFWti = AllWidenableFloatVectors in {
+foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  // Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
-  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
-                               GetVTypeMinimalPredicates<fwti>.Predicates) in {
+  // Define vfncvt.f.f.w for f16 when Zvfhmin is enabled.
+  // Define vfncvtbf16.f.f.w for bf16 when Zvfbfmin is enabled.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fwti>.Predicates,
+                               !if(!eq(fvti.Scalar, bf16),
+                                   [HasStdExtZvfbfmin],
+                                   GetVTypeMinimalPredicates<fvti>.Predicates)) in
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
                                (fwti.Mask VMV0:$vm), VLOpFrag)),
-              (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+              (!cast<Instruction>("PseudoVFNCVT"#
+                                  !if(!eq(fvti.Scalar, bf16), "BF16", "")#
+                                  "_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
                   (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
                   (fwti.Mask VMV0:$vm),
                   // Value to indicate no rounding mode change in
@@ -2581,6 +2680,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                   (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
                   (fwti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>;
   }
+
+  // Define vfncvt.f.f.w for bf16 when Zvfbfa is enabled.
+  if !eq(fvti.Scalar, bf16) then
+  let Predicates = [HasVInstructionsBF16] in
+  def : Pat<(fvti.Vector (any_riscv_fpround_vl
+                             (fwti.Vector fwti.RegClass:$rs1),
+                             (fwti.Mask VMV0:$vm), VLOpFrag)),
+            (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                (fwti.Mask VMV0:$vm),
+                // Value to indicate no rounding mode change in
+                // RISCVInsertReadWriteCSR
+                FRM_DYN,
+                GPR:$vl, fvti.Log2SEW, TA_MA)>;
 }
 
 // 14. Vector Reduction Operations
@@ -2751,7 +2864,7 @@ foreach vti = AllIntegerVectors in {
 }
 
 // 16.2. Floating-Point Scalar Move Instructions
-foreach vti = NoGroupFloatVectors in {
+foreach vti = !listconcat(NoGroupFloatVectors, NoGroupBF16Vectors) in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              (vti.Scalar (fpimm0)),
@@ -2764,7 +2877,8 @@ foreach vti = NoGroupFloatVectors in {
     def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              vti.ScalarRegClass:$rs1,
                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix)
+              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#
+                                  !if(!eq(vti.Scalar, bf16), "_ALT", ""))
                   vti.RegClass:$passthru,
                   (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 3a6ce3ce1d469..39a7aeda94707 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -156,7 +156,7 @@ foreach m = MxList in {
     let BaseInstr = RI_VEXTRACT in
     def PseudoRI_VEXTRACT_  # mx :
       RISCVVPseudo<(outs GPR:$rd),
-                   (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew),
+                   (ins m.vrclass:$rs2, uimm5:$idx, sew:$sew),
                    []>;
 
     let HasVLOp = 1, BaseInstr = RI_VINSERT, HasVecPolicyOp = 1,
@@ -164,7 +164,7 @@ foreach m = MxList in {
     def PseudoRI_VINSERT_ # mx :
       RISCVVPseudo<(outs m.vrclass:$rd),
                    (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl,
-                        ixlenimm:$sew, ixlenimm:$policy),
+                        sew:$sew, vec_policy:$policy),
                    []>;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index e24e4a33288f7..866e831fdcd94 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -406,47 +406,11 @@ let Predicates = [HasStdExtZvfbfmin] in {
                                 "PseudoVFWCVTBF16_F_F", isSEWAware=1>;
   defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
                                    "PseudoVFNCVTBF16_F_F", isSEWAware=1>;
-
-  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
-    defvar fvti = fvtiToFWti.Vti;
-    defvar fwti = fvtiToFWti.Wti;
-    def : Pat<(fwti.Vector (any_riscv_fpextend_vl
-                               (fvti.Vector fvti.RegClass:$rs1),
-                               (fvti.Mask VMV0:$vm),
-                               VLOpFrag)),
-              (!cast<Instruction>("PseudoVFWCVTBF16_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
-                  (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask VMV0:$vm),
-                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
-
-    def : Pat<(fvti.Vector (any_riscv_fpround_vl
-                               (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask VMV0:$vm), VLOpFrag)),
-              (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
-                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask VMV0:$vm),
-                  // Value to indicate no rounding mode change in
-                  // RISCVInsertReadWriteCSR
-                  FRM_DYN,
-                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
-    def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
-              (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
-                  (fvti.Vector (IMPLICIT_DEF)),
-                  fwti.RegClass:$rs1,
-                  // Value to indicate no rounding mode change in
-                  // RISCVInsertReadWriteCSR
-                  FRM_DYN,
-                  fvti.AVL, fvti.Log2SEW, TA_MA)>;
-  }
 }
 
 let Predicates = [HasStdExtZvfbfwma] in {
   defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmaccbf16", "PseudoVFWMACCBF16",
                                AllWidenableBF16ToFloatVectors, isSEWAware=1>;
-  defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACCBF16",
-                                      AllWidenableBF16ToFloatVectors>;
-  defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
-                                          AllWidenableBF16ToFloatVectors>;
 }
 
 multiclass VPatConversionVI_VF_BF16<string intrinsic, string instruction> {
@@ -614,191 +578,4 @@ defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_
                                 isSEWAware=1>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>;
-
-foreach fvti = AllBF16Vectors in {
-  defvar ivti = GetIntVTypeInfo<fvti>.Vti;
-  let Predicates = GetVTypePredicates<ivti>.Predicates in {
-    // 13.16. Vector Floating-Point Move Instruction
-    // If we're splatting fpimm0, use vmv.v.x vd, x0.
-    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                           fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
-              (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
-               $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>;
-    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                           fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)),
-              (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
-               $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>;
-  }
-
-  let Predicates = GetVTypePredicates<fvti>.Predicates in {
-    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                           fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" #
-                                  fvti.LMul.MX)
-               $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
-               GPR:$vl, fvti.Log2SEW, TU_MU)>;
-  }
-}
-
-foreach vti = NoGroupBF16Vectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
-                                             (vti.Scalar (fpimm0)),
-                                             VLOpFrag)),
-              (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
-                                             (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))),
-                                             VLOpFrag)),
-              (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
-                                             vti.ScalarRegClass:$rs1,
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT")
-                  vti.RegClass:$passthru,
-                  (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
-  }
-
-  defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
-                                                       vti.ScalarSuffix,
-                                                       "_S_ALT"));
-  // Only pattern-match extract-element operations where the index is 0. Any
-  // other index will have been custom-lowered to slide the vector correctly
-  // into place.
-  let Predicates = GetVTypePredicates<vti>.Predicates in
-  def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
-            (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
-}
-
-let Predicates = [HasStdExtZvfbfa] in {
-  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
-    defvar fvti = fvtiToFWti.Vti;
-    defvar fwti = fvtiToFWti.Wti;
-    def : Pat<(fwti.Vector (any_riscv_fpextend_vl
-                               (fvti.Vector fvti.RegClass:$rs1),
-                               (fvti.Mask VMV0:$vm),
-                               VLOpFrag)),
-              (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
-                  (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
-                  (fvti.Mask VMV0:$vm),
-                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
-
-    def : Pat<(fvti.Vector (any_riscv_fpround_vl
-                               (fwti.Vector fwti.RegClass:$rs1),
-                               (fwti.Mask VMV0:$vm), VLOpFrag)),
-              (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
-                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                  (fwti.Mask VMV0:$vm),
-                  // Value to indicate no rounding mode change in
-                  // RISCVInsertReadWriteCSR
-                  FRM_DYN,
-                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
-    def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
-              (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
-                  (fvti.Vector (IMPLICIT_DEF)),
-                  fwti.RegClass:$rs1,
-                  // Value to indicate no rounding mode change in
-                  // RISCVInsertReadWriteCSR
-                  FRM_DYN,
-                  fvti.AVL, fvti.Log2SEW, TA_MA)>;
-  }
-
-  foreach vti = AllBF16Vectors in {
-    // 13.12. Vector Floating-Point Sign-Injection Instructions
-    def : Pat<(fabs (vti.Vector vti.RegClass:$rs)),
-              (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
-    // Handle fneg with VFSGNJN using the same input for both operands.
-    def : Pat<(fneg (vti.Vector vti.RegClass:$rs)),
-              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                     (vti.Vector vti.RegClass:$rs2))),
-              (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
-    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                     (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))),
-              (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                     (vti.Vector (fneg vti.RegClass:$rs2)))),
-              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
-    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
-                                     (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))),
-              (!cast<Instruction>("PseudoVFSGNJN_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)),
-                   vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
-
-    // 13.12. Vector Floating-Point Sign-Injection Instructions
-    def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
-                             VLOpFrag),
-              (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK")
-                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
-                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
-                   TA_MA)>;
-    // Handle fneg with VFSGNJN using the same input for both operands.
-    def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
-                             VLOpFrag),
-              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK")
-                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
-                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
-                   TA_MA)>;
-
-    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
-                                  (vti.Vector vti.RegClass:$rs2),
-                                  vti.RegClass:$passthru,
-                                  (vti.Mask VMV0:$vm),
-                                  VLOpFrag),
-              (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$passthru, vti.RegClass:$rs1,
-                   vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
-                   TAIL_AGNOSTIC)>;
-
-    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
-                                  (riscv_fneg_vl vti.RegClass:$rs2,
-                                                 (vti.Mask true_mask),
-                                                 VLOpFrag),
-                                  srcvalue,
-                                  (vti.Mask true_mask),
-                                  VLOpFrag),
-              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
-                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1,
-                   vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
-                                  (SplatFPOp vti.ScalarRegClass:$rs2),
-                                  vti.RegClass:$passthru,
-                                  (vti.Mask VMV0:$vm),
-                                  VLOpFrag),
-              (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$passthru, vti.RegClass:$rs1,
-                   vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
-                   TAIL_AGNOSTIC)>;
-    }
-  }
-
-  defm : VPatBinaryFPSDNode_VV_VF_RM<any_fadd, "PseudoVFADD_ALT",
-                                     isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPSDNode_VV_VF_RM<any_fsub, "PseudoVFSUB_ALT",
-                                     isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPSDNode_VV_VF_RM<any_fmul, "PseudoVFMUL_ALT",
-                                     isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPSDNode_R_VF_RM<any_fsub, "PseudoVFRSUB_ALT",
-                                    isSEWAware=1, isBF16=1>;
-
-  defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fadd_vl, "PseudoVFADD_ALT",
-                                 isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fsub_vl, "PseudoVFSUB_ALT",
-                                 isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPVL_VV_VF_RM<any_riscv_fmul_vl, "PseudoVFMUL_ALT",
-                                 isSEWAware=1, isBF16=1>;
-  defm : VPatBinaryFPVL_R_VF_RM<any_riscv_fsub_vl, "PseudoVFRSUB_ALT",
-                                isSEWAware=1, isBF16=1>;
 } // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
index a22ab6bfc04b8..f1827dcf174f3 100644
--- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
@@ -70,6 +70,12 @@ struct RISCVLoadStoreOpt : public MachineFunctionPass {
   // Convert load/store pairs to single instructions.
   bool tryConvertToLdStPair(MachineBasicBlock::iterator First,
                             MachineBasicBlock::iterator Second);
+  bool tryConvertToXqcilsmLdStPair(MachineFunction *MF,
+                                   MachineBasicBlock::iterator First,
+                                   MachineBasicBlock::iterator Second);
+  bool tryConvertToMIPSLdStPair(MachineFunction *MF,
+                                MachineBasicBlock::iterator First,
+                                MachineBasicBlock::iterator Second);
 
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
@@ -114,7 +120,7 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   ModifiedRegUnits.init(*TRI);
   UsedRegUnits.init(*TRI);
 
-  if (Subtarget.useMIPSLoadStorePairs()) {
+  if (Subtarget.useMIPSLoadStorePairs() || Subtarget.hasVendorXqcilsm()) {
     for (MachineBasicBlock &MBB : Fn) {
       LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
 
@@ -168,14 +174,93 @@ bool RISCVLoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   return false;
 }
 
-// Merge two adjacent load/store instructions into a paired instruction
-// (LDP/SDP/SWP/LWP) if the effective address is 8-byte aligned in case of
-// SWP/LWP 16-byte aligned in case of LDP/SDP. This function selects the
-// appropriate paired opcode, verifies that the memory operand is properly
-// aligned, and checks that the offset is valid. If all conditions are met, it
-// builds and inserts the paired instruction.
-bool RISCVLoadStoreOpt::tryConvertToLdStPair(
-    MachineBasicBlock::iterator First, MachineBasicBlock::iterator Second) {
+bool RISCVLoadStoreOpt::tryConvertToXqcilsmLdStPair(
+    MachineFunction *MF, MachineBasicBlock::iterator First,
+    MachineBasicBlock::iterator Second) {
+  unsigned Opc = First->getOpcode();
+  if ((Opc != RISCV::LW && Opc != RISCV::SW) || Second->getOpcode() != Opc)
+    return false;
+
+  const auto &FirstOp1 = First->getOperand(1);
+  const auto &SecondOp1 = Second->getOperand(1);
+  const auto &FirstOp2 = First->getOperand(2);
+  const auto &SecondOp2 = Second->getOperand(2);
+
+  // Require simple reg+imm addressing for both.
+  if (!FirstOp1.isReg() || !SecondOp1.isReg() || !FirstOp2.isImm() ||
+      !SecondOp2.isImm())
+    return false;
+
+  Register Base1 = FirstOp1.getReg();
+  Register Base2 = SecondOp1.getReg();
+
+  if (Base1 != Base2)
+    return false;
+
+  const MachineMemOperand *MMO = *First->memoperands_begin();
+  Align MMOAlign = MMO->getAlign();
+
+  if (MMOAlign < Align(4))
+    return false;
+
+  auto &FirstOp0 = First->getOperand(0);
+  auto &SecondOp0 = Second->getOperand(0);
+
+  int64_t Off1 = FirstOp2.getImm();
+  int64_t Off2 = SecondOp2.getImm();
+
+  if (Off2 < Off1) {
+    std::swap(FirstOp0, SecondOp0);
+    std::swap(Off1, Off2);
+  }
+
+  Register StartReg = FirstOp0.getReg();
+  Register NextReg = SecondOp0.getReg();
+
+  if (StartReg == RISCV::X0 || NextReg == RISCV::X0)
+    return false;
+
+  // If the base reg gets overwritten by one of the loads then bail out.
+  if (Opc == RISCV::LW && (StartReg == Base1 || NextReg == Base1))
+    return false;
+
+  if (!isShiftedUInt<5, 2>(Off1) || (Off2 - Off1 != 4))
+    return false;
+
+  if (NextReg != StartReg + 1)
+    return false;
+
+  unsigned XqciOpc = (Opc == RISCV::LW) ? RISCV::QC_LWMI : RISCV::QC_SWMI;
+
+  auto StartRegState = (Opc == RISCV::LW) ? RegState::Define
+                                          : getKillRegState(FirstOp0.isKill());
+  auto NextRegState =
+      (Opc == RISCV::LW)
+          ? RegState::ImplicitDefine
+          : (RegState::Implicit | getKillRegState(SecondOp0.isKill()));
+
+  DebugLoc DL =
+      First->getDebugLoc() ? First->getDebugLoc() : Second->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, TII->get(XqciOpc));
+  MIB.addReg(StartReg, StartRegState)
+      .addReg(Base1, getKillRegState(FirstOp1.isKill() || SecondOp1.isKill()))
+      .addImm(2)
+      .addImm(Off1)
+      .cloneMergedMemRefs({&*First, &*Second})
+      .addReg(NextReg, NextRegState);
+
+  First->getParent()->insert(First, MIB);
+  First->removeFromParent();
+  Second->removeFromParent();
+
+  return true;
+}
+
+bool RISCVLoadStoreOpt::tryConvertToMIPSLdStPair(
+    MachineFunction *MF, MachineBasicBlock::iterator First,
+    MachineBasicBlock::iterator Second) {
+  // Try converting to SWP/LWP/LDP/SDP.
+  // SWP/LWP requires 8-byte alignment whereas LDP/SDP needs 16-byte alignment.
   unsigned PairOpc;
   Align RequiredAlignment;
   switch (First->getOpcode()) {
@@ -199,7 +284,6 @@ bool RISCVLoadStoreOpt::tryConvertToLdStPair(
     break;
   }
 
-  MachineFunction *MF = First->getMF();
   const MachineMemOperand *MMO = *First->memoperands_begin();
   Align MMOAlign = MMO->getAlign();
 
@@ -227,6 +311,24 @@ bool RISCVLoadStoreOpt::tryConvertToLdStPair(
   return true;
 }
 
+// Merge two adjacent load/store instructions into a paired instruction.
+// This function calls the vendor specific implementation that seelects the
+// appropriate paired opcode, verifies that the memory operand is properly
+// aligned, and checks that the offset is valid. If all conditions are met, it
+// builds and inserts the paired instruction.
+bool RISCVLoadStoreOpt::tryConvertToLdStPair(
+    MachineBasicBlock::iterator First, MachineBasicBlock::iterator Second) {
+  MachineFunction *MF = First->getMF();
+  const RISCVSubtarget &STI = MF->getSubtarget<RISCVSubtarget>();
+
+  // Try converting to QC_LWMI/QC_SWMI if the XQCILSM extension is enabled.
+  if (!STI.is64Bit() && STI.hasVendorXqcilsm())
+    return tryConvertToXqcilsmLdStPair(MF, First, Second);
+
+  // Else try to convert them into MIPS Paired Loads/Stores.
+  return tryConvertToMIPSLdStPair(MF, First, Second);
+}
+
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      AliasAnalysis *AA) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index e5ba0201c0cc1..b111909fc25cc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1154,10 +1154,63 @@ static unsigned getNumSizeComponents(SPIRVType *imgType) {
   return arrayed ? numComps + 1 : numComps;
 }
 
+static bool builtinMayNeedPromotionToVec(uint32_t BuiltinNumber) {
+  switch (BuiltinNumber) {
+  case SPIRV::OpenCLExtInst::s_min:
+  case SPIRV::OpenCLExtInst::u_min:
+  case SPIRV::OpenCLExtInst::s_max:
+  case SPIRV::OpenCLExtInst::u_max:
+  case SPIRV::OpenCLExtInst::fmax:
+  case SPIRV::OpenCLExtInst::fmin:
+  case SPIRV::OpenCLExtInst::fmax_common:
+  case SPIRV::OpenCLExtInst::fmin_common:
+  case SPIRV::OpenCLExtInst::s_clamp:
+  case SPIRV::OpenCLExtInst::fclamp:
+  case SPIRV::OpenCLExtInst::u_clamp:
+  case SPIRV::OpenCLExtInst::mix:
+  case SPIRV::OpenCLExtInst::step:
+  case SPIRV::OpenCLExtInst::smoothstep:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation functions for each builtin group
 //===----------------------------------------------------------------------===//
 
+static SmallVector<Register>
+getBuiltinCallArguments(const SPIRV::IncomingCall *Call, uint32_t BuiltinNumber,
+                        MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) {
+
+  Register ReturnTypeId = GR->getSPIRVTypeID(Call->ReturnType);
+  unsigned ResultElementCount =
+      GR->getScalarOrVectorComponentCount(ReturnTypeId);
+  bool MayNeedPromotionToVec =
+      builtinMayNeedPromotionToVec(BuiltinNumber) && ResultElementCount > 1;
+
+  if (!MayNeedPromotionToVec)
+    return {Call->Arguments.begin(), Call->Arguments.end()};
+
+  SmallVector<Register> Arguments;
+  for (Register Argument : Call->Arguments) {
+    Register VecArg = Argument;
+    SPIRVType *ArgumentType = GR->getSPIRVTypeForVReg(Argument);
+    if (ArgumentType != Call->ReturnType) {
+      VecArg = createVirtualRegister(Call->ReturnType, GR, MIRBuilder);
+      auto VecSplat = MIRBuilder.buildInstr(SPIRV::OpCompositeConstruct)
+                          .addDef(VecArg)
+                          .addUse(ReturnTypeId);
+      for (unsigned I = 0; I != ResultElementCount; ++I)
+        VecSplat.addUse(Argument);
+    }
+    Arguments.push_back(VecArg);
+  }
+  return Arguments;
+}
+
 static bool generateExtInst(const SPIRV::IncomingCall *Call,
                             MachineIRBuilder &MIRBuilder,
                             SPIRVGlobalRegistry *GR, const CallBase &CB) {
@@ -1179,16 +1232,21 @@ static bool generateExtInst(const SPIRV::IncomingCall *Call,
                  : SPIRV::OpenCLExtInst::fmax;
   }
 
+  Register ReturnTypeId = GR->getSPIRVTypeID(Call->ReturnType);
+  SmallVector<Register> Arguments =
+      getBuiltinCallArguments(Call, Number, MIRBuilder, GR);
+
   // Build extended instruction.
   auto MIB =
       MIRBuilder.buildInstr(SPIRV::OpExtInst)
           .addDef(Call->ReturnRegister)
-          .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+          .addUse(ReturnTypeId)
           .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
           .addImm(Number);
 
-  for (auto Argument : Call->Arguments)
+  for (Register Argument : Arguments)
     MIB.addUse(Argument);
+
   MIB.getInstr()->copyIRFlags(CB);
   if (OrigNumber == SPIRV::OpenCLExtInst::fmin_common ||
       OrigNumber == SPIRV::OpenCLExtInst::fmax_common) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
index 1b95f09974c61..653c9ad53e888 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
@@ -12,11 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SPIRV.h"
-#include "llvm/Demangle/Demangle.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 
 #include <list>
 
@@ -25,9 +24,7 @@
 using namespace llvm;
 
 namespace {
-struct SPIRVRegularizer : public FunctionPass, InstVisitor<SPIRVRegularizer> {
-  DenseMap<Function *, Function *> Old2NewFuncs;
-
+struct SPIRVRegularizer : public FunctionPass {
 public:
   static char ID;
   SPIRVRegularizer() : FunctionPass(ID) {}
@@ -37,11 +34,8 @@ struct SPIRVRegularizer : public FunctionPass, InstVisitor<SPIRVRegularizer> {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     FunctionPass::getAnalysisUsage(AU);
   }
-  void visitCallInst(CallInst &CI);
 
 private:
-  void visitCallScalToVec(CallInst *CI, StringRef MangledName,
-                          StringRef DemangledName);
   void runLowerConstExpr(Function &F);
 };
 } // namespace
@@ -157,98 +151,8 @@ void SPIRVRegularizer::runLowerConstExpr(Function &F) {
   }
 }
 
-// It fixes calls to OCL builtins that accept vector arguments and one of them
-// is actually a scalar splat.
-void SPIRVRegularizer::visitCallInst(CallInst &CI) {
-  auto F = CI.getCalledFunction();
-  if (!F)
-    return;
-
-  auto MangledName = F->getName();
-  char *NameStr = itaniumDemangle(F->getName().data());
-  if (!NameStr)
-    return;
-  StringRef DemangledName(NameStr);
-
-  // TODO: add support for other builtins.
-  if (DemangledName.starts_with("fmin") || DemangledName.starts_with("fmax") ||
-      DemangledName.starts_with("min") || DemangledName.starts_with("max"))
-    visitCallScalToVec(&CI, MangledName, DemangledName);
-  free(NameStr);
-}
-
-void SPIRVRegularizer::visitCallScalToVec(CallInst *CI, StringRef MangledName,
-                                          StringRef DemangledName) {
-  // Check if all arguments have the same type - it's simple case.
-  auto Uniform = true;
-  Type *Arg0Ty = CI->getOperand(0)->getType();
-  auto IsArg0Vector = isa<VectorType>(Arg0Ty);
-  for (unsigned I = 1, E = CI->arg_size(); Uniform && (I != E); ++I)
-    Uniform = isa<VectorType>(CI->getOperand(I)->getType()) == IsArg0Vector;
-  if (Uniform)
-    return;
-
-  auto *OldF = CI->getCalledFunction();
-  Function *NewF = nullptr;
-  auto [It, Inserted] = Old2NewFuncs.try_emplace(OldF);
-  if (Inserted) {
-    AttributeList Attrs = CI->getCalledFunction()->getAttributes();
-    SmallVector<Type *, 2> ArgTypes = {OldF->getArg(0)->getType(), Arg0Ty};
-    auto *NewFTy =
-        FunctionType::get(OldF->getReturnType(), ArgTypes, OldF->isVarArg());
-    NewF = Function::Create(NewFTy, OldF->getLinkage(), OldF->getName(),
-                            *OldF->getParent());
-    ValueToValueMapTy VMap;
-    auto NewFArgIt = NewF->arg_begin();
-    for (auto &Arg : OldF->args()) {
-      auto ArgName = Arg.getName();
-      NewFArgIt->setName(ArgName);
-      VMap[&Arg] = &(*NewFArgIt++);
-    }
-    SmallVector<ReturnInst *, 8> Returns;
-    CloneFunctionInto(NewF, OldF, VMap,
-                      CloneFunctionChangeType::LocalChangesOnly, Returns);
-    NewF->setAttributes(Attrs);
-    It->second = NewF;
-  } else {
-    NewF = It->second;
-  }
-  assert(NewF);
-
-  // This produces an instruction sequence that implements a splat of
-  // CI->getOperand(1) to a vector Arg0Ty. However, we use InsertElementInst
-  // and ShuffleVectorInst to generate the same code as the SPIR-V translator.
-  // For instance (transcoding/OpMin.ll), this call
-  //   call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> <i32 1, i32 10>, i32 5)
-  // is translated to
-  //    %8 = OpUndef %v2uint
-  //   %14 = OpConstantComposite %v2uint %uint_1 %uint_10
-  //   ...
-  //   %10 = OpCompositeInsert %v2uint %uint_5 %8 0
-  //   %11 = OpVectorShuffle %v2uint %10 %8 0 0
-  // %call = OpExtInst %v2uint %1 s_min %14 %11
-  auto ConstInt = ConstantInt::get(IntegerType::get(CI->getContext(), 32), 0);
-  PoisonValue *PVal = PoisonValue::get(Arg0Ty);
-  Instruction *Inst = InsertElementInst::Create(
-      PVal, CI->getOperand(1), ConstInt, "", CI->getIterator());
-  ElementCount VecElemCount = cast<VectorType>(Arg0Ty)->getElementCount();
-  Constant *ConstVec = ConstantVector::getSplat(VecElemCount, ConstInt);
-  Value *NewVec =
-      new ShuffleVectorInst(Inst, PVal, ConstVec, "", CI->getIterator());
-  CI->setOperand(1, NewVec);
-  CI->replaceUsesOfWith(OldF, NewF);
-  CI->mutateFunctionType(NewF->getFunctionType());
-}
-
 bool SPIRVRegularizer::runOnFunction(Function &F) {
   runLowerConstExpr(F);
-  visit(F);
-  for (auto &OldNew : Old2NewFuncs) {
-    Function *OldF = OldNew.first;
-    Function *NewF = OldNew.second;
-    NewF->takeName(OldF);
-    OldF->eraseFromParent();
-  }
   return true;
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index e7903a72d85bb..9791c1999086b 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1875,8 +1875,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
       !IndirectTlsSegRefs &&
-      (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
-       Subtarget->isTargetFuchsia())) {
+      (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
+       Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
     if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
       return true;
     switch (N->getPointerInfo().getAddrSpace()) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3b3b20edbbe84..ec746843f8ea8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33171,7 +33171,14 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
     // is enabled.
     if (VT == MVT::i64) {
-      if (Subtarget.hasSSE1()) {
+      SDValue BCValue = peekThroughBitcasts(Node->getVal());
+      if (BCValue.getValueType() == MVT::f64 &&
+          (Subtarget.hasX87() || Subtarget.hasSSE2())) {
+        // If the i64 was bitcast from a f64 then we can do the f64 atomic store
+        // directly with FSTPL/MOVSD.
+        Chain = DAG.getStore(Node->getChain(), dl, BCValue, Node->getBasePtr(),
+                             Node->getMemOperand());
+      } else if (Subtarget.hasSSE1()) {
         SDValue SclToVec =
             DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 8db3e501f9b7e..ae9d0a162011f 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -546,8 +546,8 @@ unsigned X86TargetLowering::getAddressSpace() const {
 }
 
 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
-  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
-         TargetTriple.isAndroid();
+  return TargetTriple.isOSGlibc() || TargetTriple.isMusl() ||
+         TargetTriple.isOSFuchsia() || TargetTriple.isAndroid();
 }
 
 static Constant* SegmentOffset(IRBuilderBase &IRB,
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 868f41375b96b..3b920bc4ef7c1 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -293,6 +293,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
   bool isTargetHurd() const { return TargetTriple.isOSHurd(); }
   bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
+  bool isTargetMusl() const { return TargetTriple.isMusl(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
   bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
   bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index cbcbb5e40fdfa..981c5561211db 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -247,6 +247,10 @@ static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) {
   else
     Ret += "-n32";
 
+  // The ABI alignment for doubles on AIX is 4 bytes.
+  if (T.isOSAIX())
+    Ret += "-f64:32:64";
+
   // Specify the vector alignment explicitly. For v256i1 and v512i1, the
   // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
   // which is 256 and 512 bytes - way over aligned.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ba5568b00441b..9cf382f8020fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -186,9 +186,9 @@ static unsigned conjugateICmpMask(unsigned Mask) {
   return NewMask;
 }
 
-// Adapts the external decomposeBitTestICmp for local use.
-static bool decomposeBitTestICmp(Value *Cond, CmpInst::Predicate &Pred,
-                                 Value *&X, Value *&Y, Value *&Z) {
+// Adapts the external decomposeBitTest for local use.
+static bool decomposeBitTest(Value *Cond, CmpInst::Predicate &Pred, Value *&X,
+                             Value *&Y, Value *&Z) {
   auto Res = llvm::decomposeBitTest(Cond, /*LookThroughTrunc=*/true,
                                     /*AllowNonZeroC=*/true);
   if (!Res)
@@ -220,7 +220,7 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, Value *&D, Value *&E,
 
   // Check whether the icmp can be decomposed into a bit test.
   Value *L1, *L11, *L12, *L2, *L21, *L22;
-  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
+  if (decomposeBitTest(LHS, PredL, L11, L12, L2)) {
     L21 = L22 = L1 = nullptr;
   } else {
     auto *LHSCMP = dyn_cast<ICmpInst>(LHS);
@@ -253,7 +253,7 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, Value *&D, Value *&E,
     return std::nullopt;
 
   Value *R11, *R12, *R2;
-  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
+  if (decomposeBitTest(RHS, PredR, R11, R12, R2)) {
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11;
       D = R12;
@@ -3890,7 +3890,7 @@ static std::optional<DecomposedBitMaskMul> matchBitmaskMul(Value *V) {
   // Decompose ((A & N) ? 0 : N * C) into BitMaskMul
   if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) {
     auto ICmpDecompose =
-        decomposeBitTest(Cond, /*LookThruTrunc=*/true,
+        decomposeBitTest(Cond, /*LookThroughTrunc=*/true,
                          /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true);
     if (!ICmpDecompose.has_value())
       return std::nullopt;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index abf4381ebd794..1859dad4ec00b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6290,7 +6290,7 @@ Instruction *InstCombinerImpl::foldICmpWithTrunc(ICmpInst &ICmp) {
 
   // This matches patterns corresponding to tests of the signbit as well as:
   // (trunc X) pred C2 --> (X & Mask) == C
-  if (auto Res = decomposeBitTestICmp(Op0, Op1, Pred, /*WithTrunc=*/true,
+  if (auto Res = decomposeBitTestICmp(Op0, Op1, Pred, /*LookThroughTrunc=*/true,
                                       /*AllowNonZeroC=*/true)) {
     Value *And = Builder.CreateAnd(Res->X, Res->Mask);
     Constant *C = ConstantInt::get(Res->X->getType(), Res->C);
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 4521352e8bcb0..02bf3ef1f765e 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -265,7 +265,7 @@ function(runtime_default_target)
     list(APPEND test_targets runtimes-test-depends check-runtimes check-builtins)
 
     # The default runtimes target can run tests the default builtins target
-    list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_FORCE_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-bins/")
+    list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-bins/")
   endif()
 
   set_enable_per_target_runtime_dir()
@@ -376,7 +376,7 @@ function(runtime_register_target name)
     # If a builtins-${name} target exists, we'll test those builtins
     # with this runtimes build
     if(TARGET builtins-${name})
-      list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_FORCE_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-${name}-bins/")
+      list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-${name}-bins/")
       set(check-builtins-${name} check-builtins)
       list(APPEND ${name}_test_targets check-builtins-${name})
       list(APPEND test_targets check-builtins-${name})
diff --git a/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll b/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll
index 665deff4cd04b..2857f77ff695b 100644
--- a/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll
+++ b/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll
@@ -51,6 +51,20 @@
 @g = global i32 ptrtoaddr (ptr @g to i32)
 ; DST_NOT_ADDR_SIZE-NEXT: PtrToAddr result must be address width
 ; DST_NOT_ADDR_SIZE-NEXT: i32 ptrtoaddr (ptr @g to i32)
-@g_vec = global <4 x i32> ptrtoaddr (<4 x ptr> <ptr @g, ptr @g, ptr @g, ptr @g> to <4 x i32>)
-; TODO: Verifier.cpp does not visit ConstantVector/ConstantStruct values
-; TODO-DST_NOT_ADDR_SIZE: PtrToAddr result must be address width
+@g_vec = global <4 x i32> ptrtoaddr (<4 x ptr> <ptr @g_vec, ptr @g_vec, ptr @g_vec, ptr @g_vec> to <4 x i32>)
+; DST_NOT_ADDR_SIZE-NEXT: PtrToAddr result must be address width
+; DST_NOT_ADDR_SIZE-NEXT: i32 ptrtoaddr (ptr @g_vec to i32)
+
+;--- dst_not_addr_size_in_inst.ll
+; RUN: not llvm-as %t/dst_not_addr_size_in_inst.ll -o /dev/null 2>&1 | FileCheck -check-prefix=DST_NOT_ADDR_SIZE_IN_INST %s --implicit-check-not="error:"
+; DST_NOT_ADDR_SIZE_IN_INST: PtrToAddr result must be address width
+; DST_NOT_ADDR_SIZE_IN_INST-NEXT: i32 ptrtoaddr (ptr @fn to i32)
+define i32 @fn() {
+    ret i32 ptrtoaddr (ptr @fn to i32)
+}
+
+; DST_NOT_ADDR_SIZE_IN_INST: PtrToAddr result must be address width
+; DST_NOT_ADDR_SIZE_IN_INST-NEXT: i32 ptrtoaddr (ptr @fn2 to i32)
+define <2 x i32> @fn2() {
+    ret <2 x i32> <i32 ptrtoaddr (ptr @fn2 to i32), i32 ptrtoaddr (ptr @fn2 to i32)>
+}
diff --git a/llvm/test/CodeGen/AArch64/tbi.ll b/llvm/test/CodeGen/AArch64/tbi.ll
index 285726a485b87..a7c9b4cddf808 100644
--- a/llvm/test/CodeGen/AArch64/tbi.ll
+++ b/llvm/test/CodeGen/AArch64/tbi.ll
@@ -7,7 +7,7 @@
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define i32 @ld_and32(i64 %p) {
-  %and = and i64 %p, 72057594037927935
+  %and = and i64 %p, 1152921504606846975
   %cast = inttoptr i64 %and to ptr
   %load = load i32, ptr %cast
   ret i32 %load
@@ -18,7 +18,7 @@ define i32 @ld_and32(i64 %p) {
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define i32 @ld_and_plus_offset(i64 %p) {
-  %and = and i64 %p, 72057594037927935
+  %and = and i64 %p, 1152921504606846975
   %cast = inttoptr i64 %and to ptr
   %gep = getelementptr i32, ptr %cast, i64 4
   %load = load i32, ptr %gep
@@ -40,7 +40,7 @@ define i32 @ld_and32_wider(i64 %p) {
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define i64 @ld_and64(i64 %p) {
-  %and = and i64 %p, 72057594037927935
+  %and = and i64 %p, 1152921504606846975
   %cast = inttoptr i64 %and to ptr
   %load = load i64, ptr %cast
   ret i64 %load
@@ -50,7 +50,7 @@ define i64 @ld_and64(i64 %p) {
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define void @st_and32(i64 %p, i32 %v) {
-  %and = and i64 %p, 72057594037927935
+  %and = and i64 %p, 1152921504606846975
   %cast = inttoptr i64 %and to ptr
   store i32 %v, ptr %cast
   ret void
@@ -62,7 +62,7 @@ define void @st_and32(i64 %p, i32 %v) {
 ; NO_TBI: and x
 define i32 @ld_ro(i64 %a, i64 %b) {
   %p = add i64 %a, %b
-  %and = and i64 %p, 72057594037927935
+  %and = and i64 %p, 1152921504606846975
   %cast = inttoptr i64 %and to ptr
   %load = load i32, ptr %cast
   ret i32 %load
@@ -73,7 +73,7 @@ define i32 @ld_ro(i64 %a, i64 %b) {
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define i32 @ld_ro2(i64 %a, i64 %b) {
-  %and = and i64 %a, 72057594037927935
+  %and = and i64 %a, 1152921504606846975
   %p = add i64 %and, %b
   %cast = inttoptr i64 %p to ptr
   %load = load i32, ptr %cast
@@ -85,7 +85,7 @@ define i32 @ld_ro2(i64 %a, i64 %b) {
 ; TBI-NOT: and x
 ; NO_TBI: and x
 define i32 @ld_indirect_and(i64 %r1, i64 %r2) {
-  %and = and i64 %r1, 72057594037927935
+  %and = and i64 %r1, 1152921504606846975
   %p = or i64 %and, %r2
   %cast = inttoptr i64 %p to ptr
   %load = load i32, ptr %cast
diff --git a/llvm/test/CodeGen/LoongArch/trap.ll b/llvm/test/CodeGen/LoongArch/trap.ll
index 15a7ad82bd7a8..d433266b47e47 100644
--- a/llvm/test/CodeGen/LoongArch/trap.ll
+++ b/llvm/test/CodeGen/LoongArch/trap.ll
@@ -10,7 +10,7 @@ declare void @llvm.debugtrap()
 define void @test_trap() nounwind {
 ; CHECK-LABEL: test_trap:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    amswap.w $zero, $ra, $zero
+; CHECK-NEXT:    ud 0
 ; CHECK-NEXT:    ret
   tail call void @llvm.trap()
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index 258ddf60088c1..02994811dc8af 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -636,7 +636,7 @@ define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) {
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
   ;
   ; 64BIT-LABEL: name: test_mix
@@ -655,7 +655,7 @@ define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) {
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
 entry:
   %conv = fpext float %f to double
@@ -956,11 +956,7 @@ define void @call_test_stackarg_float() {
   ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
   ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load (s64) from @d)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 32BIT-NEXT:   STFS renamable $f1, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
-  ; 32BIT-NEXT:   STW killed renamable $r3, 64, $r1 :: (store (s32) into stack + 64, align 16)
-  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   STFD renamable $f2, 60, $r1 :: (store (s64) into stack + 60, align 4, basealign 16)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -969,8 +965,8 @@ define void @call_test_stackarg_float() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STW killed renamable $r11, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
-  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   STFS renamable $f1, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
   ;
@@ -1057,11 +1053,7 @@ define void @call_test_stackarg_float3() {
   ; 32BIT-NEXT:   renamable $r10 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
   ; 32BIT-NEXT:   renamable $f2 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
   ; 32BIT-NEXT:   STFS renamable $f2, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4)
-  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
-  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -1069,8 +1061,8 @@ define void @call_test_stackarg_float3() {
   ; 32BIT-NEXT:   $r7 = LI 5
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
-  ; 32BIT-NEXT:   STW killed renamable $r11, 52, $r1 :: (store (s32) into stack + 52, basealign 16)
-  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   STFD renamable $f1, 52, $r1 :: (store (s64) into stack + 52, align 4, basealign 16)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
   ;
@@ -1372,7 +1364,7 @@ define double @test_fpr_stack(double %d1, double %d2, double %d3, double %d4, do
   ; 32BIT: bb.0.entry:
   ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
   ; 32BIT-NEXT: {{  $}}
-  ; 32BIT-NEXT:   renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1, align 4)
   ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
   ; 32BIT-NEXT:   renamable $f2 = LFS 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 16)
   ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
@@ -1449,92 +1441,88 @@ define void @caller_fpr_stack() {
   ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d15, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
-  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f16, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (dereferenceable load (s32) from @f16)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
   ; 32BIT-NEXT:   renamable $r5 = LI 0
   ; 32BIT-NEXT:   renamable $r6 = LIS 16352
-  ; 32BIT-NEXT:   STW killed renamable $r3, 140, $r1 :: (store (s32) into stack + 140, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LIS 13107
-  ; 32BIT-NEXT:   STW killed renamable $r4, 128, $r1 :: (store (s32) into stack + 128, align 16)
-  ; 32BIT-NEXT:   renamable $r4 = LIS 16355
   ; 32BIT-NEXT:   STW killed renamable $r5, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 26214
+  ; 32BIT-NEXT:   renamable $r5 = LIS 13107
   ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16355
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 26214
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r6, 64, $r1 :: (store (s32) into stack + 64, align 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16358
-  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r3, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LIS 39321
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r4, 64, $r1 :: (store (s32) into stack + 64, align 16)
-  ; 32BIT-NEXT:   renamable $r4 = LIS 16361
   ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 26214
   ; 32BIT-NEXT:   STW killed renamable $r5, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 52428
+  ; 32BIT-NEXT:   renamable $r5 = LIS 39321
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 26214
   ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16361
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 80, $r1 :: (store (s32) into stack + 80, align 16)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 52428
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52429
+  ; 32BIT-NEXT:   STW killed renamable $r6, 92, $r1 :: (store (s32) into stack + 92, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16364
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r4, 80, $r1 :: (store (s32) into stack + 80, align 16)
-  ; 32BIT-NEXT:   renamable $r4 = LIS 16313
-  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 52429
-  ; 32BIT-NEXT:   STW killed renamable $r5, 92, $r1 :: (store (s32) into stack + 92, basealign 16)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 49807
-  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 39322
-  ; 32BIT-NEXT:   STW renamable $r3, 84, $r1 :: (store (s32) into stack + 84, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 39322
+  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into stack + 84, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52428
   ; 32BIT-NEXT:   STW killed renamable $r6, 88, $r1 :: (store (s32) into stack + 88, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16313
+  ; 32BIT-NEXT:   STW killed renamable $r5, 100, $r1 :: (store (s32) into stack + 100, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 49807
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 96, $r1 :: (store (s32) into stack + 96, align 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16316
-  ; 32BIT-NEXT:   STW killed renamable $r3, 100, $r1 :: (store (s32) into stack + 100, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LIS 60293
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r4, 96, $r1 :: (store (s32) into stack + 96, align 16)
-  ; 32BIT-NEXT:   renamable $r4 = LIS 16318
   ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 23593
   ; 32BIT-NEXT:   STW killed renamable $r5, 108, $r1 :: (store (s32) into stack + 108, basealign 16)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 2621
+  ; 32BIT-NEXT:   renamable $r5 = LIS 60293
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 10485
   ; 32BIT-NEXT:   STW killed renamable $r6, 104, $r1 :: (store (s32) into stack + 104, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16318
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 7864
+  ; 32BIT-NEXT:   STW killed renamable $r5, 116, $r1 :: (store (s32) into stack + 116, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 2621
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 47185
+  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32) into stack + 112, align 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16320
-  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 7864
-  ; 32BIT-NEXT:   STW killed renamable $r3, 116, $r1 :: (store (s32) into stack + 116, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 47185
-  ; 32BIT-NEXT:   STW killed renamable $r4, 112, $r1 :: (store (s32) into stack + 112, align 16)
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r5, 28836
-  ; 32BIT-NEXT:   STW killed renamable $r4, 124, $r1 :: (store (s32) into stack + 124, basealign 16)
-  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r6, 41943
-  ; 32BIT-NEXT:   STW killed renamable $r4, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16)
-  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r5 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.4, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 28836
+  ; 32BIT-NEXT:   STW killed renamable $r5, 124, $r1 :: (store (s32) into stack + 124, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 41943
+  ; 32BIT-NEXT:   STW killed renamable $r6, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.5, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.6, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.6, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.8, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.9, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.10, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.9, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.10, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.11, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $r6 :: (load (s32) from constant-pool)
-  ; 32BIT-NEXT:   STW killed renamable $r5, 136, $r1 :: (store (s32) into stack + 136, align 8, basealign 16)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 140, $r1 :: (store (s32) into stack + 140, basealign 16)
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 132, $r1 :: (store (s64) into stack + 132, align 4, basealign 16)
   ; 32BIT-NEXT:   $f10 = COPY renamable $f1
-  ; 32BIT-NEXT:   STW killed renamable $r3, 132, $r1 :: (store (s32) into stack + 132, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 128, $r1 :: (store (s32) into stack + 128, align 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_fpr_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 144, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -1647,7 +1635,7 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
   ;
   ; 64BIT-LABEL: name: mix_callee
@@ -1671,7 +1659,7 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
   entry:
     %add = fadd double %d1, %d2
@@ -1791,7 +1779,7 @@ define void @caller_mix() {
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f30, implicit $rm
   ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
   ;
   ; 64BIT-LABEL: name: mix_floats
@@ -1826,7 +1814,7 @@ define void @caller_mix() {
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
   ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
   ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
-  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4)
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
   entry:
     %add = add nsw i32 %i1, %i2
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 03770d22d9f4f..5ed0dfb258f73 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -1012,22 +1012,18 @@ define void @call_test_stackarg_float() {
 ; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
 ; ASM32PWR4-NEXT:    stw 0, 88(1)
 ; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    li 5, 3
 ; ASM32PWR4-NEXT:    li 6, 4
 ; ASM32PWR4-NEXT:    li 7, 5
-; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    lfs 1, 0(3)
 ; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
+; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    li 9, 7
-; ASM32PWR4-NEXT:    li 10, 8
 ; ASM32PWR4-NEXT:    lfd 2, 0(3)
 ; ASM32PWR4-NEXT:    li 3, 1
-; ASM32PWR4-NEXT:    stfd 2, 72(1)
-; ASM32PWR4-NEXT:    lwz 5, 76(1)
-; ASM32PWR4-NEXT:    lwz 11, 72(1)
-; ASM32PWR4-NEXT:    stw 5, 64(1)
-; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    stfd 2, 60(1)
 ; ASM32PWR4-NEXT:    stfs 1, 56(1)
-; ASM32PWR4-NEXT:    stw 11, 60(1)
 ; ASM32PWR4-NEXT:    bl .test_stackarg_float[PR]
 ; ASM32PWR4-NEXT:    nop
 ; ASM32PWR4-NEXT:    addi 1, 1, 80
@@ -1130,24 +1126,20 @@ define void @call_test_stackarg_float3() {
 ; ASM32PWR4-NEXT:    stwu 1, -80(1)
 ; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
 ; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    li 4, 2
 ; ASM32PWR4-NEXT:    li 5, 3
 ; ASM32PWR4-NEXT:    li 6, 4
 ; ASM32PWR4-NEXT:    li 7, 5
-; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    lfd 1, 0(3)
 ; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
+; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    li 9, 7
 ; ASM32PWR4-NEXT:    stfd 1, 72(1)
+; ASM32PWR4-NEXT:    lwz 10, 72(1)
 ; ASM32PWR4-NEXT:    lfs 2, 0(3)
 ; ASM32PWR4-NEXT:    li 3, 1
-; ASM32PWR4-NEXT:    stfd 1, 64(1)
-; ASM32PWR4-NEXT:    lwz 4, 68(1)
-; ASM32PWR4-NEXT:    lwz 10, 72(1)
-; ASM32PWR4-NEXT:    lwz 11, 64(1)
-; ASM32PWR4-NEXT:    stw 4, 56(1)
-; ASM32PWR4-NEXT:    li 4, 2
 ; ASM32PWR4-NEXT:    stfs 2, 60(1)
-; ASM32PWR4-NEXT:    stw 11, 52(1)
+; ASM32PWR4-NEXT:    stfd 1, 52(1)
 ; ASM32PWR4-NEXT:    bl .test_stackarg_float3[PR]
 ; ASM32PWR4-NEXT:    nop
 ; ASM32PWR4-NEXT:    addi 1, 1, 80
@@ -1570,99 +1562,95 @@ define void @caller_fpr_stack() {
 ; ASM32PWR4-LABEL: caller_fpr_stack:
 ; ASM32PWR4:       # %bb.0: # %entry
 ; ASM32PWR4-NEXT:    mflr 0
-; ASM32PWR4-NEXT:    stwu 1, -160(1)
+; ASM32PWR4-NEXT:    stwu 1, -144(1)
 ; ASM32PWR4-NEXT:    lwz 3, L..C19(2) # @d15
-; ASM32PWR4-NEXT:    stw 0, 168(1)
-; ASM32PWR4-NEXT:    lwz 5, L..C20(2) # %const.1
-; ASM32PWR4-NEXT:    lwz 4, L..C21(2) # @f14
+; ASM32PWR4-NEXT:    lwz 4, L..C20(2) # @f14
+; ASM32PWR4-NEXT:    lwz 5, L..C21(2) # @f16
+; ASM32PWR4-NEXT:    stw 0, 152(1)
+; ASM32PWR4-NEXT:    lis 6, 16361
+; ASM32PWR4-NEXT:    ori 6, 6, 39321
 ; ASM32PWR4-NEXT:    lfd 0, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C22(2) # @f16
-; ASM32PWR4-NEXT:    lwz 3, 0(3)
-; ASM32PWR4-NEXT:    stw 3, 140(1)
-; ASM32PWR4-NEXT:    li 3, 0
-; ASM32PWR4-NEXT:    stw 3, 60(1)
-; ASM32PWR4-NEXT:    lis 3, 16352
-; ASM32PWR4-NEXT:    stw 3, 56(1)
-; ASM32PWR4-NEXT:    lis 3, 13107
-; ASM32PWR4-NEXT:    ori 3, 3, 13107
-; ASM32PWR4-NEXT:    stw 3, 68(1)
-; ASM32PWR4-NEXT:    lis 3, 16355
-; ASM32PWR4-NEXT:    ori 3, 3, 13107
-; ASM32PWR4-NEXT:    stw 3, 64(1)
-; ASM32PWR4-NEXT:    lis 3, 26214
-; ASM32PWR4-NEXT:    ori 3, 3, 26214
-; ASM32PWR4-NEXT:    stw 3, 76(1)
-; ASM32PWR4-NEXT:    lis 3, 16358
-; ASM32PWR4-NEXT:    ori 3, 3, 26214
-; ASM32PWR4-NEXT:    stw 3, 72(1)
-; ASM32PWR4-NEXT:    lis 3, -26215
-; ASM32PWR4-NEXT:    ori 3, 3, 39322
-; ASM32PWR4-NEXT:    stw 3, 84(1)
-; ASM32PWR4-NEXT:    stw 3, 100(1)
-; ASM32PWR4-NEXT:    lis 3, 16313
-; ASM32PWR4-NEXT:    ori 3, 3, 39321
-; ASM32PWR4-NEXT:    stw 3, 96(1)
-; ASM32PWR4-NEXT:    lis 3, -15729
-; ASM32PWR4-NEXT:    ori 3, 3, 23593
-; ASM32PWR4-NEXT:    stw 3, 108(1)
-; ASM32PWR4-NEXT:    lis 3, 16316
-; ASM32PWR4-NEXT:    ori 3, 3, 10485
-; ASM32PWR4-NEXT:    stw 3, 104(1)
-; ASM32PWR4-NEXT:    lis 3, -5243
-; ASM32PWR4-NEXT:    ori 3, 3, 7864
-; ASM32PWR4-NEXT:    stw 3, 116(1)
-; ASM32PWR4-NEXT:    lis 3, 16318
-; ASM32PWR4-NEXT:    ori 3, 3, 47185
-; ASM32PWR4-NEXT:    stw 3, 112(1)
-; ASM32PWR4-NEXT:    lis 3, 2621
-; ASM32PWR4-NEXT:    ori 3, 3, 28836
-; ASM32PWR4-NEXT:    stw 3, 124(1)
-; ASM32PWR4-NEXT:    lis 3, 16320
-; ASM32PWR4-NEXT:    ori 3, 3, 41943
-; ASM32PWR4-NEXT:    stw 3, 120(1)
-; ASM32PWR4-NEXT:    lwz 3, L..C23(2) # %const.0
-; ASM32PWR4-NEXT:    lfd 2, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C24(2) # %const.2
+; ASM32PWR4-NEXT:    lwz 3, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, 0(5)
+; ASM32PWR4-NEXT:    li 5, 0
+; ASM32PWR4-NEXT:    stw 5, 60(1)
+; ASM32PWR4-NEXT:    lis 5, 16352
+; ASM32PWR4-NEXT:    stw 5, 56(1)
+; ASM32PWR4-NEXT:    lis 5, 13107
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    stw 5, 68(1)
+; ASM32PWR4-NEXT:    lis 5, 16355
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    stw 5, 64(1)
+; ASM32PWR4-NEXT:    lis 5, 26214
+; ASM32PWR4-NEXT:    ori 5, 5, 26214
+; ASM32PWR4-NEXT:    stw 5, 76(1)
+; ASM32PWR4-NEXT:    lis 5, 16358
+; ASM32PWR4-NEXT:    ori 5, 5, 26214
+; ASM32PWR4-NEXT:    stw 5, 72(1)
+; ASM32PWR4-NEXT:    lis 5, -26215
+; ASM32PWR4-NEXT:    ori 5, 5, 39322
+; ASM32PWR4-NEXT:    stw 5, 84(1)
+; ASM32PWR4-NEXT:    stw 5, 100(1)
+; ASM32PWR4-NEXT:    lis 5, 16313
+; ASM32PWR4-NEXT:    ori 5, 5, 39321
+; ASM32PWR4-NEXT:    stw 5, 96(1)
+; ASM32PWR4-NEXT:    lis 5, -15729
+; ASM32PWR4-NEXT:    ori 5, 5, 23593
+; ASM32PWR4-NEXT:    stw 5, 108(1)
+; ASM32PWR4-NEXT:    lis 5, 16316
+; ASM32PWR4-NEXT:    ori 5, 5, 10485
+; ASM32PWR4-NEXT:    stw 5, 104(1)
+; ASM32PWR4-NEXT:    lis 5, -5243
+; ASM32PWR4-NEXT:    ori 5, 5, 7864
+; ASM32PWR4-NEXT:    stw 5, 116(1)
+; ASM32PWR4-NEXT:    lis 5, 16318
+; ASM32PWR4-NEXT:    ori 5, 5, 47185
+; ASM32PWR4-NEXT:    stw 6, 80(1)
+; ASM32PWR4-NEXT:    lis 6, -13108
+; ASM32PWR4-NEXT:    ori 6, 6, 52429
+; ASM32PWR4-NEXT:    stw 5, 112(1)
+; ASM32PWR4-NEXT:    lis 5, 2621
+; ASM32PWR4-NEXT:    ori 5, 5, 28836
+; ASM32PWR4-NEXT:    stw 6, 92(1)
+; ASM32PWR4-NEXT:    lis 6, 16364
+; ASM32PWR4-NEXT:    ori 6, 6, 52428
+; ASM32PWR4-NEXT:    stw 5, 124(1)
+; ASM32PWR4-NEXT:    lis 5, 16320
+; ASM32PWR4-NEXT:    ori 5, 5, 41943
+; ASM32PWR4-NEXT:    stw 6, 88(1)
+; ASM32PWR4-NEXT:    lwz 6, L..C22(2) # %const.0
+; ASM32PWR4-NEXT:    stw 5, 120(1)
+; ASM32PWR4-NEXT:    lwz 5, L..C23(2) # %const.1
+; ASM32PWR4-NEXT:    lfd 2, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C24(2) # %const.2
 ; ASM32PWR4-NEXT:    lfd 3, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C25(2) # %const.3
-; ASM32PWR4-NEXT:    lfd 4, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C26(2) # %const.4
+; ASM32PWR4-NEXT:    lfd 4, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C26(2) # %const.4
 ; ASM32PWR4-NEXT:    lfd 6, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C27(2) # %const.5
-; ASM32PWR4-NEXT:    lwz 4, 0(4)
-; ASM32PWR4-NEXT:    lfd 7, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C28(2) # %const.6
+; ASM32PWR4-NEXT:    lfd 7, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C28(2) # %const.6
 ; ASM32PWR4-NEXT:    lfd 8, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C29(2) # %const.7
-; ASM32PWR4-NEXT:    stw 4, 128(1)
-; ASM32PWR4-NEXT:    lis 4, 16361
-; ASM32PWR4-NEXT:    ori 4, 4, 39321
-; ASM32PWR4-NEXT:    lfd 9, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C30(2) # %const.8
+; ASM32PWR4-NEXT:    lfd 9, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C30(2) # %const.8
 ; ASM32PWR4-NEXT:    lfd 1, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C31(2) # %const.9
-; ASM32PWR4-NEXT:    stw 4, 80(1)
-; ASM32PWR4-NEXT:    lis 4, -13108
+; ASM32PWR4-NEXT:    lfd 11, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C32(2) # %const.10
 ; ASM32PWR4-NEXT:    fmr 10, 1
-; ASM32PWR4-NEXT:    ori 4, 4, 52429
-; ASM32PWR4-NEXT:    lfd 11, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, L..C32(2) # %const.10
 ; ASM32PWR4-NEXT:    lfd 12, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C33(2) # %const.11
-; ASM32PWR4-NEXT:    stw 4, 92(1)
-; ASM32PWR4-NEXT:    lis 4, 16364
-; ASM32PWR4-NEXT:    ori 4, 4, 52428
-; ASM32PWR4-NEXT:    stfd 0, 152(1)
-; ASM32PWR4-NEXT:    stw 4, 88(1)
-; ASM32PWR4-NEXT:    lwz 4, 156(1)
-; ASM32PWR4-NEXT:    lfd 13, 0(3)
+; ASM32PWR4-NEXT:    lfd 13, 0(6)
 ; ASM32PWR4-NEXT:    lfs 5, 0(5)
-; ASM32PWR4-NEXT:    lwz 3, 152(1)
-; ASM32PWR4-NEXT:    stw 4, 136(1)
-; ASM32PWR4-NEXT:    stw 3, 132(1)
+; ASM32PWR4-NEXT:    stfd 0, 132(1)
+; ASM32PWR4-NEXT:    stw 4, 140(1)
+; ASM32PWR4-NEXT:    stw 3, 128(1)
 ; ASM32PWR4-NEXT:    bl .test_fpr_stack
 ; ASM32PWR4-NEXT:    nop
-; ASM32PWR4-NEXT:    addi 1, 1, 160
+; ASM32PWR4-NEXT:    addi 1, 1, 144
 ; ASM32PWR4-NEXT:    lwz 0, 8(1)
 ; ASM32PWR4-NEXT:    mtlr 0
 ; ASM32PWR4-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll
index 2827155dc1845..8c4f81b65144e 100644
--- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll
@@ -160,7 +160,7 @@ entry:
 ; CHECK-ASM-LABEL:     .main:{{[[:space:]] *}}# %bb.0:
 ; CHECK-FUNC-LABEL:    .csect .main[PR],5{{[[:space:]] *}}# %bb.0
 ; COMMON-NEXT:   mflr 0
-; COMMON:        stw 0, 168(1)
+; COMMON:        stw 0, 152(1)
 ; COMMON:        mtlr 0
 ; COMMON-NEXT:   blr
 ; COMMON-NEXT: L..main0:
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll
index 468303d8d9cbc..88db5cd1c8af0 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll
@@ -29,8 +29,8 @@
 %struct.anon = type <{ i32, double }>
 @astruct = global [1 x %struct.anon] [%struct.anon <{ i32 1, double 7.000000e+00 }>], align 1
 
-%struct.anon2 = type { double, i32 }
-@bstruct = global [1 x %struct.anon2] [%struct.anon2 { double 7.000000e+00 , i32 1}], align 8
+%struct.anon2 = type { double, i32, [4 x i8] }
+@bstruct = global [1 x %struct.anon2] [%struct.anon2 { double 7.000000e+00 , i32 1, [4 x i8] undef }], align 8
 
 @a = common global i32 0, align 4
 @b = common global i64 0, align 8
diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
index 682c2b7afe34d..7218c814b30b8 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
@@ -114,24 +114,18 @@ define double @double_va_arg(double %a, ...) local_unnamed_addr  {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   liveins: $f1, $r5, $r6, $r7, $r8, $r9, $r10
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $r3 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; CHECK-NEXT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
   ; CHECK-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
-  ; CHECK-NEXT:   STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
-  ; CHECK-NEXT:   STW renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
   ; CHECK-NEXT:   STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %ir.argp.cur2, align 16)
   ; CHECK-NEXT:   STW killed renamable $r9, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 16)
   ; CHECK-NEXT:   STW killed renamable $r10, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
-  ; CHECK-NEXT:   STW renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
-  ; CHECK-NEXT:   STW renamable $r5, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
-  ; CHECK-NEXT:   STW renamable $r6, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
-  ; CHECK-NEXT:   renamable $f0 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
-  ; CHECK-NEXT:   STW killed renamable $r5, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
-  ; CHECK-NEXT:   STW killed renamable $r6, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
-  ; CHECK-NEXT:   renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
-  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $r3 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
   ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
 entry:
   %arg1 = alloca ptr, align 4
@@ -163,31 +157,24 @@ define double @double_stack_va_arg(double %one, double %two, double %three, doub
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %ir.argp.cur142, align 16)
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
   ; CHECK-NEXT:   renamable $r3 = ADDI %fixed-stack.0, 0
   ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-  ; CHECK-NEXT:   renamable $r3 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
-  ; CHECK-NEXT:   STW renamable $r3, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
-  ; CHECK-NEXT:   renamable $r4 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
-  ; CHECK-NEXT:   STW renamable $r4, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
-  ; CHECK-NEXT:   renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
-  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
-  ; CHECK-NEXT:   STW killed renamable $r4, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
-  ; CHECK-NEXT:   renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
-  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
-  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
   ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
 entry:
   %arg1 = alloca ptr, align 4
diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
index 9cf1e45607042..30727b8d4fe94 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
@@ -108,24 +108,18 @@ entry:
 define double @double_va_arg(double %a, ...) local_unnamed_addr  {
 ; CHECK-LABEL: double_va_arg:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stw 5, -16(1)
-; CHECK-NEXT:    addi 3, 1, 32
-; CHECK-NEXT:    stw 6, -12(1)
-; CHECK-NEXT:    lfd 0, -16(1)
-; CHECK-NEXT:    stw 5, -24(1)
-; CHECK-NEXT:    fadd 0, 0, 1
-; CHECK-NEXT:    stw 6, -20(1)
-; CHECK-NEXT:    lfd 1, -24(1)
-; CHECK-NEXT:    fadd 1, 1, 1
-; CHECK-NEXT:    stw 7, 40(1)
-; CHECK-NEXT:    fadd 1, 0, 1
 ; CHECK-NEXT:    stw 5, 32(1)
+; CHECK-NEXT:    addi 3, 1, 32
 ; CHECK-NEXT:    stw 6, 36(1)
+; CHECK-NEXT:    lfd 0, 32(1)
+; CHECK-NEXT:    fadd 1, 0, 1
+; CHECK-NEXT:    fadd 0, 0, 0
+; CHECK-NEXT:    stw 7, 40(1)
 ; CHECK-NEXT:    stw 8, 44(1)
+; CHECK-NEXT:    fadd 1, 1, 0
 ; CHECK-NEXT:    stw 9, 48(1)
 ; CHECK-NEXT:    stw 10, 52(1)
 ; CHECK-NEXT:    stw 3, -4(1)
-; CHECK-NEXT:    stw 3, -8(1)
 ; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
@@ -155,31 +149,24 @@ entry:
 define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
 ; CHECK-LABEL: double_stack_va_arg:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    fadd 0, 1, 2
+; CHECK-NEXT:    fadd 1, 1, 2
+; CHECK-NEXT:    lfd 0, 128(1)
 ; CHECK-NEXT:    addi 3, 1, 128
-; CHECK-NEXT:    lwz 4, 132(1)
-; CHECK-NEXT:    fadd 0, 0, 3
+; CHECK-NEXT:    fadd 1, 1, 3
 ; CHECK-NEXT:    stw 3, -4(1)
-; CHECK-NEXT:    fadd 0, 0, 4
-; CHECK-NEXT:    lwz 3, 128(1)
-; CHECK-NEXT:    fadd 0, 0, 5
-; CHECK-NEXT:    stw 3, -16(1)
-; CHECK-NEXT:    fadd 0, 0, 6
-; CHECK-NEXT:    stw 4, -12(1)
-; CHECK-NEXT:    fadd 0, 0, 7
-; CHECK-NEXT:    lfd 1, -16(1)
-; CHECK-NEXT:    fadd 0, 0, 8
-; CHECK-NEXT:    stw 3, -24(1)
-; CHECK-NEXT:    fadd 0, 0, 9
-; CHECK-NEXT:    stw 4, -20(1)
-; CHECK-NEXT:    fadd 0, 0, 10
-; CHECK-NEXT:    fadd 0, 0, 11
-; CHECK-NEXT:    fadd 0, 0, 12
-; CHECK-NEXT:    fadd 0, 0, 13
-; CHECK-NEXT:    fadd 0, 0, 1
-; CHECK-NEXT:    lfd 1, -24(1)
-; CHECK-NEXT:    fadd 1, 1, 1
-; CHECK-NEXT:    fadd 1, 0, 1
+; CHECK-NEXT:    fadd 1, 1, 4
+; CHECK-NEXT:    fadd 1, 1, 5
+; CHECK-NEXT:    fadd 1, 1, 6
+; CHECK-NEXT:    fadd 1, 1, 7
+; CHECK-NEXT:    fadd 1, 1, 8
+; CHECK-NEXT:    fadd 1, 1, 9
+; CHECK-NEXT:    fadd 1, 1, 10
+; CHECK-NEXT:    fadd 1, 1, 11
+; CHECK-NEXT:    fadd 1, 1, 12
+; CHECK-NEXT:    fadd 1, 1, 13
+; CHECK-NEXT:    fadd 1, 1, 0
+; CHECK-NEXT:    fadd 0, 0, 0
+; CHECK-NEXT:    fadd 1, 1, 0
 ; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll
index dc62e18378e72..af13552ed5949 100644
--- a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll
@@ -113,10 +113,10 @@ define double @double_va_arg(double %a, ...) local_unnamed_addr  {
   ; CHECK-NEXT:   renamable $x5 = ADDI8 %fixed-stack.0, 8
   ; CHECK-NEXT:   STD killed renamable $x3, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
   ; CHECK-NEXT:   STD killed renamable $x5, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
-  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0)
   ; CHECK-NEXT:   renamable $x3 = ADDI8 renamable $x4, 8
   ; CHECK-NEXT:   STD killed renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
-  ; CHECK-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (load (s64))
+  ; CHECK-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (load (s64), align 4)
   ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
   ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
   ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
@@ -145,7 +145,7 @@ define double @double_stack_va_arg(double %one, double %two, double %three, doub
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0)
   ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
   ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
   ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
index cd59aa03597e2..1e1110f0a30b8 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -638,6 +638,60 @@ define void @test_psslai_h(ptr %ret_ptr, ptr %a_ptr) {
   ret void
 }
 
+; Test logical shift right immediate
+define void @test_psrli_h(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrli.h a1, a1, 2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %res = lshr <2 x i16> %a, splat(i16 2)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrli_b(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrli.b a1, a1, 2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %res = lshr <4 x i8> %a, splat(i8 2)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right immediate
+define void @test_psrai_h(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrai_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrai.h a1, a1, 2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %res = ashr <2 x i16> %a, splat(i16 2)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrai_b(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrai_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrai.b a1, a1, 2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %res = ashr <4 x i8> %a, splat(i8 2)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
 ; Test logical shift left(scalar shamt)
 define void @test_psll_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) {
 ; CHECK-LABEL: test_psll_hs:
@@ -746,3 +800,243 @@ define void @test_psll_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
   store <4 x i8> %res, ptr %ret_ptr
   ret void
 }
+
+; Test logical shift right(scalar shamt)
+define void @test_psrl_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) {
+; CHECK-LABEL: test_psrl_hs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrl.hs a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %insert = insertelement <2 x i16> poison, i16 %shamt, i32 0
+  %b = shufflevector <2 x i16> %insert, <2 x i16> poison, <2 x i32> zeroinitializer
+  %res = lshr <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrl_bs(ptr %ret_ptr, ptr %a_ptr, i8 %shamt) {
+; CHECK-LABEL: test_psrl_bs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psrl.bs a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %insert = insertelement <4 x i8> poison, i8 %shamt, i32 0
+  %b = shufflevector <4 x i8> %insert, <4 x i8> poison, <4 x i32> zeroinitializer
+  %res = lshr <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right(scalar shamt)
+define void @test_psra_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) {
+; CHECK-LABEL: test_psra_hs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psra.hs a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %insert = insertelement <2 x i16> poison, i16 %shamt, i32 0
+  %b = shufflevector <2 x i16> %insert, <2 x i16> poison, <2 x i32> zeroinitializer
+  %res = ashr <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psra_bs(ptr %ret_ptr, ptr %a_ptr, i8 %shamt) {
+; CHECK-LABEL: test_psra_bs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    psra.bs a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %insert = insertelement <4 x i8> poison, i8 %shamt, i32 0
+  %b = shufflevector <4 x i8> %insert, <4 x i8> poison, <4 x i32> zeroinitializer
+  %res = ashr <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test logical shift right(vector shamt)
+define void @test_psrl_hs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-RV32-LABEL: test_psrl_hs_vec_shamt:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srli a4, a1, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 16
+; CHECK-RV32-NEXT:    srl a3, a4, a3
+; CHECK-RV32-NEXT:    srli a1, a1, 16
+; CHECK-RV32-NEXT:    srl a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a1, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psrl_hs_vec_shamt:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srli a3, a2, 16
+; CHECK-RV64-NEXT:    srliw a4, a1, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    srl a3, a4, a3
+; CHECK-RV64-NEXT:    srli a1, a1, 48
+; CHECK-RV64-NEXT:    srl a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a3
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %shamt_ptr
+  %res = lshr <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrl_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-RV32-LABEL: test_psrl_bs_vec_shamt:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a1, 24
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    srl a7, a4, a3
+; CHECK-RV32-NEXT:    srli a3, a6, 24
+; CHECK-RV32-NEXT:    srl a6, a3, a5
+; CHECK-RV32-NEXT:    zext.b a3, a1
+; CHECK-RV32-NEXT:    srli a4, a2, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 8
+; CHECK-RV32-NEXT:    srl a2, a3, a2
+; CHECK-RV32-NEXT:    srli a1, a1, 24
+; CHECK-RV32-NEXT:    srl a3, a1, a4
+; CHECK-RV32-NEXT:    ppaire.db a2, a2, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psrl_bs_vec_shamt:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srli a3, a2, 24
+; CHECK-RV64-NEXT:    srliw a4, a1, 24
+; CHECK-RV64-NEXT:    srli a5, a2, 16
+; CHECK-RV64-NEXT:    srl a3, a4, a3
+; CHECK-RV64-NEXT:    slli a4, a1, 40
+; CHECK-RV64-NEXT:    srli a4, a4, 56
+; CHECK-RV64-NEXT:    srl a4, a4, a5
+; CHECK-RV64-NEXT:    zext.b a5, a1
+; CHECK-RV64-NEXT:    srl a5, a5, a2
+; CHECK-RV64-NEXT:    srli a2, a2, 8
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    srli a1, a1, 56
+; CHECK-RV64-NEXT:    srl a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a4, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a5, a1
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %shamt_ptr
+  %res = lshr <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right(vector shamt)
+define void @test_psra_hs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-RV32-LABEL: test_psra_hs_vec_shamt:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    srai a4, a1, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 16
+; CHECK-RV32-NEXT:    sra a3, a4, a3
+; CHECK-RV32-NEXT:    srai a1, a1, 16
+; CHECK-RV32-NEXT:    sra a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a1, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psra_hs_vec_shamt:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srli a3, a2, 16
+; CHECK-RV64-NEXT:    sraiw a4, a1, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    sra a3, a4, a3
+; CHECK-RV64-NEXT:    srai a1, a1, 48
+; CHECK-RV64-NEXT:    sra a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a3
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %shamt_ptr
+  %res = ashr <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psra_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-RV32-LABEL: test_psra_bs_vec_shamt:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srai a4, a1, 24
+; CHECK-RV32-NEXT:    srli a5, a2, 8
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    sra a7, a4, a3
+; CHECK-RV32-NEXT:    srai a3, a6, 24
+; CHECK-RV32-NEXT:    sra a6, a3, a5
+; CHECK-RV32-NEXT:    srli a3, a2, 16
+; CHECK-RV32-NEXT:    slli a4, a1, 8
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    srai a4, a4, 24
+; CHECK-RV32-NEXT:    sra a3, a4, a3
+; CHECK-RV32-NEXT:    srai a1, a1, 24
+; CHECK-RV32-NEXT:    sra a2, a1, a2
+; CHECK-RV32-NEXT:    ppaire.db a2, a2, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psra_bs_vec_shamt:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srli a3, a2, 24
+; CHECK-RV64-NEXT:    sraiw a4, a1, 24
+; CHECK-RV64-NEXT:    srli a5, a2, 16
+; CHECK-RV64-NEXT:    slli a6, a1, 40
+; CHECK-RV64-NEXT:    sra a3, a4, a3
+; CHECK-RV64-NEXT:    srli a4, a2, 8
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    sra a5, a6, a5
+; CHECK-RV64-NEXT:    slli a6, a1, 48
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    sra a4, a6, a4
+; CHECK-RV64-NEXT:    slli a1, a1, 56
+; CHECK-RV64-NEXT:    srai a1, a1, 56
+; CHECK-RV64-NEXT:    sra a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a5, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a1, a4
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %shamt_ptr
+  %res = ashr <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
index c7fb891cdd996..3e0f431d67f41 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -791,6 +791,86 @@ define void @test_pslli_w(ptr %ret_ptr, ptr %a_ptr) {
   store <2 x i32> %res, ptr %ret_ptr
   ret void
 }
+; Test logical shift right immediate
+define void @test_psrli_w(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrli_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrli.w a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %res = lshr <2 x i32> %a, splat(i32 2)
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrli_h(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrli.h a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %res = lshr <4 x i16> %a, splat(i16 2)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrli_b(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrli.b a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %res = lshr <8 x i8> %a, splat(i8 2)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right immediate
+define void @test_psrai_w(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrai_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrai.w a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %res = ashr <2 x i32> %a, splat(i32 2)
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrai_h(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrai_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrai.h a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %res = ashr <4 x i16> %a, splat(i16 2)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrai_b(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_psrai_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrai.b a1, a1, 2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %res = ashr <8 x i8> %a, splat(i8 2)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
 
 ; Test arithmetic saturation shift left immediate for v2i32
 define void @test_psslai_w(ptr %ret_ptr, ptr %a_ptr) {
@@ -841,3 +921,75 @@ define void @test_psll_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
   store <2 x i32> %res, ptr %ret_ptr
   ret void
 }
+
+; Test logical shift right(scalar shamt)
+define void @test_psrl_ws(ptr %ret_ptr, ptr %a_ptr, i32 %shamt) {
+; CHECK-LABEL: test_psrl_ws:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psrl.ws a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %insert = insertelement <2 x i32> poison, i32 %shamt, i32 0
+  %b = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %res = lshr <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right(scalar shamt)
+define void @test_psra_ws(ptr %ret_ptr, ptr %a_ptr, i32 %shamt) {
+; CHECK-LABEL: test_psra_ws:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    psra.ws a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %insert = insertelement <2 x i32> poison, i32 %shamt, i32 0
+  %b = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %res = ashr <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test logical shift right(vector shamt)
+define void @test_psrl_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-LABEL: test_psrl_ws_vec_shamt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    srlw a3, a1, a2
+; CHECK-NEXT:    srli a2, a2, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    srlw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %shamt_ptr
+  %res = lshr <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test arithmetic shift right(vector shamt)
+define void @test_psra_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) {
+; CHECK-LABEL: test_psra_ws_vec_shamt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    sraw a3, a1, a2
+; CHECK-NEXT:    srli a2, a2, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    sraw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %shamt_ptr
+  %res = ashr <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir b/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir
new file mode 100644
index 0000000000000..396f67326a7ca
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir
@@ -0,0 +1,315 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm -run-pass=riscv-load-store-opt %s -o - | FileCheck %s
+
+--- |
+
+  define void @pair_two_lw_into_qc_lwmi() nounwind { ret void }
+  define void @pair_two_lw_into_qc_lwmi_reversed() nounwind { ret void }
+  define void @pair_two_sw_into_qc_swmi_reversed() nounwind { ret void }
+  define void @no_pair_if_different_base_regs() nounwind { ret void }
+  define void @no_pair_if_alignment_lt_4() nounwind { ret void }
+  define void @pair_two_sw_into_qc_swmi() nounwind { ret void }
+  define void @no_pair_if_misaligned() nounwind { ret void }
+  define void @pair_at_upper_boundary_lw() nounwind { ret void }
+  define void @pair_at_upper_boundary_sw() nounwind { ret void }
+  define void @no_pair_if_offset_out_of_range_lw() nounwind { ret void }
+  define void @no_pair_if_offset_out_of_range_sw() nounwind { ret void }
+  define void @no_pair_if_non_consecutive_regs() nounwind { ret void }
+  define void @no_pair_if_rd_is_x0() nounwind { ret void }
+  define void @no_pair_if_lw_rd_equals_base() nounwind { ret void }
+  define void @pair_if_not_adjacent() nounwind { ret void }
+  define void @pair_if_not_adjacent_use() nounwind { ret void }
+  define void @no_pair_if_not_adjacent_use() nounwind { ret void }
+---
+name: pair_two_lw_into_qc_lwmi
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: pair_two_lw_into_qc_lwmi
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 0, implicit-def $x13 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 0 :: (load (s32), align 4)
+    $x13 = LW $x10, 4 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+# FIXME: Kill flags are not propagated correctly for the base register
+name: pair_two_lw_into_qc_lwmi_reversed
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: pair_two_lw_into_qc_lwmi_reversed
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 0, implicit-def $x13 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x13 = LW $x10, 4 :: (load (s32))
+    $x12 = LW killed $x10, 0 :: (load (s32))
+    PseudoRET
+
+...
+---
+name: pair_two_sw_into_qc_swmi_reversed
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x12, $x13
+    ; CHECK-LABEL: name: pair_two_sw_into_qc_swmi_reversed
+    ; CHECK: liveins: $x10, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: QC_SWMI killed $x12, $x10, 2, 0, implicit killed $x13 :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    SW killed $x13, $x10, 4 :: (store (s32))
+    SW killed $x12, $x10, 0 :: (store (s32))
+    PseudoRET
+
+...
+---
+name: no_pair_if_different_base_regs
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: no_pair_if_different_base_regs
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = LW $x10, 0 :: (load (s32))
+    ; CHECK-NEXT: $x13 = LW $x11, 4 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 0 :: (load (s32))
+    $x13 = LW $x11, 4 :: (load (s32))
+    PseudoRET
+
+...
+---
+name: no_pair_if_alignment_lt_4
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_alignment_lt_4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = LW $x10, 0 :: (load (s32))
+    ; CHECK-NEXT: $x13 = LW $x10, 3 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 0 :: (load (s32))
+    $x13 = LW $x10, 3 :: (load (s32))
+    PseudoRET
+
+...
+---
+name: pair_two_sw_into_qc_swmi
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x12, $x13
+    ; CHECK-LABEL: name: pair_two_sw_into_qc_swmi
+    ; CHECK: liveins: $x10, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: QC_SWMI killed $x12, $x10, 2, 0, implicit killed $x13 :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    SW killed $x12, $x10, 0 :: (store (s32), align 4)
+    SW killed $x13, $x10, 4 :: (store (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_misaligned
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_misaligned
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = LW $x10, 2 :: (load (s32))
+    ; CHECK-NEXT: $x13 = LW $x10, 6 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 2 :: (load (s32), align 4)
+    $x13 = LW $x10, 6 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+# FIXME: Kill flags are not propagated correctly for the base register
+name: pair_at_upper_boundary_lw
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: pair_at_upper_boundary_lw
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 124, implicit-def $x13 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 124 :: (load (s32), align 4)
+    $x13 = LW killed $x10, 128 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+# FIXME: Kill flags are not propagated correctly for the base register
+name: pair_at_upper_boundary_sw
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x12, $x13
+    ; CHECK-LABEL: name: pair_at_upper_boundary_sw
+    ; CHECK: liveins: $x10, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: QC_SWMI $x12, $x10, 2, 124, implicit $x13 :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    SW $x12, $x10, 124 :: (store (s32), align 4)
+    SW $x13, killed $x10, 128 :: (store (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_offset_out_of_range_lw
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_offset_out_of_range_lw
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = LW $x10, 128 :: (load (s32))
+    ; CHECK-NEXT: $x13 = LW $x10, 132 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x12 = LW $x10, 128 :: (load (s32), align 4)
+    $x13 = LW $x10, 132 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_offset_out_of_range_sw
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x12, $x13
+    ; CHECK-LABEL: name: no_pair_if_offset_out_of_range_sw
+    ; CHECK: liveins: $x10, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: SW $x12, $x10, 128 :: (store (s32))
+    ; CHECK-NEXT: SW $x13, $x10, 132 :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    SW $x12, $x10, 128 :: (store (s32), align 4)
+    SW $x13, $x10, 132 :: (store (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_non_consecutive_regs
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_non_consecutive_regs
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x11 = LW $x10, 0 :: (load (s32))
+    ; CHECK-NEXT: $x13 = LW $x10, 4 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x11 = LW $x10, 0 :: (load (s32), align 4)
+    $x13 = LW $x10, 4 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_rd_is_x0
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_rd_is_x0
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x0 = LW $x10, 0 :: (load (s32))
+    ; CHECK-NEXT: $x1 = LW $x10, 4 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x0 = LW $x10, 0 :: (load (s32), align 4)
+    $x1 = LW $x10, 4 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_lw_rd_equals_base
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: no_pair_if_lw_rd_equals_base
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x10 = LW $x10, 20 :: (load (s32))
+    ; CHECK-NEXT: $x11 = LW $x10, 24 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x10 = LW $x10, 20 :: (load (s32), align 4)
+    $x11 = LW $x10, 24 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+# FIXME: Kill flags are not propagated correctly for the base register
+name: pair_if_not_adjacent
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: pair_if_not_adjacent
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x1 = QC_LWMI $x10, 2, 20, implicit-def $x2 :: (load (s32))
+    ; CHECK-NEXT: $x3 = ADDI $x1, 10
+    ; CHECK-NEXT: PseudoRET
+    $x1 = LW $x10, 20 :: (load (s32), align 4)
+    $x3 = ADDI $x1, 10
+    $x2 = LW killed $x10, 24 :: (load (s32), align 4)
+    PseudoRET
+
+...
+---
+name: pair_if_not_adjacent_use
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x1, $x2
+    ; CHECK-LABEL: name: pair_if_not_adjacent_use
+    ; CHECK: liveins: $x10, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x2 = ADDI $x2, 10
+    ; CHECK-NEXT: QC_SWMI $x1, $x10, 2, 20, implicit $x2 :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    SW $x1, $x10, 20 :: (store (s32), align 4)
+    $x2 = ADDI $x2, 10
+    SW $x2, $x10, 24 :: (store (s32), align 4)
+    PseudoRET
+
+...
+---
+name: no_pair_if_not_adjacent_use
+tracksRegLiveness: false
+body: |
+  bb.0:
+    liveins: $x10, $x2
+    ; CHECK-LABEL: name: no_pair_if_not_adjacent_use
+    ; CHECK: liveins: $x10, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x1 = LW $x10, 20 :: (load (s32))
+    ; CHECK-NEXT: $x1 = ADDI $x1, 10
+    ; CHECK-NEXT: SW $x2, $x10, 40 :: (store (s32))
+    ; CHECK-NEXT: $x2 = LW $x10, 24 :: (load (s32))
+    ; CHECK-NEXT: PseudoRET
+    $x1 = LW $x10, 20 :: (load (s32), align 4)
+    $x1 = ADDI $x1, 10
+    SW $x2, $x10, 40 :: (store (s32), align 4)
+    $x2 = LW $x10, 24 :: (load (s32), align 4)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll
new file mode 100644
index 0000000000000..b406f8b71f7e6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll
@@ -0,0 +1,179 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown < %s | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown < %s -filetype=obj | spirv-val %}
+;
+; Some OpenCL builtins have mixed vector-scalar variants, but OpExtInt only supports
+; versions where all the arguments have the same type.
+;
+; We generate code, but it is invalid.
+; We should generate vector versions for these cases.
+
+define spir_kernel void @S_MIN() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_MIN
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_min %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> <i32 1, i32 10>, i32 5)
+  ret void
+}
+
+define spir_kernel void @U_MIN() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function U_MIN
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} u_min %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x i32> @_Z3minDv2_jj(<2 x i32> <i32 1, i32 10>, i32 5)
+  ret void
+}
+
+define spir_kernel void @S_MAX() {
+; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_MAX
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_max %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x i32> @_Z3maxDv2_ii(<2 x i32> <i32 1, i32 10>, i32 5)
+  ret void
+}
+
+define spir_kernel void @F_MIN() {
+; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_MIN
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmin %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z3minDv2_ff(<2 x float> <float 1.0, float 10.0>, float 5.0)
+  ret void
+}
+
+define spir_kernel void @F_MAX() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_MAX
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmax %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z3maxDv2_ff(<2 x float> <float 1.0, float 10.0>, float 5.0)
+  ret void
+}
+
+define spir_kernel void @F_FMIN() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_FMIN
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmin %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z4fminDv2_ff(<2 x float> <float 1.0, float 10.0>, float 5.0)
+  ret void
+}
+
+define spir_kernel void @F_FMAX() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_FMAX
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmax %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float> <float 1.0, float 10.0>, float 5.0)
+  ret void
+}
+
+define spir_kernel void @S_CLAMP() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_CLAMP
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]]
+; CHECK-NEXT:    %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_clamp %{{[0-9]+}} %[[VEC_0]] %[[VEC_1]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x i32> @_Z5clampDv2_iii(<2 x i32> <i32 1, i32 10>, i32 5, i32 6)
+  ret void
+}
+
+define spir_kernel void @F_CLAMP() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_CLAMP
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]]
+; CHECK-NEXT:    %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fclamp %{{[0-9]+}} %[[VEC_0]] %[[VEC_1]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z5clampDv2_fff(<2 x float> <float 1.0, float 10.0>, float 5.0, float 6.0)
+  ret void
+}
+
+define spir_kernel void @MIX() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function MIX
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} mix %{{[0-9]+}} %{{[0-9]+}} %[[VEC]]
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z3mixDv2_fS_f(<2 x float> <float 1.0, float 10.0>, <2 x float> <float 2.0, float 20.0>, float 0.5)
+  ret void
+}
+
+define spir_kernel void @SMOOTHSTEP() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function SMOOTHSTEP
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]]
+; CHECK-NEXT:    %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]]
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} smoothstep %[[VEC_0]] %[[VEC_1]] %{{[0-9]+}}
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  %call = tail call spir_func <2 x float> @_Z10smoothstepffDv2_f(float 1.0, float 0.5, <2 x float> <float 1.0, float 10.0>)
+  ret void
+}
+
+define spir_kernel void @ill_0() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function ill_0
+; CHECK-NEXT:    OpLabel
+; CHECK-NEXT:    OpFunctionCall %{{[0-9]+}} %{{[0-9]+}}
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  tail call spir_func void @_Z3minv()
+  ret void
+}
+
+declare spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32>, i32)
+declare spir_func <2 x i32> @_Z3minDv2_jj(<2 x i32>, i32)
+declare spir_func <2 x i32> @_Z3maxDv2_ii(<2 x i32>, i32)
+declare spir_func <2 x float> @_Z3minDv2_ff(<2 x float>, float)
+declare spir_func <2 x float> @_Z3maxDv2_ff(<2 x float>, float)
+declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float)
+declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float)
+declare spir_func <2 x i32> @_Z5clampDv2_iii(<2 x i32>, i32)
+declare spir_func <2 x float> @_Z5clampDv2_fff(<2 x float>, float)
+declare spir_func <2 x float> @_Z3mixDv2_fS_f(<2 x float>, <2 x float>, float)
+declare spir_func <2 x float> @_Z10smoothstepffDv2_f(float, float, <2 x float>)
+declare spir_func void @_Z3minv()
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll
new file mode 100644
index 0000000000000..b81f373be33c3
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown < %s | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown < %s -filetype=obj | not spirv-val 2>&1 | FileCheck %s --check-prefix=VALIDATOR %}
+;
+; _Z3miniii is not a valid OpenCL intrinsic, do not treat it like one.
+;
+; VALIDATOR: Invalid instruction OpExtInst starting at word {{[0-9]+}}: expected no more operands after 7 words, but stated word count is 8
+
+define spir_kernel void @ill_1() {
+; CHECK-LABEL:   OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function ill_1
+; CHECK-NEXT:    OpLabel
+; This is wrong, we should generate a regular call
+; CHECK-NEXT:    %{{[0-9]+}} = OpExtInst %{{[0-9]+}} %{{[0-9]+}} s_min %{{[0-9]+}} %{{[0-9]+}} %{{[0-9]+}}
+; CHECK-NEXT:    OpReturn
+; CHECK-NEXT:    OpFunctionEnd
+; CHECK-NEXT:    ; -- End function
+entry:
+  tail call spir_func void @_Z3miniii(i32 1, i32 2, i32 3)
+  ret void
+}
+
+declare spir_func i32 @_Z3miniii(i32, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll
deleted file mode 100644
index 5cc3ea01e5191..0000000000000
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-
-; CHECK-SPIRV: %[[#SetInstID:]] = OpExtInstImport "OpenCL.std"
-; CHECK-SPIRV: %[[#IntTypeID:]] = OpTypeInt 32 [[#]]
-; CHECK-SPIRV: %[[#Int2TypeID:]] = OpTypeVector %[[#IntTypeID]] 2
-; CHECK-SPIRV: %[[#CompositeID:]] = OpCompositeInsert %[[#Int2TypeID]] %[[#]] %[[#]] [[#]]
-; CHECK-SPIRV: %[[#ShuffleID:]] = OpVectorShuffle %[[#Int2TypeID]] %[[#CompositeID]] %[[#]] [[#]] [[#]]
-; CHECK-SPIRV: %[[#]] = OpExtInst %[[#Int2TypeID]] %[[#SetInstID]] s_min %[[#]] %[[#ShuffleID]]
-
-define spir_kernel void @test() {
-entry:
-  %call = tail call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> <i32 1, i32 10>, i32 5) #2
-  ret void
-}
-
-declare spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32>, i32)
diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
index fe79dfe39f645..2dee1d12e7255 100644
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -80,23 +80,17 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
 ; X86-NOSSE-NEXT:    fildll (%eax)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    faddl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ecx, (%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    faddl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%eax)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -109,16 +103,13 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    fstpl (%eax)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -132,9 +123,7 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -148,9 +137,7 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -246,22 +233,16 @@ define dso_local void @fadd_64g() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll glob64
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fld1
-; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    faddl (%esp)
+; X86-NOSSE-NEXT:    fstpl glob64
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -273,16 +254,13 @@ define dso_local void @fadd_64g() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    fstpl glob64
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -295,9 +273,7 @@ define dso_local void @fadd_64g() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movsd %xmm0, glob64
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -310,9 +286,7 @@ define dso_local void @fadd_64g() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    vmovsd %xmm0, glob64
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -409,22 +383,16 @@ define dso_local void @fadd_64imm() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll -559038737
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fld1
-; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    faddl (%esp)
+; X86-NOSSE-NEXT:    fstpl -559038737
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -436,16 +404,13 @@ define dso_local void @fadd_64imm() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    fstpl -559038737
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -458,9 +423,7 @@ define dso_local void @fadd_64imm() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movsd %xmm0, -559038737
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -473,9 +436,7 @@ define dso_local void @fadd_64imm() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    vmovsd %xmm0, -559038737
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -577,22 +538,16 @@ define dso_local void @fadd_64stack() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    fld1
-; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    faddl (%esp)
 ; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -604,16 +559,13 @@ define dso_local void @fadd_64stack() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $24, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -627,8 +579,6 @@ define dso_local void @fadd_64stack() nounwind {
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -642,8 +592,6 @@ define dso_local void @fadd_64stack() nounwind {
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -677,7 +625,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
 ; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
@@ -685,16 +633,10 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    faddl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %edx, (%esp)
-; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    faddl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
@@ -709,16 +651,13 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -733,9 +672,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movsd %xmm0, (%ecx,%eax,8)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -750,9 +687,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %ecx
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%ecx,%eax,8)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -852,23 +787,17 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
 ; X86-NOSSE-NEXT:    fildll (%eax)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fsubl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ecx, (%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fsubl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%eax)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -881,16 +810,13 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fsubl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    fstpl (%eax)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -904,9 +830,7 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    subsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -920,9 +844,7 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vsubsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1018,23 +940,17 @@ define dso_local void @fsub_64g() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll glob64
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    fld1
 ; X86-NOSSE-NEXT:    fchs
-; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    faddl (%esp)
+; X86-NOSSE-NEXT:    fstpl glob64
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1046,17 +962,14 @@ define dso_local void @fsub_64g() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    fchs
 ; X86-SSE1-NEXT:    faddl (%esp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    fstpl glob64
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1069,9 +982,7 @@ define dso_local void @fsub_64g() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movsd %xmm0, glob64
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1084,9 +995,7 @@ define dso_local void @fsub_64g() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    vmovsd %xmm0, glob64
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1184,23 +1093,17 @@ define dso_local void @fsub_64imm() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll -559038737
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    fld1
 ; X86-NOSSE-NEXT:    fchs
-; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    faddl (%esp)
+; X86-NOSSE-NEXT:    fstpl -559038737
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1212,17 +1115,14 @@ define dso_local void @fsub_64imm() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    fchs
 ; X86-SSE1-NEXT:    faddl (%esp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    fstpl -559038737
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1235,9 +1135,7 @@ define dso_local void @fsub_64imm() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movsd %xmm0, -559038737
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1250,9 +1148,7 @@ define dso_local void @fsub_64imm() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    vmovsd %xmm0, -559038737
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1354,22 +1250,16 @@ define dso_local void @fsub_64stack() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    fld1
-; X86-NOSSE-NEXT:    fsubl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fsubl (%esp)
 ; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1381,16 +1271,13 @@ define dso_local void @fsub_64stack() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $24, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    fsubl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1405,8 +1292,6 @@ define dso_local void @fsub_64stack() nounwind {
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; X86-SSE2-NEXT:    subsd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movsd %xmm1, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1421,8 +1306,6 @@ define dso_local void @fsub_64stack() nounwind {
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; X86-AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1456,7 +1339,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
 ; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
@@ -1464,16 +1347,10 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fsubl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %edx, (%esp)
-; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fsubl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
@@ -1488,16 +1365,13 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fsubl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1512,9 +1386,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    subsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movsd %xmm0, (%ecx,%eax,8)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1529,9 +1401,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %ecx
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vsubsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%ecx,%eax,8)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1631,23 +1501,17 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
 ; X86-NOSSE-NEXT:    fildll (%eax)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fmull 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ecx, (%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fmull 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%eax)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1660,16 +1524,13 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fmull 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    fstpl (%eax)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1683,9 +1544,7 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    mulsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1699,9 +1558,7 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vmulsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1794,22 +1651,16 @@ define dso_local void @fmul_64g() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll glob64
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl glob64
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1821,16 +1672,13 @@ define dso_local void @fmul_64g() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    fstpl glob64
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -1843,9 +1691,7 @@ define dso_local void @fmul_64g() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movsd %xmm0, glob64
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1858,9 +1704,7 @@ define dso_local void @fmul_64g() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    vmovsd %xmm0, glob64
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -1957,22 +1801,16 @@ define dso_local void @fmul_64imm() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll -559038737
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl -559038737
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -1984,16 +1822,13 @@ define dso_local void @fmul_64imm() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    fstpl -559038737
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2006,9 +1841,7 @@ define dso_local void @fmul_64imm() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movsd %xmm0, -559038737
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2021,9 +1854,7 @@ define dso_local void @fmul_64imm() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    vmovsd %xmm0, -559038737
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2125,22 +1956,16 @@ define dso_local void @fmul_64stack() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    fldl (%esp)
 ; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -2152,16 +1977,13 @@ define dso_local void @fmul_64stack() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $24, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2175,8 +1997,6 @@ define dso_local void @fmul_64stack() nounwind {
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2190,8 +2010,6 @@ define dso_local void @fmul_64stack() nounwind {
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2225,7 +2043,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
 ; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
@@ -2233,16 +2051,10 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fmull 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %edx, (%esp)
-; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fmull 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
@@ -2257,16 +2069,13 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fmull 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2281,9 +2090,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    mulsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movsd %xmm0, (%ecx,%eax,8)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2298,9 +2105,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %ecx
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vmulsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%ecx,%eax,8)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2400,23 +2205,17 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
 ; X86-NOSSE-NEXT:    fildll (%eax)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fdivl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ecx, (%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fdivl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%eax)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -2429,16 +2228,13 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fdivl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    fstpl (%eax)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2452,9 +2248,7 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    divsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2468,9 +2262,7 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vdivsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2565,22 +2357,16 @@ define dso_local void @fdiv_64g() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll glob64
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl glob64
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -2592,16 +2378,13 @@ define dso_local void @fdiv_64g() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    fstpl glob64
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2614,9 +2397,7 @@ define dso_local void @fdiv_64g() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movsd %xmm0, glob64
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2629,9 +2410,7 @@ define dso_local void @fdiv_64g() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    vmovsd %xmm0, glob64
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2728,22 +2507,16 @@ define dso_local void @fdiv_64imm() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    subl $24, %esp
 ; X86-NOSSE-NEXT:    fildll -559038737
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl -559038737
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -2755,16 +2528,13 @@ define dso_local void @fdiv_64imm() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    fstpl -559038737
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2777,9 +2547,7 @@ define dso_local void @fdiv_64imm() nounwind {
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movsd %xmm0, -559038737
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2792,9 +2560,7 @@ define dso_local void @fdiv_64imm() nounwind {
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    vmovsd %xmm0, -559038737
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2896,22 +2662,16 @@ define dso_local void @fdiv_64stack() nounwind {
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    fld1
-; X86-NOSSE-NEXT:    fdivl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fdivl (%esp)
 ; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
@@ -2923,16 +2683,13 @@ define dso_local void @fdiv_64stack() nounwind {
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $24, %esp
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    fdivl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -2947,8 +2704,6 @@ define dso_local void @fdiv_64stack() nounwind {
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; X86-SSE2-NEXT:    divsd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movsd %xmm1, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -2963,8 +2718,6 @@ define dso_local void @fdiv_64stack() nounwind {
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; X86-AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
@@ -2998,7 +2751,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-8, %esp
-; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
 ; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
 ; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
@@ -3006,16 +2759,10 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fdivl 12(%ebp)
-; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOSSE-NEXT:    movl %edx, (%esp)
-; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fildll (%esp)
-; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fldl (%esp)
+; X86-NOSSE-NEXT:    fdivl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
@@ -3030,16 +2777,13 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-SSE1-NEXT:    movss %xmm1, (%esp)
-; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    fdivl 12(%ebp)
-; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    fstpl (%ecx,%eax,8)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
@@ -3054,9 +2798,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    divsd 12(%ebp), %xmm0
-; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movsd %xmm0, (%ecx,%eax,8)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -3071,9 +2813,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-AVX-NEXT:    movl 8(%ebp), %ecx
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX-NEXT:    vdivsd 12(%ebp), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    vmovsd %xmm0, (%ecx,%eax,8)
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/misc.s b/llvm/test/MC/LoongArch/Basic/Integer/misc.s
index 182d1da9b237e..26a9205d8e17d 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/misc.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/misc.s
@@ -7,7 +7,7 @@
 # RUN: llvm-mc %s --triple=loongarch32 --filetype=obj | llvm-objdump -d - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc %s --triple=loongarch64 --filetype=obj --defsym=LA64=1 | llvm-objdump -d - \
-# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK64-ASM-AND-OBJ %s
+# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK64-OBJ,CHECK64-ASM-AND-OBJ %s
 
 #############################################################
 ## Instructions for both loongarch32 and loongarch64
@@ -33,6 +33,13 @@ rdtimeh.w $a7, $a1
 # CHECK-ASM: encoding: [0x03,0x6d,0x00,0x00]
 cpucfg $sp, $a4
 
+# CHECK-ASM-AND-OBJ: ud 0
+# CHECK-ASM: encoding: [0x00,0x04,0x60,0x38]
+ud 0
+
+# CHECK-ASM-AND-OBJ: ud 31
+# CHECK-ASM: encoding: [0xff,0x07,0x60,0x38]
+ud 31
 
 #############################################################
 ## Instructions only for loongarch64
@@ -40,6 +47,11 @@ cpucfg $sp, $a4
 
 .ifdef LA64
 
+# CHECK64-OBJ: ud 0
+# CHECK64-ASM: amswap.w $zero, $ra, $zero
+# CHECK64-ASM: encoding: [0x00,0x04,0x60,0x38]
+amswap.w $zero, $ra, $zero
+
 # CHECK64-ASM-AND-OBJ: asrtle.d $t0, $t5
 # CHECK64-ASM: encoding: [0x80,0x45,0x01,0x00]
 asrtle.d $t0, $t5
diff --git a/llvm/test/TableGen/RegClassByHwModeErrors.td b/llvm/test/TableGen/RegClassByHwModeErrors.td
index 0ee6370ccd0ce..c7731312e28a6 100644
--- a/llvm/test/TableGen/RegClassByHwModeErrors.td
+++ b/llvm/test/TableGen/RegClassByHwModeErrors.td
@@ -9,6 +9,8 @@
 // RUN:   %t/compress-regclass-by-hwmode-2.td -o /dev/null 2>&1 | FileCheck %t/compress-regclass-by-hwmode-2.td --implicit-check-not="error:"
 // RUN: not llvm-tblgen --gen-dag-isel  -I %p/../../include -I %t -I %S \
 // RUN:   %t/vt-by-hwmode-missing.td -o /dev/null 2>&1 | FileCheck %t/vt-by-hwmode-missing.td --implicit-check-not="error:"
+// RUN: not llvm-tblgen --gen-dag-isel -I %p/../../include -I %t -I %S \
+// RUN:   %t/multiple-entries-for-same-mode.td -o /dev/null 2>&1 | FileCheck %t/multiple-entries-for-same-mode.td --implicit-check-not="error:"
 
 //--- Common.td
 include "Common/RegClassByHwModeCommon.td"
@@ -119,3 +121,22 @@ def TEST : TestInstruction {
 
 def MyTargetISA : InstrInfo;
 def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+
+//--- multiple-entries-for-same-mode.td
+include "Common.td"
+/// We should get an error if the same mode is listed more than once
+defvar Ptr64Alias = Ptr64;
+def BadRegClass : RegClassByHwMode<[Ptr32, Ptr64, Ptr64Alias], [XRegs, YRegs, YRegs]>;
+// CHECK: [[#@LINE-1]]:5: error: duplicate RegisterClass entry for HwMode Ptr64: YRegs
+// Need at least one CompressPat use of the bad reg class to trigger the error:
+def USE_BAD_REG_CLASS : TestInstruction {
+  let OutOperandList = (outs BadRegClass:$dst);
+  let InOperandList = (ins BadRegClass:$src1, BadRegClass:$src2);
+  let AsmString = "bad $dst";
+  let Pattern = [(set BadRegClass:$dst, (add BadRegClass:$src1, BadRegClass:$src2))];
+}
+def MyTargetISA : InstrInfo;
+def MyTarget : Target {
+  let InstructionSet = MyTargetISA;
+}
diff --git a/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll b/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
index fb860a5e7bdf3..6509797e0d3dc 100644
--- a/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
+++ b/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
@@ -35,12 +35,10 @@ define <4 x float> @test_fmul_reassoc_nsz(<4 x float> %V) {
 }
 
 ; (V * C1) * C2 => V * (C1 * C2)
-; TODO: This doesn't require 'nsz'.  It should fold to V * { 1.0, 4.0e+05, -9.0, 16.0 }
 define <4 x float> @test_fmul_reassoc(<4 x float> %V) {
 ; CHECK-LABEL: @test_fmul_reassoc(
-; CHECK-NEXT:     [[TMP1:%.*]] = fmul reassoc <4 x float> [[V:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
-; CHECK-NEXT:     [[TMP2:%.*]] = fmul reassoc <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00>
-; CHECK-NEXT:     ret <4 x float> [[TMP2]]
+; CHECK: [[TMP1:%.*]] = fmul reassoc <4 x float> %V, <float 1.000000e+00, float 4.000000e+05, float -9.000000e+00, float 1.600000e+01>
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
         %Y = fmul reassoc <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
         %Z = fmul reassoc <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 >
         ret <4 x float> %Z
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 54b0bf8c50ac7..3465781e3af9d 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -525,8 +525,7 @@ define <2 x float> @div_constant_dividend2_reassoc_only(<2 x float> %x) {
 
 define <2 x float> @div_constant_dividend3(<2 x float> %x) {
 ; CHECK-LABEL: @div_constant_dividend3(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc arcp <2 x float> [[X:%.*]], <float 1.500000e+01, float -7.000000e+00>
-; CHECK-NEXT:    [[T2:%.*]] = fmul reassoc arcp <2 x float> [[TMP1]], <float 0x3FD5555560000000, float 0x3FC24924A0000000>
+; CHECK-NEXT:    [[T2:%.*]] = fmul reassoc arcp <2 x float> [[X:%.*]], <float 5.000000e+00, float -1.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[T2]]
 ;
   %t1 = fdiv <2 x float> <float 3.0e0, float 7.0e0>, %x
diff --git a/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll
new file mode 100644
index 0000000000000..5d064234bf609
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; Show that unlike fadd, fmul does not require nsz to be reassociated.
+
+; Can't reassociate anyway
+define float @fmul(float %x) {
+; CHECK-LABEL: define float @fmul(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL0:%.*]] = fmul float [[X]], 2.000000e+00
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul float [[FMUL0]], 4.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul float %x, 2.0
+  %fmul1 = fmul float %fmul0, 4.0
+  ret float %fmul1
+}
+
+; Should be able to reassociate without nsz
+; (+0 * 2) * 4 = +0
+; (-0 * 2) * 4 = -0
+
+; (+0 * 8) = +0
+; (-0 * 8) = -0
+define float @fmul_reassoc(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc float [[X]], 8.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul reassoc float %x, 2.0
+  %fmul1 = fmul reassoc float %fmul0, 4.0
+  ret float %fmul1
+}
+
+define <2 x float> @fmul_reassoc_v2(<2 x float> %x) {
+; CHECK-LABEL: define <2 x float> @fmul_reassoc_v2(
+; CHECK-SAME: <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc <2 x float> [[X]], splat (float 8.000000e+00)
+; CHECK-NEXT:    ret <2 x float> [[FMUL1]]
+;
+  %fmul0 = fmul reassoc <2 x float> %x, splat (float 2.0)
+  %fmul1 = fmul reassoc <2 x float> %fmul0, splat (float 4.0)
+  ret <2 x float> %fmul1
+}
+
+; (+0 * 2) * -4 = -0
+; (-0 * 2) * -4 = +0
+
+; (+0 * -8) = -0
+; (-0 * -8) = +0
+define float @fmul_reassoc_negative_0(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc_negative_0(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc float [[X]], -8.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul reassoc float %x, 2.0
+  %fmul1 = fmul reassoc float %fmul0, -4.0
+  ret float %fmul1
+}
+
+; (+0 * -2) * 4 = -0
+; (-0 * -2) * 4 = +0
+
+; (+0 * -8) = -0
+; (-0 * -8) = +0
+define float @fmul_reassoc_negative_1(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc_negative_1(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc float [[X]], -8.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul reassoc float %x, -2.0
+  %fmul1 = fmul reassoc float %fmul0, 4.0
+  ret float %fmul1
+}
+
+; Does reassociate already, unnecessarily requires nsz on both multiplies.
+define float @fmul_reassoc_nsz(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc_nsz(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc nsz float [[X]], 8.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul nsz reassoc float %x, 2.0
+  %fmul1 = fmul nsz reassoc float %fmul0, 4.0
+  ret float %fmul1
+}
+
+define float @fmul_reassoc_posk_neg0(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc_posk_neg0(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL1:%.*]] = fmul reassoc float [[X]], -0.000000e+00
+; CHECK-NEXT:    ret float [[FMUL1]]
+;
+  %fmul0 = fmul reassoc float %x, 4.0
+  %fmul1 = fmul reassoc float %fmul0, -0.0
+  ret float %fmul1
+}
+
+define float @fmul_reassoc_neg0_posk(float %x) {
+; CHECK-LABEL: define float @fmul_reassoc_neg0_posk(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[FMUL0:%.*]] = fmul reassoc float [[X]], -0.000000e+00
+; CHECK-NEXT:    ret float [[FMUL0]]
+;
+  %fmul0 = fmul reassoc float %x, -0.0
+  %fmul1 = fmul reassoc float %fmul0, 4.0
+  ret float %fmul1
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 0f3137cdd0be3..615f905b7e58a 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2202,3 +2202,31 @@ define i8 @mul_not_nsw_nonneg(i8 %x, i8 %y) {
   %mul = mul i8 %x, %y
   ret i8 %mul
 }
+
+define i16 @mul_udiv_zext(i8 %x) {
+; CHECK-LABEL: @mul_udiv_zext(
+; CHECK-NEXT:    [[X_FR:%.*]] = freeze i8 [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i8 [[X_FR]], 15
+; CHECK-NEXT:    [[NARROW:%.*]] = sub nuw i8 [[X_FR]], [[TMP1]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[NARROW]] to i16
+; CHECK-NEXT:    ret i16 [[ZEXT]]
+;
+  %div = udiv i8 %x, 15
+  %zext = zext i8 %div to i16
+  %mul = mul i16 %zext, 15
+  ret i16 %mul
+}
+
+define i16 @mul_udiv_zext_uneq(i8 %x) {
+; CHECK-LABEL: @mul_udiv_zext_uneq(
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i8 [[X:%.*]], 20
+; CHECK-NEXT:    [[NARROW:%.*]] = mul nuw i8 [[DIV]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = zext i8 [[NARROW]] to i16
+; CHECK-NEXT:    ret i16 [[MUL]]
+;
+  %div = udiv i8 %x, 20
+  %zext = zext i8 %div to i16
+  %mul = mul i16 %zext, 15
+  ret i16 %mul
+}
+
diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
index d06b520931b92..eaccf15cd80f6 100644
--- a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
@@ -316,3 +316,85 @@ define ptr @gep_gep_inv_ptrtoaddr(ptr %p) {
   %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv
   ret ptr %gep2
 }
+
+define i1 @icmp_ptrtoaddr_0() {
+; CHECK-LABEL: define i1 @icmp_ptrtoaddr_0() {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ne i64 ptrtoaddr (ptr @g to i64), 0
+  ret i1 %cmp
+}
+
+; This fails to fold because we currently don't assume that globals are located
+; at a non-null address for non-default address spaces.
+define i1 @icmp_ptrtoaddr_0_addrsize() {
+; CHECK-LABEL: define i1 @icmp_ptrtoaddr_0_addrsize() {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_ptrtoint_0_addrsize() {
+; CHECK-LABEL: define i1 @icmp_ptrtoint_0_addrsize() {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_ptrtoaddr_ptrtoaddr() {
+; CHECK-LABEL: define i1 @icmp_ptrtoaddr_ptrtoaddr() {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ne i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64)
+  ret i1 %cmp
+}
+
+define i1 @icmp_ptrtoaddr_ptrtoaddr_addrsize() {
+; CHECK-LABEL: define i1 @icmp_ptrtoaddr_ptrtoaddr_addrsize() {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), ptrtoaddr (ptr addrspace(1) @g2.as1 to i32)
+  ret i1 %cmp
+}
+
+; This could still be folded because the address being non-equal also implies
+; that all pointer bits together are non-equal.
+define i1 @icmp_ptrtoint_ptrtoint_addrsize() {
+; CHECK-LABEL: define i1 @icmp_ptrtoint_ptrtoint_addrsize() {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) @g2.as1 to i64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) @g2.as1 to i64)
+  ret i1 %cmp
+}
+
+define i1 @icmp_relational_ptrtoaddr_ptrtoaddr() {
+; CHECK-LABEL: define i1 @icmp_relational_ptrtoaddr_ptrtoaddr() {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ult i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr getelementptr inbounds (i8, ptr @g, i64 1) to i64)
+  ret i1 %cmp
+}
+
+define i1 @icmp_relational_ptrtoaddr_ptrtoaddr_addrsize() {
+; CHECK-LABEL: define i1 @icmp_relational_ptrtoaddr_ptrtoaddr_addrsize() {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ult i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), ptrtoaddr (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i32 1) to i32)
+  ret i1 %cmp
+}
+
+; This could still be folded because we know that the non-address bits must be
+; the same, as GEP does not modify them.
+define i1 @icmp_relational_ptrtoint_ptrtoint_addrsize() {
+; CHECK-LABEL: define i1 @icmp_relational_ptrtoint_ptrtoint_addrsize() {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i64 1) to i64)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp ult i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i64 1) to i64)
+  ret i1 %cmp
+}
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s
index cd3d7e0bf1b57..d9943f342b827 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -41,19 +41,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]
-# CHECK-NEXT:  -      -     6.50   6.50    -     6.50   6.50    -      -      -      -      -      -
+# CHECK-NEXT:  -      -     2.00   2.00   3.00   5.00   5.00    -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlurb	w30, [x17]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s
index 6faa5e1f4db1b..d5302e96edf4e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -41,19 +41,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]
-# CHECK-NEXT:  -      -     6.50   6.50    -     6.50   6.50    -      -      -      -      -      -
+# CHECK-NEXT:  -      -     2.00   2.00   3.00   5.00   5.00    -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -     0.50   0.50    -     0.50   0.50    -      -      -      -      -      -     stlurb	w30, [x17]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s
index 5c9b43a0e5121..dcea382de5fa9 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -46,19 +46,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7.0]  [7.1]  [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     6.50   6.50    -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -     2.00   2.00    -      -      -     3.00   5.00   5.00    -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7.0]  [7.1]  [8]    [9]    [10]   [11]   Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -     stlurb	w30, [x17]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s
index 71fd689522215..dfcc202192392 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -48,19 +48,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -     6.50   6.50    -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -     2.00   2.00    -      -      -     3.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -     stlurb	w30, [x17]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s
index a48978ce8b94d..4fff7670058bb 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -53,19 +53,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [2.3]  [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   [17]
-# CHECK-NEXT:  -      -      -     6.50   6.50    -      -      -      -      -      -     6.50    -      -      -      -      -      -      -      -     6.50    -      -      -      -
+# CHECK-NEXT:  -      -      -     2.00   2.00    -      -      -      -     3.00   3.00   5.00    -      -      -      -      -      -      -      -     2.00    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [2.3]  [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   [17]   Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -      -      -     stlurb	w30, [x17]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s
index f801a18bc7a06..dc064d6ea3f3f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s
@@ -10,15 +10,15 @@
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	w7, [x24]
-# CHECK-NEXT:  2      1     0.50    *                   ldapur	x20, [x13]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurb	w13, [x17]
-# CHECK-NEXT:  2      1     0.50    *                   ldapurh	w3, [x22]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	w7, [x8]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursb	x29, [x7]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	w17, [x19]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursh	x3, [x3]
-# CHECK-NEXT:  2      1     0.50                  U     ldapursw	x3, [x18]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	w7, [x24]
+# CHECK-NEXT:  1      4     0.33    *                   ldapur	x20, [x13]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurb	w13, [x17]
+# CHECK-NEXT:  1      4     0.33    *                   ldapurh	w3, [x22]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	w7, [x8]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursb	x29, [x7]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	w17, [x19]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursh	x3, [x3]
+# CHECK-NEXT:  1      4     0.33                  U     ldapursw	x3, [x18]
 # CHECK-NEXT:  2      1     0.50           *            stlur	w3, [x27]
 # CHECK-NEXT:  2      1     0.50           *            stlur	x23, [x25]
 # CHECK-NEXT:  2      1     0.50           *            stlurb	w30, [x17]
@@ -51,19 +51,19 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [2.3]  [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   [15]
-# CHECK-NEXT:  -      -      -     6.50   6.50    -      -      -      -      -      -     6.50    -      -      -      -      -      -      -      -     6.50    -      -
+# CHECK-NEXT:  -      -      -     2.00   2.00    -      -      -      -     3.00   3.00   5.00    -      -      -      -      -      -      -      -     2.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [2.3]  [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   [15]   Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapur	w7, [x24]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapur	x20, [x13]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapurb	w13, [x17]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapurh	w3, [x22]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapursb	w7, [x8]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapursb	x29, [x7]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapursh	w17, [x19]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapursh	x3, [x3]
-# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     ldapursw	x3, [x18]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapur	w7, [x24]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapur	x20, [x13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapurb	w13, [x17]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapurh	w3, [x22]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapursb	w7, [x8]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapursb	x29, [x7]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapursh	w17, [x19]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapursh	x3, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -     ldapursw	x3, [x18]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     stlur	w3, [x27]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     stlur	x23, [x25]
 # CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -      -      -      -     0.50    -      -      -      -      -      -      -      -     0.50    -      -     stlurb	w30, [x17]
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 57e15a48c0bff..898141cbcf978 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -105,7 +105,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
       "E-m:e-Fn32-i64:64-i128:128-n32:64");
   EXPECT_EQ(
       UpgradeDataLayoutString("E-m:a-Fi64-i64:64-n32:64", "powerpc64-ibm-aix"),
-      "E-m:a-Fi64-i64:64-i128:128-n32:64");
+      "E-m:a-Fi64-i64:64-i128:128-n32:64-f64:32:64");
 
   // Check that WebAssembly targets add -i128:128.
   EXPECT_EQ(
@@ -189,6 +189,16 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
             "E-m:e-Fn32-i64:64-n32");
   EXPECT_EQ(UpgradeDataLayoutString("E-m:a-Fi64-i64:64-n32", "powerpc-aix"),
             "E-m:a-Fi64-i64:64-n32");
+
+  EXPECT_EQ(UpgradeDataLayoutString("E-m:a-p:32:32-Fi32-i64:64-n32",
+                                    "powerpc-unknown-aix"),
+            "E-m:a-p:32:32-Fi32-i64:64-n32-f64:32:64");
+  EXPECT_EQ(
+      UpgradeDataLayoutString(
+          "E-m:a-Fi64-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512",
+          "powerpc64-unknown-aix"),
+      "E-m:a-Fi64-i64:64-i128:128-n32:64-f64:32:64-S128-v256:256:256-v512:512:"
+      "512");
 }
 
 TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index a3f8909c36090..4ab27a610249d 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <string>
 
@@ -32,10 +33,12 @@ std::string llvm::getModeName(unsigned Mode) {
 ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, const CodeGenHwModes &CGH)
     : InfoByHwMode<llvm::MVT>(R) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
-  for (const HwModeSelect::PairType &P : MS.Items) {
-    auto I = Map.try_emplace(P.first, MVT(llvm::getValueType(P.second)));
-    assert(I.second && "Duplicate entry?");
-    (void)I;
+  for (auto [ModeID, VT] : MS.Items) {
+    assert(VT && VT->isSubClassOf("ValueType"));
+    if (!Map.try_emplace(ModeID, MVT(llvm::getValueType(VT))).second)
+      PrintFatalError(R->getLoc(), "duplicate ValueType entry for HwMode " +
+                                       CGH.getModeName(ModeID, true) + ": " +
+                                       VT->getName());
   }
   if (R->isSubClassOf("PtrValueType"))
     PtrAddrSpace = R->getValueAsInt("AddrSpace");
@@ -143,10 +146,12 @@ RegSizeInfoByHwMode::RegSizeInfoByHwMode(const Record *R,
                                          const CodeGenHwModes &CGH)
     : InfoByHwMode<llvm::RegSizeInfo>(R) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
-  for (const HwModeSelect::PairType &P : MS.Items) {
-    auto I = Map.try_emplace(P.first, RegSizeInfo(P.second));
-    assert(I.second && "Duplicate entry?");
-    (void)I;
+  for (auto [ModeID, RegInfo] : MS.Items) {
+    assert(RegInfo && RegInfo->isSubClassOf("RegInfo"));
+    if (!Map.try_emplace(ModeID, RegSizeInfo(RegInfo)).second)
+      PrintFatalError(R->getLoc(), "duplicate RegInfo entry for HwMode " +
+                                       CGH.getModeName(ModeID, true) + ": " +
+                                       RegInfo->getName());
   }
 }
 
@@ -198,7 +203,9 @@ RegClassByHwMode::RegClassByHwMode(const Record *R, const CodeGenHwModes &CGH,
            "Register class must subclass RegisterClass");
     const CodeGenRegisterClass *RegClass = RegBank.getRegClass(RegClassRec);
     if (!Map.try_emplace(ModeID, RegClass).second)
-      llvm_unreachable("duplicate entry");
+      PrintFatalError(R->getLoc(), "duplicate RegisterClass entry for HwMode " +
+                                       CGH.getModeName(ModeID, true) + ": " +
+                                       RegClass->getName());
   }
 }
 
@@ -211,10 +218,12 @@ SubRegRangeByHwMode::SubRegRangeByHwMode(const Record *R,
                                          const CodeGenHwModes &CGH)
     : InfoByHwMode<llvm::SubRegRange>(R) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
-  for (const HwModeSelect::PairType &P : MS.Items) {
-    auto I = Map.try_emplace(P.first, SubRegRange(P.second));
-    assert(I.second && "Duplicate entry?");
-    (void)I;
+  for (auto [ModeID, Range] : MS.Items) {
+    assert(Range && Range->isSubClassOf("SubRegRange"));
+    if (!Map.try_emplace(ModeID, SubRegRange(Range)).second)
+      PrintFatalError(R->getLoc(), "duplicate SubRegRange entry for HwMode " +
+                                       CGH.getModeName(ModeID, true) + ": " +
+                                       Range->getName());
   }
 }
 
@@ -222,12 +231,14 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(const Record *R,
                                            const CodeGenHwModes &CGH)
     : InfoByHwMode<const llvm::Record *>(R) {
   const HwModeSelect &MS = CGH.getHwModeSelect(R);
-  for (const HwModeSelect::PairType &P : MS.Items) {
-    assert(P.second && P.second->isSubClassOf("InstructionEncoding") &&
+  for (auto [ModeID, Encoding] : MS.Items) {
+    assert(Encoding && Encoding->isSubClassOf("InstructionEncoding") &&
            "Encoding must subclass InstructionEncoding");
-    auto I = Map.try_emplace(P.first, P.second);
-    assert(I.second && "Duplicate entry?");
-    (void)I;
+    if (!Map.try_emplace(ModeID, Encoding).second)
+      PrintFatalError(R->getLoc(),
+                      "duplicate InstructionEncoding entry for HwMode " +
+                          CGH.getModeName(ModeID, true) + ": " +
+                          Encoding->getName());
   }
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index a0a00513d7da5..51d310970fda9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -543,7 +543,7 @@ def NVVM_NanosleepOp : NVVM_Op<"nanosleep">,
 // NVVM Performance Monitor events
 //===----------------------------------------------------------------------===//
 
-def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
+def NVVM_PMEventOp : NVVM_Op<"pmevent">,
   Arguments<(ins OptionalAttr<I16Attr>:$maskedEventId, 
                  OptionalAttr<I32Attr>:$eventId)> {
   let summary = "Trigger one or more Performance Monitor events.";
@@ -561,20 +561,20 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent)
   }];
   
-  string llvmBuilder = [{
-      llvm::Value *mId = builder.getInt16(* $maskedEventId);
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_pm_event_mask, {mId});
-  }];
-
   let assemblyFormat = "attr-dict (`id` `=` $eventId^)? (`mask` `=` $maskedEventId^)?";
+  let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    bool hasIntrinsic() { return !getEventId(); }
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("pmevent %0;"); }
+
+  string llvmBuilder = [{
+    auto [id, args] = NVVM::PMEventOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
-  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 89197ec2f50b6..fd84ed6399d5d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2476,6 +2476,25 @@ mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
   return {id, std::move(args)};
 }
 
+mlir::NVVM::IDArgPair
+PMEventOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                                 llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::PMEventOp>(op);
+  llvm::Type *i16Ty = llvm::Type::getInt16Ty(mt.getLLVMContext());
+
+  // With event-id, mask is generated as (1 << event-id)
+  llvm::Value *maskVal;
+  if (auto eventAttr = thisOp.getEventIdAttr()) {
+    uint16_t mask = static_cast<uint16_t>(1u << eventAttr.getInt());
+    maskVal = llvm::ConstantInt::get(i16Ty, mask);
+  } else {
+    maskVal =
+        llvm::ConstantInt::get(i16Ty, thisOp.getMaskedEventIdAttr().getValue());
+  }
+
+  return {llvm::Intrinsic::nvvm_pm_event_mask, {maskVal}};
+}
+
 mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInitOp>(op);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index c2485a08932dd..bbfbd2e9736a1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -279,6 +279,17 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
   CONV_OP_SPECIALIZER(linalg::Conv1DNwcWcfOp);
   CONV_OP_SPECIALIZER(linalg::Conv1DNcwFcwOp);
   CONV_OP_SPECIALIZER(linalg::Conv2DOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwcHwcfOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwcHwcfQOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwcFhwcOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwcFhwcQOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNchwFchwOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNchwFchwQOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNgchwFgchwOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNgchwGfchwOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNgchwGfchwQOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwgcGfhwcOp);
+  CONV_OP_SPECIALIZER(linalg::Conv2DNhwgcGfhwcQOp);
   CONV_OP_SPECIALIZER(linalg::Conv3DOp);
   // -----------------------------
   // Depthwise Convolution ops.
@@ -287,6 +298,10 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
   CONV_OP_SPECIALIZER(linalg::DepthwiseConv1DNwcWcOp);
   CONV_OP_SPECIALIZER(linalg::DepthwiseConv1DNwcWcmOp);
   CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNchwChwOp);
+  CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcOp);
+  CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcQOp);
+  CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcmOp);
+  CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcmQOp);
   CONV_OP_SPECIALIZER(linalg::DepthwiseConv3DNdhwcDhwcmOp);
   // -----------------------------
   // Pooling ops.
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 01e6e1e248658..1244be90390e2 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -240,8 +240,8 @@ bool isReductionIterator(utils::IteratorType iteratorType) {
 //===----------------------------------------------------------------------===//
 
 /// Returns the BlockArgument that leads to `val`, if any. Traverses optional
-/// ext* ops.
-static BlockArgument getBlockArgumentWithOptionalExtOps(Value val) {
+/// ext*/sitofp ops.
+static BlockArgument getBlockArgumentWithOptionalCastOps(Value val) {
   BlockArgument blockArg = dyn_cast<BlockArgument>(val);
   if ((blockArg))
     return blockArg;
@@ -249,18 +249,82 @@ static BlockArgument getBlockArgumentWithOptionalExtOps(Value val) {
   Operation *defOp = val.getDefiningOp();
   if (!dyn_cast_if_present<arith::ExtFOp>(defOp) &&
       !dyn_cast_if_present<arith::ExtSIOp>(defOp) &&
-      !dyn_cast_if_present<arith::ExtUIOp>(defOp)) {
+      !dyn_cast_if_present<arith::ExtUIOp>(defOp) &&
+      !dyn_cast_if_present<arith::SIToFPOp>(defOp)) {
     return nullptr;
   }
   return dyn_cast<BlockArgument>(defOp->getOperand(0));
 }
 
+/// Utility function to match the zero point offset body of quantized
+/// convolution ops.
+///
+/// Quantized convolutions have a body of the form:
+///   %out + ((%input - %inputZp) * (%filter - %filterZp))
+/// where:
+///   - %input is the input tensor element (block arg 0)
+///   - %filter is the filter tensor element (block arg 1)
+///   - %inputZp is the input zero-point scalar (block arg 2)
+///   - %filterZp is the filter zero-point scalar (block arg 3)
+///   - %out is the output accumulator (block arg 4)
+///
+/// This function verifies that the multiplication operands are subtraction
+/// operations matching this pattern.
+static bool bodyMatcherForZeroPointOffsets(Operation *addOp, Operation *mulOp,
+                                           Block *body) {
+  // The multiplication should have two subtraction operands:
+  // one for (input - inputZp) and one for (filter - filterZp).
+  Operation *inputSubOp = mulOp->getOperand(0).getDefiningOp();
+  if (!isa_and_present<arith::SubIOp, arith::SubFOp>(inputSubOp))
+    return false;
+
+  Operation *filterSubOp = mulOp->getOperand(1).getDefiningOp();
+  if (!isa_and_present<arith::SubIOp, arith::SubFOp>(filterSubOp))
+    return false;
+
+  // Extract block arguments from subtraction operands.
+  BlockArgument inputBlockArg =
+      getBlockArgumentWithOptionalCastOps(inputSubOp->getOperand(0));
+  BlockArgument inputZpBlockArg =
+      getBlockArgumentWithOptionalCastOps(inputSubOp->getOperand(1));
+  BlockArgument filterBlockArg =
+      getBlockArgumentWithOptionalCastOps(filterSubOp->getOperand(0));
+  BlockArgument filterZpBlockArg =
+      getBlockArgumentWithOptionalCastOps(filterSubOp->getOperand(1));
+  BlockArgument outBlockArg =
+      getBlockArgumentWithOptionalCastOps(addOp->getOperand(0));
+
+  // Verify all block arguments are valid.
+  if (!inputBlockArg || !inputZpBlockArg || !filterBlockArg ||
+      !filterZpBlockArg || !outBlockArg)
+    return false;
+
+  // Verify all block arguments belong to the convolution body.
+  if (inputBlockArg.getOwner() != body || inputZpBlockArg.getOwner() != body ||
+      filterBlockArg.getOwner() != body ||
+      filterZpBlockArg.getOwner() != body || outBlockArg.getOwner() != body)
+    return false;
+
+  // Verify block arguments have expected indices:
+  // arg0: input, arg1: filter, arg2: inputZp, arg3: filterZp, arg4: output
+  if (inputBlockArg.getArgNumber() != 0 || filterBlockArg.getArgNumber() != 1 ||
+      inputZpBlockArg.getArgNumber() != 2 ||
+      filterZpBlockArg.getArgNumber() != 3 || outBlockArg.getArgNumber() != 4)
+    return false;
+
+  return true;
+}
+
 /// Utility to match block body for convolution ops.
 /// The body is thus expected to yield :-
 ///     %out + (%lhs * %rhs)
 ///   where: %lhs, %rhs and %out are block arguments and
 ///          %lhs and %rhs can have optional upcast operation.
-static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body) {
+/// NOTE: In case of zero point offset convolution ops %lhs and %rhs would be :-
+///       %input - %input_scalar
+///          where, %input_scalar can have optional upcast operation.
+static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body,
+                                         bool containsZeroPointOffset = false) {
   Operation *addOp = yieldVal.getDefiningOp();
   if (!isa_and_present<arith::AddIOp, arith::AddFOp>(addOp))
     return false;
@@ -269,12 +333,15 @@ static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body) {
   if (!isa_and_present<arith::MulIOp, arith::MulFOp>(mulOp))
     return false;
 
+  if (containsZeroPointOffset) {
+    return bodyMatcherForZeroPointOffsets(addOp, mulOp, body);
+  }
   BlockArgument lhsBlockArg =
-      getBlockArgumentWithOptionalExtOps(mulOp->getOperand(0));
+      getBlockArgumentWithOptionalCastOps(mulOp->getOperand(0));
   BlockArgument rhsBlockArg =
-      getBlockArgumentWithOptionalExtOps(mulOp->getOperand(1));
+      getBlockArgumentWithOptionalCastOps(mulOp->getOperand(1));
   BlockArgument outBlockArg =
-      getBlockArgumentWithOptionalExtOps(addOp->getOperand(0));
+      getBlockArgumentWithOptionalCastOps(addOp->getOperand(0));
   if (!lhsBlockArg || !rhsBlockArg || !outBlockArg ||
       lhsBlockArg.getOwner() != body || rhsBlockArg.getOwner() != body ||
       outBlockArg.getOwner() != body || lhsBlockArg.getArgNumber() != 0 ||
@@ -291,9 +358,9 @@ static bool bodyMatcherForPoolOps(Value yieldVal, Block *body) {
     return false;
 
   BlockArgument lhsArg =
-      getBlockArgumentWithOptionalExtOps(defOp->getOperand(0));
+      getBlockArgumentWithOptionalCastOps(defOp->getOperand(0));
   BlockArgument rhsArg =
-      getBlockArgumentWithOptionalExtOps(defOp->getOperand(1));
+      getBlockArgumentWithOptionalCastOps(defOp->getOperand(1));
   if (!lhsArg || !rhsArg || lhsArg.getOwner() != body ||
       rhsArg.getOwner() != body || lhsArg.getArgNumber() != 2 ||
       rhsArg.getArgNumber() != 0)
@@ -502,14 +569,15 @@ class ConvMatcherBuilder {
   }
 
   /// Match body pattern. This should be called last.
-  bool matchBody() {
+  bool matchBody(bool zeroPointOffset = false) {
     if (!matched)
       return false;
     Block *body = op.getBlock();
     auto yieldOp = cast<linalg::YieldOp>(body->getTerminator());
     switch (poolingType) {
     case PoolingType::None:
-      return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body);
+      return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body,
+                                          zeroPointOffset);
     case PoolingType::MaxSigned:
       return bodyMatcherForMaxSignedPoolOps(yieldOp.getOperand(0), body);
     case PoolingType::MaxUnsigned:
@@ -634,6 +702,361 @@ bool isaConvolutionOpOfType<linalg::Conv2DOp>(LinalgOp op,
       .matchBody();
 }
 
+// #inputMap  = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)>
+// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (h, w, c, F)>
+// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwcHwcfOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwcHwcfOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr F = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+  AffineExpr c = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c},
+                  /*filterMap=*/{h, w, c, F},
+                  /*outputMap=*/{N, H, W, F}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)>
+// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (h, w, c, F)>
+// #scalarMap = affine_map<(N, H, W, F, h, w, c) -> ()>
+// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwcHwcfQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwcHwcfQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr F = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+  AffineExpr c = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c},
+                  /*filterMap=*/{h, w, c, F},
+                  /*scalarMap=*/{},
+                  /*scalarMap=*/{},
+                  /*outputMap=*/{N, H, W, F}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
+// #inputMap  = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)>
+// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (F, h, w, c)>
+// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwcFhwcOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwcFhwcOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr F = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+  AffineExpr c = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c},
+                  /*filterMap=*/{F, h, w, c},
+                  /*outputMap=*/{N, H, W, F}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)>
+// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (F, h, w, c)>
+// #scalarMap = affine_map<(N, H, W, F, h, w, c) -> ()>
+// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwcFhwcQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwcFhwcQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr F = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+  AffineExpr c = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c},
+                  /*filterMap=*/{F, h, w, c},
+                  /*scalarMap=*/{},
+                  /*scalarMap=*/{},
+                  /*outputMap=*/{N, H, W, F}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
+// #inputMap  = affine_map<(N, F, H, W, c, h, w) -> (N, c, H + h, W + w)>
+// #filterMap = affine_map<(N, F, H, W, c, h, w) -> (F, c, h, w)>
+// #outputMap = affine_map<(N, F, H, W, c, h, w) -> (N, F, H, W)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNchwFchwOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNchwFchwOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr F = m.dim(1);
+  AffineExpr H = m.dim(2);
+  AffineExpr W = m.dim(3);
+  AffineExpr c = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+
+  return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0)
+      .matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, c, m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{F, c, h, w},
+                  /*outputMap=*/{N, F, H, W}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, F, H, W, c, h, w) -> (N, c, H + h, W + w)>
+// #filterMap = affine_map<(N, F, H, W, c, h, w) -> (F, c, h, w)>
+// #scalarMap = affine_map<(N, F, H, W, c, h, w) -> ()>
+// #outputMap = affine_map<(N, F, H, W, c, h, w) -> (N, F, H, W)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNchwFchwQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNchwFchwQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr F = m.dim(1);
+  AffineExpr H = m.dim(2);
+  AffineExpr W = m.dim(3);
+  AffineExpr c = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+
+  return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0)
+      .matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, c, m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{F, c, h, w},
+                  /*scalarMap=*/{},
+                  /*scalarMap=*/{},
+                  /*outputMap=*/{N, F, H, W}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
+// #inputMap  = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)>
+// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (F, G, c, h, w)>
+// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNgchwFgchwOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNgchwFgchwOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr G = m.dim(1);
+  AffineExpr F = m.dim(2);
+  AffineExpr H = m.dim(3);
+  AffineExpr W = m.dim(4);
+  AffineExpr c = m.dim(5);
+  AffineExpr h = m.dim(6);
+  AffineExpr w = m.dim(7);
+
+  return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0)
+      .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1)
+      .matchMaps(
+          {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)},
+           /*filterMap=*/{F, G, c, h, w},
+           /*outputMap=*/{N, G, F, H, W}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)>
+// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (G, F, c, h, w)>
+// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNgchwGfchwOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNgchwGfchwOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr G = m.dim(1);
+  AffineExpr F = m.dim(2);
+  AffineExpr H = m.dim(3);
+  AffineExpr W = m.dim(4);
+  AffineExpr c = m.dim(5);
+  AffineExpr h = m.dim(6);
+  AffineExpr w = m.dim(7);
+
+  return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0)
+      .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1)
+      .matchMaps(
+          {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)},
+           /*filterMap=*/{G, F, c, h, w},
+           /*outputMap=*/{N, G, F, H, W}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)>
+// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (G, F, c, h, w)>
+// #scalarMap = affine_map<(N, G, F, H, W, c, h, w) -> ()>
+// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNgchwGfchwQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNgchwGfchwQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr G = m.dim(1);
+  AffineExpr F = m.dim(2);
+  AffineExpr H = m.dim(3);
+  AffineExpr W = m.dim(4);
+  AffineExpr c = m.dim(5);
+  AffineExpr h = m.dim(6);
+  AffineExpr w = m.dim(7);
+
+  return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0)
+      .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1)
+      .matchMaps(
+          {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)},
+           /*filterMap=*/{G, F, c, h, w},
+           /*scalarMap=*/{},
+           /*scalarMap=*/{},
+           /*outputMap=*/{N, G, F, H, W}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
+// #inputMap  = affine_map<(N, H, W, G, F, h, w, c) -> (N, H + h, W + w, G, c)>
+// #filterMap = affine_map<(N, H, W, G, F, h, w, c) -> (G, F, h, w, c)>
+// #outputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H, W, G, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwgcGfhwcOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwgcGfhwcOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr G = m.dim(3);
+  AffineExpr F = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+  AffineExpr c = m.dim(7);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/2, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/3, /*oDim=*/2, /*idx=*/1)
+      .matchMaps(
+          {/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), G, c},
+           /*filterMap=*/{G, F, h, w, c},
+           /*outputMap=*/{N, H, W, G, F}})
+      .matchBody();
+}
+
+// #inputMap  = affine_map<(N, H, W, G, F, h, w, c) -> (N, H + h, W + w, G, c)>
+// #filterMap = affine_map<(N, H, W, G, F, h, w, c) -> (G, F, h, w, c)>
+// #scalarMap = affine_map<(N, H, W, G, F, h, w, c) -> ()>
+// #outputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H, W, G, F)>
+template <>
+bool isaConvolutionOpOfType<linalg::Conv2DNhwgcGfhwcQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::Conv2DNhwgcGfhwcQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr G = m.dim(3);
+  AffineExpr F = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+  AffineExpr c = m.dim(7);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/2, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/3, /*oDim=*/2, /*idx=*/1)
+      .matchMaps(
+          {/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), G, c},
+           /*filterMap=*/{G, F, h, w, c},
+           /*scalarMap=*/{},
+           /*scalarMap=*/{},
+           /*outputMap=*/{N, H, W, G, F}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
 // #inputMap  = affine_map<(D, H, W, d, h, w) -> (D + d, H + h, W + w)>
 // #filterMap = affine_map<(D, H, W, d, h, w) -> (d, h, w)>
 // #outputMap = affine_map<(D, H, W, d, h, w) -> (D, H, W)>
@@ -773,6 +1196,130 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNchwChwOp>(
       .matchBody();
 }
 
+// #inputMap = affine_map<(N, H, W, C, h, w) -> (N, H + h, W + w, C)>
+// #filterMap = affine_map<(N, H, W, C, h, w) -> (h, w, C)>
+// #outputMap = affine_map<(N, H, W, C, h, w) -> (N, H, W, C)>
+template <>
+bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNhwcHwcOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr C = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w, C},
+                  /*outputMap=*/{N, H, W, C}})
+      .matchBody();
+}
+
+// #inputMap = affine_map<(N, H, W, C, h, w) -> (N, H + h, W + w, C)>
+// #filterMap = affine_map<(N, H, W, C, h, w) -> (h, w, C)>
+// #scalarMap = affine_map<(N, H, W, C, h, w) -> ()>
+// #outputMap = affine_map<(N, H, W, C, h, w) -> (N, H, W, C)>
+template <>
+bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNhwcHwcQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::DepthwiseConv2DNhwcHwcQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr C = m.dim(3);
+  AffineExpr h = m.dim(4);
+  AffineExpr w = m.dim(5);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w, C},
+                  /*scalarMap=*/{},
+                  /*scalarMap=*/{},
+                  /*outputMap=*/{N, H, W, C}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
+// #inputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H + h, W + w, C)>
+// #filterMap = affine_map<(N, H, W, C, CM, h, w) -> (h, w, C, CM)>
+// #outputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H, W, C, CM)>
+template <>
+bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNhwcHwcmOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::DepthwiseConv2DNhwcHwcmOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr C = m.dim(3);
+  AffineExpr CM = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w, C, CM},
+                  /*outputMap=*/{N, H, W, C, CM}})
+      .matchBody();
+}
+
+// #inputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H + h, W + w, C)>
+// #filterMap = affine_map<(N, H, W, C, CM, h, w) -> (h, w, C, CM)>
+// #scalarMap = affine_map<(N, H, W, C, CM, h, w) -> ()>
+// #outputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H, W, C, CM)>
+template <>
+bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNhwcHwcmQOp>(
+    LinalgOp op, SmallVector<int64_t> *dilations,
+    SmallVector<int64_t> *strides) {
+  if (isa<linalg::DepthwiseConv2DNhwcHwcmQOp>(op))
+    return true;
+
+  assert(isaConvolutionOpInterface(op) &&
+         "expected op to implement ConvolutionOpInterface");
+
+  ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides);
+  AffineExpr N = m.dim(0);
+  AffineExpr H = m.dim(1);
+  AffineExpr W = m.dim(2);
+  AffineExpr C = m.dim(3);
+  AffineExpr CM = m.dim(4);
+  AffineExpr h = m.dim(5);
+  AffineExpr w = m.dim(6);
+
+  return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
+      .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w, C, CM},
+                  /*scalarMap=*/{},
+                  /*scalarMap=*/{},
+                  /*outputMap=*/{N, H, W, C, CM}})
+      .matchBody(/*zeroPointOffset=*/true);
+}
+
 // #inputMap = affine_map<(N, D, H, W, CM, d, h, w, C)
 //                    -> (N, D + d, H + h, W + w, C)>
 // #filterMap = affine_map<(N, D, H, W, CM, d, h, w, C)
diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
index 410a6bffd345e..496a7b036e65d 100644
--- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -17,19 +17,36 @@ namespace mlir {
 namespace scf {
 namespace {
 
+static AffineExpr getTripCountExpr(OpFoldResult lb, OpFoldResult ub,
+                                   OpFoldResult step,
+                                   ValueBoundsConstraintSet &cstr) {
+  AffineExpr lbExpr = cstr.getExpr(lb);
+  AffineExpr ubExpr = cstr.getExpr(ub);
+  AffineExpr stepExpr = cstr.getExpr(step);
+  AffineExpr tripCountExpr =
+      AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step
+  return tripCountExpr;
+}
+
+static void populateIVBounds(OpFoldResult lb, OpFoldResult ub,
+                             OpFoldResult step, Value iv,
+                             ValueBoundsConstraintSet &cstr) {
+  cstr.bound(iv) >= cstr.getExpr(lb);
+  cstr.bound(iv) < cstr.getExpr(ub);
+  // iv <= lb + ((ub-lb)/step - 1) * step
+  // This bound does not replace the `iv < ub` constraint mentioned above,
+  // since constraints involving the multiplication of two constraint set
+  // dimensions are not supported.
+  AffineExpr tripCountMinusOne =
+      getTripCountExpr(lb, ub, step, cstr) - cstr.getExpr(1);
+  AffineExpr computedUpperBound =
+      cstr.getExpr(lb) + AffineExpr(tripCountMinusOne * cstr.getExpr(step));
+  cstr.bound(iv) <= computedUpperBound;
+}
+
 struct ForOpInterface
     : public ValueBoundsOpInterface::ExternalModel<ForOpInterface, ForOp> {
 
-  static AffineExpr getTripCountExpr(scf::ForOp forOp,
-                                     ValueBoundsConstraintSet &cstr) {
-    AffineExpr lbExpr = cstr.getExpr(forOp.getLowerBound());
-    AffineExpr ubExpr = cstr.getExpr(forOp.getUpperBound());
-    AffineExpr stepExpr = cstr.getExpr(forOp.getStep());
-    AffineExpr tripCountExpr =
-        AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step
-    return tripCountExpr;
-  }
-
   /// Populate bounds of values/dimensions for iter_args/OpResults. If the
   /// value/dimension size does not change in an iteration, we can deduce that
   /// it the same as the initial value/dimension.
@@ -87,7 +104,8 @@ struct ForOpInterface
     // `value` is result of `forOp`, we can prove that:
     // %result == %init_arg + trip_count * (%yielded_value - %iter_arg).
     // Where trip_count is (ub - lb) / step.
-    AffineExpr tripCountExpr = getTripCountExpr(forOp, cstr);
+    AffineExpr tripCountExpr = getTripCountExpr(
+        forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), cstr);
     AffineExpr oneIterAdvanceExpr =
         cstr.getExpr(yieldedValue) - cstr.getExpr(iterArg);
     cstr.bound(value) ==
@@ -99,19 +117,8 @@ struct ForOpInterface
     auto forOp = cast<ForOp>(op);
 
     if (value == forOp.getInductionVar()) {
-      cstr.bound(value) >= forOp.getLowerBound();
-      cstr.bound(value) < forOp.getUpperBound();
-      // iv <= lb + ((ub-lb)/step - 1) * step
-      // This bound does not replace the `iv < ub` constraint mentioned above,
-      // since constraints involving the multiplication of two constraint set
-      // dimensions are not supported.
-      AffineExpr tripCountMinusOne =
-          getTripCountExpr(forOp, cstr) - cstr.getExpr(1);
-      AffineExpr computedUpperBound =
-          cstr.getExpr(forOp.getLowerBound()) +
-          AffineExpr(tripCountMinusOne * cstr.getExpr(forOp.getStep()));
-      cstr.bound(value) <= computedUpperBound;
-      return;
+      return populateIVBounds(forOp.getLowerBound(), forOp.getUpperBound(),
+                              forOp.getStep(), value, cstr);
     }
 
     // Handle iter_args and OpResults.
@@ -141,11 +148,9 @@ struct ForallOpInterface
     assert(blockArg.getArgNumber() < forallOp.getInductionVars().size() &&
            "expected index value to be an induction var");
     int64_t idx = blockArg.getArgNumber();
-    // TODO: Take into account step size.
-    AffineExpr lb = cstr.getExpr(forallOp.getMixedLowerBound()[idx]);
-    AffineExpr ub = cstr.getExpr(forallOp.getMixedUpperBound()[idx]);
-    cstr.bound(value) >= lb;
-    cstr.bound(value) < ub;
+    return populateIVBounds(forallOp.getMixedLowerBound()[idx],
+                            forallOp.getMixedUpperBound()[idx],
+                            forallOp.getMixedStep()[idx], value, cstr);
   }
 
   void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim,
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 8fb36ace2c463..c4b8e93b6a9f9 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -678,21 +678,6 @@ llvm.func @inline_ptx_multi_rw_r(%a : i32, %b : i32,  %rw_c : f32, %rw_d : f32)
    llvm.return %r5 : f32
 }
 
-
-// -----
-
-// CHECK-LABEL: @nvvm_pmevent
-llvm.func @nvvm_pmevent() {
-  // CHECK: %[[S0:.+]] = llvm.mlir.constant(10 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S0]] : (i32) -> ()
-  
-  nvvm.pmevent id = 10
-  // CHECK: %[[S1:.+]] = llvm.mlir.constant(4 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S1]] : (i32) -> ()
-  nvvm.pmevent id = 4
-  llvm.return
-}
-
 // -----
 
 llvm.func @inline_ptx_pack_4i8(%src : vector<4xi8>,  %mask : i32, %zero: i32)  {
diff --git a/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir b/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir
index 4b2d42a3ae4e0..432fdd12f540d 100644
--- a/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir
+++ b/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir
@@ -5,8 +5,9 @@
 // RUN: mlir-opt %s -linalg-generalize-named-ops | mlir-opt --linalg-specialize-generic-ops | FileCheck %s --implicit-check-not=linalg.generic
 
 // -----------------------------
-// Convolution ops.
+// Convolution ops - 1D.
 // -----------------------------
+
 func.func @conv_1d(%in : tensor<?xf32>, %filter : tensor<?xf32>, %out : tensor<?xf32>) -> tensor<?xf32> {
   %0 = linalg.conv_1d
          ins(%in, %filter : tensor<?xf32>, tensor<?xf32>)
@@ -44,6 +45,10 @@ func.func @conv_1d_ncw_fcw(%input: tensor<?x?x?xf32>, %filter: tensor<?x?x?xf32>
 
 // -----
 
+// -----------------------------
+// Convolution ops - 2D.
+// -----------------------------
+
 func.func @conv_2d(%in : tensor<?x?xf32>, %filter : tensor<?x?xf32>, %out : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.conv_2d
          ins(%in, %filter : tensor<?x?xf32>, tensor<?x?xf32>)
@@ -55,6 +60,153 @@ func.func @conv_2d(%in : tensor<?x?xf32>, %filter : tensor<?x?xf32>, %out : tens
 
 // -----
 
+func.func @conv_2d_nhwc_hwcf(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %0 = linalg.conv_2d_nhwc_hwcf
+         {dilations = dense<2> : tensor<2xi64>, strides = dense<3> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_nhwc_hwcf
+//      CHECK:   linalg.conv_2d_nhwc_hwcf
+// CHECK-SAME:      dilations = dense<2> : tensor<2xi64>, strides = dense<3> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nhwc_hwcf_q(%input: tensor<?x?x?x?xi8>, %filter: tensor<?x?x?x?xi8>, %output: tensor<?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?xi32> {
+  %0 = linalg.conv_2d_nhwc_hwcf_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?xi32>
+}
+//      CHECK: @conv_2d_nhwc_hwcf_q
+//      CHECK:   linalg.conv_2d_nhwc_hwcf_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nhwc_fhwc(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %0 = linalg.conv_2d_nhwc_fhwc
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_nhwc_fhwc
+//      CHECK:   linalg.conv_2d_nhwc_fhwc
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nhwc_fhwc_q(%input: tensor<?x?x?x?xi8>, %filter: tensor<?x?x?x?xi8>, %output: tensor<?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?xi32> {
+  %0 = linalg.conv_2d_nhwc_fhwc_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?xi32>
+}
+//      CHECK: @conv_2d_nhwc_fhwc_q
+//      CHECK:   linalg.conv_2d_nhwc_fhwc_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nchw_fchw(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %0 = linalg.conv_2d_nchw_fchw
+         {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 4]> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_nchw_fchw
+//      CHECK:   linalg.conv_2d_nchw_fchw
+// CHECK-SAME:      dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 4]> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nchw_fchw_q(%input: tensor<?x?x?x?xi8>, %filter: tensor<?x?x?x?xi8>, %output: tensor<?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?xi32> {
+  %0 = linalg.conv_2d_nchw_fchw_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?xi32>
+}
+//      CHECK: @conv_2d_nchw_fchw_q
+//      CHECK:   linalg.conv_2d_nchw_fchw_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_ngchw_fgchw(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  %0 = linalg.conv_2d_ngchw_fgchw
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_ngchw_fgchw
+//      CHECK:   linalg.conv_2d_ngchw_fgchw
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_ngchw_gfchw(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  %0 = linalg.conv_2d_ngchw_gfchw
+         {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_ngchw_gfchw
+//      CHECK:   linalg.conv_2d_ngchw_gfchw
+// CHECK-SAME:      dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_ngchw_gfchw_q(%input: tensor<?x?x?x?x?xi8>, %filter: tensor<?x?x?x?x?xi8>, %output: tensor<?x?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?x?xi32> {
+  %0 = linalg.conv_2d_ngchw_gfchw_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?x?xi8>, tensor<?x?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?x?xi32>) -> tensor<?x?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?x?xi32>
+}
+//      CHECK: @conv_2d_ngchw_gfchw_q
+//      CHECK:   linalg.conv_2d_ngchw_gfchw_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nhwgc_gfhwc(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  %0 = linalg.conv_2d_nhwgc_gfhwc
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?x?xf32>
+}
+//      CHECK: @conv_2d_nhwgc_gfhwc
+//      CHECK:   linalg.conv_2d_nhwgc_gfhwc
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>
+
+// -----
+
+func.func @conv_2d_nhwgc_gfhwc_q(%input: tensor<?x?x?x?x?xi8>, %filter: tensor<?x?x?x?x?xi8>, %output: tensor<?x?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?x?xi32> {
+  %0 = linalg.conv_2d_nhwgc_gfhwc_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?x?xi8>, tensor<?x?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?x?xi32>) -> tensor<?x?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?x?xi32>
+}
+//      CHECK: @conv_2d_nhwgc_gfhwc_q
+//      CHECK:   linalg.conv_2d_nhwgc_gfhwc_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+// -----------------------------
+// Convolution ops - 3D.
+// -----------------------------
+
 func.func @conv_3d(%in : tensor<?x?x?xf32>, %filter : tensor<?x?x?xf32>, %out : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.conv_3d
          ins(%in, %filter : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
@@ -66,9 +218,10 @@ func.func @conv_3d(%in : tensor<?x?x?xf32>, %filter : tensor<?x?x?xf32>, %out :
 
 // -----
 
-// -----------------------------
-// Depthwise Convolution ops.
-// -----------------------------
+// -------------------------------
+// Depthwise Convolution ops - 1D.
+// -------------------------------
+
 func.func @depthwise_conv_1d_ncw_cw(%input: tensor<?x?x?xf32>, %filter: tensor<?x?xf32>, %output: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.depthwise_conv_1d_ncw_cw
          {dilations = dense<3> : tensor<1xi64>, strides = dense<2> : tensor<1xi64>}
@@ -108,6 +261,10 @@ func.func @depthwise_conv_1d_nwc_wcm(%input: tensor<?x?x?xf32>, %filter: tensor<
 
 // -----
 
+// -------------------------------
+// Depthwise Convolution ops - 2D.
+// -------------------------------
+
 func.func @depthwise_conv_2d_nchw_chw(%input: tensor<?x?x?x?xf16>, %filter: tensor<?x?x?xf16>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
   %0 = linalg.depthwise_conv_2d_nchw_chw
          {dilations = dense<[2,3]> : vector<2xi64>, strides = dense<[4,5]> : vector<2xi64>}
@@ -121,6 +278,62 @@ func.func @depthwise_conv_2d_nchw_chw(%input: tensor<?x?x?x?xf16>, %filter: tens
 
 // -----
 
+func.func @depthwise_conv_2d_nhwc_hwc(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %0 = linalg.depthwise_conv_2d_nhwc_hwc
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+//      CHECK: @depthwise_conv_2d_nhwc_hwc
+//      CHECK:   linalg.depthwise_conv_2d_nhwc_hwc
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>
+
+// -----
+
+func.func @depthwise_conv_2d_nhwc_hwc_q(%input: tensor<?x?x?x?xi8>, %filter: tensor<?x?x?xi8>, %output: tensor<?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?xi32> {
+  %0 = linalg.depthwise_conv_2d_nhwc_hwc_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?xi8>, tensor<?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?xi32>
+}
+//      CHECK: @depthwise_conv_2d_nhwc_hwc_q
+//      CHECK:   linalg.depthwise_conv_2d_nhwc_hwc_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+func.func @depthwise_conv_2d_nhwc_hwcm(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  %0 = linalg.depthwise_conv_2d_nhwc_hwcm
+         {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 1]> : tensor<2xi64>}
+         ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+         outs (%output: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?x?xf32>
+}
+//      CHECK: @depthwise_conv_2d_nhwc_hwcm
+//      CHECK:   linalg.depthwise_conv_2d_nhwc_hwcm
+// CHECK-SAME:      dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 1]> : tensor<2xi64>
+
+// -----
+
+func.func @depthwise_conv_2d_nhwc_hwcm_q(%input: tensor<?x?x?x?xi8>, %filter: tensor<?x?x?x?xi8>, %output: tensor<?x?x?x?x?xi32>, %zp_input: i32, %zp_filter: i32) -> tensor<?x?x?x?x?xi32> {
+  %0 = linalg.depthwise_conv_2d_nhwc_hwcm_q
+         {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+         ins (%input, %filter, %zp_input, %zp_filter : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>, i32, i32)
+         outs (%output: tensor<?x?x?x?x?xi32>) -> tensor<?x?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?x?xi32>
+}
+//      CHECK: @depthwise_conv_2d_nhwc_hwcm_q
+//      CHECK:   linalg.depthwise_conv_2d_nhwc_hwcm_q
+// CHECK-SAME:      dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>
+
+// -----
+
+// -------------------------------
+// Depthwise Convolution ops - 3D.
+// -------------------------------
+
 func.func @depthwise_conv_3d_ndhwc_dhwcm(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %output: tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32> {
   %0 = linalg.depthwise_conv_3d_ndhwc_dhwcm
          {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
@@ -137,6 +350,7 @@ func.func @depthwise_conv_3d_ndhwc_dhwcm(%input: tensor<?x?x?x?x?xf32>, %filter:
 // -----------------------------
 // Pooling ops.
 // -----------------------------
+
 func.func @pooling_nhwc_max(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
   %0 = linalg.pooling_nhwc_max
          {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
diff --git a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir
index 339d97df001c5..60fe96d52d20b 100644
--- a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir
@@ -379,3 +379,12 @@ func.func @scf_for_result_infer_dynamic_init_big_step(%i : index) {
   "test.compare"(%0, %7) {cmp = "LE"} : (index, index) -> ()
   return
 }
+
+func.func @scf_forall_computed_upper_bound(%x: index) {
+  %c6 = arith.constant 6 : index
+  scf.forall (%iv) = (0) to (8) step (3) {
+    // expected-remark @below{{true}}
+    "test.compare"(%iv, %c6) {cmp = "LE"} : (index, index) -> ()
+  }
+  return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir b/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir
new file mode 100644
index 0000000000000..0092d32319a83
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @nvvm_pmevent_mask() {
+  // CHECK-LABEL: define void @nvvm_pmevent_mask() {
+  // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 15000)
+  // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 4)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.pmevent mask = 15000
+  nvvm.pmevent mask = 4
+  llvm.return
+}
+
+llvm.func @nvvm_pmevent_id() {
+  // CHECK-LABEL: define void @nvvm_pmevent_id() {
+  // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 1024)
+  // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 16)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.pmevent id = 10
+  nvvm.pmevent id = 4
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir
new file mode 100644
index 0000000000000..783988fb36368
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+llvm.func @pmevent_no_id() {
+  // expected-error @below {{either `id` or `mask` must be set}}
+  nvvm.pmevent 
+}
+
+// -----
+
+llvm.func @pmevent_bigger15() {
+  // expected-error @below {{`id` must be between 0 and 15}}
+  nvvm.pmevent id  = 16
+}
+
+// -----
+
+llvm.func @pmevent_many_ids() {
+  // expected-error @below {{`id` and `mask` cannot be set at the same time}}
+  nvvm.pmevent id = 1 mask = 1
+}
+
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index d5868ee73cc50..c0fe0fa11f497 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -1,26 +1,5 @@
 // RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
 
-llvm.func @pmevent_no_id() {
-  // expected-error @below {{either `id` or `mask` must be set}}
-  nvvm.pmevent 
-}
-
-// -----
-
-llvm.func @pmevent_bigger15() {
-  // expected-error @below {{`id` must be between 0 and 15}}
-  nvvm.pmevent id  = 141
-}
-
-// -----
-
-llvm.func @pmevent_many_ids() {
-  // expected-error @below {{`id` and `mask` cannot be set at the same time}}
-  nvvm.pmevent id = 1 mask = 1
-}
-
-// -----
-
 llvm.func @kernel_func(%numberOfThreads : i32) {
   // expected-error @below {{'nvvm.barrier' op barrier id is missing, it should be set between 0 to 15}}
   nvvm.barrier number_of_threads = %numberOfThreads
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index c4a69097692cb..9e4aadac69896 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -903,17 +903,6 @@ llvm.func @nvvm_dot_accumulate_2way(%a: vector<2xi16>, %b: vector<4xi8>, %c: i32
 
 // -----
 
-// CHECK-LABEL: @nvvm_pmevent
-llvm.func @nvvm_pmevent() {
-  // CHECK: call void @llvm.nvvm.pm.event.mask(i16 15000)
-  nvvm.pmevent mask = 15000
-  // CHECK: call void @llvm.nvvm.pm.event.mask(i16 4)
-  nvvm.pmevent mask = 4
-  llvm.return
-}
-
-// -----
-
 // CHECK-LABEL: @nanosleep
 llvm.func @nanosleep(%duration: i32) {
   // CHECK: call void @llvm.nvvm.nanosleep(i32 %{{.*}})