From 9058b0aec5822cc77fa4644451f86d2fc5401ecc Mon Sep 17 00:00:00 2001
From: mlund <mlund@localhost>
Date: Mon, 26 Jan 2026 19:08:07 +0100
Subject: [PATCH] Fix MPI deadlock in GibbsMatterMove

The Gibbs ensemble simulation would deadlock when both moltransrot and
gibbs_matter moves were enabled. Root cause was RNG desynchronization
between MPI processes.

Problem:
- MoveCollection::sample() uses mpi.random to select moves (must stay in sync)
- GibbsMatterMove::_move() was using mpi.random for molecule selection
- The two cells have different molecule counts (e.g., 5 vs 15)
- std::uniform_int_distribution uses rejection sampling, which makes
  variable numbers of engine calls for different ranges
- This caused mpi.random states to diverge between processes
- Subsequent sample() calls returned different moves on each process
- When one selected gibbs_matter (with MPI calls in bias()) and the
  other selected moltransrot, the first blocked waiting for MPI

Fix:
1. Use Faunus::random (local, non-MPI) for molecule selection instead
   of mpi.random. This keeps mpi.random synchronized for move selection.

2. Add MPI exchange to synchronize early returns. When one process fails
   (no molecules found or cell full) while the other succeeds, both must
   skip the move to avoid one blocking in bias() waiting for MPI calls.
---
 src/move.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/src/move.cpp b/src/move.cpp
index 5207eab8..c8d790aa 100644
--- a/src/move.cpp
+++ b/src/move.cpp
@@ -768,22 +768,32 @@ void GibbsMatterMove::_from_json(const json& j)
 
 void GibbsMatterMove::_move(Change& change)
 {
-    auto& random_engine = mpi.random;
+    // Use mpi.random ONLY for the insert/delete decision (must stay in sync between processes).
+    // Use Faunus::random for molecule selection since the number of molecules differs between
+    // cells and uniform_int_distribution can make variable RNG calls due to rejection sampling.
+    auto& mpi_random = mpi.random;
+
     // insert or delete; do the opposite in the other cell
-    insert = static_cast<bool>(random_engine.range(0, 1)); // note internal random generator!
+    insert = static_cast<bool>(mpi_random.range(0, 1)); // synchronized across MPI ranks
     if (gibbs->mpi.isMaster()) {
         insert = !insert; // delete in one cell; remove from the other
     }
 
-    // find a molecule to insert or delete
+    // find a molecule to insert or delete - use local (non-MPI) random generator
     const auto selection = (insert) ? Space::Selection::INACTIVE : Space::Selection::ACTIVE;
-    const auto group = spc.randomMolecule(molid, random_engine, selection);
+    const auto group = spc.randomMolecule(molid, Faunus::random, selection);
 
     // boundary check
     const bool no_group = (group == spc.groups.end());
     const bool cell_is_full =
         (spc.numMolecules<Space::GroupType::ACTIVE>(molid) == gibbs->total_num_particles);
-    if (no_group || (insert && cell_is_full)) {
+
+    // Synchronize early return: if either process fails, both must fail.
+    // This is necessary because bias() contains MPI exchange() calls and is only
+    // called when change is non-empty. Without sync, one process could block in bias().
+    const bool local_failed = no_group || (insert && cell_is_full);
+    const bool other_failed = static_cast<bool>(gibbs->exchange(static_cast<double>(local_failed)));
+    if (local_failed || other_failed) {
         change.clear();
         return;
     }