From 9058b0aec5822cc77fa4644451f86d2fc5401ecc Mon Sep 17 00:00:00 2001 From: mlund Date: Mon, 26 Jan 2026 19:08:07 +0100 Subject: [PATCH] Fix MPI deadlock in GibbsMatterMove The Gibbs ensemble simulation would deadlock when both moltransrot and gibbs_matter moves were enabled. Root cause was RNG desynchronization between MPI processes. Problem: - MoveCollection::sample() uses mpi.random to select moves (must stay in sync) - GibbsMatterMove::_move() was using mpi.random for molecule selection - The two cells have different molecule counts (e.g., 5 vs 15) - std::uniform_int_distribution uses rejection sampling, which makes variable numbers of engine calls for different ranges - This caused mpi.random states to diverge between processes - Subsequent sample() calls returned different moves on each process - When one selected gibbs_matter (with MPI calls in bias()) and the other selected moltransrot, the first blocked waiting for MPI Fix: 1. Use Faunus::random (local, non-MPI) for molecule selection instead of mpi.random. This keeps mpi.random synchronized for move selection. 2. Add MPI exchange to synchronize early returns. When one process fails (no molecules found or cell full) while the other succeeds, both must skip the move to avoid one blocking in bias() waiting for MPI calls. --- src/move.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/move.cpp b/src/move.cpp index 5207eab8..c8d790aa 100644 --- a/src/move.cpp +++ b/src/move.cpp @@ -768,22 +768,32 @@ void GibbsMatterMove::_from_json(const json& j) void GibbsMatterMove::_move(Change& change) { - auto& random_engine = mpi.random; + // Use mpi.random ONLY for the insert/delete decision (must stay in sync between processes). + // Use Faunus::random for molecule selection since the number of molecules differs between + // cells and uniform_int_distribution can make variable RNG calls due to rejection sampling. + auto& mpi_random = mpi.random; + // insert or delete; do the opposite in the other cell - insert = static_cast(random_engine.range(0, 1)); // note internal random generator! + insert = static_cast(mpi_random.range(0, 1)); // synchronized across MPI ranks if (gibbs->mpi.isMaster()) { insert = !insert; // delete in one cell; remove from the other } - // find a molecule to insert or delete + // find a molecule to insert or delete - use local (non-MPI) random generator const auto selection = (insert) ? Space::Selection::INACTIVE : Space::Selection::ACTIVE; - const auto group = spc.randomMolecule(molid, random_engine, selection); + const auto group = spc.randomMolecule(molid, Faunus::random, selection); // boundary check const bool no_group = (group == spc.groups.end()); const bool cell_is_full = (spc.numMolecules(molid) == gibbs->total_num_particles); - if (no_group || (insert && cell_is_full)) { + + // Synchronize early return: if either process fails, both must fail. + // This is necessary because bias() contains MPI exchange() calls and is only + // called when change is non-empty. Without sync, one process could block in bias(). + const bool local_failed = no_group || (insert && cell_is_full); + const bool other_failed = static_cast(gibbs->exchange(static_cast(local_failed))); + if (local_failed || other_failed) { change.clear(); return; }