From 862a77641be800c61019b2f296f9a37bd8eb8997 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 8 Jan 2024 16:54:05 -0600 Subject: [PATCH 01/76] first round of refactoring runners.py, Runner base class for normal in-place launches, but based on the contents of passed-in specs, instantiates the relevant subclass --- libensemble/utils/runners.py | 122 +++++++++++------------------------ libensemble/worker.py | 10 +-- 2 files changed, 44 insertions(+), 88 deletions(-) diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py index 07897b942..8c35a9064 100644 --- a/libensemble/utils/runners.py +++ b/libensemble/utils/runners.py @@ -1,76 +1,53 @@ import inspect import logging import logging.handlers -from typing import Callable, Dict, Optional +from typing import Callable, Optional import numpy.typing as npt -from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG - logger = logging.getLogger(__name__) -class Runners: - """Determines and returns methods for workers to run user functions. - - Currently supported: direct-call and Globus Compute - """ - - def __init__(self, sim_specs: dict, gen_specs: dict) -> None: - self.sim_specs = sim_specs - self.gen_specs = gen_specs - self.sim_f = sim_specs["sim_f"] - self.gen_f = gen_specs.get("gen_f") - self.has_globus_compute_sim = len(sim_specs.get("globus_compute_endpoint", "")) > 0 - self.has_globus_compute_gen = len(gen_specs.get("globus_compute_endpoint", "")) > 0 - - if any([self.has_globus_compute_sim, self.has_globus_compute_gen]): - if self.has_globus_compute_sim: - self.sim_globus_compute_executor = self._get_globus_compute_executor()( - endpoint_id=self.sim_specs["globus_compute_endpoint"] - ) - self.globus_compute_simfid = self.sim_globus_compute_executor.register_function(self.sim_f) - - if self.has_globus_compute_gen: - self.gen_globus_compute_executor = self._get_globus_compute_executor()( - endpoint_id=self.gen_specs["globus_compute_endpoint"] - ) - self.globus_compute_genfid = self.gen_globus_compute_executor.register_function(self.gen_f) +class Runner: + def __new__(cls, specs): + if len(specs.get("globus_compute_endpoint", "")) > 0: + return super(Runner, GlobusComputeRunner).__new__(GlobusComputeRunner) + if specs.get("threaded"): # TODO: undecided interface + return super(Runner, ThreadRunner).__new__(ThreadRunner) + else: + return Runner - def make_runners(self) -> Dict[int, Callable]: - """Creates functions to run a sim or gen. These functions are either - called directly by the worker or submitted to a Globus Compute endpoint.""" + def __init__(self, specs): + self.specs = specs + self.f = specs.get("sim_f") or specs.get("gen_f") - def run_sim(calc_in, Work): - """Determines how to run sim.""" - if self.has_globus_compute_sim: - result = self._globus_compute_result - else: - result = self._normal_result + def _truncate_args(self, calc_in, persis_info, specs, libE_info, user_f): + nparams = len(inspect.signature(user_f).parameters) + args = [calc_in, persis_info, specs, libE_info] + return args[:nparams] - return result(calc_in, Work["persis_info"], self.sim_specs, Work["libE_info"], self.sim_f, Work["tag"]) + def _result( + self, calc_in: npt.NDArray, persis_info: dict, specs: dict, libE_info: dict, user_f: Callable, tag: int + ) -> (npt.NDArray, dict, Optional[int]): + """User function called in-place""" + args = self._truncate_args(calc_in, persis_info, specs, libE_info, user_f) + return user_f(*args) - if self.gen_specs: + def shutdown(self) -> None: + pass - def run_gen(calc_in, Work): - """Determines how to run gen.""" - if self.has_globus_compute_gen: - result = self._globus_compute_result - else: - result = self._normal_result + def run(self, calc_in, Work): + return self._result(calc_in, Work["persis_info"], self.specs, Work["libE_info"], self.f, Work["tag"]) - return result(calc_in, Work["persis_info"], self.gen_specs, Work["libE_info"], self.gen_f, Work["tag"]) - else: - run_gen = [] - - return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} +class GlobusComputeRunner(Runner): + def __init__(self, specs): + super().__init__(specs) + self.globus_compute_executor = self._get_globus_compute_executor()(endpoint_id=specs["globus_compute_endpoint"]) + self.globus_compute_fid = self.globus_compute_executor.register_function(self.f) def shutdown(self) -> None: - if self.has_globus_compute_sim: - self.sim_globus_compute_executor.shutdown() - if self.has_globus_compute_gen: - self.gen_globus_compute_executor.shutdown() + self.globus_compute_executor.shutdown() def _get_globus_compute_executor(self): try: @@ -82,42 +59,21 @@ def _get_globus_compute_executor(self): else: return Executor - def _truncate_args(self, calc_in, persis_info, specs, libE_info, user_f): - nparams = len(inspect.signature(user_f).parameters) - args = [calc_in, persis_info, specs, libE_info] - return args[:nparams] - - def _normal_result( - self, calc_in: npt.NDArray, persis_info: dict, specs: dict, libE_info: dict, user_f: Callable, tag: int - ) -> (npt.NDArray, dict, Optional[int]): - """User function called in-place""" - args = self._truncate_args(calc_in, persis_info, specs, libE_info, user_f) - return user_f(*args) - - def _get_func_uuid(self, tag): - if tag == EVAL_SIM_TAG: - return self.globus_compute_simfid - elif tag == EVAL_GEN_TAG: - return self.globus_compute_genfid - - def _get_globus_compute_exctr(self, tag): - if tag == EVAL_SIM_TAG: - return self.sim_globus_compute_executor - elif tag == EVAL_GEN_TAG: - return self.gen_globus_compute_executor - - def _globus_compute_result( + def _result( self, calc_in: npt.NDArray, persis_info: dict, specs: dict, libE_info: dict, user_f: Callable, tag: int ) -> (npt.NDArray, dict, Optional[int]): - """User function submitted to Globus Compute""" from libensemble.worker import Worker libE_info["comm"] = None # 'comm' object not pickle-able Worker._set_executor(0, None) # ditto for executor fargs = self._truncate_args(calc_in, persis_info, specs, libE_info, user_f) - exctr = self._get_globus_compute_exctr(tag) - func_id = self._get_func_uuid(tag) + exctr = self.globus_compute_executor + func_id = self.globus_compute_fid task_fut = exctr.submit_to_registered_function(func_id, fargs) return task_fut.result() + + +class ThreadRunner(Runner): + pass diff --git a/libensemble/worker.py b/libensemble/worker.py index 792c7886b..46ab84db6 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -33,7 +33,7 @@ from libensemble.utils.loc_stack import LocationStack from libensemble.utils.misc import extract_H_ranges from libensemble.utils.output_directory import EnsembleDirectory -from libensemble.utils.runners import Runners +from libensemble.utils.runners import Runner from libensemble.utils.timer import Timer logger = logging.getLogger(__name__) @@ -166,10 +166,10 @@ def __init__( self.workerID = workerID self.libE_specs = libE_specs self.stats_fmt = libE_specs.get("stats_fmt", {}) - + self.sim_runner = Runner(sim_specs) + self.gen_runner = Runner(gen_specs) + self.runners = {EVAL_SIM_TAG: self.sim_runner.run, EVAL_GEN_TAG: self.gen_runner.run} self.calc_iter = {EVAL_SIM_TAG: 0, EVAL_GEN_TAG: 0} - self.runners = Runners(sim_specs, gen_specs) - self._run_calc = self.runners.make_runners() Worker._set_executor(self.workerID, self.comm) Worker._set_resources(self.workerID, self.comm) self.EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) @@ -258,7 +258,7 @@ def _handle_calc(self, Work: dict, calc_in: npt.NDArray) -> (npt.NDArray, dict, try: logger.debug(f"Starting {enum_desc}: {calc_id}") - calc = self._run_calc[calc_type] + calc = self.runners[calc_type] with timer: if self.EnsembleDirectory.use_calc_dirs(calc_type): loc_stack, calc_dir = self.EnsembleDirectory.prep_calc_dir( From e6874a6657618059c10a1f1a75dc3ba83355c964 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 9 Jan 2024 12:28:21 -0600 Subject: [PATCH 02/76] refactoring classes so class attributes aren't passed around internally. update unit test --- .../tests/unit_tests/test_ufunc_runners.py | 51 +++++++------------ libensemble/utils/runners.py | 42 +++++++-------- libensemble/worker.py | 3 +- 3 files changed, 37 insertions(+), 59 deletions(-) diff --git a/libensemble/tests/unit_tests/test_ufunc_runners.py b/libensemble/tests/unit_tests/test_ufunc_runners.py index 85b986d39..b63360e81 100644 --- a/libensemble/tests/unit_tests/test_ufunc_runners.py +++ b/libensemble/tests/unit_tests/test_ufunc_runners.py @@ -3,9 +3,8 @@ import pytest import libensemble.tests.unit_tests.setup as setup -from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.tools.fields_keys import libE_fields -from libensemble.utils.runners import Runners +from libensemble.utils.runners import Runner def get_ufunc_args(): @@ -19,7 +18,7 @@ def get_ufunc_args(): sim_ids = np.zeros(1, dtype=int) Work = { - "tag": EVAL_SIM_TAG, + "tag": 1, "persis_info": {}, "libE_info": {"H_rows": sim_ids}, "H_fields": sim_specs["in"], @@ -28,30 +27,15 @@ def get_ufunc_args(): return calc_in, sim_specs, gen_specs -@pytest.mark.extra def test_normal_runners(): calc_in, sim_specs, gen_specs = get_ufunc_args() - runners = Runners(sim_specs, gen_specs) - assert ( - not runners.has_globus_compute_sim and not runners.has_globus_compute_gen + simrunner = Runner(sim_specs) + genrunner = Runner(gen_specs) + assert not hasattr(simrunner, "globus_compute_executor") and not hasattr( + genrunner, "globus_compute_executor" ), "Globus Compute use should not be detected without setting endpoint fields" - ro = runners.make_runners() - assert all( - [i in ro for i in [EVAL_SIM_TAG, EVAL_GEN_TAG]] - ), "Both user function tags should be included in runners dictionary" - - -@pytest.mark.extra -def test_normal_no_gen(): - calc_in, sim_specs, gen_specs = get_ufunc_args() - - runners = Runners(sim_specs, {}) - ro = runners.make_runners() - - assert not ro[2], "generator function shouldn't be provided if not using gen_specs" - @pytest.mark.extra def test_globus_compute_runner_init(): @@ -60,10 +44,10 @@ def test_globus_compute_runner_init(): sim_specs["globus_compute_endpoint"] = "1234" with mock.patch("globus_compute_sdk.Executor"): - runners = Runners(sim_specs, gen_specs) + runner = Runner(sim_specs) - assert ( - runners.sim_globus_compute_executor is not None + assert hasattr( + runner, "globus_compute_executor" ), "Globus ComputeExecutor should have been instantiated when globus_compute_endpoint found in specs" @@ -74,7 +58,7 @@ def test_globus_compute_runner_pass(): sim_specs["globus_compute_endpoint"] = "1234" with mock.patch("globus_compute_sdk.Executor"): - runners = Runners(sim_specs, gen_specs) + runner = Runner(sim_specs) # Creating Mock Globus ComputeExecutor and Globus Compute future object - no exception globus_compute_mock = mock.Mock() @@ -83,12 +67,12 @@ def test_globus_compute_runner_pass(): globus_compute_future.exception.return_value = None globus_compute_future.result.return_value = (True, True) - runners.sim_globus_compute_executor = globus_compute_mock - ro = runners.make_runners() + runner.globus_compute_executor = globus_compute_mock + runners = {1: runner.run} libE_info = {"H_rows": np.array([2, 3, 4]), "workerID": 1, "comm": "fakecomm"} - out, persis_info = ro[1](calc_in, {"libE_info": libE_info, "persis_info": {}, "tag": 1}) + out, persis_info = runners[1](calc_in, {"libE_info": libE_info, "persis_info": {}, "tag": 1}) assert all([out, persis_info]), "Globus Compute runner correctly returned results" @@ -100,7 +84,7 @@ def test_globus_compute_runner_fail(): gen_specs["globus_compute_endpoint"] = "4321" with mock.patch("globus_compute_sdk.Executor"): - runners = Runners(sim_specs, gen_specs) + runner = Runner(gen_specs) # Creating Mock Globus ComputeExecutor and Globus Compute future object - yes exception globus_compute_mock = mock.Mock() @@ -108,19 +92,18 @@ def test_globus_compute_runner_fail(): globus_compute_mock.submit_to_registered_function.return_value = globus_compute_future globus_compute_future.exception.return_value = Exception - runners.gen_globus_compute_executor = globus_compute_mock - ro = runners.make_runners() + runner.globus_compute_executor = globus_compute_mock + runners = {2: runner.run} libE_info = {"H_rows": np.array([2, 3, 4]), "workerID": 1, "comm": "fakecomm"} with pytest.raises(Exception): - out, persis_info = ro[2](calc_in, {"libE_info": libE_info, "persis_info": {}, "tag": 2}) + out, persis_info = runners[2](calc_in, {"libE_info": libE_info, "persis_info": {}, "tag": 2}) pytest.fail("Expected exception") if __name__ == "__main__": test_normal_runners() - test_normal_no_gen() test_globus_compute_runner_init() test_globus_compute_runner_pass() test_globus_compute_runner_fail() diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py index 8c35a9064..113fcf45b 100644 --- a/libensemble/utils/runners.py +++ b/libensemble/utils/runners.py @@ -1,7 +1,7 @@ import inspect import logging import logging.handlers -from typing import Callable, Optional +from typing import Optional import numpy.typing as npt @@ -15,29 +15,27 @@ def __new__(cls, specs): if specs.get("threaded"): # TODO: undecided interface return super(Runner, ThreadRunner).__new__(ThreadRunner) else: - return Runner + return super().__new__(Runner) def __init__(self, specs): self.specs = specs self.f = specs.get("sim_f") or specs.get("gen_f") - def _truncate_args(self, calc_in, persis_info, specs, libE_info, user_f): - nparams = len(inspect.signature(user_f).parameters) - args = [calc_in, persis_info, specs, libE_info] + def _truncate_args(self, calc_in: npt.NDArray, persis_info, libE_info): + nparams = len(inspect.signature(self.f).parameters) + args = [calc_in, persis_info, self.specs, libE_info] return args[:nparams] - def _result( - self, calc_in: npt.NDArray, persis_info: dict, specs: dict, libE_info: dict, user_f: Callable, tag: int - ) -> (npt.NDArray, dict, Optional[int]): + def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> (npt.NDArray, dict, Optional[int]): """User function called in-place""" - args = self._truncate_args(calc_in, persis_info, specs, libE_info, user_f) - return user_f(*args) + args = self._truncate_args(calc_in, persis_info, libE_info) + return self.f(*args) def shutdown(self) -> None: pass - def run(self, calc_in, Work): - return self._result(calc_in, Work["persis_info"], self.specs, Work["libE_info"], self.f, Work["tag"]) + def run(self, calc_in: npt.NDArray, Work: dict) -> (npt.NDArray, dict, Optional[int]): + return self._result(calc_in, Work["persis_info"], Work["libE_info"]) class GlobusComputeRunner(Runner): @@ -46,9 +44,6 @@ def __init__(self, specs): self.globus_compute_executor = self._get_globus_compute_executor()(endpoint_id=specs["globus_compute_endpoint"]) self.globus_compute_fid = self.globus_compute_executor.register_function(self.f) - def shutdown(self) -> None: - self.globus_compute_executor.shutdown() - def _get_globus_compute_executor(self): try: from globus_compute_sdk import Executor @@ -59,21 +54,20 @@ def _get_globus_compute_executor(self): else: return Executor - def _result( - self, calc_in: npt.NDArray, persis_info: dict, specs: dict, libE_info: dict, user_f: Callable, tag: int - ) -> (npt.NDArray, dict, Optional[int]): + def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> (npt.NDArray, dict, Optional[int]): from libensemble.worker import Worker libE_info["comm"] = None # 'comm' object not pickle-able Worker._set_executor(0, None) # ditto for executor - fargs = self._truncate_args(calc_in, persis_info, specs, libE_info, user_f) - exctr = self.globus_compute_executor - func_id = self.globus_compute_fid - - task_fut = exctr.submit_to_registered_function(func_id, fargs) + fargs = self._truncate_args(calc_in, persis_info, libE_info) + task_fut = self.globus_compute_executor.submit_to_registered_function(self.globus_compute_fid, fargs) return task_fut.result() + def shutdown(self) -> None: + self.globus_compute_executor.shutdown() + class ThreadRunner(Runner): - pass + def __init__(self, specs): + super().__init__(specs) diff --git a/libensemble/worker.py b/libensemble/worker.py index 46ab84db6..ad8bd4530 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -413,5 +413,6 @@ def run(self) -> None: else: self.comm.kill_pending() finally: - self.runners.shutdown() + self.gen_runner.shutdown() + self.sim_runner.shutdown() self.EnsembleDirectory.copy_back() From e17eabedf034f0d5005d19be7e96cdccee820d68 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 9 Jan 2024 16:22:31 -0600 Subject: [PATCH 03/76] ThreadRunner uses comms.QCommThread, slightly modified, to launch its user function. corresponding unit test --- libensemble/comms/comms.py | 17 ++++++++++------- .../tests/unit_tests/test_ufunc_runners.py | 18 ++++++++++++++++++ libensemble/utils/runners.py | 11 +++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index 9bf14e98a..30de28ad9 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -146,7 +146,7 @@ def mail_flag(self): class QCommLocal(Comm): - def __init__(self, main, nworkers, *args, **kwargs): + def __init__(self, main, *args, **kwargs): self._result = None self._exception = None self._done = False @@ -208,10 +208,13 @@ def result(self, timeout=None): return self._result @staticmethod - def _qcomm_main(comm, main, *args, **kwargs): + def _qcomm_main(comm, main, *fargs, **kwargs): """Main routine -- handles return values and exceptions.""" try: - _result = main(comm, *args, **kwargs) + if not kwargs.get("ufunc"): + _result = main(comm, *fargs, **kwargs) + else: + _result = main(*fargs) comm.send(CommResult(_result)) except Exception as e: comm.send(CommResultErr(str(e), format_exc())) @@ -233,12 +236,12 @@ def __exit__(self, etype, value, traceback): class QCommThread(QCommLocal): """Launch a user function in a thread with an attached QComm.""" - def __init__(self, main, nworkers, *args, **kwargs): + def __init__(self, main, nworkers, *fargs, **kwargs): self.inbox = thread_queue.Queue() self.outbox = thread_queue.Queue() - super().__init__(self, main, nworkers, *args, **kwargs) + super().__init__(self, main, *fargs, **kwargs) comm = QComm(self.inbox, self.outbox, nworkers) - self.handle = Thread(target=QCommThread._qcomm_main, args=(comm, main) + args, kwargs=kwargs) + self.handle = Thread(target=QCommThread._qcomm_main, args=(comm, main) + fargs, kwargs=kwargs) def terminate(self, timeout=None): """Terminate the thread. @@ -260,7 +263,7 @@ class QCommProcess(QCommLocal): def __init__(self, main, nworkers, *args, **kwargs): self.inbox = Queue() self.outbox = Queue() - super().__init__(self, main, nworkers, *args, **kwargs) + super().__init__(self, main, *args, **kwargs) comm = QComm(self.inbox, self.outbox, nworkers) self.handle = Process(target=QCommProcess._qcomm_main, args=(comm, main) + args, kwargs=kwargs) diff --git a/libensemble/tests/unit_tests/test_ufunc_runners.py b/libensemble/tests/unit_tests/test_ufunc_runners.py index b63360e81..1d3cbb4b2 100644 --- a/libensemble/tests/unit_tests/test_ufunc_runners.py +++ b/libensemble/tests/unit_tests/test_ufunc_runners.py @@ -37,6 +37,23 @@ def test_normal_runners(): ), "Globus Compute use should not be detected without setting endpoint fields" +def test_thread_runners(): + calc_in, sim_specs, gen_specs = get_ufunc_args() + + def tupilize(arg1, arg2): + return (arg1, arg2) + + sim_specs["threaded"] = True # TODO: undecided interface + sim_specs["sim_f"] = tupilize + persis_info = {"hello": "threads"} + + simrunner = Runner(sim_specs) + result = simrunner._result(calc_in, persis_info, {}) + assert result == (calc_in, persis_info) + assert hasattr(simrunner, "thread_handle") + simrunner.shutdown() + + @pytest.mark.extra def test_globus_compute_runner_init(): calc_in, sim_specs, gen_specs = get_ufunc_args() @@ -104,6 +121,7 @@ def test_globus_compute_runner_fail(): if __name__ == "__main__": test_normal_runners() + test_thread_runners() test_globus_compute_runner_init() test_globus_compute_runner_pass() test_globus_compute_runner_fail() diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py index 113fcf45b..e21c87ba5 100644 --- a/libensemble/utils/runners.py +++ b/libensemble/utils/runners.py @@ -5,6 +5,8 @@ import numpy.typing as npt +from libensemble.comms.comms import QCommThread + logger = logging.getLogger(__name__) @@ -71,3 +73,12 @@ def shutdown(self) -> None: class ThreadRunner(Runner): def __init__(self, specs): super().__init__(specs) + + def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> (npt.NDArray, dict, Optional[int]): + fargs = self._truncate_args(calc_in, persis_info, libE_info) + self.thread_handle = QCommThread(self.f, None, *fargs, ufunc=True) + self.thread_handle.run() + return self.thread_handle.result() + + def shutdown(self) -> None: + self.thread_handle.terminate() From 83493d027d41049e5967bee9dd05250fe2b9dfc8 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 10 Jan 2024 10:37:08 -0600 Subject: [PATCH 04/76] handful of small changes from experimental/gen_on_manager_inplace --- libensemble/executors/executor.py | 2 +- libensemble/message_numbers.py | 2 ++ libensemble/resources/scheduler.py | 2 +- libensemble/resources/worker_resources.py | 13 ++++--------- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 35a321767..c04c0760a 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -658,7 +658,7 @@ def set_workerID(self, workerid) -> None: """Sets the worker ID for this executor""" self.workerID = workerid - def set_worker_info(self, comm, workerid=None) -> None: + def set_worker_info(self, comm=None, workerid=None) -> None: """Sets info for this executor""" self.workerID = workerid self.comm = comm diff --git a/libensemble/message_numbers.py b/libensemble/message_numbers.py index adfcbc244..6caef0a6e 100644 --- a/libensemble/message_numbers.py +++ b/libensemble/message_numbers.py @@ -41,6 +41,8 @@ # last_calc_status_rst_tag CALC_EXCEPTION = 35 # Reserved: Automatically used if user_f raised an exception +EVAL_FINAL_GEN_TAG = 36 + MAN_KILL_SIGNALS = [MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL] calc_status_strings = { diff --git a/libensemble/resources/scheduler.py b/libensemble/resources/scheduler.py index 04de87e77..386a406bc 100644 --- a/libensemble/resources/scheduler.py +++ b/libensemble/resources/scheduler.py @@ -245,7 +245,7 @@ def get_avail_rsets_by_group(self): for g in groups: self.avail_rsets_by_group[g] = [] for ind, rset in enumerate(rsets): - if not rset["assigned"]: + if rset["assigned"] == -1: # now default is -1. g = rset["group"] self.avail_rsets_by_group[g].append(ind) return self.avail_rsets_by_group diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 639f27da7..2becaa1df 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -50,11 +50,10 @@ def __init__(self, num_workers: int, resources: "GlobalResources") -> None: # n ) self.rsets = np.zeros(self.total_num_rsets, dtype=ResourceManager.man_rset_dtype) - self.rsets["assigned"] = 0 + self.rsets["assigned"] = -1 # Can assign to manager (=0) so make unset value -1 for field in self.all_rsets.dtype.names: self.rsets[field] = self.all_rsets[field] self.num_groups = self.rsets["group"][-1] - self.rsets_free = self.total_num_rsets self.gpu_rsets_free = self.total_num_gpu_rsets self.nongpu_rsets_free = self.total_num_nongpu_rsets @@ -70,7 +69,7 @@ def assign_rsets(self, rset_team, worker_id): if rset_team: rteam = self.rsets["assigned"][rset_team] for i, wid in enumerate(rteam): - if wid == 0: + if wid == -1: self.rsets["assigned"][rset_team[i]] = worker_id self.rsets_free -= 1 if self.rsets["gpus"][rset_team[i]]: @@ -85,13 +84,13 @@ def assign_rsets(self, rset_team, worker_id): def free_rsets(self, worker=None): """Free up assigned resource sets""" if worker is None: - self.rsets["assigned"] = 0 + self.rsets["assigned"] = -1 self.rsets_free = self.total_num_rsets self.gpu_rsets_free = self.total_num_gpu_rsets self.nongpu_rsets_free = self.total_num_nongpu_rsets else: rsets_to_free = np.where(self.rsets["assigned"] == worker)[0] - self.rsets["assigned"][rsets_to_free] = 0 + self.rsets["assigned"][rsets_to_free] = -1 self.rsets_free += len(rsets_to_free) self.gpu_rsets_free += np.count_nonzero(self.rsets["gpus"][rsets_to_free]) self.nongpu_rsets_free += np.count_nonzero(~self.rsets["gpus"][rsets_to_free]) @@ -200,7 +199,6 @@ def __init__(self, num_workers, resources, workerID): self.gen_nprocs = None self.gen_ngpus = None self.platform_info = resources.platform_info - self.tiles_per_gpu = resources.tiles_per_gpu # User convenience functions ---------------------------------------------- @@ -218,9 +216,6 @@ def get_slots_as_string(self, multiplier=1, delimiter=",", limit=None): slot_list = [j for i in self.slots_on_node for j in range(i * n, (i + 1) * n)] if limit is not None: slot_list = slot_list[:limit] - if self.tiles_per_gpu > 1: - ntiles = self.tiles_per_gpu - slot_list = [f"{i // ntiles}.{i % ntiles}" for i in slot_list] slots = delimiter.join(map(str, slot_list)) return slots From 6ad870c7591b6f639768fe6d4b85f0d542ef24c3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 10 Jan 2024 15:44:51 -0600 Subject: [PATCH 05/76] first incredibly long and ugly concatenation of "pipeline" and "state" management routines from manager.py into pipelines.py --- libensemble/utils/pipelines.py | 382 +++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 libensemble/utils/pipelines.py diff --git a/libensemble/utils/pipelines.py b/libensemble/utils/pipelines.py new file mode 100644 index 000000000..558c9c962 --- /dev/null +++ b/libensemble/utils/pipelines.py @@ -0,0 +1,382 @@ +import logging +import time +from dataclasses import dataclass + +import numpy as np +import numpy.typing as npt +from numpy.lib.recfunctions import repack_fields + +from libensemble.comms.comms import CommFinishedException +from libensemble.message_numbers import ( + EVAL_GEN_TAG, + EVAL_SIM_TAG, + FINISHED_PERSISTENT_GEN_TAG, + FINISHED_PERSISTENT_SIM_TAG, + MAN_SIGNAL_FINISH, + MAN_SIGNAL_KILL, + PERSIS_STOP, + STOP_TAG, + calc_status_strings, + calc_type_strings, +) +from libensemble.resources.resources import Resources +from libensemble.tools.tools import _PERSIS_RETURN_WARNING +from libensemble.utils.misc import extract_H_ranges +from libensemble.worker import WorkerErrMsg + +logger = logging.getLogger(__name__) + +_WALLCLOCK_MSG_ALL_RETURNED = """ +Termination due to wallclock_max has occurred. +All completed work has been returned. +Posting kill messages for all workers. +""" + +_WALLCLOCK_MSG_ACTIVE = """ +Termination due to wallclock_max has occurred. +Some issued work has not been returned. +Posting kill messages for all workers. +""" + + +class WorkerException(Exception): + """Exception raised on abort signal from worker""" + + +class _WorkPipeline: + def __init__(self, libE_specs, sim_specs, gen_specs): + self.libE_specs = libE_specs + self.sim_specs = sim_specs + self.gen_specs = gen_specs + + +class WorkerToManager(_WorkPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs): + super().__init__(libE_specs, sim_specs, gen_specs) + + +class Worker: + """Wrapper class for Worker array and worker comms""" + + def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): + self.__dict__["_W"] = W + self.__dict__["_wid"] = wid - 1 + self.__dict__["_wcomms"] = wcomms + + def __setattr__(self, field, value): + self._W[self._wid][field] = value + + def __getattr__(self, field): + return self._W[self._wid][field] + + def update_state_on_alloc(self, Work: dict): + self.active = Work["tag"] + if "libE_info" in Work: + if "persistent" in Work["libE_info"]: + self.persis_state = Work["tag"] + if Work["libE_info"].get("active_recv", False): + self.active_recv = Work["tag"] + else: + assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + + def update_persistent_state(self): + self.persis_state = 0 + if self.active_recv: + self.active = 0 + self.active_recv = 0 + + def send(self, tag, data): + self._wcomms[self._wid].send(tag, data) + + def mail_flag(self): + return self._wcomms[self._wid].mail_flag() + + def recv(self): + return self._wcomms[self._wid].recv() + + +class _ManagerPipeline(_WorkPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs, W, hist, wcomms): + super().__init__(libE_specs, sim_specs, gen_specs) + self.W = W + self.hist = hist + self.wcomms = wcomms + + def _update_state_on_alloc(self, Work: dict, w: int): + """Updates a workers' active/idle status following an allocation order""" + worker = Worker(self.W, w) + worker.update_state_on_alloc(Work) + + work_rows = Work["libE_info"]["H_rows"] + if Work["tag"] == EVAL_SIM_TAG: + self.hist.update_history_x_out(work_rows, w, self.kill_canceled_sims) + elif Work["tag"] == EVAL_GEN_TAG: + self.hist.update_history_to_gen(work_rows) + + def _kill_workers(self) -> None: + """Kills the workers""" + for w in self.W["worker_id"]: + self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) + + +class ManagerFromWorker(_ManagerPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs, W, hist, wcomms): + super().__init__(libE_specs, sim_specs, gen_specs, W, hist) + self.WorkerExc = False + + def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: + """Handles a message from worker w""" + try: + msg = self.wcomms[w - 1].recv() + tag, D_recv = msg + except CommFinishedException: + logger.debug(f"Finalizing message from Worker {w}") + return + if isinstance(D_recv, WorkerErrMsg): + self.W[w - 1]["active"] = 0 + logger.debug(f"Manager received exception from worker {w}") + if not self.WorkerExc: + self.WorkerExc = True + self._kill_workers() + raise WorkerException(f"Received error message from worker {w}", D_recv.msg, D_recv.exc) + elif isinstance(D_recv, logging.LogRecord): + logger.debug(f"Manager received a log message from worker {w}") + logging.getLogger(D_recv.name).handle(D_recv) + else: + logger.debug(f"Manager received data message from worker {w}") + self._update_state_on_worker_msg(persis_info, D_recv, w) + + def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) -> None: + """Updates history and worker info on worker message""" + calc_type = D_recv["calc_type"] + calc_status = D_recv["calc_status"] + ManagerFromWorker._check_received_calc(D_recv) + + worker = Worker(self.W, w) + + keep_state = D_recv["libE_info"].get("keep_state", False) + if w not in self.persis_pending and not worker.active_recv and not keep_state: + worker.active = 0 + + if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: + final_data = D_recv.get("calc_out", None) + if isinstance(final_data, np.ndarray): + if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): + self.hist.update_history_x_in(w, final_data, self.W[w - 1]["gen_started_time"]) + elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): + self.hist.update_history_f(D_recv, self.kill_canceled_sims) + else: + logger.info(_PERSIS_RETURN_WARNING) + worker.update_persistent_state() + if w in self.persis_pending: + self.persis_pending.remove(w) + worker.active = 0 + self._freeup_resources(w) + else: + if calc_type == EVAL_SIM_TAG: + self.hist.update_history_f(D_recv, self.kill_canceled_sims) + if calc_type == EVAL_GEN_TAG: + self.hist.update_history_x_in(w, D_recv["calc_out"], worker.gen_started_time) + assert ( + len(D_recv["calc_out"]) or np.any(self.W["active"]) or worker.persis_state + ), "Gen must return work when is is the only thing active and not persistent." + if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: + # Now a waiting, persistent worker + worker.persis_state = calc_type + else: + self._freeup_resources(w) + + def _receive_from_workers(self, persis_info: dict) -> dict: + """Receives calculation output from workers. Loops over all + active workers and probes to see if worker is ready to + communicate. If any output is received, all other workers are + looped back over. + """ + time.sleep(0.0001) # Critical for multiprocessing performance + new_stuff = True + while new_stuff: + new_stuff = False + for w in self.W["worker_id"]: + if self.wcomms[w - 1].mail_flag(): + new_stuff = True + self._handle_msg_from_worker(persis_info, w) + + self._init_every_k_save() + return persis_info + + def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): + """ + Tries to receive from any active workers. + + If time expires before all active workers have been received from, a + nonblocking receive is posted (though the manager will not receive this + data) and a kill signal is sent. + """ + + # Send a handshake signal to each persistent worker. + if any(self.W["persis_state"]): + for w in self.W["worker_id"][self.W["persis_state"] > 0]: + logger.debug(f"Manager sending PERSIS_STOP to worker {w}") + if self.libE_specs.get("final_gen_send", False): + rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] + work = { + "H_fields": self.gen_specs["persis_in"], + "persis_info": persis_info[w], + "tag": PERSIS_STOP, + "libE_info": {"persistent": True, "H_rows": rows_to_send}, + } + self._check_work_order(work, w, force=True) + self._send_work_order(work, w) + self.hist.update_history_to_gen(rows_to_send) + else: + self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) + if not self.W[w - 1]["active"]: + # Re-activate if necessary + self.W[w - 1]["active"] = self.W[w - 1]["persis_state"] + self.persis_pending.append(w) + + exit_flag = 0 + while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: + persis_info = self._receive_from_workers(persis_info) + if self.term_test(logged=False) == 2: + # Elapsed Wallclock has expired + if not any(self.W["persis_state"]): + if any(self.W["active"]): + logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) + else: + logger.manager_warning(_WALLCLOCK_MSG_ALL_RETURNED) + exit_flag = 2 + if self.WorkerExc: + exit_flag = 1 + + self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) + self._kill_workers() + return persis_info, exit_flag, self.elapsed() + + @staticmethod + def _check_received_calc(D_recv: dict) -> None: + """Checks the type and status fields on a receive calculation""" + calc_type = D_recv["calc_type"] + calc_status = D_recv["calc_status"] + assert calc_type in [ + EVAL_SIM_TAG, + EVAL_GEN_TAG, + ], f"Aborting, Unknown calculation type received. Received type: {calc_type}" + + assert calc_status in list(calc_status_strings.keys()) + [PERSIS_STOP] or isinstance( + calc_status, str + ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" + + +@dataclass +class Work: + wid: int + H_fields: list + persis_info: dict + tag: int + libE_info: dict + + +class ManagerToWorker(_ManagerPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs, W, wcomms): + super().__init__(libE_specs, sim_specs, gen_specs, W) + self.wcomms = wcomms + + def _kill_cancelled_sims(self) -> None: + """Send kill signals to any sims marked as cancel_requested""" + + if self.kill_canceled_sims: + inds_to_check = np.arange(self.hist.last_ended + 1, self.hist.last_started + 1) + + kill_sim = ( + self.hist.H["sim_started"][inds_to_check] + & self.hist.H["cancel_requested"][inds_to_check] + & ~self.hist.H["sim_ended"][inds_to_check] + & ~self.hist.H["kill_sent"][inds_to_check] + ) + kill_sim_rows = inds_to_check[kill_sim] + + # Note that a return is still expected when running sims are killed + if np.any(kill_sim): + logger.debug(f"Manager sending kill signals to H indices {kill_sim_rows}") + kill_ids = self.hist.H["sim_id"][kill_sim_rows] + kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] + for w in kill_on_workers: + self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) + self.hist.H["kill_sent"][kill_ids] = True + + @staticmethod + def _set_resources(Work: dict, w: int) -> None: + """Check rsets given in Work match rsets assigned in resources. + + If rsets are not assigned, then assign using default mapping + """ + resource_manager = Resources.resources.resource_manager + rset_req = Work["libE_info"].get("rset_team") + + if rset_req is None: + rset_team = [] + default_rset = resource_manager.index_list[w - 1] + if default_rset is not None: + rset_team.append(default_rset) + Work["libE_info"]["rset_team"] = rset_team + + resource_manager.assign_rsets(Work["libE_info"]["rset_team"], w) + + def _send_work_order(self, Work: dict, w: int) -> None: + """Sends an allocation function order to a worker""" + logger.debug(f"Manager sending work unit to worker {w}") + + if Resources.resources: + self._set_resources(Work, w) + + self.wcomms[w - 1].send(Work["tag"], Work) + + if Work["tag"] == EVAL_GEN_TAG: + self.W[w - 1]["gen_started_time"] = time.time() + + work_rows = Work["libE_info"]["H_rows"] + work_name = calc_type_strings[Work["tag"]] + logger.debug(f"Manager sending {work_name} work to worker {w}. Rows {extract_H_ranges(Work) or None}") + if len(work_rows): + new_dtype = [(name, self.hist.H.dtype.fields[name][0]) for name in Work["H_fields"]] + H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) + for i, row in enumerate(work_rows): + H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) + self.wcomms[w - 1].send(0, H_to_be_sent) + + def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: + """Checks validity of an allocation function order""" + assert w != 0, "Can't send to worker 0; this is the manager." + if self.W[w - 1]["active_recv"]: + assert "active_recv" in Work["libE_info"], ( + "Messages to a worker in active_recv mode should have active_recv" + f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" + ) + else: + if not force: + assert self.W[w - 1]["active"] == 0, ( + "Allocation function requested work be sent to worker %d, an already active worker." % w + ) + work_rows = Work["libE_info"]["H_rows"] + if len(work_rows): + work_fields = set(Work["H_fields"]) + + assert len(work_fields), ( + f"Allocation function requested rows={work_rows} be sent to worker={w}, " + "but requested no fields to be sent." + ) + hist_fields = self.hist.H.dtype.names + diff_fields = list(work_fields.difference(hist_fields)) + + assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." + + def _freeup_resources(self, w: int) -> None: + """Free up resources assigned to the worker""" + if self.resources: + self.resources.resource_manager.free_rsets(w) + + +class ManagerInplace(_ManagerPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs): + super().__init__(libE_specs, sim_specs, gen_specs) From d14b0aae4e55375c1dc9694bd6a9dfa530ee3828 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 11 Jan 2024 13:57:53 -0600 Subject: [PATCH 06/76] progress --- libensemble/utils/pipelines.py | 36 ++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/libensemble/utils/pipelines.py b/libensemble/utils/pipelines.py index 558c9c962..694710527 100644 --- a/libensemble/utils/pipelines.py +++ b/libensemble/utils/pipelines.py @@ -85,6 +85,9 @@ def update_persistent_state(self): self.active = 0 self.active_recv = 0 + def set_work(self, Work): + self.__dict__["_Work"] = Work + def send(self, tag, data): self._wcomms[self._wid].send(tag, data) @@ -126,14 +129,15 @@ def __init__(self, libE_specs, sim_specs, gen_specs, W, hist, wcomms): def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: """Handles a message from worker w""" + worker = Worker(self.W, w) try: - msg = self.wcomms[w - 1].recv() + msg = worker.recv() tag, D_recv = msg except CommFinishedException: logger.debug(f"Finalizing message from Worker {w}") return if isinstance(D_recv, WorkerErrMsg): - self.W[w - 1]["active"] = 0 + worker.active = 0 logger.debug(f"Manager received exception from worker {w}") if not self.WorkerExc: self.WorkerExc = True @@ -162,7 +166,7 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - final_data = D_recv.get("calc_out", None) if isinstance(final_data, np.ndarray): if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): - self.hist.update_history_x_in(w, final_data, self.W[w - 1]["gen_started_time"]) + self.hist.update_history_x_in(w, final_data, worker.gen_started_time) elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): self.hist.update_history_f(D_recv, self.kill_canceled_sims) else: @@ -216,6 +220,7 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): # Send a handshake signal to each persistent worker. if any(self.W["persis_state"]): for w in self.W["worker_id"][self.W["persis_state"] > 0]: + worker = Worker(self.W, w) logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -225,14 +230,14 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): "tag": PERSIS_STOP, "libE_info": {"persistent": True, "H_rows": rows_to_send}, } - self._check_work_order(work, w, force=True) + # self._check_work_order(work, w, force=True) # this work is hardcoded, not from an alloc_f. trust! self._send_work_order(work, w) self.hist.update_history_to_gen(rows_to_send) else: - self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) - if not self.W[w - 1]["active"]: + worker.send(PERSIS_STOP, MAN_SIGNAL_KILL) + if not worker.active: # Re-activate if necessary - self.W[w - 1]["active"] = self.W[w - 1]["persis_state"] + worker.active = worker.persis_state self.persis_pending.append(w) exit_flag = 0 @@ -327,13 +332,15 @@ def _send_work_order(self, Work: dict, w: int) -> None: """Sends an allocation function order to a worker""" logger.debug(f"Manager sending work unit to worker {w}") + worker = Worker(self.W, w) + if Resources.resources: self._set_resources(Work, w) - self.wcomms[w - 1].send(Work["tag"], Work) + worker.send(Work["tag"], Work) if Work["tag"] == EVAL_GEN_TAG: - self.W[w - 1]["gen_started_time"] = time.time() + worker.gen_started_time = time.time() work_rows = Work["libE_info"]["H_rows"] work_name = calc_type_strings[Work["tag"]] @@ -343,19 +350,22 @@ def _send_work_order(self, Work: dict, w: int) -> None: H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) for i, row in enumerate(work_rows): H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - self.wcomms[w - 1].send(0, H_to_be_sent) + worker.send(0, H_to_be_sent) def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: """Checks validity of an allocation function order""" - assert w != 0, "Can't send to worker 0; this is the manager." - if self.W[w - 1]["active_recv"]: + # assert w != 0, "Can't send to worker 0; this is the manager." + + worker = Worker(self.W, w) + + if worker.active_recv: assert "active_recv" in Work["libE_info"], ( "Messages to a worker in active_recv mode should have active_recv" f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" ) else: if not force: - assert self.W[w - 1]["active"] == 0, ( + assert worker.active == 0, ( "Allocation function requested work be sent to worker %d, an already active worker." % w ) work_rows = Work["libE_info"]["H_rows"] From ab32e3fa22b60c635637e5ce7d6d8438c6b8dbf2 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 11 Jan 2024 17:46:07 -0600 Subject: [PATCH 07/76] bugfixes, first "working" refactor of manager can run 1d_sampling using utils.pipelines --- libensemble/manager.py | 25 ++++---- libensemble/utils/pipelines.py | 101 ++++++++++++--------------------- 2 files changed, 51 insertions(+), 75 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index cce7682f8..25e82ada1 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -36,6 +36,7 @@ from libensemble.tools.tools import _PERSIS_RETURN_WARNING, _USER_CALC_DIR_WARNING from libensemble.utils.misc import extract_H_ranges from libensemble.utils.output_directory import EnsembleDirectory +from libensemble.utils.pipelines import ManagerFromWorker, ManagerToWorker from libensemble.utils.timer import Timer from libensemble.worker import WorkerErrMsg @@ -108,9 +109,6 @@ def manager_main( pr = cProfile.Profile() pr.enable() - if "in" not in gen_specs: - gen_specs["in"] = [] - # Send dtypes to workers dtypes = { EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, @@ -642,11 +640,15 @@ def run(self, persis_info: dict) -> (dict, int, int): logger.info(f"Manager initiated on node {socket.gethostname()}") logger.info(f"Manager exit_criteria: {self.exit_criteria}") + self.ToWorker = ManagerToWorker(self) + self.FromWorker = ManagerFromWorker(self) + # Continue receiving and giving until termination test is satisfied try: while not self.term_test(): - self._kill_cancelled_sims() - persis_info = self._receive_from_workers(persis_info) + self.ToWorker._kill_cancelled_sims() + persis_info = self.FromWorker._receive_from_workers(persis_info) + self._init_every_k_save() Work, persis_info, flag = self._alloc_work(self.hist.trim_H(), persis_info) if flag: break @@ -654,21 +656,22 @@ def run(self, persis_info: dict) -> (dict, int, int): for w in Work: if self._sim_max_given(): break - self._check_work_order(Work[w], w) - self._send_work_order(Work[w], w) - self._update_state_on_alloc(Work[w], w) + self.ToWorker._check_work_order(Work[w], w) + self.ToWorker._send_work_order(Work[w], w) + self.ToWorker._update_state_on_alloc(Work[w], w) assert self.term_test() or any( self.W["active"] != 0 ), "alloc_f did not return any work, although all workers are idle." - except WorkerException as e: + except WorkerException as e: # catches all error messages from worker report_worker_exc(e) raise LoggedException(e.args[0], e.args[1]) from None - except Exception as e: + except Exception as e: # should only catch bugs within manager, or AssertionErrors logger.error(traceback.format_exc()) raise LoggedException(e.args) from None finally: # Return persis_info, exit_flag, elapsed time - result = self._final_receive_and_kill(persis_info) + result = self.FromWorker._final_receive_and_kill(persis_info) + self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) sys.stdout.flush() sys.stderr.flush() return result diff --git a/libensemble/utils/pipelines.py b/libensemble/utils/pipelines.py index 694710527..a50d85a82 100644 --- a/libensemble/utils/pipelines.py +++ b/libensemble/utils/pipelines.py @@ -1,6 +1,5 @@ import logging import time -from dataclasses import dataclass import numpy as np import numpy.typing as npt @@ -16,7 +15,6 @@ MAN_SIGNAL_KILL, PERSIS_STOP, STOP_TAG, - calc_status_strings, calc_type_strings, ) from libensemble.resources.resources import Resources @@ -60,24 +58,23 @@ class Worker: def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): self.__dict__["_W"] = W - self.__dict__["_wid"] = wid - 1 + self.__dict__["_wididx"] = wid - 1 self.__dict__["_wcomms"] = wcomms def __setattr__(self, field, value): - self._W[self._wid][field] = value + self._W[self._wididx][field] = value def __getattr__(self, field): - return self._W[self._wid][field] + return self._W[self._wididx][field] def update_state_on_alloc(self, Work: dict): self.active = Work["tag"] - if "libE_info" in Work: - if "persistent" in Work["libE_info"]: - self.persis_state = Work["tag"] - if Work["libE_info"].get("active_recv", False): - self.active_recv = Work["tag"] - else: - assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + if "persistent" in Work["libE_info"]: + self.persis_state = Work["tag"] + if Work["libE_info"].get("active_recv", False): + self.active_recv = Work["tag"] + else: + assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" def update_persistent_state(self): self.persis_state = 0 @@ -89,25 +86,27 @@ def set_work(self, Work): self.__dict__["_Work"] = Work def send(self, tag, data): - self._wcomms[self._wid].send(tag, data) + self._wcomms[self._wididx].send(tag, data) def mail_flag(self): - return self._wcomms[self._wid].mail_flag() + return self._wcomms[self._wididx].mail_flag() def recv(self): - return self._wcomms[self._wid].recv() + return self._wcomms[self._wididx].recv() class _ManagerPipeline(_WorkPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs, W, hist, wcomms): - super().__init__(libE_specs, sim_specs, gen_specs) - self.W = W - self.hist = hist - self.wcomms = wcomms + def __init__(self, Manager): + super().__init__(Manager.libE_specs, Manager.sim_specs, Manager.gen_specs) + self.W = Manager.W + self.hist = Manager.hist + self.wcomms = Manager.wcomms + self.kill_canceled_sims = Manager.kill_canceled_sims + self.persis_pending = Manager.persis_pending def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) worker.update_state_on_alloc(Work) work_rows = Work["libE_info"]["H_rows"] @@ -123,16 +122,19 @@ def _kill_workers(self) -> None: class ManagerFromWorker(_ManagerPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs, W, hist, wcomms): - super().__init__(libE_specs, sim_specs, gen_specs, W, hist) + def __init__(self, Manager): + super().__init__(Manager) self.WorkerExc = False + self.resources = Manager.resources + self.term_test = Manager.term_test + self.elapsed = Manager.elapsed def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: """Handles a message from worker w""" - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) try: msg = worker.recv() - tag, D_recv = msg + _, D_recv = msg except CommFinishedException: logger.debug(f"Finalizing message from Worker {w}") return @@ -154,9 +156,8 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - """Updates history and worker info on worker message""" calc_type = D_recv["calc_type"] calc_status = D_recv["calc_status"] - ManagerFromWorker._check_received_calc(D_recv) - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) keep_state = D_recv["libE_info"].get("keep_state", False) if w not in self.persis_pending and not worker.active_recv and not keep_state: @@ -205,7 +206,6 @@ def _receive_from_workers(self, persis_info: dict) -> dict: new_stuff = True self._handle_msg_from_worker(persis_info, w) - self._init_every_k_save() return persis_info def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): @@ -220,7 +220,7 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): # Send a handshake signal to each persistent worker. if any(self.W["persis_state"]): for w in self.W["worker_id"][self.W["persis_state"] > 0]: - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -230,7 +230,6 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): "tag": PERSIS_STOP, "libE_info": {"persistent": True, "H_rows": rows_to_send}, } - # self._check_work_order(work, w, force=True) # this work is hardcoded, not from an alloc_f. trust! self._send_work_order(work, w) self.hist.update_history_to_gen(rows_to_send) else: @@ -254,38 +253,18 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): if self.WorkerExc: exit_flag = 1 - self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) self._kill_workers() return persis_info, exit_flag, self.elapsed() - @staticmethod - def _check_received_calc(D_recv: dict) -> None: - """Checks the type and status fields on a receive calculation""" - calc_type = D_recv["calc_type"] - calc_status = D_recv["calc_status"] - assert calc_type in [ - EVAL_SIM_TAG, - EVAL_GEN_TAG, - ], f"Aborting, Unknown calculation type received. Received type: {calc_type}" - - assert calc_status in list(calc_status_strings.keys()) + [PERSIS_STOP] or isinstance( - calc_status, str - ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" - - -@dataclass -class Work: - wid: int - H_fields: list - persis_info: dict - tag: int - libE_info: dict + def _freeup_resources(self, w: int) -> None: + """Free up resources assigned to the worker""" + if self.resources: + self.resources.resource_manager.free_rsets(w) class ManagerToWorker(_ManagerPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs, W, wcomms): - super().__init__(libE_specs, sim_specs, gen_specs, W) - self.wcomms = wcomms + def __init__(self, Manager): + super().__init__(Manager) def _kill_cancelled_sims(self) -> None: """Send kill signals to any sims marked as cancel_requested""" @@ -332,7 +311,7 @@ def _send_work_order(self, Work: dict, w: int) -> None: """Sends an allocation function order to a worker""" logger.debug(f"Manager sending work unit to worker {w}") - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) if Resources.resources: self._set_resources(Work, w) @@ -354,9 +333,8 @@ def _send_work_order(self, Work: dict, w: int) -> None: def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: """Checks validity of an allocation function order""" - # assert w != 0, "Can't send to worker 0; this is the manager." - worker = Worker(self.W, w) + worker = Worker(self.W, w, self.wcomms) if worker.active_recv: assert "active_recv" in Work["libE_info"], ( @@ -381,11 +359,6 @@ def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." - def _freeup_resources(self, w: int) -> None: - """Free up resources assigned to the worker""" - if self.resources: - self.resources.resource_manager.free_rsets(w) - class ManagerInplace(_ManagerPipeline): def __init__(self, libE_specs, sim_specs, gen_specs): From 68d8855b8fce28f60705b80246398e5c19dae055 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 12 Jan 2024 17:52:12 -0600 Subject: [PATCH 08/76] removing now-redundant content from manager, trying to see if we can start a temporary, local Worker for handling work --- libensemble/comms/comms.py | 1 + libensemble/manager.py | 284 ++------------------------------- libensemble/utils/pipelines.py | 41 +++-- libensemble/worker.py | 7 +- 4 files changed, 51 insertions(+), 282 deletions(-) diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index 30de28ad9..70458dd98 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -150,6 +150,7 @@ def __init__(self, main, *args, **kwargs): self._result = None self._exception = None self._done = False + self._ufunc = kwargs.get("ufunc", False) def _is_result_msg(self, msg): """Return true if message indicates final result (and set result/except).""" diff --git a/libensemble/manager.py b/libensemble/manager.py index 25e82ada1..a822de005 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -10,35 +10,22 @@ import platform import socket import sys -import time import traceback +from queue import SimpleQueue from typing import Any, Union import numpy as np import numpy.typing as npt from numpy.lib.recfunctions import repack_fields -from libensemble.comms.comms import CommFinishedException -from libensemble.message_numbers import ( - EVAL_GEN_TAG, - EVAL_SIM_TAG, - FINISHED_PERSISTENT_GEN_TAG, - FINISHED_PERSISTENT_SIM_TAG, - MAN_SIGNAL_FINISH, - MAN_SIGNAL_KILL, - PERSIS_STOP, - STOP_TAG, - calc_status_strings, - calc_type_strings, -) +from libensemble.comms.comms import QComm +from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG, PERSIS_STOP, calc_status_strings from libensemble.resources.resources import Resources from libensemble.tools.fields_keys import protected_libE_fields -from libensemble.tools.tools import _PERSIS_RETURN_WARNING, _USER_CALC_DIR_WARNING -from libensemble.utils.misc import extract_H_ranges +from libensemble.tools.tools import _USER_CALC_DIR_WARNING from libensemble.utils.output_directory import EnsembleDirectory from libensemble.utils.pipelines import ManagerFromWorker, ManagerToWorker from libensemble.utils.timer import Timer -from libensemble.worker import WorkerErrMsg logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -122,6 +109,8 @@ def manager_main( for wcomm in wcomms: wcomm.send(0, libE_specs.get("workflow_dir_path")) + libE_specs["_dtypes"] = dtypes + # Set up and run manager mgr = Manager(hist, libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, wcomms) result = mgr.run(persis_info) @@ -198,8 +187,8 @@ def __init__( self.gen_num_procs = libE_specs.get("gen_num_procs", 0) self.gen_num_gpus = libE_specs.get("gen_num_gpus", 0) - self.W = np.zeros(len(self.wcomms), dtype=Manager.worker_dtype) - self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 + self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) + self.W["worker_id"] = np.arange(len(self.wcomms) + 1) self.term_tests = [ (2, "wallclock_max", self.term_test_wallclock), (1, "sim_max", self.term_test_sim_max), @@ -207,6 +196,11 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] + self.self_inbox = SimpleQueue() + self.self_outbox = SimpleQueue() + + self.wcomms = [QComm(self.self_inbox, self.self_outbox, len(self.W))] + self.wcomms + temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources self.scheduler_opts = self.libE_specs.get("scheduler_opts", {}) @@ -259,13 +253,6 @@ def term_test(self, logged: bool = True) -> Union[bool, int]: return retval return 0 - # --- Low-level communication routines - - def _kill_workers(self) -> None: - """Kills the workers""" - for w in self.W["worker_id"]: - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) - # --- Checkpointing logic def _get_date_start_str(self) -> str: @@ -314,95 +301,6 @@ def _init_every_k_save(self, complete=False) -> None: if self.libE_specs.get("save_every_k_gens"): self._save_every_k_gens(complete) - # --- Handle outgoing messages to workers (work orders from alloc) - - def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: - """Checks validity of an allocation function order""" - assert w != 0, "Can't send to worker 0; this is the manager." - if self.W[w - 1]["active_recv"]: - assert "active_recv" in Work["libE_info"], ( - "Messages to a worker in active_recv mode should have active_recv" - f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" - ) - else: - if not force: - assert self.W[w - 1]["active"] == 0, ( - "Allocation function requested work be sent to worker %d, an already active worker." % w - ) - work_rows = Work["libE_info"]["H_rows"] - if len(work_rows): - work_fields = set(Work["H_fields"]) - - assert len(work_fields), ( - f"Allocation function requested rows={work_rows} be sent to worker={w}, " - "but requested no fields to be sent." - ) - hist_fields = self.hist.H.dtype.names - diff_fields = list(work_fields.difference(hist_fields)) - - assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." - - def _set_resources(self, Work: dict, w: int) -> None: - """Check rsets given in Work match rsets assigned in resources. - - If rsets are not assigned, then assign using default mapping - """ - resource_manager = self.resources.resource_manager - rset_req = Work["libE_info"].get("rset_team") - - if rset_req is None: - rset_team = [] - default_rset = resource_manager.index_list[w - 1] - if default_rset is not None: - rset_team.append(default_rset) - Work["libE_info"]["rset_team"] = rset_team - - resource_manager.assign_rsets(Work["libE_info"]["rset_team"], w) - - def _freeup_resources(self, w: int) -> None: - """Free up resources assigned to the worker""" - if self.resources: - self.resources.resource_manager.free_rsets(w) - - def _send_work_order(self, Work: dict, w: int) -> None: - """Sends an allocation function order to a worker""" - logger.debug(f"Manager sending work unit to worker {w}") - - if self.resources: - self._set_resources(Work, w) - - self.wcomms[w - 1].send(Work["tag"], Work) - - if Work["tag"] == EVAL_GEN_TAG: - self.W[w - 1]["gen_started_time"] = time.time() - - work_rows = Work["libE_info"]["H_rows"] - work_name = calc_type_strings[Work["tag"]] - logger.debug(f"Manager sending {work_name} work to worker {w}. Rows {extract_H_ranges(Work) or None}") - if len(work_rows): - new_dtype = [(name, self.hist.H.dtype.fields[name][0]) for name in Work["H_fields"]] - H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) - for i, row in enumerate(work_rows): - H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - self.wcomms[w - 1].send(0, H_to_be_sent) - - def _update_state_on_alloc(self, Work: dict, w: int): - """Updates a workers' active/idle status following an allocation order""" - self.W[w - 1]["active"] = Work["tag"] - if "libE_info" in Work: - if "persistent" in Work["libE_info"]: - self.W[w - 1]["persis_state"] = Work["tag"] - if Work["libE_info"].get("active_recv", False): - self.W[w - 1]["active_recv"] = Work["tag"] - else: - assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" - - work_rows = Work["libE_info"]["H_rows"] - if Work["tag"] == EVAL_SIM_TAG: - self.hist.update_history_x_out(work_rows, w, self.kill_canceled_sims) - elif Work["tag"] == EVAL_GEN_TAG: - self.hist.update_history_to_gen(work_rows) - # --- Handle incoming messages from workers @staticmethod @@ -419,164 +317,8 @@ def _check_received_calc(D_recv: dict) -> None: calc_status, str ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" - def _receive_from_workers(self, persis_info: dict) -> dict: - """Receives calculation output from workers. Loops over all - active workers and probes to see if worker is ready to - communticate. If any output is received, all other workers are - looped back over. - """ - time.sleep(0.0001) # Critical for multiprocessing performance - new_stuff = True - while new_stuff: - new_stuff = False - for w in self.W["worker_id"]: - if self.wcomms[w - 1].mail_flag(): - new_stuff = True - self._handle_msg_from_worker(persis_info, w) - - self._init_every_k_save() - return persis_info - - def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) -> None: - """Updates history and worker info on worker message""" - calc_type = D_recv["calc_type"] - calc_status = D_recv["calc_status"] - Manager._check_received_calc(D_recv) - - keep_state = D_recv["libE_info"].get("keep_state", False) - if w not in self.persis_pending and not self.W[w - 1]["active_recv"] and not keep_state: - self.W[w - 1]["active"] = 0 - - if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: - final_data = D_recv.get("calc_out", None) - if isinstance(final_data, np.ndarray): - if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): - self.hist.update_history_x_in(w, final_data, self.W[w - 1]["gen_started_time"]) - elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): - self.hist.update_history_f(D_recv, self.kill_canceled_sims) - else: - logger.info(_PERSIS_RETURN_WARNING) - self.W[w - 1]["persis_state"] = 0 - if self.W[w - 1]["active_recv"]: - self.W[w - 1]["active"] = 0 - self.W[w - 1]["active_recv"] = 0 - if w in self.persis_pending: - self.persis_pending.remove(w) - self.W[w - 1]["active"] = 0 - self._freeup_resources(w) - else: - if calc_type == EVAL_SIM_TAG: - self.hist.update_history_f(D_recv, self.kill_canceled_sims) - if calc_type == EVAL_GEN_TAG: - self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w - 1]["gen_started_time"]) - assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w - 1]["persis_state"] - ), "Gen must return work when is is the only thing active and not persistent." - if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: - # Now a waiting, persistent worker - self.W[w - 1]["persis_state"] = calc_type - else: - self._freeup_resources(w) - - if D_recv.get("persis_info"): - persis_info[w].update(D_recv["persis_info"]) - - def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: - """Handles a message from worker w""" - try: - msg = self.wcomms[w - 1].recv() - tag, D_recv = msg - except CommFinishedException: - logger.debug(f"Finalizing message from Worker {w}") - return - if isinstance(D_recv, WorkerErrMsg): - self.W[w - 1]["active"] = 0 - logger.debug(f"Manager received exception from worker {w}") - if not self.WorkerExc: - self.WorkerExc = True - self._kill_workers() - raise WorkerException(f"Received error message from worker {w}", D_recv.msg, D_recv.exc) - elif isinstance(D_recv, logging.LogRecord): - logger.debug(f"Manager received a log message from worker {w}") - logging.getLogger(D_recv.name).handle(D_recv) - else: - logger.debug(f"Manager received data message from worker {w}") - self._update_state_on_worker_msg(persis_info, D_recv, w) - - def _kill_cancelled_sims(self) -> None: - """Send kill signals to any sims marked as cancel_requested""" - - if self.kill_canceled_sims: - inds_to_check = np.arange(self.hist.last_ended + 1, self.hist.last_started + 1) - - kill_sim = ( - self.hist.H["sim_started"][inds_to_check] - & self.hist.H["cancel_requested"][inds_to_check] - & ~self.hist.H["sim_ended"][inds_to_check] - & ~self.hist.H["kill_sent"][inds_to_check] - ) - kill_sim_rows = inds_to_check[kill_sim] - - # Note that a return is still expected when running sims are killed - if np.any(kill_sim): - logger.debug(f"Manager sending kill signals to H indices {kill_sim_rows}") - kill_ids = self.hist.H["sim_id"][kill_sim_rows] - kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] - for w in kill_on_workers: - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) - self.hist.H["kill_sent"][kill_ids] = True - # --- Handle termination - def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): - """ - Tries to receive from any active workers. - - If time expires before all active workers have been received from, a - nonblocking receive is posted (though the manager will not receive this - data) and a kill signal is sent. - """ - - # Send a handshake signal to each persistent worker. - if any(self.W["persis_state"]): - for w in self.W["worker_id"][self.W["persis_state"] > 0]: - logger.debug(f"Manager sending PERSIS_STOP to worker {w}") - if self.libE_specs.get("final_gen_send", False): - rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] - work = { - "H_fields": self.gen_specs["persis_in"], - "persis_info": persis_info[w], - "tag": PERSIS_STOP, - "libE_info": {"persistent": True, "H_rows": rows_to_send}, - } - self._check_work_order(work, w, force=True) - self._send_work_order(work, w) - self.hist.update_history_to_gen(rows_to_send) - else: - self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) - if not self.W[w - 1]["active"]: - # Re-activate if necessary - self.W[w - 1]["active"] = self.W[w - 1]["persis_state"] - self.persis_pending.append(w) - - exit_flag = 0 - while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: - persis_info = self._receive_from_workers(persis_info) - if self.term_test(logged=False) == 2: - # Elapsed Wallclock has expired - if not any(self.W["persis_state"]): - if any(self.W["active"]): - logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) - else: - logger.manager_warning(_WALLCLOCK_MSG_ALL_RETURNED) - exit_flag = 2 - if self.WorkerExc: - exit_flag = 1 - - self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) - self._kill_workers() - return persis_info, exit_flag, self.elapsed() - def _sim_max_given(self) -> bool: if "sim_max" in self.exit_criteria: return self.hist.sim_started_count >= self.exit_criteria["sim_max"] + self.hist.sim_started_offset diff --git a/libensemble/utils/pipelines.py b/libensemble/utils/pipelines.py index a50d85a82..0c81cbd03 100644 --- a/libensemble/utils/pipelines.py +++ b/libensemble/utils/pipelines.py @@ -20,6 +20,7 @@ from libensemble.resources.resources import Resources from libensemble.tools.tools import _PERSIS_RETURN_WARNING from libensemble.utils.misc import extract_H_ranges +from libensemble.worker import Worker as LocalWorker from libensemble.worker import WorkerErrMsg logger = logging.getLogger(__name__) @@ -53,12 +54,23 @@ def __init__(self, libE_specs, sim_specs, gen_specs): super().__init__(libE_specs, sim_specs, gen_specs) +class WorkerFromManager(_WorkPipeline): + def __init__(self, libE_specs, sim_specs, gen_specs): + super().__init__(libE_specs, sim_specs, gen_specs) + + class Worker: """Wrapper class for Worker array and worker comms""" + def __new__(cls, W: npt.NDArray, wid: int, wcomms: list = []): + if wid == 0: + return super(Worker, ManagerWorker).__new__(ManagerWorker) + else: + return super().__new__(Worker) + def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): self.__dict__["_W"] = W - self.__dict__["_wididx"] = wid - 1 + self.__dict__["_wididx"] = wid self.__dict__["_wcomms"] = wcomms def __setattr__(self, field, value): @@ -82,9 +94,6 @@ def update_persistent_state(self): self.active = 0 self.active_recv = 0 - def set_work(self, Work): - self.__dict__["_Work"] = Work - def send(self, tag, data): self._wcomms[self._wididx].send(tag, data) @@ -95,6 +104,20 @@ def recv(self): return self._wcomms[self._wididx].recv() +class ManagerWorker(Worker): + """Manager invisibly sends work to itself, then performs work""" + + def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): + super().__init__(W, wid, wcomms) + + def run_gen_work(self, pipeline): + comm = self.__dict__["_wcomms"][0] + local_worker = LocalWorker( + comm, pipeline.libE_specs["_dtypes"], 0, pipeline.sim_specs, pipeline.gen_specs, pipeline.libE_specs + ) + local_worker.run(iterations=1) + + class _ManagerPipeline(_WorkPipeline): def __init__(self, Manager): super().__init__(Manager.libE_specs, Manager.sim_specs, Manager.gen_specs) @@ -202,7 +225,7 @@ def _receive_from_workers(self, persis_info: dict) -> dict: while new_stuff: new_stuff = False for w in self.W["worker_id"]: - if self.wcomms[w - 1].mail_flag(): + if self.wcomms[w].mail_flag(): new_stuff = True self._handle_msg_from_worker(persis_info, w) @@ -331,6 +354,9 @@ def _send_work_order(self, Work: dict, w: int) -> None: H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) worker.send(0, H_to_be_sent) + if Work["tag"] == EVAL_GEN_TAG and w == 0: + worker.run_gen_work(self) + def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: """Checks validity of an allocation function order""" @@ -358,8 +384,3 @@ def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: diff_fields = list(work_fields.difference(hist_fields)) assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." - - -class ManagerInplace(_ManagerPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs): - super().__init__(libE_specs, sim_specs, gen_specs) diff --git a/libensemble/worker.py b/libensemble/worker.py index ad8bd4530..c13567750 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -374,11 +374,13 @@ def _handle(self, Work: dict) -> dict: "calc_type": calc_type, } - def run(self) -> None: + def run(self, iterations=0) -> None: """Runs the main worker loop.""" try: logger.info(f"Worker {self.workerID} initiated on node {socket.gethostname()}") + current_iterations = 0 + for worker_iter in count(start=1): logger.debug(f"Iteration {worker_iter}") @@ -407,6 +409,9 @@ def run(self) -> None: if response is None: break self.comm.send(0, response) + current_iterations += 1 + if iterations > 0 and (current_iterations >= iterations): + break except Exception as e: self.comm.send(0, WorkerErrMsg(" ".join(format_exc_msg(type(e), e)).strip(), format_exc())) From 33ea282e4c07d017d29bf0dc5a573f2179bf48b2 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 11:11:41 -0600 Subject: [PATCH 09/76] restore version of manager from develop. specify iterations for worker. --- libensemble/manager.py | 309 +++++++++++++++++++++++++++++++++++++---- libensemble/worker.py | 5 +- 2 files changed, 285 insertions(+), 29 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index a822de005..cce7682f8 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -10,22 +10,34 @@ import platform import socket import sys +import time import traceback -from queue import SimpleQueue from typing import Any, Union import numpy as np import numpy.typing as npt from numpy.lib.recfunctions import repack_fields -from libensemble.comms.comms import QComm -from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG, PERSIS_STOP, calc_status_strings +from libensemble.comms.comms import CommFinishedException +from libensemble.message_numbers import ( + EVAL_GEN_TAG, + EVAL_SIM_TAG, + FINISHED_PERSISTENT_GEN_TAG, + FINISHED_PERSISTENT_SIM_TAG, + MAN_SIGNAL_FINISH, + MAN_SIGNAL_KILL, + PERSIS_STOP, + STOP_TAG, + calc_status_strings, + calc_type_strings, +) from libensemble.resources.resources import Resources from libensemble.tools.fields_keys import protected_libE_fields -from libensemble.tools.tools import _USER_CALC_DIR_WARNING +from libensemble.tools.tools import _PERSIS_RETURN_WARNING, _USER_CALC_DIR_WARNING +from libensemble.utils.misc import extract_H_ranges from libensemble.utils.output_directory import EnsembleDirectory -from libensemble.utils.pipelines import ManagerFromWorker, ManagerToWorker from libensemble.utils.timer import Timer +from libensemble.worker import WorkerErrMsg logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -96,6 +108,9 @@ def manager_main( pr = cProfile.Profile() pr.enable() + if "in" not in gen_specs: + gen_specs["in"] = [] + # Send dtypes to workers dtypes = { EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, @@ -109,8 +124,6 @@ def manager_main( for wcomm in wcomms: wcomm.send(0, libE_specs.get("workflow_dir_path")) - libE_specs["_dtypes"] = dtypes - # Set up and run manager mgr = Manager(hist, libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, wcomms) result = mgr.run(persis_info) @@ -187,8 +200,8 @@ def __init__( self.gen_num_procs = libE_specs.get("gen_num_procs", 0) self.gen_num_gpus = libE_specs.get("gen_num_gpus", 0) - self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) - self.W["worker_id"] = np.arange(len(self.wcomms) + 1) + self.W = np.zeros(len(self.wcomms), dtype=Manager.worker_dtype) + self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 self.term_tests = [ (2, "wallclock_max", self.term_test_wallclock), (1, "sim_max", self.term_test_sim_max), @@ -196,11 +209,6 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - self.self_inbox = SimpleQueue() - self.self_outbox = SimpleQueue() - - self.wcomms = [QComm(self.self_inbox, self.self_outbox, len(self.W))] + self.wcomms - temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources self.scheduler_opts = self.libE_specs.get("scheduler_opts", {}) @@ -253,6 +261,13 @@ def term_test(self, logged: bool = True) -> Union[bool, int]: return retval return 0 + # --- Low-level communication routines + + def _kill_workers(self) -> None: + """Kills the workers""" + for w in self.W["worker_id"]: + self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) + # --- Checkpointing logic def _get_date_start_str(self) -> str: @@ -301,6 +316,95 @@ def _init_every_k_save(self, complete=False) -> None: if self.libE_specs.get("save_every_k_gens"): self._save_every_k_gens(complete) + # --- Handle outgoing messages to workers (work orders from alloc) + + def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: + """Checks validity of an allocation function order""" + assert w != 0, "Can't send to worker 0; this is the manager." + if self.W[w - 1]["active_recv"]: + assert "active_recv" in Work["libE_info"], ( + "Messages to a worker in active_recv mode should have active_recv" + f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" + ) + else: + if not force: + assert self.W[w - 1]["active"] == 0, ( + "Allocation function requested work be sent to worker %d, an already active worker." % w + ) + work_rows = Work["libE_info"]["H_rows"] + if len(work_rows): + work_fields = set(Work["H_fields"]) + + assert len(work_fields), ( + f"Allocation function requested rows={work_rows} be sent to worker={w}, " + "but requested no fields to be sent." + ) + hist_fields = self.hist.H.dtype.names + diff_fields = list(work_fields.difference(hist_fields)) + + assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." + + def _set_resources(self, Work: dict, w: int) -> None: + """Check rsets given in Work match rsets assigned in resources. + + If rsets are not assigned, then assign using default mapping + """ + resource_manager = self.resources.resource_manager + rset_req = Work["libE_info"].get("rset_team") + + if rset_req is None: + rset_team = [] + default_rset = resource_manager.index_list[w - 1] + if default_rset is not None: + rset_team.append(default_rset) + Work["libE_info"]["rset_team"] = rset_team + + resource_manager.assign_rsets(Work["libE_info"]["rset_team"], w) + + def _freeup_resources(self, w: int) -> None: + """Free up resources assigned to the worker""" + if self.resources: + self.resources.resource_manager.free_rsets(w) + + def _send_work_order(self, Work: dict, w: int) -> None: + """Sends an allocation function order to a worker""" + logger.debug(f"Manager sending work unit to worker {w}") + + if self.resources: + self._set_resources(Work, w) + + self.wcomms[w - 1].send(Work["tag"], Work) + + if Work["tag"] == EVAL_GEN_TAG: + self.W[w - 1]["gen_started_time"] = time.time() + + work_rows = Work["libE_info"]["H_rows"] + work_name = calc_type_strings[Work["tag"]] + logger.debug(f"Manager sending {work_name} work to worker {w}. Rows {extract_H_ranges(Work) or None}") + if len(work_rows): + new_dtype = [(name, self.hist.H.dtype.fields[name][0]) for name in Work["H_fields"]] + H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) + for i, row in enumerate(work_rows): + H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) + self.wcomms[w - 1].send(0, H_to_be_sent) + + def _update_state_on_alloc(self, Work: dict, w: int): + """Updates a workers' active/idle status following an allocation order""" + self.W[w - 1]["active"] = Work["tag"] + if "libE_info" in Work: + if "persistent" in Work["libE_info"]: + self.W[w - 1]["persis_state"] = Work["tag"] + if Work["libE_info"].get("active_recv", False): + self.W[w - 1]["active_recv"] = Work["tag"] + else: + assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + + work_rows = Work["libE_info"]["H_rows"] + if Work["tag"] == EVAL_SIM_TAG: + self.hist.update_history_x_out(work_rows, w, self.kill_canceled_sims) + elif Work["tag"] == EVAL_GEN_TAG: + self.hist.update_history_to_gen(work_rows) + # --- Handle incoming messages from workers @staticmethod @@ -317,8 +421,164 @@ def _check_received_calc(D_recv: dict) -> None: calc_status, str ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" + def _receive_from_workers(self, persis_info: dict) -> dict: + """Receives calculation output from workers. Loops over all + active workers and probes to see if worker is ready to + communticate. If any output is received, all other workers are + looped back over. + """ + time.sleep(0.0001) # Critical for multiprocessing performance + new_stuff = True + while new_stuff: + new_stuff = False + for w in self.W["worker_id"]: + if self.wcomms[w - 1].mail_flag(): + new_stuff = True + self._handle_msg_from_worker(persis_info, w) + + self._init_every_k_save() + return persis_info + + def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) -> None: + """Updates history and worker info on worker message""" + calc_type = D_recv["calc_type"] + calc_status = D_recv["calc_status"] + Manager._check_received_calc(D_recv) + + keep_state = D_recv["libE_info"].get("keep_state", False) + if w not in self.persis_pending and not self.W[w - 1]["active_recv"] and not keep_state: + self.W[w - 1]["active"] = 0 + + if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: + final_data = D_recv.get("calc_out", None) + if isinstance(final_data, np.ndarray): + if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): + self.hist.update_history_x_in(w, final_data, self.W[w - 1]["gen_started_time"]) + elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): + self.hist.update_history_f(D_recv, self.kill_canceled_sims) + else: + logger.info(_PERSIS_RETURN_WARNING) + self.W[w - 1]["persis_state"] = 0 + if self.W[w - 1]["active_recv"]: + self.W[w - 1]["active"] = 0 + self.W[w - 1]["active_recv"] = 0 + if w in self.persis_pending: + self.persis_pending.remove(w) + self.W[w - 1]["active"] = 0 + self._freeup_resources(w) + else: + if calc_type == EVAL_SIM_TAG: + self.hist.update_history_f(D_recv, self.kill_canceled_sims) + if calc_type == EVAL_GEN_TAG: + self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w - 1]["gen_started_time"]) + assert ( + len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w - 1]["persis_state"] + ), "Gen must return work when is is the only thing active and not persistent." + if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: + # Now a waiting, persistent worker + self.W[w - 1]["persis_state"] = calc_type + else: + self._freeup_resources(w) + + if D_recv.get("persis_info"): + persis_info[w].update(D_recv["persis_info"]) + + def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: + """Handles a message from worker w""" + try: + msg = self.wcomms[w - 1].recv() + tag, D_recv = msg + except CommFinishedException: + logger.debug(f"Finalizing message from Worker {w}") + return + if isinstance(D_recv, WorkerErrMsg): + self.W[w - 1]["active"] = 0 + logger.debug(f"Manager received exception from worker {w}") + if not self.WorkerExc: + self.WorkerExc = True + self._kill_workers() + raise WorkerException(f"Received error message from worker {w}", D_recv.msg, D_recv.exc) + elif isinstance(D_recv, logging.LogRecord): + logger.debug(f"Manager received a log message from worker {w}") + logging.getLogger(D_recv.name).handle(D_recv) + else: + logger.debug(f"Manager received data message from worker {w}") + self._update_state_on_worker_msg(persis_info, D_recv, w) + + def _kill_cancelled_sims(self) -> None: + """Send kill signals to any sims marked as cancel_requested""" + + if self.kill_canceled_sims: + inds_to_check = np.arange(self.hist.last_ended + 1, self.hist.last_started + 1) + + kill_sim = ( + self.hist.H["sim_started"][inds_to_check] + & self.hist.H["cancel_requested"][inds_to_check] + & ~self.hist.H["sim_ended"][inds_to_check] + & ~self.hist.H["kill_sent"][inds_to_check] + ) + kill_sim_rows = inds_to_check[kill_sim] + + # Note that a return is still expected when running sims are killed + if np.any(kill_sim): + logger.debug(f"Manager sending kill signals to H indices {kill_sim_rows}") + kill_ids = self.hist.H["sim_id"][kill_sim_rows] + kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] + for w in kill_on_workers: + self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) + self.hist.H["kill_sent"][kill_ids] = True + # --- Handle termination + def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): + """ + Tries to receive from any active workers. + + If time expires before all active workers have been received from, a + nonblocking receive is posted (though the manager will not receive this + data) and a kill signal is sent. + """ + + # Send a handshake signal to each persistent worker. + if any(self.W["persis_state"]): + for w in self.W["worker_id"][self.W["persis_state"] > 0]: + logger.debug(f"Manager sending PERSIS_STOP to worker {w}") + if self.libE_specs.get("final_gen_send", False): + rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] + work = { + "H_fields": self.gen_specs["persis_in"], + "persis_info": persis_info[w], + "tag": PERSIS_STOP, + "libE_info": {"persistent": True, "H_rows": rows_to_send}, + } + self._check_work_order(work, w, force=True) + self._send_work_order(work, w) + self.hist.update_history_to_gen(rows_to_send) + else: + self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) + if not self.W[w - 1]["active"]: + # Re-activate if necessary + self.W[w - 1]["active"] = self.W[w - 1]["persis_state"] + self.persis_pending.append(w) + + exit_flag = 0 + while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: + persis_info = self._receive_from_workers(persis_info) + if self.term_test(logged=False) == 2: + # Elapsed Wallclock has expired + if not any(self.W["persis_state"]): + if any(self.W["active"]): + logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) + else: + logger.manager_warning(_WALLCLOCK_MSG_ALL_RETURNED) + exit_flag = 2 + if self.WorkerExc: + exit_flag = 1 + + self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) + self._kill_workers() + return persis_info, exit_flag, self.elapsed() + def _sim_max_given(self) -> bool: if "sim_max" in self.exit_criteria: return self.hist.sim_started_count >= self.exit_criteria["sim_max"] + self.hist.sim_started_offset @@ -382,15 +642,11 @@ def run(self, persis_info: dict) -> (dict, int, int): logger.info(f"Manager initiated on node {socket.gethostname()}") logger.info(f"Manager exit_criteria: {self.exit_criteria}") - self.ToWorker = ManagerToWorker(self) - self.FromWorker = ManagerFromWorker(self) - # Continue receiving and giving until termination test is satisfied try: while not self.term_test(): - self.ToWorker._kill_cancelled_sims() - persis_info = self.FromWorker._receive_from_workers(persis_info) - self._init_every_k_save() + self._kill_cancelled_sims() + persis_info = self._receive_from_workers(persis_info) Work, persis_info, flag = self._alloc_work(self.hist.trim_H(), persis_info) if flag: break @@ -398,22 +654,21 @@ def run(self, persis_info: dict) -> (dict, int, int): for w in Work: if self._sim_max_given(): break - self.ToWorker._check_work_order(Work[w], w) - self.ToWorker._send_work_order(Work[w], w) - self.ToWorker._update_state_on_alloc(Work[w], w) + self._check_work_order(Work[w], w) + self._send_work_order(Work[w], w) + self._update_state_on_alloc(Work[w], w) assert self.term_test() or any( self.W["active"] != 0 ), "alloc_f did not return any work, although all workers are idle." - except WorkerException as e: # catches all error messages from worker + except WorkerException as e: report_worker_exc(e) raise LoggedException(e.args[0], e.args[1]) from None - except Exception as e: # should only catch bugs within manager, or AssertionErrors + except Exception as e: logger.error(traceback.format_exc()) raise LoggedException(e.args) from None finally: # Return persis_info, exit_flag, elapsed time - result = self.FromWorker._final_receive_and_kill(persis_info) - self._init_every_k_save(complete=self.libE_specs["save_H_on_completion"]) + result = self._final_receive_and_kill(persis_info) sys.stdout.flush() sys.stderr.flush() return result diff --git a/libensemble/worker.py b/libensemble/worker.py index c13567750..96d2de8bf 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -51,6 +51,7 @@ def worker_main( log_comm: bool = True, resources: Resources = None, executor: Executor = None, + iterations: int = 0, ) -> None: # noqa: F821 """Evaluates calculations given to it by the manager. @@ -96,7 +97,7 @@ def worker_main( if libE_specs.get("use_workflow_dir"): _, libE_specs["workflow_dir_path"] = comm.recv() - workerID = workerID or comm.rank + workerID = workerID or getattr(comm, "rank", 0) # Initialize logging on comms if log_comm: @@ -108,7 +109,7 @@ def worker_main( # Set up and run worker worker = Worker(comm, dtypes, workerID, sim_specs, gen_specs, libE_specs) with LS.loc("workflow"): - worker.run() + worker.run(iterations) if libE_specs.get("profile"): pr.disable() From 843df3972da97c5b9071ae75c1f9384771948e07 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 11:12:14 -0600 Subject: [PATCH 10/76] remove pipelines.py. will start simpler --- libensemble/utils/pipelines.py | 386 --------------------------------- 1 file changed, 386 deletions(-) delete mode 100644 libensemble/utils/pipelines.py diff --git a/libensemble/utils/pipelines.py b/libensemble/utils/pipelines.py deleted file mode 100644 index 0c81cbd03..000000000 --- a/libensemble/utils/pipelines.py +++ /dev/null @@ -1,386 +0,0 @@ -import logging -import time - -import numpy as np -import numpy.typing as npt -from numpy.lib.recfunctions import repack_fields - -from libensemble.comms.comms import CommFinishedException -from libensemble.message_numbers import ( - EVAL_GEN_TAG, - EVAL_SIM_TAG, - FINISHED_PERSISTENT_GEN_TAG, - FINISHED_PERSISTENT_SIM_TAG, - MAN_SIGNAL_FINISH, - MAN_SIGNAL_KILL, - PERSIS_STOP, - STOP_TAG, - calc_type_strings, -) -from libensemble.resources.resources import Resources -from libensemble.tools.tools import _PERSIS_RETURN_WARNING -from libensemble.utils.misc import extract_H_ranges -from libensemble.worker import Worker as LocalWorker -from libensemble.worker import WorkerErrMsg - -logger = logging.getLogger(__name__) - -_WALLCLOCK_MSG_ALL_RETURNED = """ -Termination due to wallclock_max has occurred. -All completed work has been returned. -Posting kill messages for all workers. -""" - -_WALLCLOCK_MSG_ACTIVE = """ -Termination due to wallclock_max has occurred. -Some issued work has not been returned. -Posting kill messages for all workers. -""" - - -class WorkerException(Exception): - """Exception raised on abort signal from worker""" - - -class _WorkPipeline: - def __init__(self, libE_specs, sim_specs, gen_specs): - self.libE_specs = libE_specs - self.sim_specs = sim_specs - self.gen_specs = gen_specs - - -class WorkerToManager(_WorkPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs): - super().__init__(libE_specs, sim_specs, gen_specs) - - -class WorkerFromManager(_WorkPipeline): - def __init__(self, libE_specs, sim_specs, gen_specs): - super().__init__(libE_specs, sim_specs, gen_specs) - - -class Worker: - """Wrapper class for Worker array and worker comms""" - - def __new__(cls, W: npt.NDArray, wid: int, wcomms: list = []): - if wid == 0: - return super(Worker, ManagerWorker).__new__(ManagerWorker) - else: - return super().__new__(Worker) - - def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): - self.__dict__["_W"] = W - self.__dict__["_wididx"] = wid - self.__dict__["_wcomms"] = wcomms - - def __setattr__(self, field, value): - self._W[self._wididx][field] = value - - def __getattr__(self, field): - return self._W[self._wididx][field] - - def update_state_on_alloc(self, Work: dict): - self.active = Work["tag"] - if "persistent" in Work["libE_info"]: - self.persis_state = Work["tag"] - if Work["libE_info"].get("active_recv", False): - self.active_recv = Work["tag"] - else: - assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" - - def update_persistent_state(self): - self.persis_state = 0 - if self.active_recv: - self.active = 0 - self.active_recv = 0 - - def send(self, tag, data): - self._wcomms[self._wididx].send(tag, data) - - def mail_flag(self): - return self._wcomms[self._wididx].mail_flag() - - def recv(self): - return self._wcomms[self._wididx].recv() - - -class ManagerWorker(Worker): - """Manager invisibly sends work to itself, then performs work""" - - def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): - super().__init__(W, wid, wcomms) - - def run_gen_work(self, pipeline): - comm = self.__dict__["_wcomms"][0] - local_worker = LocalWorker( - comm, pipeline.libE_specs["_dtypes"], 0, pipeline.sim_specs, pipeline.gen_specs, pipeline.libE_specs - ) - local_worker.run(iterations=1) - - -class _ManagerPipeline(_WorkPipeline): - def __init__(self, Manager): - super().__init__(Manager.libE_specs, Manager.sim_specs, Manager.gen_specs) - self.W = Manager.W - self.hist = Manager.hist - self.wcomms = Manager.wcomms - self.kill_canceled_sims = Manager.kill_canceled_sims - self.persis_pending = Manager.persis_pending - - def _update_state_on_alloc(self, Work: dict, w: int): - """Updates a workers' active/idle status following an allocation order""" - worker = Worker(self.W, w, self.wcomms) - worker.update_state_on_alloc(Work) - - work_rows = Work["libE_info"]["H_rows"] - if Work["tag"] == EVAL_SIM_TAG: - self.hist.update_history_x_out(work_rows, w, self.kill_canceled_sims) - elif Work["tag"] == EVAL_GEN_TAG: - self.hist.update_history_to_gen(work_rows) - - def _kill_workers(self) -> None: - """Kills the workers""" - for w in self.W["worker_id"]: - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) - - -class ManagerFromWorker(_ManagerPipeline): - def __init__(self, Manager): - super().__init__(Manager) - self.WorkerExc = False - self.resources = Manager.resources - self.term_test = Manager.term_test - self.elapsed = Manager.elapsed - - def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: - """Handles a message from worker w""" - worker = Worker(self.W, w, self.wcomms) - try: - msg = worker.recv() - _, D_recv = msg - except CommFinishedException: - logger.debug(f"Finalizing message from Worker {w}") - return - if isinstance(D_recv, WorkerErrMsg): - worker.active = 0 - logger.debug(f"Manager received exception from worker {w}") - if not self.WorkerExc: - self.WorkerExc = True - self._kill_workers() - raise WorkerException(f"Received error message from worker {w}", D_recv.msg, D_recv.exc) - elif isinstance(D_recv, logging.LogRecord): - logger.debug(f"Manager received a log message from worker {w}") - logging.getLogger(D_recv.name).handle(D_recv) - else: - logger.debug(f"Manager received data message from worker {w}") - self._update_state_on_worker_msg(persis_info, D_recv, w) - - def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) -> None: - """Updates history and worker info on worker message""" - calc_type = D_recv["calc_type"] - calc_status = D_recv["calc_status"] - - worker = Worker(self.W, w, self.wcomms) - - keep_state = D_recv["libE_info"].get("keep_state", False) - if w not in self.persis_pending and not worker.active_recv and not keep_state: - worker.active = 0 - - if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: - final_data = D_recv.get("calc_out", None) - if isinstance(final_data, np.ndarray): - if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): - self.hist.update_history_x_in(w, final_data, worker.gen_started_time) - elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): - self.hist.update_history_f(D_recv, self.kill_canceled_sims) - else: - logger.info(_PERSIS_RETURN_WARNING) - worker.update_persistent_state() - if w in self.persis_pending: - self.persis_pending.remove(w) - worker.active = 0 - self._freeup_resources(w) - else: - if calc_type == EVAL_SIM_TAG: - self.hist.update_history_f(D_recv, self.kill_canceled_sims) - if calc_type == EVAL_GEN_TAG: - self.hist.update_history_x_in(w, D_recv["calc_out"], worker.gen_started_time) - assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or worker.persis_state - ), "Gen must return work when is is the only thing active and not persistent." - if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: - # Now a waiting, persistent worker - worker.persis_state = calc_type - else: - self._freeup_resources(w) - - def _receive_from_workers(self, persis_info: dict) -> dict: - """Receives calculation output from workers. Loops over all - active workers and probes to see if worker is ready to - communicate. If any output is received, all other workers are - looped back over. - """ - time.sleep(0.0001) # Critical for multiprocessing performance - new_stuff = True - while new_stuff: - new_stuff = False - for w in self.W["worker_id"]: - if self.wcomms[w].mail_flag(): - new_stuff = True - self._handle_msg_from_worker(persis_info, w) - - return persis_info - - def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): - """ - Tries to receive from any active workers. - - If time expires before all active workers have been received from, a - nonblocking receive is posted (though the manager will not receive this - data) and a kill signal is sent. - """ - - # Send a handshake signal to each persistent worker. - if any(self.W["persis_state"]): - for w in self.W["worker_id"][self.W["persis_state"] > 0]: - worker = Worker(self.W, w, self.wcomms) - logger.debug(f"Manager sending PERSIS_STOP to worker {w}") - if self.libE_specs.get("final_gen_send", False): - rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] - work = { - "H_fields": self.gen_specs["persis_in"], - "persis_info": persis_info[w], - "tag": PERSIS_STOP, - "libE_info": {"persistent": True, "H_rows": rows_to_send}, - } - self._send_work_order(work, w) - self.hist.update_history_to_gen(rows_to_send) - else: - worker.send(PERSIS_STOP, MAN_SIGNAL_KILL) - if not worker.active: - # Re-activate if necessary - worker.active = worker.persis_state - self.persis_pending.append(w) - - exit_flag = 0 - while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: - persis_info = self._receive_from_workers(persis_info) - if self.term_test(logged=False) == 2: - # Elapsed Wallclock has expired - if not any(self.W["persis_state"]): - if any(self.W["active"]): - logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) - else: - logger.manager_warning(_WALLCLOCK_MSG_ALL_RETURNED) - exit_flag = 2 - if self.WorkerExc: - exit_flag = 1 - - self._kill_workers() - return persis_info, exit_flag, self.elapsed() - - def _freeup_resources(self, w: int) -> None: - """Free up resources assigned to the worker""" - if self.resources: - self.resources.resource_manager.free_rsets(w) - - -class ManagerToWorker(_ManagerPipeline): - def __init__(self, Manager): - super().__init__(Manager) - - def _kill_cancelled_sims(self) -> None: - """Send kill signals to any sims marked as cancel_requested""" - - if self.kill_canceled_sims: - inds_to_check = np.arange(self.hist.last_ended + 1, self.hist.last_started + 1) - - kill_sim = ( - self.hist.H["sim_started"][inds_to_check] - & self.hist.H["cancel_requested"][inds_to_check] - & ~self.hist.H["sim_ended"][inds_to_check] - & ~self.hist.H["kill_sent"][inds_to_check] - ) - kill_sim_rows = inds_to_check[kill_sim] - - # Note that a return is still expected when running sims are killed - if np.any(kill_sim): - logger.debug(f"Manager sending kill signals to H indices {kill_sim_rows}") - kill_ids = self.hist.H["sim_id"][kill_sim_rows] - kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] - for w in kill_on_workers: - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) - self.hist.H["kill_sent"][kill_ids] = True - - @staticmethod - def _set_resources(Work: dict, w: int) -> None: - """Check rsets given in Work match rsets assigned in resources. - - If rsets are not assigned, then assign using default mapping - """ - resource_manager = Resources.resources.resource_manager - rset_req = Work["libE_info"].get("rset_team") - - if rset_req is None: - rset_team = [] - default_rset = resource_manager.index_list[w - 1] - if default_rset is not None: - rset_team.append(default_rset) - Work["libE_info"]["rset_team"] = rset_team - - resource_manager.assign_rsets(Work["libE_info"]["rset_team"], w) - - def _send_work_order(self, Work: dict, w: int) -> None: - """Sends an allocation function order to a worker""" - logger.debug(f"Manager sending work unit to worker {w}") - - worker = Worker(self.W, w, self.wcomms) - - if Resources.resources: - self._set_resources(Work, w) - - worker.send(Work["tag"], Work) - - if Work["tag"] == EVAL_GEN_TAG: - worker.gen_started_time = time.time() - - work_rows = Work["libE_info"]["H_rows"] - work_name = calc_type_strings[Work["tag"]] - logger.debug(f"Manager sending {work_name} work to worker {w}. Rows {extract_H_ranges(Work) or None}") - if len(work_rows): - new_dtype = [(name, self.hist.H.dtype.fields[name][0]) for name in Work["H_fields"]] - H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) - for i, row in enumerate(work_rows): - H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - worker.send(0, H_to_be_sent) - - if Work["tag"] == EVAL_GEN_TAG and w == 0: - worker.run_gen_work(self) - - def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: - """Checks validity of an allocation function order""" - - worker = Worker(self.W, w, self.wcomms) - - if worker.active_recv: - assert "active_recv" in Work["libE_info"], ( - "Messages to a worker in active_recv mode should have active_recv" - f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" - ) - else: - if not force: - assert worker.active == 0, ( - "Allocation function requested work be sent to worker %d, an already active worker." % w - ) - work_rows = Work["libE_info"]["H_rows"] - if len(work_rows): - work_fields = set(Work["H_fields"]) - - assert len(work_fields), ( - f"Allocation function requested rows={work_rows} be sent to worker={w}, " - "but requested no fields to be sent." - ) - hist_fields = self.hist.H.dtype.names - diff_fields = list(work_fields.difference(hist_fields)) - - assert not diff_fields, f"Allocation function requested invalid fields {diff_fields} be sent to worker={w}." From 3aeab06b4a6054810cdaf5d39bb6ff7cc0f30895 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 11:40:15 -0600 Subject: [PATCH 11/76] undoing "iterations" change in worker, seeing if we can simply submit gen work to local worker thread --- libensemble/manager.py | 45 ++++++++++++++++++++++++++++++++++++++---- libensemble/worker.py | 10 ++-------- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index cce7682f8..d1f7a2d83 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -18,7 +18,8 @@ import numpy.typing as npt from numpy.lib.recfunctions import repack_fields -from libensemble.comms.comms import CommFinishedException +from libensemble.comms.comms import CommFinishedException, QCommThread +from libensemble.executors.executor import Executor from libensemble.message_numbers import ( EVAL_GEN_TAG, EVAL_SIM_TAG, @@ -37,7 +38,7 @@ from libensemble.utils.misc import extract_H_ranges from libensemble.utils.output_directory import EnsembleDirectory from libensemble.utils.timer import Timer -from libensemble.worker import WorkerErrMsg +from libensemble.worker import WorkerErrMsg, worker_main logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -209,6 +210,29 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] + self.local_worker_comm = None + self.libE_specs["gen_man"] = True + + dtypes = { + EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, + EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, + } + + if self.libE_specs.get("gen_man", False): + self.local_worker_comm = QCommThread( + worker_main, + len(self.wcomms), + sim_specs, + gen_specs, + libE_specs, + 0, + False, + Resources.resources, + Executor.executor, + ) + self.local_worker_comm.run() + self.local_worker_comm.send(0, dtypes) + temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources self.scheduler_opts = self.libE_specs.get("scheduler_opts", {}) @@ -265,6 +289,8 @@ def term_test(self, logged: bool = True) -> Union[bool, int]: def _kill_workers(self) -> None: """Kills the workers""" + if self.local_worker_comm: + self.local_worker_comm.send(STOP_TAG, MAN_SIGNAL_FINISH) for w in self.W["worker_id"]: self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) @@ -373,7 +399,10 @@ def _send_work_order(self, Work: dict, w: int) -> None: if self.resources: self._set_resources(Work, w) - self.wcomms[w - 1].send(Work["tag"], Work) + if Work["tag"] == EVAL_GEN_TAG and self.libE_specs.get("gen_man", False): + self.local_worker_comm.send(Work["tag"], Work) + else: + self.wcomms[w - 1].send(Work["tag"], Work) if Work["tag"] == EVAL_GEN_TAG: self.W[w - 1]["gen_started_time"] = time.time() @@ -386,7 +415,11 @@ def _send_work_order(self, Work: dict, w: int) -> None: H_to_be_sent = np.empty(len(work_rows), dtype=new_dtype) for i, row in enumerate(work_rows): H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - self.wcomms[w - 1].send(0, H_to_be_sent) + + if Work["tag"] == EVAL_GEN_TAG and self.libE_specs.get("gen_man", False): + self.local_worker_comm.send(0, H_to_be_sent) + else: + self.wcomms[w - 1].send(0, H_to_be_sent) def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" @@ -525,6 +558,8 @@ def _kill_cancelled_sims(self) -> None: kill_ids = self.hist.H["sim_id"][kill_sim_rows] kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] for w in kill_on_workers: + if self.local_worker_comm: + self.local_worker_comm.send(STOP_TAG, MAN_SIGNAL_KILL) self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) self.hist.H["kill_sent"][kill_ids] = True @@ -555,6 +590,8 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self._send_work_order(work, w) self.hist.update_history_to_gen(rows_to_send) else: + if self.local_worker_comm: + self.local_worker_comm.send(PERSIS_STOP, MAN_SIGNAL_KILL) self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) if not self.W[w - 1]["active"]: # Re-activate if necessary diff --git a/libensemble/worker.py b/libensemble/worker.py index 96d2de8bf..9c18c18d6 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -51,7 +51,6 @@ def worker_main( log_comm: bool = True, resources: Resources = None, executor: Executor = None, - iterations: int = 0, ) -> None: # noqa: F821 """Evaluates calculations given to it by the manager. @@ -109,7 +108,7 @@ def worker_main( # Set up and run worker worker = Worker(comm, dtypes, workerID, sim_specs, gen_specs, libE_specs) with LS.loc("workflow"): - worker.run(iterations) + worker.run() if libE_specs.get("profile"): pr.disable() @@ -375,13 +374,11 @@ def _handle(self, Work: dict) -> dict: "calc_type": calc_type, } - def run(self, iterations=0) -> None: + def run(self) -> None: """Runs the main worker loop.""" try: logger.info(f"Worker {self.workerID} initiated on node {socket.gethostname()}") - current_iterations = 0 - for worker_iter in count(start=1): logger.debug(f"Iteration {worker_iter}") @@ -410,9 +407,6 @@ def run(self, iterations=0) -> None: if response is None: break self.comm.send(0, response) - current_iterations += 1 - if iterations > 0 and (current_iterations >= iterations): - break except Exception as e: self.comm.send(0, WorkerErrMsg(" ".join(format_exc_msg(type(e), e)).strip(), format_exc())) From b083a2158d4ffef1c30075294999b7e5aeacc679 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 14:21:34 -0600 Subject: [PATCH 12/76] add attempted update_state_on_local_gen_msg and handle_msg_from_local_gen, add in Worker wrapper class to manager, but not used yet --- libensemble/manager.py | 83 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/libensemble/manager.py b/libensemble/manager.py index d1f7a2d83..18f818ff1 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -155,6 +155,51 @@ def filter_nans(array: npt.NDArray) -> npt.NDArray: """ +class _Worker: + """Wrapper class for Worker array and worker comms""" + + # def __new__(cls, W: npt.NDArray, wid: int, wcomms: list = []): + # if wid == 0: + # return super(Worker, ManagerWorker).__new__(ManagerWorker) + # else: + # return super().__new__(Worker) + + # def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): + # self.__dict__["_W"] = W + # self.__dict__["_wididx"] = wid + # self.__dict__["_wcomms"] = wcomms + + # def __setattr__(self, field, value): + # self._W[self._wididx][field] = value + + # def __getattr__(self, field): + # return self._W[self._wididx][field] + + # def update_state_on_alloc(self, Work: dict): + # self.active = Work["tag"] + # if "persistent" in Work["libE_info"]: + # self.persis_state = Work["tag"] + # if Work["libE_info"].get("active_recv", False): + # self.active_recv = Work["tag"] + # else: + # assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + + # def update_persistent_state(self): + # self.persis_state = 0 + # if self.active_recv: + # self.active = 0 + # self.active_recv = 0 + + # def send(self, tag, data): + # self._wcomms[self._wididx].send(tag, data) + + # def mail_flag(self): + # return self._wcomms[self._wididx].mail_flag() + + # def recv(self): + # return self._wcomms[self._wididx].recv() + + class Manager: """Manager class for libensemble.""" @@ -454,6 +499,40 @@ def _check_received_calc(D_recv: dict) -> None: calc_status, str ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" + def _update_state_on_local_gen_msg(self, persis_info, D_recv): + calc_type = D_recv["calc_type"] + # calc_status = D_recv["calc_status"] + Manager._check_received_calc(D_recv) + + # keep_state = D_recv["libE_info"].get("keep_state", False) + + if calc_type == EVAL_GEN_TAG: + self.hist.update_history_x_in(0, D_recv["calc_out"], 999) + + if D_recv.get("persis_info"): + persis_info[0].update(D_recv["persis_info"]) + + def _handle_msg_from_local_gen(self, persis_info: dict) -> None: + """Handles a message from worker w""" + try: + msg = self.local_worker_comm.recv() + tag, D_recv = msg + except CommFinishedException: + logger.debug("Finalizing message from Worker 0") + return + if isinstance(D_recv, WorkerErrMsg): + logger.debug("Manager received exception from worker 0") + if not self.WorkerExc: + self.WorkerExc = True + self._kill_workers() + raise WorkerException("Received error message from worker 0", D_recv.msg, D_recv.exc) + elif isinstance(D_recv, logging.LogRecord): + logger.debug("Manager received a log message from worker 0") + logging.getLogger(D_recv.name).handle(D_recv) + else: + logger.debug("Manager received data message from worker 0") + self._update_state_on_local_gen_msg(persis_info, D_recv) + def _receive_from_workers(self, persis_info: dict) -> dict: """Receives calculation output from workers. Loops over all active workers and probes to see if worker is ready to @@ -464,6 +543,9 @@ def _receive_from_workers(self, persis_info: dict) -> dict: new_stuff = True while new_stuff: new_stuff = False + if self.local_worker_comm.mail_flag(): + new_stuff = True + self._handle_msg_from_local_gen(persis_info) for w in self.W["worker_id"]: if self.wcomms[w - 1].mail_flag(): new_stuff = True @@ -638,6 +720,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, + "gen_on_man": self.libE_specs.get("gen_man", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: From 231e2b725220948a3a2a0cd138f3f6b286bcd77a Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 15:36:58 -0600 Subject: [PATCH 13/76] use _Worker class to correctly index into W and wcomms. add initial option to libE_specs --- docs/data_structures/libE_specs.rst | 5 +- libensemble/manager.py | 201 +++++++++++----------------- libensemble/specs.py | 5 +- 3 files changed, 84 insertions(+), 127 deletions(-) diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index d471cf968..15646b1c3 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -28,7 +28,10 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl Manager/Worker communications mode: ``'mpi'``, ``'local'``, or ``'tcp'``. **nworkers** [int]: - Number of worker processes in ``"local"`` or ``"tcp"``. + Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. + + **manager_runs_additional_worker** [int] = False + Manager process can launch an additional threaded worker **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/libensemble/manager.py b/libensemble/manager.py index 18f818ff1..2fedd5336 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -158,46 +158,43 @@ def filter_nans(array: npt.NDArray) -> npt.NDArray: class _Worker: """Wrapper class for Worker array and worker comms""" - # def __new__(cls, W: npt.NDArray, wid: int, wcomms: list = []): - # if wid == 0: - # return super(Worker, ManagerWorker).__new__(ManagerWorker) - # else: - # return super().__new__(Worker) - - # def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): - # self.__dict__["_W"] = W - # self.__dict__["_wididx"] = wid - # self.__dict__["_wcomms"] = wcomms + def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): + self.__dict__["_W"] = W + if 0 in W["worker_id"]: # Contains "0" for manager. Otherwise first entry is Worker 1 + self.__dict__["_wididx"] = wid + else: + self.__dict__["_wididx"] = wid - 1 + self.__dict__["_wcomms"] = wcomms - # def __setattr__(self, field, value): - # self._W[self._wididx][field] = value + def __setattr__(self, field, value): + self._W[self._wididx][field] = value - # def __getattr__(self, field): - # return self._W[self._wididx][field] + def __getattr__(self, field): + return self._W[self._wididx][field] - # def update_state_on_alloc(self, Work: dict): - # self.active = Work["tag"] - # if "persistent" in Work["libE_info"]: - # self.persis_state = Work["tag"] - # if Work["libE_info"].get("active_recv", False): - # self.active_recv = Work["tag"] - # else: - # assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + def update_state_on_alloc(self, Work: dict): + self.active = Work["tag"] + if "persistent" in Work["libE_info"]: + self.persis_state = Work["tag"] + if Work["libE_info"].get("active_recv", False): + self.active_recv = Work["tag"] + else: + assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" - # def update_persistent_state(self): - # self.persis_state = 0 - # if self.active_recv: - # self.active = 0 - # self.active_recv = 0 + def update_persistent_state(self): + self.persis_state = 0 + if self.active_recv: + self.active = 0 + self.active_recv = 0 - # def send(self, tag, data): - # self._wcomms[self._wididx].send(tag, data) + def send(self, tag, data): + self._wcomms[self._wididx].send(tag, data) - # def mail_flag(self): - # return self._wcomms[self._wididx].mail_flag() + def mail_flag(self): + return self._wcomms[self._wididx].mail_flag() - # def recv(self): - # return self._wcomms[self._wididx].recv() + def recv(self): + return self._wcomms[self._wididx].recv() class Manager: @@ -255,16 +252,16 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - self.local_worker_comm = None - self.libE_specs["gen_man"] = True + if self.libE_specs.get("manager_runs_additional_worker", False): - dtypes = { - EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, - EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, - } + dtypes = { + EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, + EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, + } - if self.libE_specs.get("gen_man", False): - self.local_worker_comm = QCommThread( + self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) + self.W["worker_id"] = np.arange(len(self.wcomms) + 1) + local_worker_comm = QCommThread( worker_main, len(self.wcomms), sim_specs, @@ -275,8 +272,9 @@ def __init__( Resources.resources, Executor.executor, ) - self.local_worker_comm.run() - self.local_worker_comm.send(0, dtypes) + self.wcomms = [local_worker_comm] + self.wcomms + local_worker_comm.run() + local_worker_comm.send(0, dtypes) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -334,10 +332,9 @@ def term_test(self, logged: bool = True) -> Union[bool, int]: def _kill_workers(self) -> None: """Kills the workers""" - if self.local_worker_comm: - self.local_worker_comm.send(STOP_TAG, MAN_SIGNAL_FINISH) for w in self.W["worker_id"]: - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_FINISH) + worker = _Worker(self.W, w, self.wcomms) + worker.send(STOP_TAG, MAN_SIGNAL_FINISH) # --- Checkpointing logic @@ -391,15 +388,16 @@ def _init_every_k_save(self, complete=False) -> None: def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: """Checks validity of an allocation function order""" - assert w != 0, "Can't send to worker 0; this is the manager." - if self.W[w - 1]["active_recv"]: + # assert w != 0, "Can't send to worker 0; this is the manager." + worker = _Worker(self.W, w, self.wcomms) + if worker.active_recv: assert "active_recv" in Work["libE_info"], ( "Messages to a worker in active_recv mode should have active_recv" f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" ) else: if not force: - assert self.W[w - 1]["active"] == 0, ( + assert worker.active == 0, ( "Allocation function requested work be sent to worker %d, an already active worker." % w ) work_rows = Work["libE_info"]["H_rows"] @@ -441,16 +439,15 @@ def _send_work_order(self, Work: dict, w: int) -> None: """Sends an allocation function order to a worker""" logger.debug(f"Manager sending work unit to worker {w}") + worker = _Worker(self.W, w, self.wcomms) + if self.resources: self._set_resources(Work, w) - if Work["tag"] == EVAL_GEN_TAG and self.libE_specs.get("gen_man", False): - self.local_worker_comm.send(Work["tag"], Work) - else: - self.wcomms[w - 1].send(Work["tag"], Work) + worker.send(Work["tag"], Work) if Work["tag"] == EVAL_GEN_TAG: - self.W[w - 1]["gen_started_time"] = time.time() + worker.gen_started_time = time.time() work_rows = Work["libE_info"]["H_rows"] work_name = calc_type_strings[Work["tag"]] @@ -461,21 +458,13 @@ def _send_work_order(self, Work: dict, w: int) -> None: for i, row in enumerate(work_rows): H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - if Work["tag"] == EVAL_GEN_TAG and self.libE_specs.get("gen_man", False): - self.local_worker_comm.send(0, H_to_be_sent) - else: - self.wcomms[w - 1].send(0, H_to_be_sent) + worker.send(0, H_to_be_sent) def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" - self.W[w - 1]["active"] = Work["tag"] - if "libE_info" in Work: - if "persistent" in Work["libE_info"]: - self.W[w - 1]["persis_state"] = Work["tag"] - if Work["libE_info"].get("active_recv", False): - self.W[w - 1]["active_recv"] = Work["tag"] - else: - assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" + + worker = _Worker(self.W, w, self.wcomms) + worker.update_state_on_alloc(Work) work_rows = Work["libE_info"]["H_rows"] if Work["tag"] == EVAL_SIM_TAG: @@ -499,40 +488,6 @@ def _check_received_calc(D_recv: dict) -> None: calc_status, str ), f"Aborting: Unknown calculation status received. Received status: {calc_status}" - def _update_state_on_local_gen_msg(self, persis_info, D_recv): - calc_type = D_recv["calc_type"] - # calc_status = D_recv["calc_status"] - Manager._check_received_calc(D_recv) - - # keep_state = D_recv["libE_info"].get("keep_state", False) - - if calc_type == EVAL_GEN_TAG: - self.hist.update_history_x_in(0, D_recv["calc_out"], 999) - - if D_recv.get("persis_info"): - persis_info[0].update(D_recv["persis_info"]) - - def _handle_msg_from_local_gen(self, persis_info: dict) -> None: - """Handles a message from worker w""" - try: - msg = self.local_worker_comm.recv() - tag, D_recv = msg - except CommFinishedException: - logger.debug("Finalizing message from Worker 0") - return - if isinstance(D_recv, WorkerErrMsg): - logger.debug("Manager received exception from worker 0") - if not self.WorkerExc: - self.WorkerExc = True - self._kill_workers() - raise WorkerException("Received error message from worker 0", D_recv.msg, D_recv.exc) - elif isinstance(D_recv, logging.LogRecord): - logger.debug("Manager received a log message from worker 0") - logging.getLogger(D_recv.name).handle(D_recv) - else: - logger.debug("Manager received data message from worker 0") - self._update_state_on_local_gen_msg(persis_info, D_recv) - def _receive_from_workers(self, persis_info: dict) -> dict: """Receives calculation output from workers. Loops over all active workers and probes to see if worker is ready to @@ -543,11 +498,9 @@ def _receive_from_workers(self, persis_info: dict) -> dict: new_stuff = True while new_stuff: new_stuff = False - if self.local_worker_comm.mail_flag(): - new_stuff = True - self._handle_msg_from_local_gen(persis_info) for w in self.W["worker_id"]: - if self.wcomms[w - 1].mail_flag(): + worker = _Worker(self.W, w, self.wcomms) + if worker.mail_flag(): new_stuff = True self._handle_msg_from_worker(persis_info, w) @@ -560,38 +513,37 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - calc_status = D_recv["calc_status"] Manager._check_received_calc(D_recv) + worker = _Worker(self.W, w, self.wcomms) + keep_state = D_recv["libE_info"].get("keep_state", False) - if w not in self.persis_pending and not self.W[w - 1]["active_recv"] and not keep_state: - self.W[w - 1]["active"] = 0 + if w not in self.persis_pending and not worker.active_recv and not keep_state: + worker.active = 0 if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: final_data = D_recv.get("calc_out", None) if isinstance(final_data, np.ndarray): if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): - self.hist.update_history_x_in(w, final_data, self.W[w - 1]["gen_started_time"]) + self.hist.update_history_x_in(w, final_data, worker.gen_started_time) elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): self.hist.update_history_f(D_recv, self.kill_canceled_sims) else: logger.info(_PERSIS_RETURN_WARNING) - self.W[w - 1]["persis_state"] = 0 - if self.W[w - 1]["active_recv"]: - self.W[w - 1]["active"] = 0 - self.W[w - 1]["active_recv"] = 0 + worker.update_persistent_state() if w in self.persis_pending: self.persis_pending.remove(w) - self.W[w - 1]["active"] = 0 + worker.active = 0 self._freeup_resources(w) else: if calc_type == EVAL_SIM_TAG: self.hist.update_history_f(D_recv, self.kill_canceled_sims) if calc_type == EVAL_GEN_TAG: - self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w - 1]["gen_started_time"]) + self.hist.update_history_x_in(w, D_recv["calc_out"], worker.gen_started_time) assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w - 1]["persis_state"] + len(D_recv["calc_out"]) or np.any(self.W["active"]) or worker.persis_state ), "Gen must return work when is is the only thing active and not persistent." if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: # Now a waiting, persistent worker - self.W[w - 1]["persis_state"] = calc_type + worker.persis_state = calc_type else: self._freeup_resources(w) @@ -600,14 +552,15 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: """Handles a message from worker w""" + worker = _Worker(self.W, w, self.wcomms) try: - msg = self.wcomms[w - 1].recv() + msg = worker.recv() tag, D_recv = msg except CommFinishedException: logger.debug(f"Finalizing message from Worker {w}") return if isinstance(D_recv, WorkerErrMsg): - self.W[w - 1]["active"] = 0 + worker.active = 0 logger.debug(f"Manager received exception from worker {w}") if not self.WorkerExc: self.WorkerExc = True @@ -640,9 +593,8 @@ def _kill_cancelled_sims(self) -> None: kill_ids = self.hist.H["sim_id"][kill_sim_rows] kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] for w in kill_on_workers: - if self.local_worker_comm: - self.local_worker_comm.send(STOP_TAG, MAN_SIGNAL_KILL) - self.wcomms[w - 1].send(STOP_TAG, MAN_SIGNAL_KILL) + worker = _Worker(self.W, w, self.wcomms) + worker.send(STOP_TAG, MAN_SIGNAL_KILL) self.hist.H["kill_sent"][kill_ids] = True # --- Handle termination @@ -659,6 +611,7 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): # Send a handshake signal to each persistent worker. if any(self.W["persis_state"]): for w in self.W["worker_id"][self.W["persis_state"] > 0]: + worker = _Worker(self.W, w, self.wcomms) logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -672,12 +625,10 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self._send_work_order(work, w) self.hist.update_history_to_gen(rows_to_send) else: - if self.local_worker_comm: - self.local_worker_comm.send(PERSIS_STOP, MAN_SIGNAL_KILL) - self.wcomms[w - 1].send(PERSIS_STOP, MAN_SIGNAL_KILL) - if not self.W[w - 1]["active"]: + worker.send(PERSIS_STOP, MAN_SIGNAL_KILL) + if not worker.active: # Re-activate if necessary - self.W[w - 1]["active"] = self.W[w - 1]["persis_state"] + worker.active = worker.persis_state self.persis_pending.append(w) exit_flag = 0 diff --git a/libensemble/specs.py b/libensemble/specs.py index f7b7b3ea5..4678b01d4 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -160,7 +160,10 @@ class LibeSpecs(BaseModel): """ Manager/Worker communications mode. ``'mpi'``, ``'local'``, ``'threads'``, or ``'tcp'`` """ nworkers: Optional[int] = 0 - """ Number of worker processes in ``"local"`` or ``"tcp"``.""" + """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" + + manager_runs_additional_worker: Optional[int] = False + """ Manager process can launch an additional threaded worker """ mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" From d251363158114b97e306307bd322ec6bba1b16bd Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 15:48:11 -0600 Subject: [PATCH 14/76] add "threaded" tentative option to sim/gen_specs --- libensemble/message_numbers.py | 2 -- libensemble/specs.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/libensemble/message_numbers.py b/libensemble/message_numbers.py index 6caef0a6e..adfcbc244 100644 --- a/libensemble/message_numbers.py +++ b/libensemble/message_numbers.py @@ -41,8 +41,6 @@ # last_calc_status_rst_tag CALC_EXCEPTION = 35 # Reserved: Automatically used if user_f raised an exception -EVAL_FINAL_GEN_TAG = 36 - MAN_KILL_SIGNALS = [MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL] calc_status_strings = { diff --git a/libensemble/specs.py b/libensemble/specs.py index 4678b01d4..13824bbc1 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -55,6 +55,11 @@ class SimSpecs(BaseModel): calling them locally. """ + threaded: Optional[bool] = False + """ + Instruct Worker process to launch user function to a thread. + """ + user: Optional[dict] = {} """ A user-data dictionary to place bounds, constants, settings, or other parameters for customizing @@ -100,6 +105,11 @@ class GenSpecs(BaseModel): calling them locally. """ + threaded: Optional[bool] = False + """ + Instruct Worker process to launch user function to a thread. + """ + user: Optional[dict] = {} """ A user-data dictionary to place bounds, constants, settings, or other parameters for From 368bf937c4136a05d13dfdb5e36bf7fe3f3ebc96 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 17 Jan 2024 15:56:40 -0600 Subject: [PATCH 15/76] fix ThreadRunner shutdown when that worker didn't launch a thread --- libensemble/utils/runners.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py index e21c87ba5..0ea9ce1e7 100644 --- a/libensemble/utils/runners.py +++ b/libensemble/utils/runners.py @@ -73,6 +73,7 @@ def shutdown(self) -> None: class ThreadRunner(Runner): def __init__(self, specs): super().__init__(specs) + self.thread_handle = None def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> (npt.NDArray, dict, Optional[int]): fargs = self._truncate_args(calc_in, persis_info, libE_info) @@ -81,4 +82,5 @@ def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> ( return self.thread_handle.result() def shutdown(self) -> None: - self.thread_handle.terminate() + if self.thread_handle is not None: + self.thread_handle.terminate() From 744620d381e7b4881d8ac2fe83d28eb7e5f1717a Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 18 Jan 2024 10:08:48 -0600 Subject: [PATCH 16/76] adds test-case to functionality tests, fixes alloc_f libE_info usable entry --- libensemble/manager.py | 2 +- .../functionality_tests/test_persistent_uniform_sampling.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 2fedd5336..e9a42f74d 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -671,7 +671,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "gen_on_man": self.libE_specs.get("gen_man", False), + "manager_additional_worker": self.libE_specs.get("manager_runs_additional_worker", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index bd381f3ae..e343ff991 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -62,7 +62,7 @@ libE_specs["kill_canceled_sims"] = False - for run in range(3): + for run in range(4): persis_info = add_unique_random_streams({}, nworkers + 1) for i in persis_info: persis_info[i]["get_grad"] = True @@ -86,6 +86,8 @@ sim_specs["out"] = [("f_i", float), ("gradf_i", float, 2 * m)] sim_specs["in"] = ["x", "obj_component"] # sim_specs["out"] = [("f", float), ("grad", float, n)] + elif run == 3: + libE_specs["manager_runs_additional_worker"] = True # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) From cd6f0db09dc5b5e66f8e3d4e0bff383f9828e98f Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 18 Jan 2024 12:37:03 -0600 Subject: [PATCH 17/76] make resources reflect develop? --- libensemble/resources/scheduler.py | 2 +- libensemble/resources/worker_resources.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/libensemble/resources/scheduler.py b/libensemble/resources/scheduler.py index 386a406bc..04de87e77 100644 --- a/libensemble/resources/scheduler.py +++ b/libensemble/resources/scheduler.py @@ -245,7 +245,7 @@ def get_avail_rsets_by_group(self): for g in groups: self.avail_rsets_by_group[g] = [] for ind, rset in enumerate(rsets): - if rset["assigned"] == -1: # now default is -1. + if not rset["assigned"]: g = rset["group"] self.avail_rsets_by_group[g].append(ind) return self.avail_rsets_by_group diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 2becaa1df..639f27da7 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -50,10 +50,11 @@ def __init__(self, num_workers: int, resources: "GlobalResources") -> None: # n ) self.rsets = np.zeros(self.total_num_rsets, dtype=ResourceManager.man_rset_dtype) - self.rsets["assigned"] = -1 # Can assign to manager (=0) so make unset value -1 + self.rsets["assigned"] = 0 for field in self.all_rsets.dtype.names: self.rsets[field] = self.all_rsets[field] self.num_groups = self.rsets["group"][-1] + self.rsets_free = self.total_num_rsets self.gpu_rsets_free = self.total_num_gpu_rsets self.nongpu_rsets_free = self.total_num_nongpu_rsets @@ -69,7 +70,7 @@ def assign_rsets(self, rset_team, worker_id): if rset_team: rteam = self.rsets["assigned"][rset_team] for i, wid in enumerate(rteam): - if wid == -1: + if wid == 0: self.rsets["assigned"][rset_team[i]] = worker_id self.rsets_free -= 1 if self.rsets["gpus"][rset_team[i]]: @@ -84,13 +85,13 @@ def assign_rsets(self, rset_team, worker_id): def free_rsets(self, worker=None): """Free up assigned resource sets""" if worker is None: - self.rsets["assigned"] = -1 + self.rsets["assigned"] = 0 self.rsets_free = self.total_num_rsets self.gpu_rsets_free = self.total_num_gpu_rsets self.nongpu_rsets_free = self.total_num_nongpu_rsets else: rsets_to_free = np.where(self.rsets["assigned"] == worker)[0] - self.rsets["assigned"][rsets_to_free] = -1 + self.rsets["assigned"][rsets_to_free] = 0 self.rsets_free += len(rsets_to_free) self.gpu_rsets_free += np.count_nonzero(self.rsets["gpus"][rsets_to_free]) self.nongpu_rsets_free += np.count_nonzero(~self.rsets["gpus"][rsets_to_free]) @@ -199,6 +200,7 @@ def __init__(self, num_workers, resources, workerID): self.gen_nprocs = None self.gen_ngpus = None self.platform_info = resources.platform_info + self.tiles_per_gpu = resources.tiles_per_gpu # User convenience functions ---------------------------------------------- @@ -216,6 +218,9 @@ def get_slots_as_string(self, multiplier=1, delimiter=",", limit=None): slot_list = [j for i in self.slots_on_node for j in range(i * n, (i + 1) * n)] if limit is not None: slot_list = slot_list[:limit] + if self.tiles_per_gpu > 1: + ntiles = self.tiles_per_gpu + slot_list = [f"{i // ntiles}.{i % ntiles}" for i in slot_list] slots = delimiter.join(map(str, slot_list)) return slots From 884d61b7174626ab91e05e0040c37371a61bcee5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 22 Jan 2024 13:19:40 -0600 Subject: [PATCH 18/76] remove old symlink --- examples/calling_scripts/tutorial_calling.py | 1 - 1 file changed, 1 deletion(-) delete mode 120000 examples/calling_scripts/tutorial_calling.py diff --git a/examples/calling_scripts/tutorial_calling.py b/examples/calling_scripts/tutorial_calling.py deleted file mode 120000 index f54fe1ad7..000000000 --- a/examples/calling_scripts/tutorial_calling.py +++ /dev/null @@ -1 +0,0 @@ -../tutorials/simple_sine/tutorial_calling.py \ No newline at end of file From dfb0fbbcf176e20182093fc0544232e9cb1cdcad Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 22 Jan 2024 14:07:48 -0600 Subject: [PATCH 19/76] print evaluated lines in check_libe_stats for now --- libensemble/tests/functionality_tests/check_libE_stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/tests/functionality_tests/check_libE_stats.py b/libensemble/tests/functionality_tests/check_libE_stats.py index 8e4e9c0cc..8260c25c0 100644 --- a/libensemble/tests/functionality_tests/check_libE_stats.py +++ b/libensemble/tests/functionality_tests/check_libE_stats.py @@ -39,6 +39,7 @@ def check_start_end_times(start="Start:", end="End:", everyline=True): with open(infile) as f: total_cnt = 0 for line in f: + print(line) s_cnt = 0 e_cnt = 0 lst = line.split() From ec236ed15d7e302c69edbdb96df970f2d26468bf Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 22 Jan 2024 14:49:26 -0600 Subject: [PATCH 20/76] only want to perform this specific datetime check on indexes 5 and 6 of a split stats line if the line is a Manager: starting or Manager: exiting line --- libensemble/tests/functionality_tests/check_libE_stats.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libensemble/tests/functionality_tests/check_libE_stats.py b/libensemble/tests/functionality_tests/check_libE_stats.py index 8260c25c0..424c07d8b 100644 --- a/libensemble/tests/functionality_tests/check_libE_stats.py +++ b/libensemble/tests/functionality_tests/check_libE_stats.py @@ -39,11 +39,10 @@ def check_start_end_times(start="Start:", end="End:", everyline=True): with open(infile) as f: total_cnt = 0 for line in f: - print(line) s_cnt = 0 e_cnt = 0 lst = line.split() - if lst[0] == "Manager": + if line.startswith("Manager : Starting") or line.startswith("Manager : Exiting"): check_datetime(lst[5], lst[6]) continue for i, val in enumerate(lst): From f06148a2d5dee26edf44ba1e1ac65e9b0f7753db Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 11:54:42 -0600 Subject: [PATCH 21/76] a much simpler indexing solution from shuds --- libensemble/manager.py | 118 ++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 73 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index e9a42f74d..3d0b926dc 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -155,46 +155,16 @@ def filter_nans(array: npt.NDArray) -> npt.NDArray: """ -class _Worker: - """Wrapper class for Worker array and worker comms""" - - def __init__(self, W: npt.NDArray, wid: int, wcomms: list = []): - self.__dict__["_W"] = W - if 0 in W["worker_id"]: # Contains "0" for manager. Otherwise first entry is Worker 1 - self.__dict__["_wididx"] = wid - else: - self.__dict__["_wididx"] = wid - 1 - self.__dict__["_wcomms"] = wcomms - - def __setattr__(self, field, value): - self._W[self._wididx][field] = value - - def __getattr__(self, field): - return self._W[self._wididx][field] - - def update_state_on_alloc(self, Work: dict): - self.active = Work["tag"] - if "persistent" in Work["libE_info"]: - self.persis_state = Work["tag"] - if Work["libE_info"].get("active_recv", False): - self.active_recv = Work["tag"] +class _WorkerIndexer: + def __init__(self, iterable: list, additional_worker=False): + self.iterable = iterable + self.additional_worker = additional_worker + + def __getitem__(self, key): + if self.additional_worker or isinstance(key, str): + return self.iterable[key] else: - assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" - - def update_persistent_state(self): - self.persis_state = 0 - if self.active_recv: - self.active = 0 - self.active_recv = 0 - - def send(self, tag, data): - self._wcomms[self._wididx].send(tag, data) - - def mail_flag(self): - return self._wcomms[self._wididx].mail_flag() - - def recv(self): - return self._wcomms[self._wididx].recv() + return self.iterable[key - 1] class Manager: @@ -253,6 +223,7 @@ def __init__( ] if self.libE_specs.get("manager_runs_additional_worker", False): + # We start an additional Worker 0 on a thread. dtypes = { EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, @@ -276,13 +247,16 @@ def __init__( local_worker_comm.run() local_worker_comm.send(0, dtypes) + self.W = _WorkerIndexer(self.W, self.libE_specs.get("manager_runs_additional_worker", False)) + self.wcomms = _WorkerIndexer(self.wcomms, self.libE_specs.get("manager_runs_additional_worker", False)) + temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources self.scheduler_opts = self.libE_specs.get("scheduler_opts", {}) if self.resources is not None: gresource = self.resources.glob_resources self.scheduler_opts = gresource.update_scheduler_opts(self.scheduler_opts) - for wrk in self.W: + for wrk in self.W.iterable: if wrk["worker_id"] in gresource.zero_resource_workers: wrk["zero_resource_worker"] = True @@ -333,8 +307,7 @@ def term_test(self, logged: bool = True) -> Union[bool, int]: def _kill_workers(self) -> None: """Kills the workers""" for w in self.W["worker_id"]: - worker = _Worker(self.W, w, self.wcomms) - worker.send(STOP_TAG, MAN_SIGNAL_FINISH) + self.wcomms[w].send(STOP_TAG, MAN_SIGNAL_FINISH) # --- Checkpointing logic @@ -389,15 +362,14 @@ def _init_every_k_save(self, complete=False) -> None: def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: """Checks validity of an allocation function order""" # assert w != 0, "Can't send to worker 0; this is the manager." - worker = _Worker(self.W, w, self.wcomms) - if worker.active_recv: + if self.W[w]["active_recv"]: assert "active_recv" in Work["libE_info"], ( "Messages to a worker in active_recv mode should have active_recv" f"set to True in libE_info. Work['libE_info'] is {Work['libE_info']}" ) else: if not force: - assert worker.active == 0, ( + assert self.W[w]["active"] == 0, ( "Allocation function requested work be sent to worker %d, an already active worker." % w ) work_rows = Work["libE_info"]["H_rows"] @@ -439,15 +411,13 @@ def _send_work_order(self, Work: dict, w: int) -> None: """Sends an allocation function order to a worker""" logger.debug(f"Manager sending work unit to worker {w}") - worker = _Worker(self.W, w, self.wcomms) - if self.resources: self._set_resources(Work, w) - worker.send(Work["tag"], Work) + self.wcomms[w].send(Work["tag"], Work) if Work["tag"] == EVAL_GEN_TAG: - worker.gen_started_time = time.time() + self.W[w]["gen_started_time"] = time.time() work_rows = Work["libE_info"]["H_rows"] work_name = calc_type_strings[Work["tag"]] @@ -458,13 +428,18 @@ def _send_work_order(self, Work: dict, w: int) -> None: for i, row in enumerate(work_rows): H_to_be_sent[i] = repack_fields(self.hist.H[Work["H_fields"]][row]) - worker.send(0, H_to_be_sent) + self.wcomms[w].send(0, H_to_be_sent) def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" - worker = _Worker(self.W, w, self.wcomms) - worker.update_state_on_alloc(Work) + self.W[w]["active"] = Work["tag"] + if "persistent" in Work["libE_info"]: + self.W[w]["persis_state"] = Work["tag"] + if Work["libE_info"].get("active_recv", False): + self.W[w]["active_recv"] = Work["tag"] + else: + assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" work_rows = Work["libE_info"]["H_rows"] if Work["tag"] == EVAL_SIM_TAG: @@ -499,8 +474,7 @@ def _receive_from_workers(self, persis_info: dict) -> dict: while new_stuff: new_stuff = False for w in self.W["worker_id"]: - worker = _Worker(self.W, w, self.wcomms) - if worker.mail_flag(): + if self.wcomms[w].mail_flag(): new_stuff = True self._handle_msg_from_worker(persis_info, w) @@ -513,37 +487,38 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - calc_status = D_recv["calc_status"] Manager._check_received_calc(D_recv) - worker = _Worker(self.W, w, self.wcomms) - keep_state = D_recv["libE_info"].get("keep_state", False) - if w not in self.persis_pending and not worker.active_recv and not keep_state: - worker.active = 0 + if w not in self.persis_pending and not self.W[w]["active_recv"] and not keep_state: + self.W[w]["active"] = 0 if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: final_data = D_recv.get("calc_out", None) if isinstance(final_data, np.ndarray): if calc_status is FINISHED_PERSISTENT_GEN_TAG and self.libE_specs.get("use_persis_return_gen", False): - self.hist.update_history_x_in(w, final_data, worker.gen_started_time) + self.hist.update_history_x_in(w, final_data, self.W[w]["gen_started_time"]) elif calc_status is FINISHED_PERSISTENT_SIM_TAG and self.libE_specs.get("use_persis_return_sim", False): self.hist.update_history_f(D_recv, self.kill_canceled_sims) else: logger.info(_PERSIS_RETURN_WARNING) - worker.update_persistent_state() + self.W[w]["persis_state"] = 0 + if self.W[w]["active_recv"]: + self.W[w]["active"] = 0 + self.W[w]["active_recv"] = 0 if w in self.persis_pending: self.persis_pending.remove(w) - worker.active = 0 + self.W[w]["active"] = 0 self._freeup_resources(w) else: if calc_type == EVAL_SIM_TAG: self.hist.update_history_f(D_recv, self.kill_canceled_sims) if calc_type == EVAL_GEN_TAG: - self.hist.update_history_x_in(w, D_recv["calc_out"], worker.gen_started_time) + self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w]["gen_started_time"]) assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or worker.persis_state + len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w]["persis_state"] ), "Gen must return work when is is the only thing active and not persistent." if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: # Now a waiting, persistent worker - worker.persis_state = calc_type + self.W[w]["persis_state"] = calc_type else: self._freeup_resources(w) @@ -552,15 +527,14 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: """Handles a message from worker w""" - worker = _Worker(self.W, w, self.wcomms) try: - msg = worker.recv() + msg = self.wcomms[w].recv() tag, D_recv = msg except CommFinishedException: logger.debug(f"Finalizing message from Worker {w}") return if isinstance(D_recv, WorkerErrMsg): - worker.active = 0 + self.W[w]["active"] = 0 logger.debug(f"Manager received exception from worker {w}") if not self.WorkerExc: self.WorkerExc = True @@ -593,8 +567,7 @@ def _kill_cancelled_sims(self) -> None: kill_ids = self.hist.H["sim_id"][kill_sim_rows] kill_on_workers = self.hist.H["sim_worker"][kill_sim_rows] for w in kill_on_workers: - worker = _Worker(self.W, w, self.wcomms) - worker.send(STOP_TAG, MAN_SIGNAL_KILL) + self.wcomms[w].send(STOP_TAG, MAN_SIGNAL_KILL) self.hist.H["kill_sent"][kill_ids] = True # --- Handle termination @@ -611,7 +584,6 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): # Send a handshake signal to each persistent worker. if any(self.W["persis_state"]): for w in self.W["worker_id"][self.W["persis_state"] > 0]: - worker = _Worker(self.W, w, self.wcomms) logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -625,10 +597,10 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self._send_work_order(work, w) self.hist.update_history_to_gen(rows_to_send) else: - worker.send(PERSIS_STOP, MAN_SIGNAL_KILL) - if not worker.active: + self.wcomms[w].send(PERSIS_STOP, MAN_SIGNAL_KILL) + if not self.W[w]["active"]: # Re-activate if necessary - worker.active = worker.persis_state + self.W[w]["active"] = self.W[w]["persis_state"] self.persis_pending.append(w) exit_flag = 0 From d584152e6ffb1b441d89f8b6b676d6e76d365ea9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 12:03:42 -0600 Subject: [PATCH 22/76] add comment for why using self.W.iterable in "for wrk in self.W.iterable" --- libensemble/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 3d0b926dc..068d60d60 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -256,7 +256,7 @@ def __init__( if self.resources is not None: gresource = self.resources.glob_resources self.scheduler_opts = gresource.update_scheduler_opts(self.scheduler_opts) - for wrk in self.W.iterable: + for wrk in self.W.iterable: # "for wrk in self.W" produces a key of 0 when not applicable if wrk["worker_id"] in gresource.zero_resource_workers: wrk["zero_resource_worker"] = True From 592c8c4d5f819b66582da7d5c7ce49cccd06e42b Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 12:23:11 -0600 Subject: [PATCH 23/76] add __len__ and __iter__ to indexer --- libensemble/manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 068d60d60..ae543d38a 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -166,6 +166,12 @@ def __getitem__(self, key): else: return self.iterable[key - 1] + def __len__(self): + return len(self.iterable) + + def __iter__(self): + return iter(self.iterable) + class Manager: """Manager class for libensemble.""" @@ -256,7 +262,7 @@ def __init__( if self.resources is not None: gresource = self.resources.glob_resources self.scheduler_opts = gresource.update_scheduler_opts(self.scheduler_opts) - for wrk in self.W.iterable: # "for wrk in self.W" produces a key of 0 when not applicable + for wrk in self.W: if wrk["worker_id"] in gresource.zero_resource_workers: wrk["zero_resource_worker"] = True From 59ca40a94f769c04830423d13edc455afa822bda Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 12:32:28 -0600 Subject: [PATCH 24/76] add __setitem__ --- libensemble/manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libensemble/manager.py b/libensemble/manager.py index ae543d38a..8a28ce235 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -166,6 +166,9 @@ def __getitem__(self, key): else: return self.iterable[key - 1] + def __setitem__(self, key, value): + self.iterable[key] = value + def __len__(self): return len(self.iterable) From d8a3a4208ef0609040a84c5a6c4b4f8eb95a2250 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 13:32:47 -0600 Subject: [PATCH 25/76] adjust alloc_support to not use w - 1 indexing --- libensemble/tools/alloc_support.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index ed1148411..a544477e7 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -201,7 +201,7 @@ def _update_rset_team(self, libE_info, wid, H=None, H_rows=None): """Add rset_team to libE_info.""" if self.manage_resources and not libE_info.get("rset_team"): num_rsets_req = 0 - if self.W[wid - 1]["persis_state"]: + if self.W[wid]["persis_state"]: # Even if empty list, non-None rset_team stops manager giving default resources libE_info["rset_team"] = [] return @@ -272,7 +272,7 @@ def gen_work(self, wid, H_fields, H_rows, persis_info, **libE_info): """ self._update_rset_team(libE_info, wid) - if not self.W[wid - 1]["persis_state"]: + if not self.W[wid]["persis_state"]: AllocSupport.gen_counter += 1 # Count total gens libE_info["gen_count"] = AllocSupport.gen_counter From 1839ff2952d6734b46a16755a16fa17818cbe826 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 24 Jan 2024 15:16:59 -0600 Subject: [PATCH 26/76] just pass in the iterable for now. resource changes coming in another branch --- libensemble/manager.py | 2 +- libensemble/tools/alloc_support.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 8a28ce235..7c77b0c27 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -665,7 +665,7 @@ def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: alloc_f = self.alloc_specs["alloc_f"] output = alloc_f( - self.W, + self.W.iterable, H, self.sim_specs, self.gen_specs, diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index a544477e7..ed1148411 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -201,7 +201,7 @@ def _update_rset_team(self, libE_info, wid, H=None, H_rows=None): """Add rset_team to libE_info.""" if self.manage_resources and not libE_info.get("rset_team"): num_rsets_req = 0 - if self.W[wid]["persis_state"]: + if self.W[wid - 1]["persis_state"]: # Even if empty list, non-None rset_team stops manager giving default resources libE_info["rset_team"] = [] return @@ -272,7 +272,7 @@ def gen_work(self, wid, H_fields, H_rows, persis_info, **libE_info): """ self._update_rset_team(libE_info, wid) - if not self.W[wid]["persis_state"]: + if not self.W[wid - 1]["persis_state"]: AllocSupport.gen_counter += 1 # Count total gens libE_info["gen_count"] = AllocSupport.gen_counter From ad525bb9e8a042b497b466a67d6a59b4163a8b17 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 23 Feb 2024 14:42:18 -0600 Subject: [PATCH 27/76] add tentative gen_on_manager option, separate additional_worker_launch into function --- docs/data_structures/libE_specs.rst | 8 +++-- libensemble/manager.py | 47 +++++++++++++++-------------- libensemble/specs.py | 9 ++++-- 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index 15646b1c3..6d5dd879e 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -30,8 +30,12 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **nworkers** [int]: Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. - **manager_runs_additional_worker** [int] = False - Manager process can launch an additional threaded worker + **manager_runs_additional_worker** [bool] = False + Manager process launches an additional threaded Worker 0. + This worker can access/modify user objects by reference. + + **gen_on_manager** Optional[bool] = False + Enable ``manager_runs_additional_worker`` and reserve that worker for a single generator. **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/libensemble/manager.py b/libensemble/manager.py index 6faff43f5..f944ce54c 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -187,6 +187,29 @@ class Manager: ("zero_resource_worker", bool), ] + def _run_additional_worker(self, hist, sim_specs, gen_specs, libE_specs): + dtypes = { + EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, + EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, + } + + self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) + self.W["worker_id"] = np.arange(len(self.wcomms) + 1) + local_worker_comm = QCommThread( + worker_main, + len(self.wcomms), + sim_specs, + gen_specs, + libE_specs, + 0, + False, + Resources.resources, + Executor.executor, + ) + self.wcomms = [local_worker_comm] + self.wcomms + local_worker_comm.run() + local_worker_comm.send(0, dtypes) + def __init__( self, hist, @@ -232,28 +255,7 @@ def __init__( if self.libE_specs.get("manager_runs_additional_worker", False): # We start an additional Worker 0 on a thread. - - dtypes = { - EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, - EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, - } - - self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) - self.W["worker_id"] = np.arange(len(self.wcomms) + 1) - local_worker_comm = QCommThread( - worker_main, - len(self.wcomms), - sim_specs, - gen_specs, - libE_specs, - 0, - False, - Resources.resources, - Executor.executor, - ) - self.wcomms = [local_worker_comm] + self.wcomms - local_worker_comm.run() - local_worker_comm.send(0, dtypes) + self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.W = _WorkerIndexer(self.W, self.libE_specs.get("manager_runs_additional_worker", False)) self.wcomms = _WorkerIndexer(self.wcomms, self.libE_specs.get("manager_runs_additional_worker", False)) @@ -637,6 +639,7 @@ def _get_alloc_libE_info(self) -> dict: "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, "manager_additional_worker": self.libE_specs.get("manager_runs_additional_worker", False), + "gen_on_manager": self.libE_specs.get("gen_on_manager", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/specs.py b/libensemble/specs.py index e796aee46..5c7990867 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -172,8 +172,13 @@ class LibeSpecs(BaseModel): nworkers: Optional[int] = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - manager_runs_additional_worker: Optional[int] = False - """ Manager process can launch an additional threaded worker """ + manager_runs_additional_worker: Optional[bool] = False + """ Manager process launches an additional threaded Worker 0. + This worker can access/modify user objects by reference. + """ + + gen_on_manager: Optional[bool] = False + """ Enable ``manager_runs_additional_worker`` and reserve that worker for a single generator. """ mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" From fe64869b659f0a844d07f3305517d2c698f21ddb Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 11:07:24 -0600 Subject: [PATCH 28/76] various refactors based on PR suggestions, then manager-refactors based on tracking worker_type as EVAL_SIM/EVAL_GEN_TAG, and active/persistent/active_recv as bools --- .../alloc_funcs/start_only_persistent.py | 5 +- libensemble/comms/comms.py | 15 ++-- libensemble/manager.py | 69 ++++++++++--------- libensemble/tools/alloc_support.py | 48 +++++++++---- libensemble/utils/runners.py | 8 +-- 5 files changed, 83 insertions(+), 62 deletions(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index ee9d4105f..17784be35 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -1,6 +1,6 @@ import numpy as np -from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG +from libensemble.message_numbers import EVAL_SIM_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -51,7 +51,6 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l if libE_info["sim_max_given"] or not libE_info["any_idle_workers"]: return {}, persis_info - # Initialize alloc_specs["user"] as user. user = alloc_specs.get("user", {}) manage_resources = libE_info["use_resource_sets"] @@ -71,7 +70,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l return Work, persis_info, 1 # Give evaluated results back to a running persistent gen - for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG, active_recv=active_recv_gen): + for wid in support.avail_gen_worker_ids(persistent=True, active_recv=active_recv_gen): gen_inds = H["gen_worker"] == wid returned_but_not_given = np.logical_and.reduce((H["sim_ended"], ~H["gen_informed"], gen_inds)) if np.any(returned_but_not_given): diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index 70458dd98..bebca9344 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -150,7 +150,6 @@ def __init__(self, main, *args, **kwargs): self._result = None self._exception = None self._done = False - self._ufunc = kwargs.get("ufunc", False) def _is_result_msg(self, msg): """Return true if message indicates final result (and set result/except).""" @@ -209,13 +208,13 @@ def result(self, timeout=None): return self._result @staticmethod - def _qcomm_main(comm, main, *fargs, **kwargs): + def _qcomm_main(comm, main, *args, **kwargs): """Main routine -- handles return values and exceptions.""" try: - if not kwargs.get("ufunc"): - _result = main(comm, *fargs, **kwargs) + if not kwargs.get("user_function"): + _result = main(comm, *args, **kwargs) else: - _result = main(*fargs) + _result = main(*args) comm.send(CommResult(_result)) except Exception as e: comm.send(CommResultErr(str(e), format_exc())) @@ -237,12 +236,12 @@ def __exit__(self, etype, value, traceback): class QCommThread(QCommLocal): """Launch a user function in a thread with an attached QComm.""" - def __init__(self, main, nworkers, *fargs, **kwargs): + def __init__(self, main, nworkers, *args, **kwargs): self.inbox = thread_queue.Queue() self.outbox = thread_queue.Queue() - super().__init__(self, main, *fargs, **kwargs) + super().__init__(self, main, *args, **kwargs) comm = QComm(self.inbox, self.outbox, nworkers) - self.handle = Thread(target=QCommThread._qcomm_main, args=(comm, main) + fargs, kwargs=kwargs) + self.handle = Thread(target=QCommThread._qcomm_main, args=(comm, main) + args, kwargs=kwargs) def terminate(self, timeout=None): """Terminate the thread. diff --git a/libensemble/manager.py b/libensemble/manager.py index f944ce54c..bd7a6d4ea 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -180,9 +180,10 @@ class Manager: worker_dtype = [ ("worker_id", int), - ("active", int), - ("persis_state", int), - ("active_recv", int), + ("worker_type", int), + ("active", bool), + ("persistent", bool), + ("active_recv", bool), ("gen_started_time", float), ("zero_resource_worker", bool), ] @@ -192,9 +193,6 @@ def _run_additional_worker(self, hist, sim_specs, gen_specs, libE_specs): EVAL_SIM_TAG: repack_fields(hist.H[sim_specs["in"]]).dtype, EVAL_GEN_TAG: repack_fields(hist.H[gen_specs["in"]]).dtype, } - - self.W = np.zeros(len(self.wcomms) + 1, dtype=Manager.worker_dtype) - self.W["worker_id"] = np.arange(len(self.wcomms) + 1) local_worker_comm = QCommThread( worker_main, len(self.wcomms), @@ -206,9 +204,9 @@ def _run_additional_worker(self, hist, sim_specs, gen_specs, libE_specs): Resources.resources, Executor.executor, ) - self.wcomms = [local_worker_comm] + self.wcomms local_worker_comm.run() local_worker_comm.send(0, dtypes) + return local_worker_comm def __init__( self, @@ -244,8 +242,6 @@ def __init__( self.gen_num_procs = libE_specs.get("gen_num_procs", 0) self.gen_num_gpus = libE_specs.get("gen_num_gpus", 0) - self.W = np.zeros(len(self.wcomms), dtype=Manager.worker_dtype) - self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 self.term_tests = [ (2, "wallclock_max", self.term_test_wallclock), (1, "sim_max", self.term_test_sim_max), @@ -253,12 +249,18 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - if self.libE_specs.get("manager_runs_additional_worker", False): - # We start an additional Worker 0 on a thread. - self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) + additional_worker = self.libE_specs.get("manager_runs_additional_worker", False) + + self.W = np.zeros(len(self.wcomms) + additional_worker, dtype=Manager.worker_dtype) + if additional_worker: + self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] + local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) + self.wcomms = [local_worker_comm] + self.wcomms + else: + self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] - self.W = _WorkerIndexer(self.W, self.libE_specs.get("manager_runs_additional_worker", False)) - self.wcomms = _WorkerIndexer(self.wcomms, self.libE_specs.get("manager_runs_additional_worker", False)) + self.W = _WorkerIndexer(self.W, additional_worker) + self.wcomms = _WorkerIndexer(self.wcomms, additional_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -379,7 +381,7 @@ def _check_work_order(self, Work: dict, w: int, force: bool = False) -> None: ) else: if not force: - assert self.W[w]["active"] == 0, ( + assert not self.W[w]["active"], ( "Allocation function requested work be sent to worker %d, an already active worker." % w ) work_rows = Work["libE_info"]["H_rows"] @@ -443,11 +445,12 @@ def _send_work_order(self, Work: dict, w: int) -> None: def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" - self.W[w]["active"] = Work["tag"] + self.W[w]["active"] = True + self.W[w]["worker_type"] = Work["tag"] if "persistent" in Work["libE_info"]: - self.W[w]["persis_state"] = Work["tag"] + self.W[w]["persistent"] = True if Work["libE_info"].get("active_recv", False): - self.W[w]["active_recv"] = Work["tag"] + self.W[w]["active_recv"] = True else: assert "active_recv" not in Work["libE_info"], "active_recv worker must also be persistent" @@ -484,7 +487,7 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - keep_state = D_recv["libE_info"].get("keep_state", False) if w not in self.persis_pending and not self.W[w]["active_recv"] and not keep_state: - self.W[w]["active"] = 0 + self.W[w]["active"] = False if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: final_data = D_recv.get("calc_out", None) @@ -495,13 +498,13 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - self.hist.update_history_f(D_recv, self.kill_canceled_sims) else: logger.info(_PERSIS_RETURN_WARNING) - self.W[w]["persis_state"] = 0 + self.W[w]["persistent"] = False if self.W[w]["active_recv"]: - self.W[w]["active"] = 0 - self.W[w]["active_recv"] = 0 + self.W[w]["active"] = False + self.W[w]["active_recv"] = False if w in self.persis_pending: self.persis_pending.remove(w) - self.W[w]["active"] = 0 + self.W[w]["active"] = False self._freeup_resources(w) else: if calc_type == EVAL_SIM_TAG: @@ -509,11 +512,11 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - if calc_type == EVAL_GEN_TAG: self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w]["gen_started_time"]) assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w]["persis_state"] + len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w]["persistent"] ), "Gen must return work when is is the only thing active and not persistent." if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: # Now a waiting, persistent worker - self.W[w]["persis_state"] = calc_type + self.W[w]["persistent"] = True else: self._freeup_resources(w) @@ -529,7 +532,7 @@ def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: logger.debug(f"Finalizing message from Worker {w}") return if isinstance(D_recv, WorkerErrMsg): - self.W[w]["active"] = 0 + self.W[w]["active"] = False logger.debug(f"Manager received exception from worker {w}") if not self.WorkerExc: self.WorkerExc = True @@ -577,8 +580,8 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): """ # Send a handshake signal to each persistent worker. - if any(self.W["persis_state"]): - for w in self.W["worker_id"][self.W["persis_state"] > 0]: + if any(self.W["persistent"]): + for w in self.W["worker_id"][self.W["persistent"]]: logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -595,15 +598,15 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self.wcomms[w].send(PERSIS_STOP, MAN_SIGNAL_KILL) if not self.W[w]["active"]: # Re-activate if necessary - self.W[w]["active"] = self.W[w]["persis_state"] + self.W[w]["active"] = self.W[w]["persistent"] self.persis_pending.append(w) exit_flag = 0 - while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: + while (any(self.W["active"]) or any(self.W["persistent"])) and exit_flag == 0: persis_info = self._receive_from_workers(persis_info) if self.term_test(logged=False) == 2: # Elapsed Wallclock has expired - if not any(self.W["persis_state"]): + if not any(self.W["persistent"]): if any(self.W["active"]): logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) else: @@ -626,7 +629,7 @@ def _get_alloc_libE_info(self) -> dict: """Selected statistics useful for alloc_f""" return { - "any_idle_workers": any(self.W["active"] == 0), + "any_idle_workers": any(~self.W["active"]), "exit_criteria": self.exit_criteria, "elapsed_time": self.elapsed(), "gen_informed_count": self.hist.gen_informed_count, @@ -697,7 +700,7 @@ def run(self, persis_info: dict) -> (dict, int, int): self._send_work_order(Work[w], w) self._update_state_on_alloc(Work[w], w) assert self.term_test() or any( - self.W["active"] != 0 + self.W["active"] ), "alloc_f did not return any work, although all workers are idle." except WorkerException as e: report_worker_exc(e) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index d1d8ac802..21d46b1b0 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -87,29 +87,25 @@ def assign_resources(self, rsets_req, use_gpus=None, user_params=[]): rset_team = self.sched.assign_resources(rsets_req, use_gpus, user_params) return rset_team - def avail_worker_ids(self, persistent=None, active_recv=False, zero_resource_workers=None): + def avail_worker_ids(self, persistent=False, active_recv=False, zero_resource_workers=None, worker_type=None): """Returns available workers as a list of IDs, filtered by the given options. :param persistent: (Optional) Int. Only return workers with given ``persis_state`` (1=sim, 2=gen). :param active_recv: (Optional) Boolean. Only return workers with given active_recv state. :param zero_resource_workers: (Optional) Boolean. Only return workers that require no resources. + :param worker_type: (Optional) Int. Only return workers with given ``worker_type`` (1=sim, 2=gen). :returns: List of worker IDs. If there are no zero resource workers defined, then the ``zero_resource_workers`` argument will be ignored. """ - def fltr(wrk, field, option): - """Filter by condition if supplied""" - if option is None: - return True - return wrk[field] == option - # For abbrev. def fltr_persis(): - if persistent is None: + if persistent: + return wrk["persistent"] + else: return True - return wrk["persis_state"] == persistent def fltr_zrw(): # If none exist or you did not ask for zrw then return True @@ -123,6 +119,12 @@ def fltr_recving(): else: return not wrk["active"] + def fltr_worker_type(): + if worker_type: + return wrk["worker_type"] == worker_type + else: + return True + if active_recv and not persistent: raise AllocException("Cannot ask for non-persistent active receive workers") @@ -130,13 +132,31 @@ def fltr_recving(): no_zrw = not any(self.W["zero_resource_worker"]) wrks = [] for wrk in self.W: - if fltr_recving() and fltr_persis() and fltr_zrw(): + if fltr_recving() and fltr_persis() and fltr_zrw() and fltr_worker_type(): wrks.append(wrk["worker_id"]) return wrks + def avail_gen_worker_ids(self, persistent=False, active_recv=False, zero_resource_workers=None): + """Returns available generator workers as a list of IDs.""" + return self.avail_worker_ids( + persistent=persistent, + active_recv=active_recv, + zero_resource_workers=zero_resource_workers, + worker_type=EVAL_GEN_TAG, + ) + + def avail_sim_worker_ids(self, persistent=False, active_recv=False, zero_resource_workers=None): + """Returns available generator workers as a list of IDs.""" + return self.avail_worker_ids( + persistent=persistent, + active_recv=active_recv, + zero_resource_workers=zero_resource_workers, + worker_type=EVAL_SIM_TAG, + ) + def count_gens(self): """Returns the number of active generators.""" - return sum(self.W["active"] == EVAL_GEN_TAG) + return sum(self.W["active"] & self.W["worker_type"] == EVAL_GEN_TAG) def test_any_gen(self): """Returns ``True`` if a generator worker is active.""" @@ -144,7 +164,7 @@ def test_any_gen(self): def count_persis_gens(self): """Return the number of active persistent generators.""" - return sum(self.W["persis_state"] == EVAL_GEN_TAG) + return sum(self.W["persistent"] == EVAL_GEN_TAG) def _req_resources_sim(self, libE_info, user_params, H, H_rows): """Determine required resources for a sim work unit""" @@ -201,7 +221,7 @@ def _update_rset_team(self, libE_info, wid, H=None, H_rows=None): """Add rset_team to libE_info.""" if self.manage_resources and not libE_info.get("rset_team"): num_rsets_req = 0 - if self.W[wid - 1]["persis_state"]: + if self.W[wid - 1]["persistent"]: # Even if empty list, non-None rset_team stops manager giving default resources libE_info["rset_team"] = [] return @@ -272,7 +292,7 @@ def gen_work(self, wid, H_fields, H_rows, persis_info, **libE_info): """ self._update_rset_team(libE_info, wid) - if not self.W[wid - 1]["persis_state"]: + if not self.W[wid - 1]["persistent"]: AllocSupport.gen_counter += 1 # Count total gens libE_info["gen_count"] = AllocSupport.gen_counter diff --git a/libensemble/utils/runners.py b/libensemble/utils/runners.py index 0ea9ce1e7..629c733b1 100644 --- a/libensemble/utils/runners.py +++ b/libensemble/utils/runners.py @@ -62,8 +62,8 @@ def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> ( libE_info["comm"] = None # 'comm' object not pickle-able Worker._set_executor(0, None) # ditto for executor - fargs = self._truncate_args(calc_in, persis_info, libE_info) - task_fut = self.globus_compute_executor.submit_to_registered_function(self.globus_compute_fid, fargs) + args = self._truncate_args(calc_in, persis_info, libE_info) + task_fut = self.globus_compute_executor.submit_to_registered_function(self.globus_compute_fid, args) return task_fut.result() def shutdown(self) -> None: @@ -76,8 +76,8 @@ def __init__(self, specs): self.thread_handle = None def _result(self, calc_in: npt.NDArray, persis_info: dict, libE_info: dict) -> (npt.NDArray, dict, Optional[int]): - fargs = self._truncate_args(calc_in, persis_info, libE_info) - self.thread_handle = QCommThread(self.f, None, *fargs, ufunc=True) + args = self._truncate_args(calc_in, persis_info, libE_info) + self.thread_handle = QCommThread(self.f, None, *args, user_function=True) self.thread_handle.run() return self.thread_handle.result() From dcf6db76e728b45b602795259cfb536399552c23 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 12:24:17 -0600 Subject: [PATCH 29/76] fix persistent filter, update avail/running gens counters --- libensemble/tools/alloc_support.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 21d46b1b0..5f223df52 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -102,10 +102,7 @@ def avail_worker_ids(self, persistent=False, active_recv=False, zero_resource_wo # For abbrev. def fltr_persis(): - if persistent: - return wrk["persistent"] - else: - return True + return wrk["persistent"] == persistent def fltr_zrw(): # If none exist or you did not ask for zrw then return True @@ -160,11 +157,11 @@ def count_gens(self): def test_any_gen(self): """Returns ``True`` if a generator worker is active.""" - return any(self.W["active"] == EVAL_GEN_TAG) + return any(self.W["active"] & self.W["worker_type"] == EVAL_GEN_TAG) def count_persis_gens(self): """Return the number of active persistent generators.""" - return sum(self.W["persistent"] == EVAL_GEN_TAG) + return sum((self.W["persistent"]) & (self.W["worker_type"] == EVAL_GEN_TAG)) def _req_resources_sim(self, libE_info, user_params, H, H_rows): """Determine required resources for a sim work unit""" From ba059004ae27640640c7771f109aa808f66bbf0a Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 13:52:02 -0600 Subject: [PATCH 30/76] update unit test, bugfix --- .../test_allocation_funcs_and_support.py | 40 ++++++++----------- libensemble/tools/alloc_support.py | 4 +- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 631c0a60b..8f5959ce9 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -17,12 +17,13 @@ H0 = [] W = np.array( - [(1, 0, 0, 0, False), (2, 0, 0, 0, False), (3, 0, 0, 0, False), (4, 0, 0, 0, False)], + [(1, 0, 0, 0, 0, False), (2, 0, 0, 0, 0, False), (3, 0, 0, 0, 0, False), (4, 0, 0, 0, 0, False)], dtype=[ ("worker_id", " Date: Mon, 26 Feb 2024 13:58:33 -0600 Subject: [PATCH 31/76] update persistent allocs, but also add backwards-compatibility check in avail_worker_ids --- libensemble/alloc_funcs/inverse_bayes_allocf.py | 3 +-- libensemble/alloc_funcs/persistent_aposmm_alloc.py | 3 +-- libensemble/alloc_funcs/start_fd_persistent.py | 3 +-- libensemble/alloc_funcs/start_persistent_local_opt_gens.py | 2 +- libensemble/tools/alloc_support.py | 3 +++ 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libensemble/alloc_funcs/inverse_bayes_allocf.py b/libensemble/alloc_funcs/inverse_bayes_allocf.py index 56a3f6e79..dcc1e13d7 100644 --- a/libensemble/alloc_funcs/inverse_bayes_allocf.py +++ b/libensemble/alloc_funcs/inverse_bayes_allocf.py @@ -1,6 +1,5 @@ import numpy as np -from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -25,7 +24,7 @@ def only_persistent_gens_for_inverse_bayes(W, H, sim_specs, gen_specs, alloc_spe # If wid is idle, but in persistent mode, and generated work has all returned # give output back to wid. Otherwise, give nothing to wid - for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): + for wid in support.avail_gen_worker_ids(persistent=True): # if > 1 persistent generator, assign the correct work to it inds_generated_by_wid = H["gen_worker"] == wid if support.all_sim_ended(H, inds_generated_by_wid): diff --git a/libensemble/alloc_funcs/persistent_aposmm_alloc.py b/libensemble/alloc_funcs/persistent_aposmm_alloc.py index 8327d3975..47b584309 100644 --- a/libensemble/alloc_funcs/persistent_aposmm_alloc.py +++ b/libensemble/alloc_funcs/persistent_aposmm_alloc.py @@ -1,6 +1,5 @@ import numpy as np -from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -40,7 +39,7 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info return Work, persis_info, 1 # If any persistent worker's calculated values have returned, give them back. - for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): + for wid in support.avail_gen_worker_ids(persistent=True): if persis_info.get("sample_done") or sum(H["sim_ended"]) >= init_sample_size + persis_info["samples_in_H0"]: # Don't return if the initial sample is not complete persis_info["sample_done"] = True diff --git a/libensemble/alloc_funcs/start_fd_persistent.py b/libensemble/alloc_funcs/start_fd_persistent.py index 0c2e939d3..33af61765 100644 --- a/libensemble/alloc_funcs/start_fd_persistent.py +++ b/libensemble/alloc_funcs/start_fd_persistent.py @@ -1,6 +1,5 @@ import numpy as np -from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -30,7 +29,7 @@ def finite_diff_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info, libE # If wid is in persistent mode, and all of its calculated values have # returned, give them back to wid. Otherwise, give nothing to wid - for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): + for wid in support.avail_gen_worker_ids(persistent=True): # What (x_ind, f_ind) pairs have all of the evaluation of all n_ind # values complete. inds_not_sent_back = ~H["gen_informed"] diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 12ad45100..ac01db407 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -46,7 +46,7 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per # If wid is idle, but in persistent mode, and its calculated values have # returned, give them back to i. Otherwise, give nothing to wid - for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): + for wid in support.avail_gen_worker_ids(persistent=True): gen_inds = H["gen_worker"] == wid if support.all_sim_ended(H, gen_inds): last_time_pos = np.argmax(H["sim_started_time"][gen_inds]) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index d5e4a7125..7e1871fe9 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -100,6 +100,9 @@ def avail_worker_ids(self, persistent=False, active_recv=False, zero_resource_wo be ignored. """ + if persistent == EVAL_GEN_TAG: # backwards compatibility + return self.avail_gen_worker_ids(persistent, active_recv, zero_resource_workers) + # For abbrev. def fltr_persis(): return wrk["persistent"] == persistent From 3d06b1c3d896d5c4db5542d769c0d4e405f690c5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 14:16:53 -0600 Subject: [PATCH 32/76] fix persistent sim test --- libensemble/alloc_funcs/start_only_persistent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 17784be35..870973dc4 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -92,7 +92,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l if user.get("alt_type"): avail_workers = list( set(support.avail_worker_ids(persistent=False, zero_resource_workers=False)) - | set(support.avail_worker_ids(persistent=EVAL_SIM_TAG, zero_resource_workers=False)) + | set(support.avail_worker_ids(persistent=True, zero_resource_workers=False, worker_type=EVAL_SIM_TAG)) ) for wid in avail_workers: if not np.any(points_to_evaluate): From 9165d7df49c6a2a004dfd62ca079b96b91cb15da Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 15:35:40 -0600 Subject: [PATCH 33/76] move _WorkerIndexer into libensemble.utils, also use within PersistentSupport --- libensemble/manager.py | 23 +---------------------- libensemble/tools/alloc_support.py | 8 ++++---- libensemble/utils/misc.py | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index bd7a6d4ea..888958608 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -34,7 +34,7 @@ from libensemble.resources.resources import Resources from libensemble.tools.fields_keys import protected_libE_fields from libensemble.tools.tools import _PERSIS_RETURN_WARNING, _USER_CALC_DIR_WARNING -from libensemble.utils.misc import extract_H_ranges +from libensemble.utils.misc import _WorkerIndexer, extract_H_ranges from libensemble.utils.output_directory import EnsembleDirectory from libensemble.utils.timer import Timer from libensemble.worker import WorkerErrMsg, worker_main @@ -154,27 +154,6 @@ def filter_nans(array: npt.NDArray) -> npt.NDArray: """ -class _WorkerIndexer: - def __init__(self, iterable: list, additional_worker=False): - self.iterable = iterable - self.additional_worker = additional_worker - - def __getitem__(self, key): - if self.additional_worker or isinstance(key, str): - return self.iterable[key] - else: - return self.iterable[key - 1] - - def __setitem__(self, key, value): - self.iterable[key] = value - - def __len__(self): - return len(self.iterable) - - def __iter__(self): - return iter(self.iterable) - - class Manager: """Manager class for libensemble.""" diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 7e1871fe9..b8d9e98ce 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -5,7 +5,7 @@ from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.resources.resources import Resources from libensemble.resources.scheduler import InsufficientFreeResources, InsufficientResourcesError, ResourceScheduler -from libensemble.utils.misc import extract_H_ranges +from libensemble.utils.misc import _WorkerIndexer, extract_H_ranges logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -47,7 +47,7 @@ def __init__( :param user_resources: (Optional) A user supplied ``resources`` object. :param user_scheduler: (Optional) A user supplied ``user_scheduler`` object. """ - self.W = W + self.W = _WorkerIndexer(W, libE_info.get("manager_runs_additional_worker", False)) self.persis_info = persis_info self.manage_resources = manage_resources self.resources = user_resources or Resources.resources @@ -221,7 +221,7 @@ def _update_rset_team(self, libE_info, wid, H=None, H_rows=None): """Add rset_team to libE_info.""" if self.manage_resources and not libE_info.get("rset_team"): num_rsets_req = 0 - if self.W[wid - 1]["persistent"]: + if self.W[wid]["persistent"]: # Even if empty list, non-None rset_team stops manager giving default resources libE_info["rset_team"] = [] return @@ -292,7 +292,7 @@ def gen_work(self, wid, H_fields, H_rows, persis_info, **libE_info): """ self._update_rset_team(libE_info, wid) - if not self.W[wid - 1]["persistent"]: + if not self.W[wid]["persistent"]: AllocSupport.gen_counter += 1 # Count total gens libE_info["gen_count"] = AllocSupport.gen_counter diff --git a/libensemble/utils/misc.py b/libensemble/utils/misc.py index 76e4ccaf2..ca67095ac 100644 --- a/libensemble/utils/misc.py +++ b/libensemble/utils/misc.py @@ -33,6 +33,27 @@ def extract_H_ranges(Work: dict) -> str: return "_".join(ranges) +class _WorkerIndexer: + def __init__(self, iterable: list, additional_worker=False): + self.iterable = iterable + self.additional_worker = additional_worker + + def __getitem__(self, key): + if self.additional_worker or isinstance(key, str): + return self.iterable[key] + else: + return self.iterable[key - 1] + + def __setitem__(self, key, value): + self.iterable[key] = value + + def __len__(self): + return len(self.iterable) + + def __iter__(self): + return iter(self.iterable) + + def specs_dump(specs, **kwargs): if pydanticV1: return specs.dict(**kwargs) From f7ba2057f2ade7f09bf59a6abf7ced1814699e6a Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 26 Feb 2024 16:49:51 -0600 Subject: [PATCH 34/76] manager also needs to send workflow_dir location to worker 0 --- libensemble/manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libensemble/manager.py b/libensemble/manager.py index 888958608..ab430decb 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -185,6 +185,8 @@ def _run_additional_worker(self, hist, sim_specs, gen_specs, libE_specs): ) local_worker_comm.run() local_worker_comm.send(0, dtypes) + if libE_specs.get("use_workflow_dir"): + local_worker_comm.send(0, libE_specs.get("workflow_dir_path")) return local_worker_comm def __init__( From 376e4506755d9b4d266975feb45412f7f6a3959f Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 27 Feb 2024 08:56:01 -0600 Subject: [PATCH 35/76] missed an alloc --- libensemble/alloc_funcs/start_persistent_local_opt_gens.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index ac01db407..1a16ea817 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -90,7 +90,9 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per break points_to_evaluate[sim_ids_to_send] = False - elif gen_count == 0 and not np.any(np.logical_and(W["active"] == EVAL_GEN_TAG, W["persis_state"] == 0)): + elif gen_count == 0 and not np.any( + np.logical_and((W["active"]), (W["persistent"] is False), (W["worker_type"] == EVAL_GEN_TAG)) + ): # Finally, generate points since there is nothing else to do (no resource sets req.) Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info[wid], rset_team=[]) gen_count += 1 From 63750588ac1a0921b7242258b0021732a1b53476 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 27 Feb 2024 12:20:14 -0600 Subject: [PATCH 36/76] make alloc_f's libE_info additional worker option match libE_specs --- libensemble/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index ab430decb..5f8604f11 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -622,7 +622,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "manager_additional_worker": self.libE_specs.get("manager_runs_additional_worker", False), + "manager_runs_additional_worker": self.libE_specs.get("manager_runs_additional_worker", False), "gen_on_manager": self.libE_specs.get("gen_on_manager", False), } From c07a5659b081961f9756d73a52fb239e4940438f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 28 Feb 2024 09:18:19 -0600 Subject: [PATCH 37/76] removes manager_runs_additional_worker in favor of gen_on_manager. pass in wrapped self.W to allocs --- docs/data_structures/libE_specs.rst | 7 ++----- libensemble/manager.py | 14 +++++++------- libensemble/specs.py | 9 +++------ .../test_persistent_uniform_sampling.py | 2 +- libensemble/tools/alloc_support.py | 4 ++-- 5 files changed, 15 insertions(+), 21 deletions(-) diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index 6d5dd879e..b2bb74d58 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -30,12 +30,9 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **nworkers** [int]: Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. - **manager_runs_additional_worker** [bool] = False - Manager process launches an additional threaded Worker 0. - This worker can access/modify user objects by reference. - **gen_on_manager** Optional[bool] = False - Enable ``manager_runs_additional_worker`` and reserve that worker for a single generator. + Instructs Manager process to run generator functions. + This generator function can access/modify user objects by reference. **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/libensemble/manager.py b/libensemble/manager.py index 5f8604f11..5d0dbf156 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -230,18 +230,19 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - additional_worker = self.libE_specs.get("manager_runs_additional_worker", False) + gen_on_manager = self.libE_specs.get("gen_on_manager", False) - self.W = np.zeros(len(self.wcomms) + additional_worker, dtype=Manager.worker_dtype) - if additional_worker: + self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) + if gen_on_manager: self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] + self.W[0]["worker_type"] = EVAL_GEN_TAG local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms else: self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] - self.W = _WorkerIndexer(self.W, additional_worker) - self.wcomms = _WorkerIndexer(self.wcomms, additional_worker) + self.W = _WorkerIndexer(self.W, gen_on_manager) + self.wcomms = _WorkerIndexer(self.wcomms, gen_on_manager) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -622,7 +623,6 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "manager_runs_additional_worker": self.libE_specs.get("manager_runs_additional_worker", False), "gen_on_manager": self.libE_specs.get("gen_on_manager", False), } @@ -636,7 +636,7 @@ def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: alloc_f = self.alloc_specs["alloc_f"] output = alloc_f( - self.W.iterable, + self.W, H, self.sim_specs, self.gen_specs, diff --git a/libensemble/specs.py b/libensemble/specs.py index 5c7990867..0073c6cd6 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -172,13 +172,10 @@ class LibeSpecs(BaseModel): nworkers: Optional[int] = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - manager_runs_additional_worker: Optional[bool] = False - """ Manager process launches an additional threaded Worker 0. - This worker can access/modify user objects by reference. - """ - gen_on_manager: Optional[bool] = False - """ Enable ``manager_runs_additional_worker`` and reserve that worker for a single generator. """ + """ Instructs Manager process to run generator functions. + This generator function can access/modify user objects by reference. + """ mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index e343ff991..5470b814d 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -87,7 +87,7 @@ sim_specs["in"] = ["x", "obj_component"] # sim_specs["out"] = [("f", float), ("grad", float, n)] elif run == 3: - libE_specs["manager_runs_additional_worker"] = True + libE_specs["gen_on_manager"] = True # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index b8d9e98ce..3cda02079 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -5,7 +5,7 @@ from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.resources.resources import Resources from libensemble.resources.scheduler import InsufficientFreeResources, InsufficientResourcesError, ResourceScheduler -from libensemble.utils.misc import _WorkerIndexer, extract_H_ranges +from libensemble.utils.misc import extract_H_ranges logger = logging.getLogger(__name__) # For debug messages - uncomment @@ -47,7 +47,7 @@ def __init__( :param user_resources: (Optional) A user supplied ``resources`` object. :param user_scheduler: (Optional) A user supplied ``user_scheduler`` object. """ - self.W = _WorkerIndexer(W, libE_info.get("manager_runs_additional_worker", False)) + self.W = W self.persis_info = persis_info self.manage_resources = manage_resources self.resources = user_resources or Resources.resources From c46802e20d5b2dffdb2440874afa15ee0e34d6aa Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 28 Feb 2024 10:19:38 -0600 Subject: [PATCH 38/76] turning W["active"] back to an int --- libensemble/manager.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 5d0dbf156..c1fad1af5 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -160,7 +160,7 @@ class Manager: worker_dtype = [ ("worker_id", int), ("worker_type", int), - ("active", bool), + ("active", int), ("persistent", bool), ("active_recv", bool), ("gen_started_time", float), @@ -427,7 +427,7 @@ def _send_work_order(self, Work: dict, w: int) -> None: def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" - self.W[w]["active"] = True + self.W[w]["active"] = Work["tag"] self.W[w]["worker_type"] = Work["tag"] if "persistent" in Work["libE_info"]: self.W[w]["persistent"] = True @@ -469,7 +469,7 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - keep_state = D_recv["libE_info"].get("keep_state", False) if w not in self.persis_pending and not self.W[w]["active_recv"] and not keep_state: - self.W[w]["active"] = False + self.W[w]["active"] = 0 if calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG]: final_data = D_recv.get("calc_out", None) @@ -482,11 +482,11 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - logger.info(_PERSIS_RETURN_WARNING) self.W[w]["persistent"] = False if self.W[w]["active_recv"]: - self.W[w]["active"] = False + self.W[w]["active"] = 0 self.W[w]["active_recv"] = False if w in self.persis_pending: self.persis_pending.remove(w) - self.W[w]["active"] = False + self.W[w]["active"] = 0 self._freeup_resources(w) else: if calc_type == EVAL_SIM_TAG: @@ -514,7 +514,7 @@ def _handle_msg_from_worker(self, persis_info: dict, w: int) -> None: logger.debug(f"Finalizing message from Worker {w}") return if isinstance(D_recv, WorkerErrMsg): - self.W[w]["active"] = False + self.W[w]["active"] = 0 logger.debug(f"Manager received exception from worker {w}") if not self.WorkerExc: self.WorkerExc = True @@ -580,7 +580,7 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self.wcomms[w].send(PERSIS_STOP, MAN_SIGNAL_KILL) if not self.W[w]["active"]: # Re-activate if necessary - self.W[w]["active"] = self.W[w]["persistent"] + self.W[w]["active"] = self.W[w]["worker_type"] if self.W[w]["persistent"] else 0 self.persis_pending.append(w) exit_flag = 0 From 2ee94665845ca3874f282ae44acf908488a7a138 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 28 Feb 2024 11:41:20 -0600 Subject: [PATCH 39/76] experimenting with gen_on_manager with give_pregenerated_work - worker 0 shouldn't be given gen work --- libensemble/alloc_funcs/give_pregenerated_work.py | 2 +- .../tests/regression_tests/test_evaluate_mixed_sample.py | 1 + .../tests/unit_tests/test_allocation_funcs_and_support.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libensemble/alloc_funcs/give_pregenerated_work.py b/libensemble/alloc_funcs/give_pregenerated_work.py index 1d6edb160..060046d27 100644 --- a/libensemble/alloc_funcs/give_pregenerated_work.py +++ b/libensemble/alloc_funcs/give_pregenerated_work.py @@ -23,7 +23,7 @@ def give_pregenerated_sim_work(W, H, sim_specs, gen_specs, alloc_specs, persis_i if persis_info["next_to_give"] >= len(H): return Work, persis_info, 1 - for i in support.avail_worker_ids(): + for i in support.avail_sim_worker_ids(): persis_info = support.skip_canceled_points(H, persis_info) # Give sim work diff --git a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py index 38998baa7..1574e8d57 100644 --- a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py +++ b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py @@ -44,6 +44,7 @@ H0["sim_ended"][:500] = True sampling = Ensemble(parse_args=True) + sampling.libE_specs.gen_on_manager = True sampling.H0 = H0 sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) sampling.alloc_specs = AllocSpecs(alloc_f=alloc_f) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 8f5959ce9..d04f3fb88 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -21,7 +21,7 @@ dtype=[ ("worker_id", " Date: Wed, 28 Feb 2024 13:01:22 -0600 Subject: [PATCH 40/76] I think for sim workers, the only requirement is that they're not gen workers --- libensemble/alloc_funcs/start_only_persistent.py | 2 +- libensemble/tools/alloc_support.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 870973dc4..35dee7752 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -88,7 +88,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l # Now the give_sim_work_first part points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=False) + avail_workers = support.avail_sim_worker_ids(persistent=False, zero_resource_workers=False) if user.get("alt_type"): avail_workers = list( set(support.avail_worker_ids(persistent=False, zero_resource_workers=False)) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 3cda02079..d93ab9814 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -120,8 +120,10 @@ def fltr_recving(): return not wrk["active"] def fltr_worker_type(): - if worker_type: - return wrk["worker_type"] == worker_type + if worker_type == EVAL_SIM_TAG: + return wrk["worker_type"] != EVAL_GEN_TAG # only workers not given gen work *yet* + elif worker_type == EVAL_GEN_TAG: + return wrk["worker_type"] == EVAL_GEN_TAG # explicitly want gen_workers else: return True @@ -146,7 +148,7 @@ def avail_gen_worker_ids(self, persistent=False, active_recv=False, zero_resourc ) def avail_sim_worker_ids(self, persistent=False, active_recv=False, zero_resource_workers=None): - """Returns available generator workers as a list of IDs.""" + """Returns available non-generator workers as a list of IDs.""" return self.avail_worker_ids( persistent=persistent, active_recv=active_recv, From 09d030c866b83b193b58da80bf53a6fed22fa328 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 28 Feb 2024 14:09:32 -0600 Subject: [PATCH 41/76] fixing alloc unit test based on passing wrapped W into alloc --- .../unit_tests/test_allocation_funcs_and_support.py | 12 +++++++----- libensemble/tools/alloc_support.py | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index d04f3fb88..38e3ecee7 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -11,6 +11,7 @@ from libensemble.tools import add_unique_random_streams from libensemble.tools.alloc_support import AllocException, AllocSupport from libensemble.tools.fields_keys import libE_fields +from libensemble.utils.misc import _WorkerIndexer al = {"alloc_f": give_sim_work_first} libE_specs = {"comms": "local", "nworkers": 4} @@ -58,7 +59,7 @@ def test_decide_work_and_resources(): libE_info = {"sim_max_given": False, "any_idle_workers": True, "use_resource_sets": False} # Don't give out work when all workers are active - W["active"] = True + W["active"] = 1 Work, persis_info = al["alloc_f"](W, hist.H, sim_specs, gen_specs, al, {}, libE_info) assert len(Work) == 0 @@ -131,8 +132,8 @@ def test_als_worker_ids(): def test_als_evaluate_gens(): W_gens = W.copy() - W_gens["active"] = np.array([True, 0, True, 0]) - W_gens["worker_type"] = np.array([2, 0, 2, 0]) + W_gens["active"] = np.array([EVAL_GEN_TAG, 0, EVAL_GEN_TAG, 0]) + W_gens["worker_type"] = np.array([EVAL_GEN_TAG, 0, EVAL_GEN_TAG, 0]) als = AllocSupport(W_gens, True) assert als.count_gens() == 2, "count_gens() didn't return correct number of active generators" @@ -166,7 +167,8 @@ def test_als_sim_work(): W_ps = W.copy() W_ps["persistent"] = np.array([True, 0, 0, 0]) - als = AllocSupport(W_ps, True) + W_ps["zero_resource_worker"] = np.array([True, 0, 0, 0]) + als = AllocSupport(_WorkerIndexer(W_ps, False), True) Work = {} Work[1] = als.sim_work(1, H, ["x"], np.array([0, 1, 2, 3, 4]), persis_info[1], persistent=True) @@ -203,7 +205,7 @@ def test_als_gen_work(): W_ps = W.copy() W_ps["persistent"] = np.array([True, 0, 0, 0]) - als = AllocSupport(W_ps, True) + als = AllocSupport(_WorkerIndexer(W_ps, False), True) Work = {} Work[1] = als.gen_work(1, ["sim_id"], range(0, 5), persis_info[1], persistent=True) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index d93ab9814..12216259a 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -117,7 +117,7 @@ def fltr_recving(): if active_recv: return wrk["active_recv"] else: - return not wrk["active"] + return wrk["active"] == 0 def fltr_worker_type(): if worker_type == EVAL_SIM_TAG: @@ -158,11 +158,11 @@ def avail_sim_worker_ids(self, persistent=False, active_recv=False, zero_resourc def count_gens(self): """Returns the number of active generators.""" - return sum(self.W["active"] & (self.W["worker_type"] == EVAL_GEN_TAG)) + return sum((self.W["active"] == EVAL_GEN_TAG) & (self.W["worker_type"] == EVAL_GEN_TAG)) def test_any_gen(self): """Returns ``True`` if a generator worker is active.""" - return any(self.W["active"] & (self.W["worker_type"] == EVAL_GEN_TAG)) + return any((self.W["active"] == EVAL_GEN_TAG) & (self.W["worker_type"] == EVAL_GEN_TAG)) def count_persis_gens(self): """Return the number of active persistent generators.""" From 2f631e095a62b38d26c2dd7e69656967079ebfd0 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 29 Feb 2024 15:52:19 -0600 Subject: [PATCH 42/76] refactoring Worker array fields to more closely match develop. worker_type:int is now gen_worker:bool. revert allocs --- .../alloc_funcs/give_pregenerated_work.py | 2 +- .../alloc_funcs/inverse_bayes_allocf.py | 3 +- .../alloc_funcs/persistent_aposmm_alloc.py | 3 +- .../alloc_funcs/start_fd_persistent.py | 3 +- .../alloc_funcs/start_only_persistent.py | 6 +-- .../start_persistent_local_opt_gens.py | 6 +-- libensemble/manager.py | 29 ++++++----- .../test_allocation_funcs_and_support.py | 21 +++++--- libensemble/tools/alloc_support.py | 49 ++++++------------- 9 files changed, 53 insertions(+), 69 deletions(-) diff --git a/libensemble/alloc_funcs/give_pregenerated_work.py b/libensemble/alloc_funcs/give_pregenerated_work.py index 060046d27..1d6edb160 100644 --- a/libensemble/alloc_funcs/give_pregenerated_work.py +++ b/libensemble/alloc_funcs/give_pregenerated_work.py @@ -23,7 +23,7 @@ def give_pregenerated_sim_work(W, H, sim_specs, gen_specs, alloc_specs, persis_i if persis_info["next_to_give"] >= len(H): return Work, persis_info, 1 - for i in support.avail_sim_worker_ids(): + for i in support.avail_worker_ids(): persis_info = support.skip_canceled_points(H, persis_info) # Give sim work diff --git a/libensemble/alloc_funcs/inverse_bayes_allocf.py b/libensemble/alloc_funcs/inverse_bayes_allocf.py index dcc1e13d7..56a3f6e79 100644 --- a/libensemble/alloc_funcs/inverse_bayes_allocf.py +++ b/libensemble/alloc_funcs/inverse_bayes_allocf.py @@ -1,5 +1,6 @@ import numpy as np +from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -24,7 +25,7 @@ def only_persistent_gens_for_inverse_bayes(W, H, sim_specs, gen_specs, alloc_spe # If wid is idle, but in persistent mode, and generated work has all returned # give output back to wid. Otherwise, give nothing to wid - for wid in support.avail_gen_worker_ids(persistent=True): + for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): # if > 1 persistent generator, assign the correct work to it inds_generated_by_wid = H["gen_worker"] == wid if support.all_sim_ended(H, inds_generated_by_wid): diff --git a/libensemble/alloc_funcs/persistent_aposmm_alloc.py b/libensemble/alloc_funcs/persistent_aposmm_alloc.py index 47b584309..8327d3975 100644 --- a/libensemble/alloc_funcs/persistent_aposmm_alloc.py +++ b/libensemble/alloc_funcs/persistent_aposmm_alloc.py @@ -1,5 +1,6 @@ import numpy as np +from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -39,7 +40,7 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info return Work, persis_info, 1 # If any persistent worker's calculated values have returned, give them back. - for wid in support.avail_gen_worker_ids(persistent=True): + for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): if persis_info.get("sample_done") or sum(H["sim_ended"]) >= init_sample_size + persis_info["samples_in_H0"]: # Don't return if the initial sample is not complete persis_info["sample_done"] = True diff --git a/libensemble/alloc_funcs/start_fd_persistent.py b/libensemble/alloc_funcs/start_fd_persistent.py index 33af61765..0c2e939d3 100644 --- a/libensemble/alloc_funcs/start_fd_persistent.py +++ b/libensemble/alloc_funcs/start_fd_persistent.py @@ -1,5 +1,6 @@ import numpy as np +from libensemble.message_numbers import EVAL_GEN_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -29,7 +30,7 @@ def finite_diff_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info, libE # If wid is in persistent mode, and all of its calculated values have # returned, give them back to wid. Otherwise, give nothing to wid - for wid in support.avail_gen_worker_ids(persistent=True): + for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): # What (x_ind, f_ind) pairs have all of the evaluation of all n_ind # values complete. inds_not_sent_back = ~H["gen_informed"] diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 35dee7752..6176a71ea 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -1,6 +1,6 @@ import numpy as np -from libensemble.message_numbers import EVAL_SIM_TAG +from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.tools.alloc_support import AllocSupport, InsufficientFreeResources @@ -70,7 +70,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l return Work, persis_info, 1 # Give evaluated results back to a running persistent gen - for wid in support.avail_gen_worker_ids(persistent=True, active_recv=active_recv_gen): + for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG, active_recv=active_recv_gen): gen_inds = H["gen_worker"] == wid returned_but_not_given = np.logical_and.reduce((H["sim_ended"], ~H["gen_informed"], gen_inds)) if np.any(returned_but_not_given): @@ -92,7 +92,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l if user.get("alt_type"): avail_workers = list( set(support.avail_worker_ids(persistent=False, zero_resource_workers=False)) - | set(support.avail_worker_ids(persistent=True, zero_resource_workers=False, worker_type=EVAL_SIM_TAG)) + | set(support.avail_worker_ids(persistent=EVAL_SIM_TAG, zero_resource_workers=False)) ) for wid in avail_workers: if not np.any(points_to_evaluate): diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 1a16ea817..255663c0b 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -46,7 +46,7 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per # If wid is idle, but in persistent mode, and its calculated values have # returned, give them back to i. Otherwise, give nothing to wid - for wid in support.avail_gen_worker_ids(persistent=True): + for wid in support.avail_worker_ids(persistent=EVAL_GEN_TAG): gen_inds = H["gen_worker"] == wid if support.all_sim_ended(H, gen_inds): last_time_pos = np.argmax(H["sim_started_time"][gen_inds]) @@ -90,9 +90,7 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per break points_to_evaluate[sim_ids_to_send] = False - elif gen_count == 0 and not np.any( - np.logical_and((W["active"]), (W["persistent"] is False), (W["worker_type"] == EVAL_GEN_TAG)) - ): + elif gen_count == 0 and not np.any(np.logical_and((W["active"] == EVAL_GEN_TAG), (W["persis_state"] == 0))): # Finally, generate points since there is nothing else to do (no resource sets req.) Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info[wid], rset_team=[]) gen_count += 1 diff --git a/libensemble/manager.py b/libensemble/manager.py index c1fad1af5..d228d089f 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -159,9 +159,9 @@ class Manager: worker_dtype = [ ("worker_id", int), - ("worker_type", int), + ("gen_worker", bool), ("active", int), - ("persistent", bool), + ("persis_state", int), ("active_recv", bool), ("gen_started_time", float), ("zero_resource_worker", bool), @@ -235,7 +235,7 @@ def __init__( self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) if gen_on_manager: self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] - self.W[0]["worker_type"] = EVAL_GEN_TAG + self.W[0]["gen_worker"] = True local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms else: @@ -428,9 +428,8 @@ def _update_state_on_alloc(self, Work: dict, w: int): """Updates a workers' active/idle status following an allocation order""" self.W[w]["active"] = Work["tag"] - self.W[w]["worker_type"] = Work["tag"] if "persistent" in Work["libE_info"]: - self.W[w]["persistent"] = True + self.W[w]["persis_state"] = Work["tag"] if Work["libE_info"].get("active_recv", False): self.W[w]["active_recv"] = True else: @@ -480,7 +479,7 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - self.hist.update_history_f(D_recv, self.kill_canceled_sims) else: logger.info(_PERSIS_RETURN_WARNING) - self.W[w]["persistent"] = False + self.W[w]["persis_state"] = 0 if self.W[w]["active_recv"]: self.W[w]["active"] = 0 self.W[w]["active_recv"] = False @@ -494,11 +493,11 @@ def _update_state_on_worker_msg(self, persis_info: dict, D_recv: dict, w: int) - if calc_type == EVAL_GEN_TAG: self.hist.update_history_x_in(w, D_recv["calc_out"], self.W[w]["gen_started_time"]) assert ( - len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w]["persistent"] + len(D_recv["calc_out"]) or np.any(self.W["active"]) or self.W[w]["persis_state"] ), "Gen must return work when is is the only thing active and not persistent." if "libE_info" in D_recv and "persistent" in D_recv["libE_info"]: # Now a waiting, persistent worker - self.W[w]["persistent"] = True + self.W[w]["persis_state"] = D_recv["calc_type"] else: self._freeup_resources(w) @@ -562,8 +561,8 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): """ # Send a handshake signal to each persistent worker. - if any(self.W["persistent"]): - for w in self.W["worker_id"][self.W["persistent"]]: + if any(self.W["persis_state"]): + for w in self.W["worker_id"][self.W["persis_state"] > 0]: logger.debug(f"Manager sending PERSIS_STOP to worker {w}") if self.libE_specs.get("final_gen_send", False): rows_to_send = np.where(self.hist.H["sim_ended"] & ~self.hist.H["gen_informed"])[0] @@ -580,15 +579,15 @@ def _final_receive_and_kill(self, persis_info: dict) -> (dict, int, int): self.wcomms[w].send(PERSIS_STOP, MAN_SIGNAL_KILL) if not self.W[w]["active"]: # Re-activate if necessary - self.W[w]["active"] = self.W[w]["worker_type"] if self.W[w]["persistent"] else 0 + self.W[w]["active"] = self.W[w]["persis_state"] self.persis_pending.append(w) exit_flag = 0 - while (any(self.W["active"]) or any(self.W["persistent"])) and exit_flag == 0: + while (any(self.W["active"]) or any(self.W["persis_state"])) and exit_flag == 0: persis_info = self._receive_from_workers(persis_info) if self.term_test(logged=False) == 2: # Elapsed Wallclock has expired - if not any(self.W["persistent"]): + if not any(self.W["persis_state"]): if any(self.W["active"]): logger.manager_warning(_WALLCLOCK_MSG_ACTIVE) else: @@ -611,7 +610,7 @@ def _get_alloc_libE_info(self) -> dict: """Selected statistics useful for alloc_f""" return { - "any_idle_workers": any(~self.W["active"]), + "any_idle_workers": any(self.W["active"] == 0), "exit_criteria": self.exit_criteria, "elapsed_time": self.elapsed(), "gen_informed_count": self.hist.gen_informed_count, @@ -681,7 +680,7 @@ def run(self, persis_info: dict) -> (dict, int, int): self._send_work_order(Work[w], w) self._update_state_on_alloc(Work[w], w) assert self.term_test() or any( - self.W["active"] + self.W["active"] != 0 ), "alloc_f did not return any work, although all workers are idle." except WorkerException as e: report_worker_exc(e) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 38e3ecee7..41a9aad83 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -18,12 +18,17 @@ H0 = [] W = np.array( - [(1, 0, 0, 0, 0, False), (2, 0, 0, 0, 0, False), (3, 0, 0, 0, 0, False), (4, 0, 0, 0, 0, False)], + [ + (1, False, 0, 0, False, False), + (2, False, 0, 0, False, False), + (3, False, 0, 0, False, False), + (4, False, 0, 0, False, False), + ], dtype=[ ("worker_id", " Date: Fri, 1 Mar 2024 09:42:17 -0600 Subject: [PATCH 43/76] fix tests --- .../tests/unit_tests/test_allocation_funcs_and_support.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 41a9aad83..6d056b1e0 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -108,7 +108,7 @@ def test_als_worker_ids(): W_ps = W.copy() W_ps["persis_state"] = np.array([EVAL_GEN_TAG, 0, 0, 0]) als = AllocSupport(W_ps, True) - assert als.avail_worker_ids(persistent=True) == [ + assert als.avail_worker_ids(persistent=EVAL_GEN_TAG) == [ 1 ], "avail_worker_ids() didn't return expected persistent worker list." @@ -116,7 +116,7 @@ def test_als_worker_ids(): W_ar["active_recv"] = np.array([True, 0, 0, 0]) W_ar["persis_state"] = np.array([EVAL_GEN_TAG, 0, 0, 0]) als = AllocSupport(W_ar, True) - assert als.avail_worker_ids(persistent=True, active_recv=True) == [ + assert als.avail_worker_ids(persistent=EVAL_GEN_TAG, active_recv=True) == [ 1 ], "avail_worker_ids() didn't return expected persistent worker list." @@ -138,7 +138,6 @@ def test_als_worker_ids(): def test_als_evaluate_gens(): W_gens = W.copy() W_gens["active"] = np.array([EVAL_GEN_TAG, 0, EVAL_GEN_TAG, 0]) - W_gens["worker_type"] = np.array([EVAL_GEN_TAG, 0, EVAL_GEN_TAG, 0]) als = AllocSupport(W_gens, True) assert als.count_gens() == 2, "count_gens() didn't return correct number of active generators" From 550ca1fdc60756d6b5ccb7e679a5fc7abf9cc583 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 1 Mar 2024 10:13:03 -0600 Subject: [PATCH 44/76] missed a revert in alloc --- libensemble/alloc_funcs/start_only_persistent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 6176a71ea..4eaf8fa1c 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -88,7 +88,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l # Now the give_sim_work_first part points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - avail_workers = support.avail_sim_worker_ids(persistent=False, zero_resource_workers=False) + avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=False) if user.get("alt_type"): avail_workers = list( set(support.avail_worker_ids(persistent=False, zero_resource_workers=False)) From e7591b6e2a8dfdda4438a8b1a0573c1a795da6d5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 1 Mar 2024 10:20:02 -0600 Subject: [PATCH 45/76] undo inconsequential tiny changes to allocs --- libensemble/alloc_funcs/start_only_persistent.py | 1 + libensemble/alloc_funcs/start_persistent_local_opt_gens.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 4eaf8fa1c..ee9d4105f 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -51,6 +51,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l if libE_info["sim_max_given"] or not libE_info["any_idle_workers"]: return {}, persis_info + # Initialize alloc_specs["user"] as user. user = alloc_specs.get("user", {}) manage_resources = libE_info["use_resource_sets"] diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 255663c0b..12ad45100 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -90,7 +90,7 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per break points_to_evaluate[sim_ids_to_send] = False - elif gen_count == 0 and not np.any(np.logical_and((W["active"] == EVAL_GEN_TAG), (W["persis_state"] == 0))): + elif gen_count == 0 and not np.any(np.logical_and(W["active"] == EVAL_GEN_TAG, W["persis_state"] == 0)): # Finally, generate points since there is nothing else to do (no resource sets req.) Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info[wid], rset_team=[]) gen_count += 1 From 68b991aa1c30c6281527734b1bc87805bf600ebb Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 1 Mar 2024 11:16:18 -0600 Subject: [PATCH 46/76] run each of the test_GPU_gen_resources tests also with the gen running on manager --- .../test_GPU_gen_resources.py | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index 6e692dfa2..a0ef24e15 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -42,6 +42,12 @@ from libensemble.sim_funcs.var_resources import gpu_variable_resources_from_gen as sim_f from libensemble.tools import add_unique_random_streams, parse_args +# TODO: multiple libE calls with gen-on-manager currently not supported with spawn on macOS +if sys.platform == "darwin": + from multiprocessing import set_start_method + + set_start_method("fork", force=True) + # from libensemble import logger # logger.set_level("DEBUG") # For testing the test @@ -100,30 +106,32 @@ libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() - for run in range(5): - # reset - libE_specs = base_libE_specs.copy() - persis_info = add_unique_random_streams({}, nworkers + 1) - - if run == 0: - libE_specs["gen_num_procs"] = 2 - elif run == 1: - libE_specs["gen_num_gpus"] = 1 - elif run == 2: - persis_info["gen_num_gpus"] = 1 - elif run == 3: - # Two GPUs per resource set - libE_specs["resource_info"]["gpus_on_node"] = nworkers * 2 - persis_info["gen_num_gpus"] = 1 - elif run == 4: - # Two GPUs requested for gen - persis_info["gen_num_procs"] = 2 - persis_info["gen_num_gpus"] = 2 - gen_specs["user"]["max_procs"] = max(nworkers - 2, 1) - - # Perform the run - H, persis_info, flag = libE( - sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs, alloc_specs=alloc_specs - ) + for gen_on_manager in [False, True]: + for run in range(5): + # reset + libE_specs = base_libE_specs.copy() + libE_specs["gen_on_manager"] = gen_on_manager + persis_info = add_unique_random_streams({}, nworkers + 1) + + if run == 0: + libE_specs["gen_num_procs"] = 2 + elif run == 1: + libE_specs["gen_num_gpus"] = 1 + elif run == 2: + persis_info["gen_num_gpus"] = 1 + elif run == 3: + # Two GPUs per resource set + libE_specs["resource_info"]["gpus_on_node"] = nworkers * 2 + persis_info["gen_num_gpus"] = 1 + elif run == 4: + # Two GPUs requested for gen + persis_info["gen_num_procs"] = 2 + persis_info["gen_num_gpus"] = 2 + gen_specs["user"]["max_procs"] = max(nworkers - 2, 1) + + # Perform the run + H, persis_info, flag = libE( + sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs, alloc_specs=alloc_specs + ) # All asserts are in gen and sim funcs From c433ecb397b2c3c5c76f37beb6a371fe067473f6 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 6 Mar 2024 10:42:57 -0600 Subject: [PATCH 47/76] simply gen_workers parameter description for avail_worker_ids --- libensemble/tools/alloc_support.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index e514bad02..9b25a267d 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -93,7 +93,7 @@ def avail_worker_ids(self, persistent=None, active_recv=False, zero_resource_wor :param persistent: (Optional) Int. Only return workers with given ``persis_state`` (1=sim, 2=gen). :param active_recv: (Optional) Boolean. Only return workers with given active_recv state. :param zero_resource_workers: (Optional) Boolean. Only return workers that require no resources. - :param gen_workers: (Optional) Boolean. If True, return gen-only workers and manager's ID. + :param gen_workers: (Optional) Boolean. If True, return gen-only workers. :returns: List of worker IDs. If there are no zero resource workers defined, then the ``zero_resource_workers`` argument will From e78056b0acbc5572385e1618aa38ae928e7eb4d3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 8 Mar 2024 13:33:08 -0600 Subject: [PATCH 48/76] debugging consecutive libE calls with gen_on_manager --- libensemble/comms/comms.py | 30 +++++++++---------- libensemble/comms/logs.py | 1 + libensemble/manager.py | 4 +++ .../test_GPU_gen_resources.py | 6 ---- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index bebca9344..51042c463 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -207,19 +207,6 @@ def result(self, timeout=None): raise RemoteException(self._exception.msg, self._exception.exc) return self._result - @staticmethod - def _qcomm_main(comm, main, *args, **kwargs): - """Main routine -- handles return values and exceptions.""" - try: - if not kwargs.get("user_function"): - _result = main(comm, *args, **kwargs) - else: - _result = main(*args) - comm.send(CommResult(_result)) - except Exception as e: - comm.send(CommResultErr(str(e), format_exc())) - raise e - @property def running(self): """Check if the thread/process is running.""" @@ -233,6 +220,19 @@ def __exit__(self, etype, value, traceback): self.handle.join() +def _qcomm_main(comm, main, *args, **kwargs): + """Main routine -- handles return values and exceptions.""" + try: + if not kwargs.get("user_function"): + _result = main(comm, *args, **kwargs) + else: + _result = main(*args) + comm.send(CommResult(_result)) + except Exception as e: + comm.send(CommResultErr(str(e), format_exc())) + raise e + + class QCommThread(QCommLocal): """Launch a user function in a thread with an attached QComm.""" @@ -241,7 +241,7 @@ def __init__(self, main, nworkers, *args, **kwargs): self.outbox = thread_queue.Queue() super().__init__(self, main, *args, **kwargs) comm = QComm(self.inbox, self.outbox, nworkers) - self.handle = Thread(target=QCommThread._qcomm_main, args=(comm, main) + args, kwargs=kwargs) + self.handle = Thread(target=_qcomm_main, args=(comm, main) + args, kwargs=kwargs) def terminate(self, timeout=None): """Terminate the thread. @@ -265,7 +265,7 @@ def __init__(self, main, nworkers, *args, **kwargs): self.outbox = Queue() super().__init__(self, main, *args, **kwargs) comm = QComm(self.inbox, self.outbox, nworkers) - self.handle = Process(target=QCommProcess._qcomm_main, args=(comm, main) + args, kwargs=kwargs) + self.handle = Process(target=_qcomm_main, args=(comm, main) + args, kwargs=kwargs) def terminate(self, timeout=None): """Terminate the process.""" diff --git a/libensemble/comms/logs.py b/libensemble/comms/logs.py index 10acbae07..47f85f351 100644 --- a/libensemble/comms/logs.py +++ b/libensemble/comms/logs.py @@ -203,6 +203,7 @@ def manager_logging_config(specs={}): def exit_logger(): stat_timer.stop() stat_logger.info(f"Exiting ensemble at: {stat_timer.date_end} Time Taken: {stat_timer.elapsed}") + stat_logger.handlers[0].close() # If closing logs - each libE() call will log to a new file. # fh.close() diff --git a/libensemble/manager.py b/libensemble/manager.py index d228d089f..094ef839b 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -302,6 +302,9 @@ def _kill_workers(self) -> None: """Kills the workers""" for w in self.W["worker_id"]: self.wcomms[w].send(STOP_TAG, MAN_SIGNAL_FINISH) + if w == 0: + self.wcomms[0].result() + self.wcomms[0] = None # --- Checkpointing logic @@ -691,6 +694,7 @@ def run(self, persis_info: dict) -> (dict, int, int): finally: # Return persis_info, exit_flag, elapsed time result = self._final_receive_and_kill(persis_info) + self.wcomms = None sys.stdout.flush() sys.stderr.flush() return result diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index a0ef24e15..bd40d5c4c 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -42,12 +42,6 @@ from libensemble.sim_funcs.var_resources import gpu_variable_resources_from_gen as sim_f from libensemble.tools import add_unique_random_streams, parse_args -# TODO: multiple libE calls with gen-on-manager currently not supported with spawn on macOS -if sys.platform == "darwin": - from multiprocessing import set_start_method - - set_start_method("fork", force=True) - # from libensemble import logger # logger.set_level("DEBUG") # For testing the test From f30233c6a892aaa9f32d557dcd57b4f0ca870ef5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 8 Mar 2024 15:20:59 -0600 Subject: [PATCH 49/76] debugging...... --- libensemble/comms/comms.py | 6 ++++++ libensemble/comms/logs.py | 1 + libensemble/libE.py | 3 +++ .../tests/functionality_tests/test_GPU_gen_resources.py | 2 ++ 4 files changed, 12 insertions(+) diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index 51042c463..2b31cf5b9 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -255,6 +255,9 @@ def terminate(self, timeout=None): self.handle.join(timeout=timeout) if self.running: raise Timeout() + self.handle = None + self.inbox = None + self.outbox = None class QCommProcess(QCommLocal): @@ -274,3 +277,6 @@ def terminate(self, timeout=None): self.handle.join(timeout=timeout) if self.running: raise Timeout() + self.handle = None + self.inbox = None + self.outbox = None diff --git a/libensemble/comms/logs.py b/libensemble/comms/logs.py index 47f85f351..de2454f8d 100644 --- a/libensemble/comms/logs.py +++ b/libensemble/comms/logs.py @@ -204,6 +204,7 @@ def exit_logger(): stat_timer.stop() stat_logger.info(f"Exiting ensemble at: {stat_timer.date_end} Time Taken: {stat_timer.elapsed}") stat_logger.handlers[0].close() + print("Manager logger closed") # If closing logs - each libE() call will log to a new file. # fh.close() diff --git a/libensemble/libE.py b/libensemble/libE.py index b283a82b4..b5ddaa330 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -460,6 +460,9 @@ def kill_proc_team(wcomms, timeout): wcomm.result(timeout=timeout) except Timeout: wcomm.terminate() + wcomm.handle = None + wcomm.inbox = None + wcomm.outbox = None def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index bd40d5c4c..0fc8192f7 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -110,6 +110,8 @@ if run == 0: libE_specs["gen_num_procs"] = 2 elif run == 1: + if gen_on_manager: + print("SECOND LIBE CALL WITH GEN ON MANAGER") libE_specs["gen_num_gpus"] = 1 elif run == 2: persis_info["gen_num_gpus"] = 1 From 6d0f9d2849c63f69fb14f3ec14d3f35e86dfed57 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 8 Mar 2024 16:08:56 -0600 Subject: [PATCH 50/76] cleaning up debugging, removing comm from Executor upon worker exiting --- libensemble/comms/comms.py | 6 ------ libensemble/comms/logs.py | 1 - libensemble/libE.py | 3 --- libensemble/manager.py | 1 - libensemble/worker.py | 1 + 5 files changed, 1 insertion(+), 11 deletions(-) diff --git a/libensemble/comms/comms.py b/libensemble/comms/comms.py index 2b31cf5b9..51042c463 100644 --- a/libensemble/comms/comms.py +++ b/libensemble/comms/comms.py @@ -255,9 +255,6 @@ def terminate(self, timeout=None): self.handle.join(timeout=timeout) if self.running: raise Timeout() - self.handle = None - self.inbox = None - self.outbox = None class QCommProcess(QCommLocal): @@ -277,6 +274,3 @@ def terminate(self, timeout=None): self.handle.join(timeout=timeout) if self.running: raise Timeout() - self.handle = None - self.inbox = None - self.outbox = None diff --git a/libensemble/comms/logs.py b/libensemble/comms/logs.py index de2454f8d..47f85f351 100644 --- a/libensemble/comms/logs.py +++ b/libensemble/comms/logs.py @@ -204,7 +204,6 @@ def exit_logger(): stat_timer.stop() stat_logger.info(f"Exiting ensemble at: {stat_timer.date_end} Time Taken: {stat_timer.elapsed}") stat_logger.handlers[0].close() - print("Manager logger closed") # If closing logs - each libE() call will log to a new file. # fh.close() diff --git a/libensemble/libE.py b/libensemble/libE.py index b5ddaa330..b283a82b4 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -460,9 +460,6 @@ def kill_proc_team(wcomms, timeout): wcomm.result(timeout=timeout) except Timeout: wcomm.terminate() - wcomm.handle = None - wcomm.inbox = None - wcomm.outbox = None def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): diff --git a/libensemble/manager.py b/libensemble/manager.py index 094ef839b..69117916d 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -304,7 +304,6 @@ def _kill_workers(self) -> None: self.wcomms[w].send(STOP_TAG, MAN_SIGNAL_FINISH) if w == 0: self.wcomms[0].result() - self.wcomms[0] = None # --- Checkpointing logic diff --git a/libensemble/worker.py b/libensemble/worker.py index fcf0a5c57..1a96dbdd5 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -415,3 +415,4 @@ def run(self) -> None: self.gen_runner.shutdown() self.sim_runner.shutdown() self.EnsembleDirectory.copy_back() + Executor.executor.comm = None From 97c2c53aceed96b7b70e6501bd21661d510edb46 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 8 Mar 2024 16:12:10 -0600 Subject: [PATCH 51/76] clarification comment --- libensemble/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/worker.py b/libensemble/worker.py index 1a96dbdd5..bfbb82659 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -415,4 +415,4 @@ def run(self) -> None: self.gen_runner.shutdown() self.sim_runner.shutdown() self.EnsembleDirectory.copy_back() - Executor.executor.comm = None + Executor.executor.comm = None # so Executor can be pickled upon further libE calls From 73d4b4c6d1d0f92d86f7956351f6aa5b8cab7069 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 11 Mar 2024 10:06:26 -0500 Subject: [PATCH 52/76] bugfix --- libensemble/worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/worker.py b/libensemble/worker.py index bfbb82659..10823ad8a 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -415,4 +415,5 @@ def run(self) -> None: self.gen_runner.shutdown() self.sim_runner.shutdown() self.EnsembleDirectory.copy_back() - Executor.executor.comm = None # so Executor can be pickled upon further libE calls + if Executor.executor is not None: + Executor.executor.comm = None # so Executor can be pickled upon further libE calls From 13fecde93a3f147f0d47f3781cca565be0c23faf Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 09:47:48 -0500 Subject: [PATCH 53/76] filter for gen_workers within avail_worker_ids, if set and there are gen_workers. solution resembles zrw, like shuds predicted all along! --- libensemble/alloc_funcs/start_only_persistent.py | 2 +- libensemble/tools/alloc_support.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index ee9d4105f..78b17ab87 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -115,7 +115,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l # Start persistent gens if no worker to give out. Uses zero_resource_workers if defined. if not np.any(points_to_evaluate): - avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=True) + avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=True, gen_workers=True) for wid in avail_workers: if gen_count < user.get("num_active_gens", 1): diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 9b25a267d..0d4ce91d8 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -87,13 +87,13 @@ def assign_resources(self, rsets_req, use_gpus=None, user_params=[]): rset_team = self.sched.assign_resources(rsets_req, use_gpus, user_params) return rset_team - def avail_worker_ids(self, persistent=None, active_recv=False, zero_resource_workers=None, gen_workers=False): + def avail_worker_ids(self, persistent=None, active_recv=False, zero_resource_workers=None, gen_workers=None): """Returns available workers as a list of IDs, filtered by the given options. :param persistent: (Optional) Int. Only return workers with given ``persis_state`` (1=sim, 2=gen). :param active_recv: (Optional) Boolean. Only return workers with given active_recv state. :param zero_resource_workers: (Optional) Boolean. Only return workers that require no resources. - :param gen_workers: (Optional) Boolean. If True, return gen-only workers. + :param gen_workers: (Optional) Boolean. If True, return gen-only workers. If False, return all other workers. :returns: List of worker IDs. If there are no zero resource workers defined, then the ``zero_resource_workers`` argument will @@ -119,16 +119,17 @@ def fltr_recving(): return wrk["active"] == 0 def fltr_gen_workers(): - if gen_workers: - return wrk["gen_worker"] - else: + if no_gen_workers or gen_workers is None: return True + return wrk["gen_worker"] == gen_workers if active_recv and not persistent: raise AllocException("Cannot ask for non-persistent active receive workers") # If there are no zero resource workers - then ignore zrw (i.e., use only if they exist) no_zrw = not any(self.W["zero_resource_worker"]) + no_gen_workers = not any(self.W["gen_worker"]) + wrks = [] for wrk in self.W: if fltr_recving() and fltr_persis() and fltr_zrw() and fltr_gen_workers(): From 0bcfc798f9f9b79b00c05288b6819d5ad8d4c5f5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 14:28:51 -0500 Subject: [PATCH 54/76] refactor give_sim_work_first for running on gen_workers if no points_to_evaluate. add test for mixed existing sample plus calling a gen --- .../alloc_funcs/give_sim_work_first.py | 10 ++- .../test_evaluate_existing_plus_gen.py | 70 +++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py diff --git a/libensemble/alloc_funcs/give_sim_work_first.py b/libensemble/alloc_funcs/give_sim_work_first.py index a7aa74d3b..1e528917b 100644 --- a/libensemble/alloc_funcs/give_sim_work_first.py +++ b/libensemble/alloc_funcs/give_sim_work_first.py @@ -64,15 +64,19 @@ def give_sim_work_first( Work = {} points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - for wid in support.avail_worker_ids(): - if np.any(points_to_evaluate): + + if np.any(points_to_evaluate): + for wid in support.avail_worker_ids(gen_workers=False): sim_ids_to_send = support.points_by_priority(H, points_avail=points_to_evaluate, batch=batch_give) try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], sim_ids_to_send, persis_info.get(wid)) except InsufficientFreeResources: break points_to_evaluate[sim_ids_to_send] = False - else: + if not np.any(points_to_evaluate): + break + else: + for wid in support.avail_worker_ids(gen_workers=True): # Allow at most num_active_gens active generator instances if gen_count >= user.get("num_active_gens", gen_count + 1): break diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py new file mode 100644 index 000000000..03d289574 --- /dev/null +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -0,0 +1,70 @@ +""" +Test libEnsemble's capability to evalute existing points and then generate +new samples via gen_on_manager. + +Execute via one of the following commands (e.g. 3 workers): + mpiexec -np 4 python test_evaluate_existing_sample.py + python test_evaluate_existing_sample.py --nworkers 3 --comms local + python test_evaluate_existing_sample.py --nworkers 3 --comms tcp + +The number of concurrent evaluations of the objective function will be 4-1=3. +""" + +# Do not change these lines - they are parsed by run-tests.sh +# TESTSUITE_COMMS: mpi local tcp +# TESTSUITE_NPROCS: 2 4 + +import numpy as np + +# Import libEnsemble items for this test +from libensemble import Ensemble +from libensemble.gen_funcs.sampling import latin_hypercube_sample as gen_f +from libensemble.sim_funcs.six_hump_camel import six_hump_camel as sim_f +from libensemble.specs import ExitCriteria, GenSpecs, SimSpecs +from libensemble.tools import add_unique_random_streams + + +def create_H0(persis_info, gen_specs, H0_size): + """Create an H0 for give_pregenerated_sim_work""" + # Manually creating H0 + ub = gen_specs["user"]["ub"] + lb = gen_specs["user"]["lb"] + n = len(lb) + b = H0_size + + H0 = np.zeros(b, dtype=[("x", float, 2), ("sim_id", int), ("sim_started", bool)]) + H0["x"] = persis_info[0]["rand_stream"].uniform(lb, ub, (b, n)) + H0["sim_id"] = range(b) + H0["sim_started"] = False + return H0 + + +# Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). +if __name__ == "__main__": + + sampling = Ensemble(parse_args=True) + sampling.libE_specs.gen_on_manager = True + sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) + + gen_specs = { + "gen_f": gen_f, + "outputs": [("x", float, (2,))], + "user": { + "gen_batch_size": 50, + "lb": np.array([-3, -3]), + "ub": np.array([3, 3]), + }, + } + sampling.gen_specs = GenSpecs(**gen_specs) + sampling.exit_criteria = ExitCriteria(sim_max=100) + sampling.persis_info = add_unique_random_streams({}, sampling.nworkers + 1) + sampling.H0 = create_H0(sampling.persis_info, gen_specs, 50) + sampling.run() + + if sampling.is_manager: + assert len(sampling.H) == 2 * len(sampling.H0) + assert np.array_equal(sampling.H0["x"][:50], sampling.H["x"][:50]) + assert np.all(sampling.H["sim_ended"]) + assert np.all(sampling.H["gen_worker"] == 0) + print("\nlibEnsemble correctly didn't add anything to initial sample") + sampling.save_output(__file__) From 45cbd1605ff83882c9be6c430c391642894989d6 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 17:10:27 -0500 Subject: [PATCH 55/76] it turns out that values set by validators are still considered "unset". So for updating purposes for libE_specs, we want to exclude fields that are still set to their defaults --- libensemble/ensemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/ensemble.py b/libensemble/ensemble.py index b037d0bc3..6f47c3dfc 100644 --- a/libensemble/ensemble.py +++ b/libensemble/ensemble.py @@ -327,7 +327,7 @@ def libE_specs(self, new_specs): # Cast new libE_specs temporarily to dict if not isinstance(new_specs, dict): - new_specs = specs_dump(new_specs, by_alias=True, exclude_none=True, exclude_unset=True) + new_specs = specs_dump(new_specs, by_alias=True, exclude_none=True, exclude_defaults=True) # Unset "comms" if we already have a libE_specs that contains that field, that came from parse_args if new_specs.get("comms") and hasattr(self._libE_specs, "comms") and self.parsed: From 2bc504c79f7eb1ee52fd7f1a47414e5cded0004f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 17:44:27 -0500 Subject: [PATCH 56/76] starting to create unit test --- libensemble/tests/unit_tests/test_ensemble.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/libensemble/tests/unit_tests/test_ensemble.py b/libensemble/tests/unit_tests/test_ensemble.py index 75e96a31e..0c7d4554c 100644 --- a/libensemble/tests/unit_tests/test_ensemble.py +++ b/libensemble/tests/unit_tests/test_ensemble.py @@ -3,6 +3,7 @@ import numpy as np import libensemble.tests.unit_tests.setup as setup +from libensemble.resources.platforms import PerlmutterGPU from libensemble.utils.misc import pydanticV1, specs_dump @@ -166,6 +167,25 @@ def test_flakey_workflow(): assert not flag, "should've caught input errors" +def test_ensemble_specs_update_libE_specs(): + + from libensemble.ensemble import Ensemble + from libensemble.specs import LibeSpecs + + platform_specs = PerlmutterGPU() + + ensemble = Ensemble( + libE_specs=LibeSpecs(comms="local", nworkers=4), + ) + + ensemble.libE_specs = LibeSpecs( + num_resource_sets=ensemble.nworkers - 1, + resource_info={"gpus_on_node": 4}, + use_workflow_dir=True, + platform_specs=platform_specs, + ) + + if __name__ == "__main__": test_ensemble_init() test_ensemble_parse_args_false() @@ -173,3 +193,4 @@ def test_flakey_workflow(): test_bad_func_loads() test_full_workflow() test_flakey_workflow() + test_ensemble_specs_update_libE_specs() From aa4db8a52f41e5aca1bc33b16e7bbe88b37b82d7 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 14 Mar 2024 09:47:00 -0500 Subject: [PATCH 57/76] finish up unit test --- libensemble/tests/unit_tests/test_ensemble.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/unit_tests/test_ensemble.py b/libensemble/tests/unit_tests/test_ensemble.py index 0c7d4554c..11b16524b 100644 --- a/libensemble/tests/unit_tests/test_ensemble.py +++ b/libensemble/tests/unit_tests/test_ensemble.py @@ -3,7 +3,6 @@ import numpy as np import libensemble.tests.unit_tests.setup as setup -from libensemble.resources.platforms import PerlmutterGPU from libensemble.utils.misc import pydanticV1, specs_dump @@ -168,8 +167,9 @@ def test_flakey_workflow(): def test_ensemble_specs_update_libE_specs(): - + """Test that libE_specs is updated as expected with .attribute setting""" from libensemble.ensemble import Ensemble + from libensemble.resources.platforms import PerlmutterGPU from libensemble.specs import LibeSpecs platform_specs = PerlmutterGPU() @@ -185,6 +185,10 @@ def test_ensemble_specs_update_libE_specs(): platform_specs=platform_specs, ) + assert ensemble.libE_specs.num_resource_sets == ensemble.nworkers - 1 + assert len(str(ensemble.libE_specs.workflow_dir_path)) > 1 + assert ensemble.libE_specs.platform_specs == specs_dump(platform_specs, exclude_none=True) + if __name__ == "__main__": test_ensemble_init() From 429adb426ea221d939633ba81a1a479c37f3930f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 17:10:27 -0500 Subject: [PATCH 58/76] it turns out that values set by validators are still considered "unset". So for updating purposes for libE_specs, we want to exclude fields that are still set to their defaults --- libensemble/ensemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/ensemble.py b/libensemble/ensemble.py index b037d0bc3..6f47c3dfc 100644 --- a/libensemble/ensemble.py +++ b/libensemble/ensemble.py @@ -327,7 +327,7 @@ def libE_specs(self, new_specs): # Cast new libE_specs temporarily to dict if not isinstance(new_specs, dict): - new_specs = specs_dump(new_specs, by_alias=True, exclude_none=True, exclude_unset=True) + new_specs = specs_dump(new_specs, by_alias=True, exclude_none=True, exclude_defaults=True) # Unset "comms" if we already have a libE_specs that contains that field, that came from parse_args if new_specs.get("comms") and hasattr(self._libE_specs, "comms") and self.parsed: From b1f9108e4791d193d28318315edc1ba231a28dd5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Mar 2024 17:44:27 -0500 Subject: [PATCH 59/76] starting to create unit test --- libensemble/tests/unit_tests/test_ensemble.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/libensemble/tests/unit_tests/test_ensemble.py b/libensemble/tests/unit_tests/test_ensemble.py index 75e96a31e..0c7d4554c 100644 --- a/libensemble/tests/unit_tests/test_ensemble.py +++ b/libensemble/tests/unit_tests/test_ensemble.py @@ -3,6 +3,7 @@ import numpy as np import libensemble.tests.unit_tests.setup as setup +from libensemble.resources.platforms import PerlmutterGPU from libensemble.utils.misc import pydanticV1, specs_dump @@ -166,6 +167,25 @@ def test_flakey_workflow(): assert not flag, "should've caught input errors" +def test_ensemble_specs_update_libE_specs(): + + from libensemble.ensemble import Ensemble + from libensemble.specs import LibeSpecs + + platform_specs = PerlmutterGPU() + + ensemble = Ensemble( + libE_specs=LibeSpecs(comms="local", nworkers=4), + ) + + ensemble.libE_specs = LibeSpecs( + num_resource_sets=ensemble.nworkers - 1, + resource_info={"gpus_on_node": 4}, + use_workflow_dir=True, + platform_specs=platform_specs, + ) + + if __name__ == "__main__": test_ensemble_init() test_ensemble_parse_args_false() @@ -173,3 +193,4 @@ def test_flakey_workflow(): test_bad_func_loads() test_full_workflow() test_flakey_workflow() + test_ensemble_specs_update_libE_specs() From 6fa18efbefbc3670a61a10375f72e6fab70e78e7 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 14 Mar 2024 09:47:00 -0500 Subject: [PATCH 60/76] finish up unit test --- libensemble/tests/unit_tests/test_ensemble.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/unit_tests/test_ensemble.py b/libensemble/tests/unit_tests/test_ensemble.py index 0c7d4554c..11b16524b 100644 --- a/libensemble/tests/unit_tests/test_ensemble.py +++ b/libensemble/tests/unit_tests/test_ensemble.py @@ -3,7 +3,6 @@ import numpy as np import libensemble.tests.unit_tests.setup as setup -from libensemble.resources.platforms import PerlmutterGPU from libensemble.utils.misc import pydanticV1, specs_dump @@ -168,8 +167,9 @@ def test_flakey_workflow(): def test_ensemble_specs_update_libE_specs(): - + """Test that libE_specs is updated as expected with .attribute setting""" from libensemble.ensemble import Ensemble + from libensemble.resources.platforms import PerlmutterGPU from libensemble.specs import LibeSpecs platform_specs = PerlmutterGPU() @@ -185,6 +185,10 @@ def test_ensemble_specs_update_libE_specs(): platform_specs=platform_specs, ) + assert ensemble.libE_specs.num_resource_sets == ensemble.nworkers - 1 + assert len(str(ensemble.libE_specs.workflow_dir_path)) > 1 + assert ensemble.libE_specs.platform_specs == specs_dump(platform_specs, exclude_none=True) + if __name__ == "__main__": test_ensemble_init() From dbdf88f1e8992b648b323f95f1e2f4ce939d7e40 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 14 Mar 2024 15:10:58 -0500 Subject: [PATCH 61/76] platform_specs sometimes seems to be at risk of disappearing when we convert LibeSpecs to dict, so lets save it and reinsert --- libensemble/ensemble.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libensemble/ensemble.py b/libensemble/ensemble.py index 6f47c3dfc..545443851 100644 --- a/libensemble/ensemble.py +++ b/libensemble/ensemble.py @@ -326,8 +326,14 @@ def libE_specs(self, new_specs): return # Cast new libE_specs temporarily to dict - if not isinstance(new_specs, dict): - new_specs = specs_dump(new_specs, by_alias=True, exclude_none=True, exclude_defaults=True) + if not isinstance(new_specs, dict): # exclude_defaults should only be enabled with Pydantic v2 + platform_specs_set = False + if new_specs.platform_specs != {}: # bugginess across Pydantic versions for recursively casting to dict + platform_specs_set = True + platform_specs = new_specs.platform_specs + new_specs = specs_dump(new_specs, exclude_none=True, exclude_defaults=True) + if platform_specs_set: + new_specs["platform_specs"] = specs_dump(platform_specs, exclude_none=True) # Unset "comms" if we already have a libE_specs that contains that field, that came from parse_args if new_specs.get("comms") and hasattr(self._libE_specs, "comms") and self.parsed: From e4d4b0863bbd69443a1157a9dff0c88adcfe1b2a Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 10:26:16 -0500 Subject: [PATCH 62/76] refactor fast_alloc for gen workers --- libensemble/alloc_funcs/fast_alloc.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libensemble/alloc_funcs/fast_alloc.py b/libensemble/alloc_funcs/fast_alloc.py index ccb2fec56..8255df8a8 100644 --- a/libensemble/alloc_funcs/fast_alloc.py +++ b/libensemble/alloc_funcs/fast_alloc.py @@ -32,19 +32,20 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li Work = {} gen_in = gen_specs.get("in", []) - for wid in support.avail_worker_ids(): - persis_info = support.skip_canceled_points(H, persis_info) + persis_info = support.skip_canceled_points(H, persis_info) - # Give sim work if possible - if persis_info["next_to_give"] < len(H): + # Give sim work if possible + if persis_info["next_to_give"] < len(H): + for wid in support.avail_worker_ids(gen_workers=False): try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], [persis_info["next_to_give"]], []) except InsufficientFreeResources: break persis_info["next_to_give"] += 1 - elif gen_count < user.get("num_active_gens", gen_count + 1): - # Give gen work + # Give gen work if possible + elif gen_count < user.get("num_active_gens", gen_count + 1): + for wid in support.avail_worker_ids(gen_workers=True): return_rows = range(len(H)) if gen_in else [] try: Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) From ffbe6c90679ed521a8276181924c5aaaf9dd0237 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 10:27:48 -0500 Subject: [PATCH 63/76] better test comment --- .../functionality_tests/test_evaluate_existing_plus_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py index 03d289574..f9c2f3403 100644 --- a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -66,5 +66,5 @@ def create_H0(persis_info, gen_specs, H0_size): assert np.array_equal(sampling.H0["x"][:50], sampling.H["x"][:50]) assert np.all(sampling.H["sim_ended"]) assert np.all(sampling.H["gen_worker"] == 0) - print("\nlibEnsemble correctly didn't add anything to initial sample") + print("\nlibEnsemble correctly appended to the initial sample via an additional gen.") sampling.save_output(__file__) From 14c8b1fbfdb448781ef51de82b5b590984e2e9a7 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 13:56:58 -0500 Subject: [PATCH 64/76] refactor inverse_bayes_allocf --- libensemble/alloc_funcs/inverse_bayes_allocf.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/libensemble/alloc_funcs/inverse_bayes_allocf.py b/libensemble/alloc_funcs/inverse_bayes_allocf.py index 56a3f6e79..e0521df6f 100644 --- a/libensemble/alloc_funcs/inverse_bayes_allocf.py +++ b/libensemble/alloc_funcs/inverse_bayes_allocf.py @@ -42,8 +42,9 @@ def only_persistent_gens_for_inverse_bayes(W, H, sim_specs, gen_specs, alloc_spe Work[wid] = support.gen_work(wid, ["like"], inds_to_send_back, persis_info.get(wid), persistent=True) points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - for wid in support.avail_worker_ids(persistent=False): - if np.any(points_to_evaluate): + if np.any(points_to_evaluate): + for wid in support.avail_worker_ids(persistent=False, gen_workers=False): + # perform sim evaluations (if any point hasn't been given). sim_subbatches = H["subbatch"][points_to_evaluate] sim_inds = sim_subbatches == np.min(sim_subbatches) @@ -54,13 +55,11 @@ def only_persistent_gens_for_inverse_bayes(W, H, sim_specs, gen_specs, alloc_spe except InsufficientFreeResources: break points_to_evaluate[sim_ids_to_send] = False - - elif gen_count == 0: - # Finally, generate points since there is nothing else to do. - try: - Work[wid] = support.gen_work(wid, gen_specs["in"], [], persis_info.get(wid), persistent=True) - except InsufficientFreeResources: + if not np.any(points_to_evaluate): break - gen_count += 1 + + elif gen_count == 0: + wid = support.avail_worker_ids(persistent=False, gen_workers=True)[0] + Work[wid] = support.gen_work(wid, gen_specs["in"], [], persis_info.get(wid), persistent=True) return Work, persis_info From 45e99b2a98d5b034824e18242ecc29ca22efe417 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 14:50:27 -0500 Subject: [PATCH 65/76] trying to refcator only_one_gen_alloc, but currently doesnt pass test_uniform_sampling_cancel --- libensemble/alloc_funcs/only_one_gen_alloc.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/libensemble/alloc_funcs/only_one_gen_alloc.py b/libensemble/alloc_funcs/only_one_gen_alloc.py index fe9a0f59c..151ae0bbf 100644 --- a/libensemble/alloc_funcs/only_one_gen_alloc.py +++ b/libensemble/alloc_funcs/only_one_gen_alloc.py @@ -21,22 +21,26 @@ def ensure_one_active_gen(W, H, sim_specs, gen_specs, alloc_specs, persis_info, gen_flag = True gen_in = gen_specs.get("in", []) - for wid in support.avail_worker_ids(): - persis_info = support.skip_canceled_points(H, persis_info) + persis_info = support.skip_canceled_points(H, persis_info) - if persis_info["next_to_give"] < len(H): + if persis_info["next_to_give"] < len(H): + for wid in support.avail_worker_ids(gen_workers=False): try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], [persis_info["next_to_give"]], []) except InsufficientFreeResources: break persis_info["next_to_give"] += 1 + if persis_info["next_to_give"] >= len(H): + break + + elif not support.test_any_gen() and gen_flag: + # Give gen work + return_rows = range(len(H)) if gen_in else [] + for wid in support.avail_worker_ids(gen_workers=True): - elif not support.test_any_gen() and gen_flag: if not support.all_sim_ended(H): break - # Give gen work - return_rows = range(len(H)) if gen_in else [] try: Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) except InsufficientFreeResources: From 6f713dc4788a613928e451f7487cafbb91f28f74 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 15:02:04 -0500 Subject: [PATCH 66/76] refactor aposmm alloc, move skip_cancled_points line --- libensemble/alloc_funcs/only_one_gen_alloc.py | 3 +-- libensemble/alloc_funcs/persistent_aposmm_alloc.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/libensemble/alloc_funcs/only_one_gen_alloc.py b/libensemble/alloc_funcs/only_one_gen_alloc.py index 151ae0bbf..cc4fc3e49 100644 --- a/libensemble/alloc_funcs/only_one_gen_alloc.py +++ b/libensemble/alloc_funcs/only_one_gen_alloc.py @@ -21,10 +21,9 @@ def ensure_one_active_gen(W, H, sim_specs, gen_specs, alloc_specs, persis_info, gen_flag = True gen_in = gen_specs.get("in", []) - persis_info = support.skip_canceled_points(H, persis_info) - if persis_info["next_to_give"] < len(H): for wid in support.avail_worker_ids(gen_workers=False): + persis_info = support.skip_canceled_points(H, persis_info) try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], [persis_info["next_to_give"]], []) except InsufficientFreeResources: diff --git a/libensemble/alloc_funcs/persistent_aposmm_alloc.py b/libensemble/alloc_funcs/persistent_aposmm_alloc.py index 8327d3975..3b87d5b5b 100644 --- a/libensemble/alloc_funcs/persistent_aposmm_alloc.py +++ b/libensemble/alloc_funcs/persistent_aposmm_alloc.py @@ -53,7 +53,7 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info ) returned_but_not_given[point_ids] = False - for wid in support.avail_worker_ids(persistent=False): + for wid in support.avail_worker_ids(persistent=False, gen_workers=False): persis_info = support.skip_canceled_points(H, persis_info) if persis_info["next_to_give"] < len(H): @@ -63,8 +63,11 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info except InsufficientFreeResources: break persis_info["next_to_give"] += 1 + if persis_info["next_to_give"] >= len(H): + break - elif persis_info.get("gen_started") is None: + if persis_info.get("gen_started") is None: + for wid in support.avail_worker_ids(persistent=False, gen_workers=True): # Finally, call a persistent generator as there is nothing else to do. persis_info.get(wid)["nworkers"] = len(W) try: @@ -74,5 +77,6 @@ def persistent_aposmm_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info except InsufficientFreeResources: break persis_info["gen_started"] = True # Must set after - in case break on resources + break return Work, persis_info From 4aa386f8e6e8a654743d079e63fb09bcae1d775c Mon Sep 17 00:00:00 2001 From: shudson Date: Fri, 15 Mar 2024 15:53:47 -0500 Subject: [PATCH 67/76] Update fast_alloc --- libensemble/alloc_funcs/fast_alloc.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/libensemble/alloc_funcs/fast_alloc.py b/libensemble/alloc_funcs/fast_alloc.py index 8255df8a8..f31dee07a 100644 --- a/libensemble/alloc_funcs/fast_alloc.py +++ b/libensemble/alloc_funcs/fast_alloc.py @@ -32,11 +32,10 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li Work = {} gen_in = gen_specs.get("in", []) - persis_info = support.skip_canceled_points(H, persis_info) - # Give sim work if possible if persis_info["next_to_give"] < len(H): for wid in support.avail_worker_ids(gen_workers=False): + persis_info = support.skip_canceled_points(H, persis_info) try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], [persis_info["next_to_give"]], []) except InsufficientFreeResources: @@ -44,14 +43,15 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li persis_info["next_to_give"] += 1 # Give gen work if possible - elif gen_count < user.get("num_active_gens", gen_count + 1): + if persis_info["next_to_give"] >= len(H): for wid in support.avail_worker_ids(gen_workers=True): - return_rows = range(len(H)) if gen_in else [] - try: - Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) - except InsufficientFreeResources: - break - gen_count += 1 - persis_info["total_gen_calls"] += 1 + if gen_count < user.get("num_active_gens", gen_count + 1): + return_rows = range(len(H)) if gen_in else [] + try: + Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) + except InsufficientFreeResources: + break + gen_count += 1 + persis_info["total_gen_calls"] += 1 return Work, persis_info From 0b12af2f7b06a1127c29ad752ed54e0f3ec4ab4a Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 16:02:37 -0500 Subject: [PATCH 68/76] refactor start_fd_persistent --- libensemble/alloc_funcs/start_fd_persistent.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/libensemble/alloc_funcs/start_fd_persistent.py b/libensemble/alloc_funcs/start_fd_persistent.py index 0c2e939d3..36fba0a73 100644 --- a/libensemble/alloc_funcs/start_fd_persistent.py +++ b/libensemble/alloc_funcs/start_fd_persistent.py @@ -49,8 +49,8 @@ def finite_diff_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info, libE ) points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - for wid in support.avail_worker_ids(persistent=False): - if np.any(points_to_evaluate): + if np.any(points_to_evaluate): + for wid in support.avail_worker_ids(persistent=False, gen_workers=False): # perform sim evaluations (if they exist in History). sim_ids_to_send = np.nonzero(points_to_evaluate)[0][0] # oldest point try: @@ -58,13 +58,12 @@ def finite_diff_alloc(W, H, sim_specs, gen_specs, alloc_specs, persis_info, libE except InsufficientFreeResources: break points_to_evaluate[sim_ids_to_send] = False - - elif gen_count == 0: - # Finally, call a persistent generator as there is nothing else to do. - try: - Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info.get(wid), persistent=True) - except InsufficientFreeResources: + if not np.any(points_to_evaluate): break - gen_count += 1 + + if gen_count == 0: + wid = support.avail_worker_ids(persistent=False, gen_workers=True)[0] + Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info.get(wid), persistent=True) + gen_count += 1 return Work, persis_info, 0 From 97cdfdb3f13b810581bd43144d6c53ea4575b842 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 15 Mar 2024 16:25:24 -0500 Subject: [PATCH 69/76] refactor start_persistent_local_opt_gens --- .../start_persistent_local_opt_gens.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 12ad45100..918ebbb75 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -54,7 +54,7 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per Work[wid] = support.gen_work(wid, gen_specs["persis_in"], last_ind, persis_info[wid], persistent=True) persis_info[wid]["run_order"].append(last_ind) - for wid in support.avail_worker_ids(persistent=False): + for wid in support.avail_worker_ids(persistent=False, gen_workers=True): # Find candidates to start local opt runs if a sample has been evaluated if np.any(np.logical_and(~H["local_pt"], H["sim_ended"], ~H["cancel_requested"])): n = len(H["x"][0]) @@ -78,7 +78,8 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per persis_info[wid]["run_order"] = [ind] gen_count += 1 - elif np.any(points_to_evaluate): + if np.any(points_to_evaluate): + for wid in support.avail_worker_ids(persistent=False, gen_workers=False): # Perform sim evaluations from existing runs q_inds_logical = np.logical_and(points_to_evaluate, H["local_pt"]) if not np.any(q_inds_logical): @@ -89,10 +90,13 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, alloc_specs, per except InsufficientFreeResources: break points_to_evaluate[sim_ids_to_send] = False + if not np.any(points_to_evaluate): + break - elif gen_count == 0 and not np.any(np.logical_and(W["active"] == EVAL_GEN_TAG, W["persis_state"] == 0)): - # Finally, generate points since there is nothing else to do (no resource sets req.) - Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info[wid], rset_team=[]) - gen_count += 1 + if gen_count == 0 and not np.any(np.logical_and(W["active"] == EVAL_GEN_TAG, W["persis_state"] == 0)): + # Finally, generate points since there is nothing else to do (no resource sets req.) + wid = support.avail_worker_ids(persistent=False, gen_workers=True)[0] + Work[wid] = support.gen_work(wid, gen_specs.get("in", []), [], persis_info[wid], rset_team=[]) + gen_count += 1 return Work, persis_info From 77f880e1c64f0d98040a2795c7143dfa66a626bc Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 15 Mar 2024 20:21:11 -0500 Subject: [PATCH 70/76] typo --- .../functionality_tests/test_evaluate_existing_plus_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py index f9c2f3403..2b601efe3 100644 --- a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -1,5 +1,5 @@ """ -Test libEnsemble's capability to evalute existing points and then generate +Test libEnsemble's capability to evaluate existing points and then generate new samples via gen_on_manager. Execute via one of the following commands (e.g. 3 workers): From 2780f106b4227d534fabac4f9e7287056fd481b3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Mar 2024 12:31:31 -0500 Subject: [PATCH 71/76] fast_alloc alloc_f: don't overwrite sim_worker with gen_work for a given worker --- libensemble/alloc_funcs/fast_alloc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/alloc_funcs/fast_alloc.py b/libensemble/alloc_funcs/fast_alloc.py index f31dee07a..c74844384 100644 --- a/libensemble/alloc_funcs/fast_alloc.py +++ b/libensemble/alloc_funcs/fast_alloc.py @@ -45,7 +45,7 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li # Give gen work if possible if persis_info["next_to_give"] >= len(H): for wid in support.avail_worker_ids(gen_workers=True): - if gen_count < user.get("num_active_gens", gen_count + 1): + if wid not in Work and gen_count < user.get("num_active_gens", gen_count + 1): return_rows = range(len(H)) if gen_in else [] try: Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) From 655a1bab78a23db5d5e2bdeb20be5cdd04ead9d4 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Mar 2024 13:37:21 -0500 Subject: [PATCH 72/76] return Work after packing up gen work --- libensemble/alloc_funcs/fast_alloc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libensemble/alloc_funcs/fast_alloc.py b/libensemble/alloc_funcs/fast_alloc.py index c74844384..04e4cd3f4 100644 --- a/libensemble/alloc_funcs/fast_alloc.py +++ b/libensemble/alloc_funcs/fast_alloc.py @@ -42,6 +42,8 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li break persis_info["next_to_give"] += 1 + return Work, persis_info + # Give gen work if possible if persis_info["next_to_give"] >= len(H): for wid in support.avail_worker_ids(gen_workers=True): From 15719c7e38fbbc0dfa8f9766571a44a4049c6d8f Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Mar 2024 13:54:59 -0500 Subject: [PATCH 73/76] do next_to_give check within avail_worker_ids loop --- libensemble/alloc_funcs/fast_alloc.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/libensemble/alloc_funcs/fast_alloc.py b/libensemble/alloc_funcs/fast_alloc.py index 04e4cd3f4..bb009740c 100644 --- a/libensemble/alloc_funcs/fast_alloc.py +++ b/libensemble/alloc_funcs/fast_alloc.py @@ -33,17 +33,15 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li gen_in = gen_specs.get("in", []) # Give sim work if possible - if persis_info["next_to_give"] < len(H): - for wid in support.avail_worker_ids(gen_workers=False): - persis_info = support.skip_canceled_points(H, persis_info) + for wid in support.avail_worker_ids(gen_workers=False): + persis_info = support.skip_canceled_points(H, persis_info) + if persis_info["next_to_give"] < len(H): try: Work[wid] = support.sim_work(wid, H, sim_specs["in"], [persis_info["next_to_give"]], []) except InsufficientFreeResources: break persis_info["next_to_give"] += 1 - return Work, persis_info - # Give gen work if possible if persis_info["next_to_give"] >= len(H): for wid in support.avail_worker_ids(gen_workers=True): From 3138a3958a89c47e962bdd1df3133c0335a185a6 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Mar 2024 14:32:25 -0500 Subject: [PATCH 74/76] add libE_specs["gen_workers"] option, adjust ensure_one_active_gen so multiple gen work orders aren't given out at once --- docs/data_structures/libE_specs.rst | 4 ++++ libensemble/alloc_funcs/only_one_gen_alloc.py | 18 +++++++++--------- libensemble/manager.py | 4 ++++ libensemble/specs.py | 6 ++++++ .../test_persistent_uniform_sampling.py | 5 ++++- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index b2bb74d58..105335ca4 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -55,6 +55,10 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **disable_log_files** [bool] = ``False``: Disable ``ensemble.log`` and ``libE_stats.txt`` log files. + **gen_workers** [list of ints]: + List of workers that should only run generators. All other workers will only + run simulator functions. + .. tab-item:: Directories .. tab-set:: diff --git a/libensemble/alloc_funcs/only_one_gen_alloc.py b/libensemble/alloc_funcs/only_one_gen_alloc.py index cc4fc3e49..7eb6a91e0 100644 --- a/libensemble/alloc_funcs/only_one_gen_alloc.py +++ b/libensemble/alloc_funcs/only_one_gen_alloc.py @@ -35,16 +35,16 @@ def ensure_one_active_gen(W, H, sim_specs, gen_specs, alloc_specs, persis_info, elif not support.test_any_gen() and gen_flag: # Give gen work return_rows = range(len(H)) if gen_in else [] - for wid in support.avail_worker_ids(gen_workers=True): + wid = support.avail_worker_ids(gen_workers=True)[0] - if not support.all_sim_ended(H): - break + if not support.all_sim_ended(H): + return Work, persis_info - try: - Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) - except InsufficientFreeResources: - break - gen_flag = False - persis_info["total_gen_calls"] += 1 + try: + Work[wid] = support.gen_work(wid, gen_in, return_rows, persis_info.get(wid)) + except InsufficientFreeResources: + return Work, persis_info + gen_flag = False + persis_info["total_gen_calls"] += 1 return Work, persis_info diff --git a/libensemble/manager.py b/libensemble/manager.py index 69117916d..aa9c53b0c 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -254,6 +254,10 @@ def __init__( if wrk["worker_id"] in gresource.zero_resource_workers: wrk["zero_resource_worker"] = True + for wrk in self.W: + if wrk["worker_id"] in self.libE_specs.get("gen_workers", []): + wrk["gen_worker"] = True + try: temp_EnsembleDirectory.make_copyback() except AssertionError as e: # Ensemble dir exists and isn't empty. diff --git a/libensemble/specs.py b/libensemble/specs.py index 0073c6cd6..f0a401fe6 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -457,6 +457,12 @@ class LibeSpecs(BaseModel): For use with supported allocation functions. """ + gen_workers: Optional[List[int]] = [] + """ + List of workers that should only run generators. All other workers will only + run simulator functions. + """ + resource_info: Optional[dict] = {} """ Resource information to override automatically detected resources. diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index 5470b814d..305521a02 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -62,7 +62,7 @@ libE_specs["kill_canceled_sims"] = False - for run in range(4): + for run in range(5): persis_info = add_unique_random_streams({}, nworkers + 1) for i in persis_info: persis_info[i]["get_grad"] = True @@ -88,6 +88,9 @@ # sim_specs["out"] = [("f", float), ("grad", float, n)] elif run == 3: libE_specs["gen_on_manager"] = True + elif run == 4: + libE_specs["gen_on_manager"] = False + libE_specs["gen_workers"] = [2] # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) From 209362976c936561fccf05012e1abbc89c0b6895 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Mar 2024 14:00:10 -0500 Subject: [PATCH 75/76] update give_pregenerated_work and start_only_persistent to only give sim work on non-gen-workers --- libensemble/alloc_funcs/give_pregenerated_work.py | 2 +- libensemble/alloc_funcs/start_only_persistent.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/alloc_funcs/give_pregenerated_work.py b/libensemble/alloc_funcs/give_pregenerated_work.py index 1d6edb160..d2b1ee7aa 100644 --- a/libensemble/alloc_funcs/give_pregenerated_work.py +++ b/libensemble/alloc_funcs/give_pregenerated_work.py @@ -23,7 +23,7 @@ def give_pregenerated_sim_work(W, H, sim_specs, gen_specs, alloc_specs, persis_i if persis_info["next_to_give"] >= len(H): return Work, persis_info, 1 - for i in support.avail_worker_ids(): + for i in support.avail_worker_ids(gen_workers=False): persis_info = support.skip_canceled_points(H, persis_info) # Give sim work diff --git a/libensemble/alloc_funcs/start_only_persistent.py b/libensemble/alloc_funcs/start_only_persistent.py index 78b17ab87..101a90f51 100644 --- a/libensemble/alloc_funcs/start_only_persistent.py +++ b/libensemble/alloc_funcs/start_only_persistent.py @@ -89,7 +89,7 @@ def only_persistent_gens(W, H, sim_specs, gen_specs, alloc_specs, persis_info, l # Now the give_sim_work_first part points_to_evaluate = ~H["sim_started"] & ~H["cancel_requested"] - avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=False) + avail_workers = support.avail_worker_ids(persistent=False, zero_resource_workers=False, gen_workers=False) if user.get("alt_type"): avail_workers = list( set(support.avail_worker_ids(persistent=False, zero_resource_workers=False)) From 8a50e607e31e15ed91c7b94d5e7e863d2030ec99 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Mar 2024 14:21:50 -0500 Subject: [PATCH 76/76] refactor fast_alloc_and_pausing --- .../alloc_funcs/fast_alloc_and_pausing.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/libensemble/alloc_funcs/fast_alloc_and_pausing.py b/libensemble/alloc_funcs/fast_alloc_and_pausing.py index 4a85f69fb..dfb747cc7 100644 --- a/libensemble/alloc_funcs/fast_alloc_and_pausing.py +++ b/libensemble/alloc_funcs/fast_alloc_and_pausing.py @@ -43,9 +43,10 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li for pt_id in persis_info["pt_ids"]: persis_info["inds_of_pt_ids"][pt_id] = H["pt_id"] == pt_id - idle_workers = support.avail_worker_ids() + idle_sim_workers = support.avail_worker_ids(gen_workers=False) + idle_gen_workers = support.avail_worker_ids(gen_workers=True) - while len(idle_workers): + while len(idle_sim_workers): pt_ids_to_pause = set() # Find indices of H that are not yet given out to be evaluated @@ -106,15 +107,19 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li if len(persis_info["need_to_give"]) != 0: next_row = persis_info["need_to_give"].pop() - i = idle_workers[0] + i = idle_sim_workers[0] try: Work[i] = support.sim_work(i, H, sim_specs["in"], [next_row], []) except InsufficientFreeResources: persis_info["need_to_give"].add(next_row) break - idle_workers = idle_workers[1:] + idle_sim_workers = idle_sim_workers[1:] - elif gen_count < alloc_specs["user"].get("num_active_gens", gen_count + 1): + else: + break + + while len(idle_gen_workers): + if gen_count < alloc_specs["user"].get("num_active_gens", gen_count + 1): lw = persis_info["last_worker"] last_size = persis_info.get("last_size") @@ -126,18 +131,18 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, alloc_specs, persis_info, li break # Give gen work - i = idle_workers[0] + i = idle_gen_workers[0] try: Work[i] = support.gen_work(i, gen_specs["in"], range(len(H)), persis_info[lw]) except InsufficientFreeResources: break - idle_workers = idle_workers[1:] + idle_gen_workers = idle_gen_workers[1:] gen_count += 1 persis_info["total_gen_calls"] += 1 persis_info["last_worker"] = i persis_info["last_size"] = len(H) elif gen_count >= alloc_specs["user"].get("num_active_gens", gen_count + 1): - idle_workers = [] + idle_gen_workers = [] return Work, persis_info