From b37e93d21263a65d11a508ee52c9f63a91c9d273 Mon Sep 17 00:00:00 2001 From: Cristian Matiut Date: Thu, 15 Jan 2026 11:19:07 +0000 Subject: [PATCH 1/2] Increase default retry_on_error and get_status sleep time --- coriolis/providers/replicator.py | 2 +- coriolis/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/coriolis/providers/replicator.py b/coriolis/providers/replicator.py index b3ec9add..f69a251e 100644 --- a/coriolis/providers/replicator.py +++ b/coriolis/providers/replicator.py @@ -162,7 +162,7 @@ def _get_session(self): sess.verify = self._creds["ca_cert"] return sess - @utils.retry_on_error() + @utils.retry_on_error(sleep_seconds=10) def get_status(self, device=None, brief=True): uri = "%s/api/v1/dev" % (self._base_uri) if device is not None: diff --git a/coriolis/utils.py b/coriolis/utils.py index 097ce52e..21320467 100644 --- a/coriolis/utils.py +++ b/coriolis/utils.py @@ -166,7 +166,7 @@ def get_single_result(lis): return lis[0] -def retry_on_error(max_attempts=5, sleep_seconds=0, +def retry_on_error(max_attempts=5, sleep_seconds=1, terminal_exceptions=[]): def _retry_on_error(func): @functools.wraps(func) From ec8e2b8192b649ddb89493d20fa0a27a092cfba0 Mon Sep 17 00:00:00 2001 From: Cristian Matiut Date: Mon, 19 Jan 2026 13:31:20 +0000 Subject: [PATCH 2/2] Catch get_status errors, close ssh on connection error --- coriolis/providers/replicator.py | 65 +++++++++++++++++++------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/coriolis/providers/replicator.py b/coriolis/providers/replicator.py index f69a251e..b4df88ff 100644 --- a/coriolis/providers/replicator.py +++ b/coriolis/providers/replicator.py @@ -162,7 +162,7 @@ def _get_session(self): sess.verify = self._creds["ca_cert"] return sess - @utils.retry_on_error(sleep_seconds=10) + @utils.retry_on_error(sleep_seconds=5) def get_status(self, device=None, brief=True): uri = "%s/api/v1/dev" % (self._base_uri) if device is not None: @@ -285,8 +285,14 @@ def _reconnect_ssh(self): return self._ssh def init_replicator(self): - self._credentials = utils.retry_on_error()( - self._setup_replicator)(self._ssh) + try: + self._credentials = utils.retry_on_error(sleep_seconds=5)( + self._setup_replicator)(self._ssh) + except Exception: + LOG.warn("Failed to setup replicator, trying to reconnect ssh") + self._reconnect_ssh() + self._credentials = utils.retry_on_error(sleep_seconds=5)( + self._setup_replicator)(self._ssh) utils.retry_on_error()( self._init_replicator_client)(self._credentials) LOG.debug( @@ -372,26 +378,29 @@ def attach_new_disk( new_disks_status = None new_device_paths = None for i in range(retry_count): - new_disks_status = self._cli.get_status() - new_device_paths = [dev['device-path'] - for dev in new_disks_status] - LOG.debug( - "Polled devices while waiting for disk '%s' to attach " - "(try %d/%d): %s", disk_id, i + 1, retry_count, - new_device_paths) - - # check for missing/multiple new device paths: - missing_device_paths = ( - set(previous_device_paths) - set(new_device_paths)) - if missing_device_paths: - LOG.warn( - "The following devices from the previous disk state qeury " - "are no longer detected: %s", [ - dev for dev in previous_disks_status - if dev['device-path'] in missing_device_paths]) - - new_device_paths = set( - new_device_paths) - set(previous_device_paths) + try: + new_disks_status = self._cli.get_status() + new_device_paths = [dev['device-path'] + for dev in new_disks_status] + LOG.debug( + "Polled devices while waiting for disk '%s' to attach " + "(try %d/%d): %s", disk_id, i + 1, retry_count, + new_device_paths) + + # check for missing/multiple new device paths: + missing_device_paths = ( + set(previous_device_paths) - set(new_device_paths)) + if missing_device_paths: + LOG.warn( + "The following devices from the previous disk state " + "qeury are no longer detected: %s", [ + dev for dev in previous_disks_status + if dev['device-path'] in missing_device_paths]) + + new_device_paths = set( + new_device_paths) - set(previous_device_paths) + except Exception: + LOG.debug("Failed to get new device status") if new_device_paths: break else: @@ -466,7 +475,7 @@ def update_state(self, state, restart=False): self.restart() self._cli._test_connection() - @utils.retry_on_error() + @utils.retry_on_error(sleep_seconds=5) def _get_ssh_client(self, args): """ gets a paramiko SSH client @@ -474,8 +483,12 @@ def _get_ssh_client(self, args): try: ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh.connect(**args) - return ssh + try: + ssh.connect(**args) + return ssh + except Exception: + ssh.close() + raise except paramiko.ssh_exception.SSHException as ex: raise exception.CoriolisException( "Failed to setup SSH client: %s" % str(ex)) from ex