diff --git a/.config/ansible-lint.yml b/.config/ansible-lint.yml index bd42e744..461cc4ae 100644 --- a/.config/ansible-lint.yml +++ b/.config/ansible-lint.yml @@ -11,3 +11,5 @@ exclude_paths: - .github/ - .idea/ - .tox/ + - collections/ + # Note: rhel-8 playbooks are not part of this test suite (not supported on RHEL 8) diff --git a/.config/cspell-words.txt b/.config/cspell-words.txt index 721fe055..be08cd52 100644 --- a/.config/cspell-words.txt +++ b/.config/cspell-words.txt @@ -55,4 +55,6 @@ ONESHELL HASAP selectattr equalto -crmsh \ No newline at end of file +crmsh +sapstart +INSTANCENAME \ No newline at end of file diff --git a/README.md b/README.md index 16e7fa3b..a430aa67 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,37 @@ It is designed to test S4/HANA Clusters comprising of 2 or more nodes where SAP Download the collection as part of the test project and make sure that the inventory file is modified to include your machines which are subjected to these tests. In order for the tests to work correctly ensure that SAP HA Interface for SAP ABAP application server instances managed by RHEL HA Add-On is configured. This link can be followed to configure the same: https://access.redhat.com/solutions/3606101 -## Collection Dependancies - -This collection is dependant on +### Setup and Configuration + +1. **Install Collection Dependencies:** + ```bash + ansible-galaxy collection install -r requirements.yml + ``` + +2. **Configure Inventory:** + - Edit `tests/inventory/x86_64.yml` or `tests/inventory/ppc64le.yml` to include your cluster nodes + - Ensure `ansible_user` is set correctly for each host (default is `root`) + +3. **Ansible Configuration:** + - The project includes an `ansible.cfg` file in the root directory with pre-configured paths + - All paths are relative, so no user-specific customization is needed + - The configuration automatically finds: + - Collections in the current directory and `./collections` subdirectory + - Roles in the collection structure + - System-wide collections and roles + +4. **Running Tests:** + ```bash + # Run a specific test + ansible-playbook -i tests/inventory/x86_64.yml ansible_collections/sap/cluster_qa/playbooks/test09.yml + + # Run all tests (test01 through test09, skipping test07) + ansible-playbook -i tests/inventory/x86_64.yml ansible_collections/sap/cluster_qa/playbooks/run_all_tests.yml + ``` + +## Collection Dependencies + +This collection is dependent on - `sap.sap_operations` collection modules ### Ansible Core Compatibility diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 00000000..6bf6fbb3 --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,23 @@ +[defaults] +# Inventory path relative to this config file location +inventory = tests/inventory +# Remote user for SSH connections - customize if needed +remote_user = root +ask_pass = false +timeout = 60 +force_color = 1 +# stdout_callback = yaml +bin_ansible_callbacks = true +# Collections paths: current directory (.), user home, system paths +# These paths are relative to where ansible.cfg is located +# Note: Collections should be installed via: ansible-galaxy collection install -r requirements.yml +collections_paths = .:~/.ansible/collections:/usr/share/ansible/collections +# Roles path: local collection roles, user home, system paths +roles_path = ansible_collections/sap/cluster_qa/roles:~/.ansible/roles:/usr/share/ansible/roles:/etc/ansible/roles + +[privilege_escalation] +become = true +become_method = sudo +become_user = root +become_ask_pass = false + diff --git a/ansible_collections/sap/cluster_qa/galaxy.yml b/ansible_collections/sap/cluster_qa/galaxy.yml index 3685f200..d98d9711 100644 --- a/ansible_collections/sap/cluster_qa/galaxy.yml +++ b/ansible_collections/sap/cluster_qa/galaxy.yml @@ -29,8 +29,6 @@ tags: dependencies: "sap.sap_operations": ">=2.10.0" - "ansible.posix": ">=1.5.4" - "community.general": ">=7.3.0" repository: https://github.com/redhat-sap/ha-cluster-qa diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/01-acquire-and-set-facts.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/01-acquire-and-set-facts.yml deleted file mode 100644 index e9e4c032..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/01-acquire-and-set-facts.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -- name: Playbook to acquire the current state of RHEL HA Pacemaker cluster running SAP HANA with system Replication - hosts: all - roles: - - sap.cluster_qa.pcs_find_hana - - sap.cluster_qa.pcs_find_instance_number - - sap.cluster_qa.hana_cluster_init diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/02-fence-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/02-fence-test.yml deleted file mode 100644 index aa6754ed..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/02-fence-test.yml +++ /dev/null @@ -1,60 +0,0 @@ ---- -- name: Playbook for testing the fencing - hosts: all - tasks: - - name: Acquiring Replication sync state - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - grep hana_{{ sap_hana_sid | lower }}_sync_state - register: sap_hana_sync_state_t2 - changed_when: false - - - name: Verifying SOK state before performing test - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state_t2.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state_t2.stdout" - - - name: PERFORMING FENCE action of MASTER node from the slave node - ansible.builtin.command: "pcs stonith fence {{ sap_hana_master_node }}" - run_once: true - delegate_to: "{{ sap_hana_slave_node }}" - register: sap_hana_master_node_fenced - - - name: VERIFYING FENCE command successful - ansible.builtin.debug: - msg: "Success message \"{{ sap_hana_master_node_fenced.stdout }}\" observed" - failed_when: "'Node:' and 'fenced' not in sap_hana_master_node_fenced.stdout" - delegate_to: "{{ sap_hana_slave_node }}" - - - name: Waiting for commencement of the failover process - ansible.builtin.shell: "pcs resource status | grep {{ sap_hana_resource_name }} -A1 | tail -n1" - register: sap_hana_promote_operation - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Promoting' not in sap_hana_promote_operation.stdout" - retries: 20 - delay: 5 - ignore_errors: true - changed_when: false - - - name: Waiting for failover process to be completed - ansible.builtin.shell: "pcs resource status | grep {{ sap_hana_resource_name }} -A1 | tail -n1" - register: sap_hana_promoted - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Masters:' in sap_hana_promoted.stdout" - retries: 50 - delay: 10 - - - name: VERIFYING the failover process is completed - ansible.builtin.debug: - msg: "Secondary instance is now promoted to master" - failed_when: "'Masters:' not in sap_hana_promoted.stdout" - delegate_to: "{{ sap_hana_slave_node }}" - - - name: Wait 900 seconds, but only start checking after 10 seconds - ansible.builtin.wait_for_connection: - delay: 10 - timeout: 900 diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/03-hdb-stop-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/03-hdb-stop-test.yml deleted file mode 100644 index 8196fd97..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/03-hdb-stop-test.yml +++ /dev/null @@ -1,96 +0,0 @@ ---- -- name: Playbook for testing HDB stop and failover - hosts: all - tasks: - - name: Acquiring Replication sync state - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - grep hana_{{ sap_hana_sid | lower }}_sync_state - register: sap_hana_sync_state_t3 - changed_when: false - - - name: Verifying SOK state before performing test - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state_t3.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state_t3.stdout" - - - name: PERFORMING HDB stop on MASTER node - become: true - become_user: "{{ sap_hana_sid | lower }}adm" - ansible.builtin.command: "/usr/sap/{{ sap_hana_sid }}/HDB{{ sap_hana_instance_number }}/HDB stop" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - register: sap_hana_master_node_hdb_stopped - - - name: Verifying that HDB is fully stopped - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'hdbdaemon is stopped' in sap_hana_master_node_hdb_stopped.stdout" - ansible.builtin.debug: - msg: "Success message \"hdbdaemon is stopped\" observed" - failed_when: "'hdbdaemon is stopped' not in sap_hana_master_node_hdb_stopped.stdout" - retries: 10 - delay: 5 - - - name: Waiting for Cluster to declare failure and start recovery - ansible.builtin.shell: | - set -o pipefail - pcs status | grep "Failed Resource Actions" -A1 - register: sap_hana_master_monitor_operation_failed - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'Failed Resource Actions' in sap_hana_master_monitor_operation_failed.stdout" - retries: 122 - delay: 1 - changed_when: false - - - name: WAITING for recovery to complete on the same node - ansible.builtin.shell: | - set -o pipefail - pcs resource status | - grep "{{ sap_hana_resource_name }}" -A1 | - tail -n1 | xargs - register: sap_hana_resource_recovery_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'Masters:' in sap_hana_resource_recovery_state.stdout" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - retries: 120 - delay: 5 - changed_when: false - - - name: VERIFYING the recovery on the same node - ansible.builtin.debug: - msg: "Current state of resource recovery is {{ sap_hana_resource_recovery_state.stdout }}" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - failed_when: "'Masters:' and sap_hana_master_node not in sap_hana_resource_recovery_state.stdout" - - - name: Cleaning up cluster for next test - ansible.builtin.command: pcs resource cleanup "{{ sap_hana_resource_name }}" - run_once: true - delegate_to: "{{ sap_hana_master_node }}" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - - - name: WAITING promotion to complete on the other node - ansible.builtin.shell: pcs resource status | grep "{{ sap_hana_resource_name }}" -A1 | tail -n1 | xargs - register: sap_hana_resource_recovery_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Masters:' in sap_hana_resource_recovery_state.stdout" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" - retries: 120 - delay: 5 - - - name: VERIFYING the promotion of the slave node - ansible.builtin.debug: - msg: "Current state of resource recovery is {{ sap_hana_resource_recovery_state.stdout }}" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" - failed_when: "'Masters:' and sap_hana_slave_node not in sap_hana_resource_recovery_state.stdout" - - - name: Cleaning up cluster for next test - ansible.builtin.command: pcs resource cleanup "{{ sap_hana_resource_name }}" - run_once: true - delegate_to: "{{ sap_hana_slave_node }}" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/04-hdb-kill-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/04-hdb-kill-test.yml deleted file mode 100644 index ee7f51be..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/04-hdb-kill-test.yml +++ /dev/null @@ -1,83 +0,0 @@ ---- -- name: Playbook for testing HDB kill and failover - hosts: all - tasks: - - name: Acquiring Replication sync state - ansible.builtin.shell: "crm_mon -A1 | grep 'Node Attributes' -A24 | grep hana_{{ sap_hana_sid | lower }}_sync_state" - register: sap_hana_sync_state_t4 - - - name: Verifying SOK state before performing test - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state_t4.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state_t4.stdout" - - - name: PERFORMING HDB kill on MASTER node - become: true - become_user: "{{ sap_hana_sid | lower }}adm" - ansible.builtin.command: "/usr/sap/{{ sap_hana_sid }}/HDB{{ sap_hana_instance_number }}/HDB kill" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - register: sap_hana_master_node_hdb_killed - - - name: Verifying that HDB is fully killed - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'killing HDB processes' in sap_hana_master_node_hdb_killed.stdout" - ansible.builtin.debug: - msg: "Success message \"killing HDB processes\" observed" - failed_when: "'killing HDB processes' not in sap_hana_master_node_hdb_killed.stdout" - retries: 10 - delay: 5 - - - name: Waiting for Cluster to declare failure and start recovery - ansible.builtin.shell: pcs status | grep "Failed Resource Actions" -A1 - register: sap_hana_master_monitor_operation_failed - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'Failed Resource Actions' in sap_hana_master_monitor_operation_failed.stdout" - retries: 120 - delay: 5 - - - name: WAITING for recovery to complete on the same node - ansible.builtin.shell: pcs resource status | grep "{{ sap_hana_resource_name }}" -A1 | tail -n1 | xargs - register: sap_hana_resource_recovery_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'Masters:' in sap_hana_resource_recovery_state.stdout" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - retries: 120 - delay: 5 - - - name: VERIFYING the recovery on same node - ansible.builtin.debug: - msg: "Current state of resource recovery is {{ sap_hana_resource_recovery_state.stdout }}" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - failed_when: "'Masters:' and sap_hana_master_node not in sap_hana_resource_recovery_state.stdout" - - - name: Cleaning up cluster for next test - ansible.builtin.command: pcs resource cleanup "{{ sap_hana_resource_name }}" - run_once: true - delegate_to: "{{ sap_hana_master_node }}" - when: "'PREFER_SITE_TAKEOVER=false' in prefer_site_takeover" - - - name: WAITING for promotion to complete on the other node - ansible.builtin.shell: pcs resource status | grep "{{ sap_hana_resource_name }}" -A1 | tail -n1 | xargs - register: sap_hana_resource_recovery_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Masters:' in sap_hana_resource_recovery_state.stdout" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" - retries: 120 - delay: 5 - - - name: VERIFYING promotion of the slave node - ansible.builtin.debug: - msg: "Current state of resource recovery is {{ sap_hana_resource_recovery_state.stdout }}" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" - failed_when: "'Masters:' and sap_hana_slave_node not in sap_hana_resource_recovery_state.stdout" - - - name: Cleaning up cluster for next test - ansible.builtin.command: pcs resource cleanup "{{ sap_hana_resource_name }}" - run_once: true - delegate_to: "{{ sap_hana_slave_node }}" - when: "'PREFER_SITE_TAKEOVER=true' in prefer_site_takeover" diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/05-node-crash-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/05-node-crash-test.yml deleted file mode 100644 index 05fdadeb..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/05-node-crash-test.yml +++ /dev/null @@ -1,46 +0,0 @@ ---- -- name: Playbook for testing node crash and failover - hosts: all - tasks: - - name: Acquiring Replication sync state - ansible.builtin.shell: "crm_mon -A1 | grep 'Node Attributes' -A24 | grep hana_{{ sap_hana_sid | lower }}_sync_state" - register: sap_hana_sync_state_t5 - - - name: Verifying SOK state before performing test - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state_t5.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state_t5.stdout" - - - name: PERFORMING node crash on MASTER node - ansible.builtin.shell: "ssh root@{{ sap_hana_master_node }} 'echo c > /proc/sysrq-trigger' &" - delegate_to: localhost - run_once: true - failed_when: false - - - name: Verifying promotion process started on the other node - ansible.builtin.shell: pcs resource status | grep "{{ sap_hana_resource_name }}" -A1 - register: sap_hana_resource_promotion_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Promoting' in sap_hana_resource_promotion_state.stdout" - retries: 120 - delay: 2 - - - name: WAITING for promotion to complete on the other node - ansible.builtin.shell: pcs resource status | grep "{{ sap_hana_resource_name }}" -A1 | tail -n1 | xargs - register: sap_hana_resource_recovery_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'Masters:' in sap_hana_resource_recovery_state.stdout" - retries: 120 - delay: 5 - - - name: VERIFYING the promotion of the slave node - ansible.builtin.debug: - msg: "Current state of resource recovery is {{ sap_hana_resource_recovery_state.stdout }}" - failed_when: "'Masters:' and sap_hana_slave_node not in sap_hana_resource_recovery_state.stdout" - - - name: Wait 900 seconds, but only start checking after 10 seconds - ansible.builtin.wait_for_connection: - delay: 10 - timeout: 900 diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/06-priority-fencing-check-and-setup.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/06-priority-fencing-check-and-setup.yml deleted file mode 100644 index ce58a7cd..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/06-priority-fencing-check-and-setup.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -- name: Checking and verifying priority fencing to avoid fence race during network tests - hosts: all - tasks: - - name: Setting up default priority fencing value to 1 - ansible.builtin.command: "pcs resource defaults update priority=1" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - - - name: Setting up priority fencing delay of 10 Seconds - ansible.builtin.command: "pcs property set priority-fencing-delay=15s" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/07-firewalld-setup-for-network-tests.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/07-firewalld-setup-for-network-tests.yml deleted file mode 100644 index fca46253..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/07-firewalld-setup-for-network-tests.yml +++ /dev/null @@ -1,88 +0,0 @@ ---- -- name: Configuring nodes for network tests - hosts: all - tasks: - - name: Installing firewalld package - ansible.builtin.dnf: - name: firewalld - state: present - - - name: Allowing High Availability service through firewall - ansible.posix.firewalld: - service: high-availability - state: enabled - permanent: true - - - name: Acquiring SAP HANA System Replication master ports - ansible.builtin.shell: netstat -tulpn | egrep -e "hdb|saps|saph" | awk '{print $4}' | cut -d ":" -f2 - register: sr_master_ports - run_once: true - delegate_to: "{{ sap_hana_master_node }}" - - - name: Opening Acquired ports of master on both nodes - ansible.posix.firewalld: - port: "{{ item }}/tcp" - permanent: true - state: enabled - loop: "{{ sr_master_ports.stdout_lines }}" - - - name: Acquiring SAP HANA System Replication slave ports - ansible.builtin.shell: netstat -tulpn | egrep -e "hdb|saps|saph" | awk '{print $4}' | cut -d ":" -f2 - register: sr_slave_ports - run_once: true - delegate_to: "{{ sap_hana_slave_node }}" - - - name: Opening Acquired ports of slave node on both nodes - ansible.posix.firewalld: - port: "{{ item }}/tcp" - permanent: true - state: enabled - loop: "{{ sr_slave_ports.stdout_lines }}" - - - name: Allowing custom system Replication Ports through firewall - ansible.posix.firewalld: - port: "4{{ sap_hana_instance_number }}00/tcp" - permanent: true - state: enabled - - - name: Stopping the cluster - community.general.pacemaker_cluster: - state: offline - timeout: 900 - failed_when: false - when: "'inactive' in firewalld_state" - - - name: Enabling and starting firewalld - ansible.builtin.service: - name: firewalld - enabled: true - state: started - when: "'inactive' in firewalld_state" - - - name: Starting the cluster for testing firewall accuracy - community.general.pacemaker_cluster: - state: online - timeout: 900 - failed_when: false - when: "'inactive' in firewalld_state" - - - name: Firewalld - Waiting for cluster to start SAP HANA resource on each node separately. - ansible.builtin.shell: pcs resource status | grep {{ sap_hana_resource_name }} -A2 | egrep -e 'Masters:|Slaves:' | awk '{print $0}' - register: sap_hana_resource_state - until: "'Masters:' and 'Slaves:' in sap_hana_resource_state.stdout" - retries: 120 - delay: 10 - - - name: Firewalld - Waiting for SOK Replication sync state - ansible.builtin.shell: "crm_mon -A1 | grep 'Node Attributes' -A24 | grep hana_{{ sap_hana_sid | lower }}_sync_state" - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'SOK' in sap_hana_sync_state.stdout" - retries: 120 - delay: 2 - - - name: Firewalld - Verifying SOK state - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state.stdout" diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/08-cluster-network-down-tests.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/08-cluster-network-down-tests.yml deleted file mode 100644 index 2ac5b1a6..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/08-cluster-network-down-tests.yml +++ /dev/null @@ -1,67 +0,0 @@ ---- -- name: Playbook to test Network Failures by blocking firewall on the Primary side - hosts: all - tasks: - - name: Dropping corosync network connection on primary - ansible.builtin.command: "firewall-cmd --direct --add-rule ipv4 filter OUTPUT 2 -p udp --dport=5405 -j DROP" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - - - name: Waiting for slave node cluster to go offline - ansible.builtin.shell: "pcs status | grep 'Node List' -A2 | xargs" - register: slave_node_cluster_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'OFFLINE:' in slave_node_cluster_state.stdout" - retries: 400 - delay: 1 - - - name: Acquiring the state of slave node - ansible.builtin.shell: "pcs status | grep 'Node List' -A2 | xargs" - register: slave_node_cluster_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - - - name: Verifying Slave node in offline state - ansible.builtin.debug: - msg: "Current state of slave node is {{ slave_node_cluster_state.stdout }}" - failed_when: "'OFFLINE' not in slave_node_cluster_state.stdout" - - - name: Restoring Network connection on the Primary side - ansible.builtin.command: "firewall-cmd --direct --remove-rule ipv4 filter OUTPUT 2 -p udp --dport=5405 -j DROP" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - - - name: Wait 900 seconds, but only start checking after 10 seconds - ansible.builtin.wait_for_connection: - delay: 10 - timeout: 900 - - - name: Acquiring the state of slave node from master - ansible.builtin.shell: pcs status | grep "Node List" -A2 | grep "{{ sap_hana_slave_node }}" - register: slave_node_cluster_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - - - name: Waiting for slave node cluster to come back online - ansible.builtin.shell: "pcs status | grep 'Node List' -A2 | xargs" - register: slave_node_cluster_state - delegate_to: "{{ sap_hana_master_node }}" - run_once: true - until: "'OFFLINE:' not in slave_node_cluster_state.stdout" - retries: 400 - delay: 1 - - - name: Verifying Slave node back to Online state - ansible.builtin.debug: - msg: "Current state of slave node is {{ slave_node_cluster_state.stdout }}" - failed_when: "'Online:' not in slave_node_cluster_state.stdout" - - - name: Waiting for SOK Replication sync state after restoring Network - ansible.builtin.shell: "crm_mon -A1 | grep 'Node Attributes' -A24 | grep hana_{{ sap_hana_sid | lower }}_sync_state" - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'SOK' in sap_hana_sync_state.stdout" - retries: 480 - delay: 2 diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/10-full-sap-hana-ha-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/10-full-sap-hana-ha-test.yml deleted file mode 100644 index 21bb26d3..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/10-full-sap-hana-ha-test.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: Playbook to test the working state of the Pacemaker HANA cluster Nodes - hosts: all -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 08-cluster-network-down-tests.yml -- import_playbook: 01-acquire-and-set-facts.yml diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/100-pstv-to-fls.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/100-pstv-to-fls.yml deleted file mode 100644 index a23b2025..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/100-pstv-to-fls.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -- name: Setting PREFER_SITE_TAKEOVER value to false - hosts: all - tasks: - - name: Setting up PREFER_SITE_TAKEOVER value to false - ansible.builtin.command: "pcs resource update {{ sap_hana_resource_name }} PREFER_SITE_TAKEOVER=false" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/101-ar-to-fls-pstv-to-tru.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/101-ar-to-fls-pstv-to-tru.yml deleted file mode 100644 index b70967d7..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/101-ar-to-fls-pstv-to-tru.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -- name: Setting PREFER_SITE_TAKEOVER value to false - hosts: all - tasks: - - name: Setting up PREFER_SITE_TAKEOVER value to false - ansible.builtin.command: "pcs resource update {{ sap_hana_resource_name }} AUTOMATED_REGISTER=false PREFER_SITE_TAKEOVER=true" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/102-ar-to-tru-pstv-to-tru.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/102-ar-to-tru-pstv-to-tru.yml deleted file mode 100644 index ae6efc29..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/102-ar-to-tru-pstv-to-tru.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -- name: Setting PREFER_SITE_TAKEOVER value to false - hosts: all - tasks: - - name: Setting up PREFER_SITE_TAKEOVER value to false - ansible.builtin.command: "pcs resource update {{ sap_hana_resource_name }} AUTOMATED_REGISTER=true PREFER_SITE_TAKEOVER=true" - delegate_to: "{{ sap_hana_master_node }}" - run_once: true diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/106-firewalld-sr-ports-disable.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/106-firewalld-sr-ports-disable.yml deleted file mode 100644 index 110dbe67..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/106-firewalld-sr-ports-disable.yml +++ /dev/null @@ -1,87 +0,0 @@ ---- -- name: Configuring nodes for network tests - hosts: all - tasks: - - name: Acquiring Firewalld state ## noqa command-instead-of-module - ansible.builtin.command: "systemctl is-active firewalld" - register: firewalld_state - failed_when: false - changed_when: false - - - name: Verifying Firewalld state - ansible.builtin.debug: - msg: "firewalld is {{ firewalld_state.stdout }}" - - - name: Setting Firewalld state as fact - ansible.builtin.set_fact: - firewalld_state: "{{ firewalld_state.stdout }}" - - - name: Stopping the cluster for firewalld setup - community.general.pacemaker_cluster: - state: offline - timeout: 900 - when: "'inactive' in firewalld_state" - failed_when: false - - - name: Installing firewalld package - ansible.builtin.dnf: - name: firewalld - state: present - - - name: Enabling and starting firewalld - ansible.builtin.service: - name: firewalld - enabled: true - state: started - - - name: Allowing High Availability service through firewall - ansible.posix.firewalld: - service: high-availability - state: enabled - permanent: true - immediate: true - - - name: Starting the cluster for Acquiring HANA SR ports - community.general.pacemaker_cluster: - state: online - timeout: 900 - failed_when: false - - - name: Waiting for cluster to start SAP HANA resource. - ansible.builtin.shell: pcs resource status | grep {{ sap_hana_resource_name }} -A2 | egrep -e "Masters|Slaves" | awk '{print $0}' - register: sap_hana_resource_state - until: "'Masters:' and 'Slaves:' in sap_hana_resource_state.stdout" - retries: 120 - delay: 10 - - - name: Acquiring SAP HANA System Replication master ports - ansible.builtin.shell: netstat -tulpn | egrep -e "hdb|saps|saph" | awk '{print $4}' | cut -d ":" -f2 - register: sr_master_ports - run_once: true - delegate_to: "{{ sap_hana_master_node }}" - - - name: Closing Acquired ports of master on both nodes - ansible.posix.firewalld: - port: "{{ item }}/tcp" - permanent: true - state: disabled - loop: "{{ sr_master_ports.stdout_lines }}" - - - name: Acquiring SAP HANA System Replication slave ports - ansible.builtin.shell: netstat -tulpn | egrep -e "hdb|saps|saph" | awk '{print $4}' | cut -d ":" -f2 - register: sr_slave_ports - run_once: true - delegate_to: "{{ sap_hana_slave_node }}" - - - name: Closing Acquired ports of slave node on both nodes - ansible.posix.firewalld: - port: "{{ item }}/tcp" - permanent: true - state: disabled - loop: "{{ sr_slave_ports.stdout_lines }}" - - - name: Closing custom system Replication Ports through firewall - ansible.posix.firewalld: - port: "4{{ sap_hana_instance_number }}00/tcp" - permanent: true - state: disabled diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/20-full-sap-hana-ha-final-test.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/20-full-sap-hana-ha-final-test.yml deleted file mode 100644 index 0b3240ef..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/20-full-sap-hana-ha-final-test.yml +++ /dev/null @@ -1,52 +0,0 @@ ---- -- name: Playbook to test the working state of the Pacemaker HANA cluster Nodes - hosts: all -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 102-ar-to-tru-pstv-to-tru.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 02-fence-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 03-hdb-stop-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 04-hdb-kill-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 05-node-crash-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 08-cluster-network-down-tests.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 100-pstv-to-fls.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 02-fence-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 03-hdb-stop-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 04-hdb-kill-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 05-node-crash-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 08-cluster-network-down-tests.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 101-ar-to-fls-pstv-to-tru.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 02-fence-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 03-hdb-stop-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 04-hdb-kill-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 05-node-crash-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 08-cluster-network-down-tests.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 100-pstv-to-fls.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 02-fence-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 03-hdb-stop-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 04-hdb-kill-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 05-node-crash-test.yml -- import_playbook: 01-acquire-and-set-facts.yml -- import_playbook: 08-cluster-network-down-tests.yml -- import_playbook: 01-acquire-and-set-facts.yml diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/offline-sap-s4hana.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/offline-sap-s4hana.yml deleted file mode 100644 index cad5298d..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/offline-sap-s4hana.yml +++ /dev/null @@ -1,17 +0,0 @@ ---- -- name: Playbook to test the working and state of the Pacemaker HANA cluster Nodes - hosts: all - vars: - sap_preconfigure_reboot_ok: true - sap_hana_preconfigure_enable_sap_hana_repos: true - sap_hana_preconfigure_set_minor_release: true - sap_hana_preconfigure_modify_grub_cmdline_linux: true - sap_hana_preconfigure_reboot_ok: true - sap_domain: example.com - sap_netweaver_preconfigure_fail_if_not_enough_swap_space_configured: false - sap_hana_preconfigure_fail_if_not_enough_swap_space_configured: false - sap_preconfigure_modify_etc_hosts: false - tasks: - - name: Verify or make the cluster online - community.general.pacemaker_cluster: - state: offline diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/pdf-driver-install-setup.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/pdf-driver-install-setup.yml deleted file mode 100644 index 34dc3335..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/pdf-driver-install-setup.yml +++ /dev/null @@ -1,41 +0,0 @@ ---- -- name: Playbook to install and configure cups-pdf printer driver - hosts: localhost - become: true - gather_facts: true - vars: - callback_plugins: - - "{{ playbook_dir }}/callback_plugins" - tasks: - - name: Install cups-pdf package - ansible.builtin.dnf: - name: cups-pdf - state: present - - - name: Configure cups-pdf printer - ansible.builtin.lineinfile: - path: /etc/cups/cups-pdf.conf - regexp: '{{ item.regexp }}' - line: '{{ item.line }}' - with_items: - - {regexp: '^Out .*', line: 'Out ${HOME}/PDF'} - - {regexp: '^Annots .*', line: 'Annots yes'} - - {regexp: '^DefaultPrinter .*', line: 'DefaultPrinter PDF'} - - - name: Create PDF output directory - ansible.builtin.file: - path: /root/rhel-8-hsr-cluster-test/test-runs - state: directory - mode: "0600" - - - name: Create callback_plugins directory - ansible.builtin.file: - path: "{{ playbook_dir }}/callback_plugins" - state: directory - mode: "0600" - - - name: Copy PDF callback plugin to callback_plugins directory - ansible.builtin.copy: - src: pdf.py - dest: "{{ playbook_dir }}/callback_plugins" - mode: "0600" diff --git a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/restart-sap-s4hana.yml b/ansible_collections/sap/cluster_qa/playbooks/rhel-8/restart-sap-s4hana.yml deleted file mode 100644 index 2082ff85..00000000 --- a/ansible_collections/sap/cluster_qa/playbooks/rhel-8/restart-sap-s4hana.yml +++ /dev/null @@ -1,17 +0,0 @@ ---- -- name: Playbook to test the working and state of the Pacemaker HANA cluster Nodes - hosts: all - vars: - sap_preconfigure_reboot_ok: true - sap_hana_preconfigure_enable_sap_hana_repos: true - sap_hana_preconfigure_set_minor_release: true - sap_hana_preconfigure_modify_grub_cmdline_linux: true - sap_hana_preconfigure_reboot_ok: true - sap_domain: example.com - sap_netweaver_preconfigure_fail_if_not_enough_swap_space_configured: false - sap_hana_preconfigure_fail_if_not_enough_swap_space_configured: false - sap_preconfigure_modify_etc_hosts: false - tasks: - - name: Verify or make the cluster online - community.general.pacemaker_cluster: - state: restart diff --git a/ansible_collections/sap/cluster_qa/playbooks/run_all_tests.yml b/ansible_collections/sap/cluster_qa/playbooks/run_all_tests.yml new file mode 100644 index 00000000..5269d353 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/playbooks/run_all_tests.yml @@ -0,0 +1,70 @@ +--- +- name: Run all test cases (test01 through test09, skipping test07) + hosts: all + gather_facts: false + become: true + become_user: root + any_errors_fatal: false + tasks: + - name: Display test suite information + ansible.builtin.debug: + msg: "Starting test suite execution: test01 through test09 (skipping test07)" + run_once: true + + - name: Run TEST01 - HA software name and version verification + block: + - name: Collect necessary gather_facts + ansible.builtin.setup: + gather_subset: + - min + + - name: Finding ASCS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ascs + + - name: Running TEST01 test role on the ASCS Node + ansible.builtin.include_role: + name: sap.cluster_qa.test01 + when: ansible_hostname == sap_ascs_node_name + + - name: Finding ERS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ers + + - name: Running TEST01 test role on the ERS Node + ansible.builtin.include_role: + name: sap.cluster_qa.test01 + when: ansible_hostname == sap_ers_node_name + + - name: Run TEST02 - HA configuration error check + ansible.builtin.include_role: + name: sap.cluster_qa.test02 + + - name: Run TEST03 - HA-Interface shared library loading verification + ansible.builtin.include_role: + name: sap.cluster_qa.test03 + + - name: Run TEST04 - Manual ASCS failover with transaction locks + ansible.builtin.include_role: + name: sap.cluster_qa.test04 + + - name: Run TEST05 + ansible.builtin.include_role: + name: sap.cluster_qa.test05 + + - name: Run TEST06 - ERS failover test + ansible.builtin.include_role: + name: sap.cluster_qa.test06 + + - name: Run TEST08 - ASCS node crash test + ansible.builtin.include_role: + name: sap.cluster_qa.test08 + + - name: Run TEST09 - Message Server automatic restart and HA interaction test + ansible.builtin.include_role: + name: sap.cluster_qa.test09 + + - name: Display test suite completion + ansible.builtin.debug: + msg: "Test suite execution completed: test01 through test09 (test07 skipped)" + run_once: true diff --git a/ansible_collections/sap/cluster_qa/playbooks/test09.yml b/ansible_collections/sap/cluster_qa/playbooks/test09.yml new file mode 100644 index 00000000..7e88d6a4 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/playbooks/test09.yml @@ -0,0 +1,11 @@ +--- +- name: Running TEST09 test role on the S4/HANA Cluster + hosts: all + gather_facts: false + become: true + become_user: root + any_errors_fatal: false + tasks: + - name: Running TEST09 test role on the S4/HANA Cluster + ansible.builtin.include_role: + name: sap.cluster_qa.test09 diff --git a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/README.md b/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/README.md deleted file mode 100644 index 225dd44b..00000000 --- a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/meta/main.yml b/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/meta/main.yml deleted file mode 100644 index a5a8f8e7..00000000 --- a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/meta/main.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -galaxy_info: - author: Amir Memon (@amemon-redhat) - description: Ensure that HANA cluster is initialized for QA testing - min_ansible_version: "2.15" - platforms: - - name: EL - versions: - - "8" - - "9" - license: GPL-3.0-only - galaxy_tags: [] -dependencies: [] diff --git a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/tasks/main.yml deleted file mode 100644 index b73c8a47..00000000 --- a/ansible_collections/sap/cluster_qa/roles/hana_cluster_init/tasks/main.yml +++ /dev/null @@ -1,453 +0,0 @@ ---- - -- name: Starting the cluster - community.general.pacemaker_cluster: - state: online - timeout: 600 - -- name: Checking Hardware CPU Architecture - ansible.builtin.debug: - msg: "Hardware Architecture of {{ ansible_hostname }} is {{ ansible_architecture }}" - -- name: Checking RHEL Version - ansible.builtin.debug: - msg: "Red Hat Release Version is {{ ansible_distribution_version }}" - -- name: Acquiring SAP HANA resource agent version - ansible.builtin.shell: | - set -o pipefail - rpm -qa | grep resource-agents-sap-hana - register: sap_hana_resource_agent - changed_when: false - -- name: Checking Installed SAP HANA Resource agent - ansible.builtin.debug: - msg: "Installed SAP HANA Resource agent is {{ sap_hana_resource_agent.stdout }}" - -- name: Acquiring HANA release version - become: true - become_user: "{{ sap_hana_sid | lower }}adm" - ansible.builtin.shell: | - set -o pipefail - /usr/sap/{{ sap_hana_sid }}/HDB{{ sap_hana_instance_number }}/HDB version | - egrep -e 'version:|branch:' | xargs - register: sap_hana_release_version - changed_when: false - -- name: Checking HANA Release Version - ansible.builtin.debug: - msg: "HANA {{ sap_hana_release_version.stdout }}" - -- name: Acquiring Control Node's Ansible Version - ansible.builtin.shell: | - set -o pipefail - ansible --version | head -n1 - delegate_to: localhost - run_once: true - register: control_node_ansible_version - changed_when: false - -- name: Checking Control Node's Ansible Version - ansible.builtin.debug: - msg: "{{ control_node_ansible_version.stdout }}" - delegate_to: localhost - run_once: true - -- name: Disable and stop firewalld service - ansible.builtin.service: - name: firewalld - state: stopped - enabled: false - -- name: Checking for failed resource actions - ansible.builtin.shell: | - set -o pipefail - pcs status | grep "Failed Resource Actions" - register: failed_resource_actions - run_once: true - ignore_errors: true - changed_when: false - -- name: Cleaning failed resource actions up if necessary - ansible.builtin.command: pcs resource cleanup - when: "'Failed Resource Actions' in failed_resource_actions.stdout" - run_once: true - changed_when: true - -- name: Checking for failed Fencing actions - ansible.builtin.shell: | - set -o pipefail - pcs status | grep "Failed Fencing Actions" - register: failed_fencing_actions - run_once: true - ignore_errors: true - changed_when: false - -- name: Cleaning failed fencing actions - ansible.builtin.command: stonith_admin --cleanup --history "*" - when: "'Failed Fencing Actions' in failed_fencing_actions.stdout" - run_once: true - changed_when: true - -- name: Acquiring SAPHanaTopology resource name - ansible.builtin.shell: | - set -o pipefail - pcs resource config | - grep type=SAPHanaTopology | - awk '{print $2}' - register: sap_hana_topology_resource_name - changed_when: false - -- name: Setting up SAPHanaTopology resource name as fact - ansible.builtin.set_fact: - sap_hana_topology_resource_name: "{{ sap_hana_topology_resource_name.stdout }}" - -- name: Acquiring the state of {{ sap_hana_topology_resource_name }} - ansible.builtin.shell: | - set -o pipefail - pcs resource status | - grep "{{ sap_hana_topology_resource_name }}" -A1 | tail -n1 | xargs - register: sap_hana_topology_resource_state - changed_when: false - -- name: Waiting for cluster to start SAP HANA Topology resource. - ansible.builtin.shell: | - set -o pipefail - pcs resource status | - grep {{ sap_hana_topology_resource_name }} -A1 | tail -n1 - register: sap_hana_topology_resource_state - until: "'Started' in sap_hana_topology_resource_state.stdout" - retries: 120 - delay: 10 - changed_when: false - -- name: VERIFYING the running state of SAPHanaTopology resource - ansible.builtin.debug: - msg: "Current state of SAP HANA Topology resource is {{ sap_hana_topology_resource_state.stdout }}" - failed_when: "'Started:' not in sap_hana_topology_resource_state.stdout" - -- name: Acquiring SAPHana resource name - ansible.builtin.shell: | - set -o pipefail - pcs resource config | - grep "AUTOMATED_REGISTER" -B1 | - grep "type=SAPHana" | awk '{print $2}' - register: sap_hana_resource_name - changed_when: false - -- name: Setting up SAPHana resource name as fact - ansible.builtin.set_fact: - sap_hana_resource_name: "{{ sap_hana_resource_name.stdout }}" - -- name: Acquiring SAP HANA resource state - ansible.builtin.shell: | - set -o pipefail - pcs status | - grep "{{ sap_hana_resource_name }}-clone" -A1 - register: sap_hana_resource_disabled - changed_when: false - -- name: Enabling the SAP HANA resource to start - ansible.builtin.command: pcs resource enable {{ sap_hana_resource_name }} - when: "'Stopped' and '(disabled)' in sap_hana_resource_disabled.stdout" - changed_when: true - -- name: Acquiring AUTOMATED_REGISTER value - ansible.builtin.shell: | - set -o pipefail - pcs resource config {{ sap_hana_resource_name }} | - grep "AUTOMATED_REGISTER" | - awk '{print $2}' | xargs - register: automated_register - changed_when: false - -- name: Setting AUTOMATED_REGISTER value as fact - ansible.builtin.set_fact: - automated_register: "{{ automated_register.stdout }}" - -- name: Acquiring PREFER_SITE_TAKEOVER value - ansible.builtin.shell: | - set -o pipefail - pcs resource config {{ sap_hana_resource_name }} | - grep "PREFER_SITE_TAKEOVER" | - awk '{print $5}' | xargs - register: prefer_site_takeover - changed_when: false - -- name: Setting PREFER_SITE_TAKEOVER value as fact - ansible.builtin.set_fact: - prefer_site_takeover: "{{ prefer_site_takeover.stdout }}" - -- name: Waiting for cluster to start SAP HANA resource on both nodes. - ansible.builtin.shell: | - set -o pipefail - pcs resource status | - grep {{ sap_hana_resource_name }} -A2 | - egrep -e "Masters|Slaves" | awk '{print $0}' - register: sap_hana_resource_state - until: "'Masters:' and 'Slaves:' in sap_hana_resource_state.stdout" - retries: 120 - delay: 10 - when: "'AUTOMATED_REGISTER=true' in automated_register" - changed_when: false - -- name: VERIFYING the running state of the SAP HANA Resource - ansible.builtin.debug: - msg: "Current node and state of SAP HANA master is {{ sap_hana_resource_state.stdout }}" - failed_when: "'Masters:' and 'Slaves:' not in sap_hana_resource_state.stdout" - when: "'AUTOMATED_REGISTER=true' in automated_register" - -- name: Waiting for cluster to start SAP HANA resource on master node. - ansible.builtin.shell: | - set -o pipefail - pcs resource status | - grep {{ sap_hana_resource_name }} -A2 | - egrep -e "Masters" | awk '{print $0}' - register: sap_hana_resource_state - until: "'Masters:' in sap_hana_resource_state.stdout" - retries: 120 - delay: 10 - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: VERIFYING the running state of the SAP HANA Resource - ansible.builtin.debug: - msg: "Current node and state of SAP HANA master is {{ sap_hana_resource_state.stdout }}" - failed_when: "'Masters:' not in sap_hana_resource_state.stdout" - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring SAP HANA master node name - ansible.builtin.shell: | - set -o pipefail - pcs status | grep "Masters:" | awk '{print $4}' - register: sap_hana_master_node - changed_when: false - -- name: Setting SAP HANA master node name as fact - ansible.builtin.set_fact: - sap_hana_master_node: "{{ sap_hana_master_node.stdout }}" - -- name: Waiting for 60 seconds to stabilize - ansible.builtin.pause: - seconds: 60 - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring SAP HANA slave node name - ansible.builtin.shell: | - set -o pipefail - pcs status | egrep -e "Slaves:|Stopped:" | awk '{print $4}' - register: sap_hana_slave_node - changed_when: false - -- name: Setting SAP HANA Slave node name as fact - ansible.builtin.set_fact: - sap_hana_slave_node: "{{ sap_hana_slave_node.stdout }}" - -- name: Acquiring SAP HANA VIP resource name - ansible.builtin.shell: | - set -o pipefail - pcs resource config | - grep "type=IPaddr2" | - grep "{{ sap_hana_sid }}_{{ sap_hana_instance_number }}" | - awk '{print $2}' - register: sap_hana_vip_name - changed_when: false - -- name: Setting up SAP HANA VIP name as fact - ansible.builtin.set_fact: - sap_hana_vip_name: "{{ sap_hana_vip_name.stdout }}" - -- name: Waiting for cluster to start the SAP HANA VIP Resource - ansible.builtin.shell: | - set -o pipefail - pcs resource status | grep "{{ sap_hana_vip_name }}" - register: sap_hana_vip_state - until: "'Started' in sap_hana_vip_state.stdout" - retries: 20 - delay: 2 - changed_when: false - -- name: VERIFYING the running/started state of the SAP HANA VIP - ansible.builtin.debug: - msg: "Current state of SAP HANA VIP is {{ sap_hana_vip_state.stdout }}" - failed_when: "'Started' not in sap_hana_vip_state.stdout" - -- name: Waiting for SOK Replication sync state - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - grep hana_{{ sap_hana_sid | lower }}_sync_state - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'SOK' in sap_hana_sync_state.stdout" - retries: 240 - delay: 2 - when: "'AUTOMATED_REGISTER=true' in automated_register" - changed_when: false - -- name: Verifying SOK state as {{ automated_register }} - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state.stdout" - when: "'AUTOMATED_REGISTER=true' in automated_register" - -- name: Waiting for 10 seconds to stabilize - ansible.builtin.pause: - seconds: 10 - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring Slave Node DC Name - ansible.builtin.shell: | - set -o pipefail - pcs node attribute | - grep {{ sap_hana_slave_node }}: | - awk '{print $4}' | - cut -d '=' -f2 | xargs - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - register: sap_hana_slave_node_dc - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: Setting Slave Node DC Name as Fact - ansible.builtin.set_fact: - sap_hana_slave_node_dc: "{{ sap_hana_slave_node_dc.stdout }}" - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring Slave Node System Replication Mode - ansible.builtin.shell: | - set -o pipefail - pcs node attribute | grep {{ sap_hana_slave_node }}: | awk '{print $5}' | cut -d '=' -f2 | xargs - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - register: sap_hana_slave_node_srmode - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: Setting Slave Node System Replication Mode as Fact - ansible.builtin.set_fact: - sap_hana_slave_node_srmode: "{{ sap_hana_slave_node_srmode.stdout }}" - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring Slave Node Operation Mode - ansible.builtin.shell: | - set -o pipefail - pcs node attribute | - grep {{ sap_hana_slave_node }}: - | awk '{print $2}' - | cut -d '=' -f2 | xargs - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - register: sap_hana_slave_node_opmode - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: Setting Slave Node Operation Mode as Fact - ansible.builtin.set_fact: - sap_hana_slave_node_mode: "{{ sap_hana_slave_node_opmode.stdout }}" - when: "'AUTOMATED_REGISTER=false' in automated_register" - -- name: Acquiring current System Replication sync state - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - egrep -e 'Node:|hana_{{ sap_hana_sid | lower }}_clone_state|hana_{{ sap_hana_sid | lower }}_sync_state' - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: Re-Registering Former primary as Secondary to new primary - become: true - become_user: "{{ sap_hana_sid | lower }}adm" - ansible.builtin.shell: | - set -o pipefail - source /usr/sap/{{ sap_hana_sid }}/home/.sapenv.sh && \ - /usr/sap/{{ sap_hana_sid }}/HDB{{ sap_hana_instance_number }}/exe/hdbnsutil \ - -sr_register --remoteHost={{ sap_hana_master_node }} \ - --remoteInstance={{ sap_hana_instance_number }} \ - --replicationMode={{ sap_hana_slave_node_srmode }} \ - --operationMode={{ sap_hana_slave_node_opmode.stdout }} \ - --name={{ sap_hana_slave_node_dc }} --online - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - when: "'AUTOMATED_REGISTER=false' in automated_register and 'SFAIL' in sap_hana_sync_state.stdout" - register: secondary_set - changed_when: true - -- name: Checking for failed resource actions - ansible.builtin.shell: pcs status | grep "Failed Resource Actions" - register: failed_resource_actions - run_once: true - ignore_errors: true - changed_when: false - -- name: Cleaning failed resource actions up if necessary - ansible.builtin.command: pcs resource cleanup - when: "'Failed Resource Actions' in failed_resource_actions.stdout" - run_once: true - changed_when: true - -- name: Checking for failed Fencing actions - ansible.builtin.shell: pcs status | grep "Failed Fencing Actions" - register: failed_fencing_actions - run_once: true - ignore_errors: true - changed_when: false - -- name: Cleaning failed fencing actions - ansible.builtin.command: stonith_admin --cleanup --history "*" - when: "'Failed Fencing Actions' in failed_fencing_actions.stdout" - run_once: true - changed_when: true - -- name: Waiting for SOK Replication sync state after Re-register using Ansible - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - grep hana_{{ sap_hana_sid | lower }}_sync_state - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'SOK' in sap_hana_sync_state.stdout" - retries: 60 - delay: 1 - when: "'AUTOMATED_REGISTER=false' in automated_register" - ignore_errors: true - changed_when: false - -- name: Trying to start HANA DB manually since SOK state is still not observed - become: true - become_user: "{{ sap_hana_sid | lower }}adm" - ansible.builtin.shell: "source /usr/sap/{{ sap_hana_sid }}/home/.sapenv.sh && HDB start" - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - when: "'AUTOMATED_REGISTER=false' in automated_register and 'SFAIL' in sap_hana_sync_state.stdout" - register: secondary_set - changed_when: true - -- name: Waiting for SOK Replication sync state after Re-register using Ansible - ansible.builtin.shell: | - set -o pipefail - crm_mon -A1 | - grep 'Node Attributes' -A24 | - grep hana_{{ sap_hana_sid | lower }}_sync_state - register: sap_hana_sync_state - delegate_to: "{{ sap_hana_slave_node }}" - run_once: true - until: "'SOK' in sap_hana_sync_state.stdout" - retries: 240 - delay: 2 - when: "'AUTOMATED_REGISTER=false' in automated_register" - changed_when: false - -- name: Verifying SOK state as {{ automated_register }} - ansible.builtin.debug: - msg: "SAP HANA Replication sync state is {{ sap_hana_sync_state.stdout }}" - failed_when: "'SOK' not in sap_hana_sync_state.stdout" - when: "'AUTOMATED_REGISTER=false' in automated_register" diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml index f1c62834..2088585f 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml +++ b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml @@ -40,6 +40,10 @@ changed_when: false failed_when: __pcs_find_ascs_sap_ascs_start_profile.rc != 0 +- name: Setting ASCS start profile path as fact + ansible.builtin.set_fact: + sap_ascs_start_profile: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" + - name: Acquiring ASCS SID ansible.builtin.shell: |- set -o pipefail | diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/README.md b/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/README.md deleted file mode 100644 index 225dd44b..00000000 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/tasks/main.yml deleted file mode 100644 index 895ea2bb..00000000 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/tasks/main.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -- name: Acquiring SAP HANA SID - ansible.builtin.shell: | - set -o pipefail - pcs resource config | - grep -w SID | - head -n1 | - awk '{print $3}' | - cut -d '=' -f2 - register: __pcs_find_hana_sap_hana_sid - changed_when: false - failed_when: __pcs_find_hana_sap_hana_sid.rc != 0 - -- name: Print SAP HANA SID - ansible.builtin.debug: - msg: "SAP HANA SID is {{ __pcs_find_hana_sap_hana_sid.stdout }}" - -- name: Set fact for sap_hana_sid variable (SAP HANA SID) - ansible.builtin.set_fact: - sap_hana_sid: "{{ __pcs_find_hana_sap_hana_sid.stdout }}" diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/README.md b/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/README.md deleted file mode 100644 index 225dd44b..00000000 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/meta/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/meta/main.yml deleted file mode 100644 index f5f3fdb3..00000000 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/meta/main.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -galaxy_info: - author: Amir Memon (@amemon) - description: your role description - license: GPL-3.0-only - min_ansible_version: "2.15" - platforms: - - name: EL - versions: - - "8" - - "9" - galaxy_tags: [] -dependencies: [] diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/tasks/main.yml deleted file mode 100644 index 14abb7b1..00000000 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_instance_number/tasks/main.yml +++ /dev/null @@ -1,18 +0,0 @@ ---- -- name: Acquiring SAP HANA Instance Number - ansible.builtin.shell: | - set -o pipefail - pcs resource config | - grep -w InstanceNumber | - awk '{print $2}' | head -n1 | cut -d '=' -f2 - register: __pcs_find_sap_hana_instance_number - changed_when: false - failed_when: __pcs_find_sap_hana_instance_number.rc != 0 - -- name: Print SAP HANA Instance Number - ansible.builtin.debug: - msg: "SAP HANA Instance Number is {{ __pcs_find_sap_hana_instance_number.stdout }}" - -- name: Setting up SAP HANA Instance Number as fact - ansible.builtin.set_fact: - sap_hana_instance_number: "{{ __pcs_find_sap_hana_instance_number.stdout }}" diff --git a/ansible_collections/sap/cluster_qa/roles/test02/molecule/default/prepare.yml b/ansible_collections/sap/cluster_qa/roles/test02/molecule/default/prepare.yml index 58d08134..fc10b0a6 100644 --- a/ansible_collections/sap/cluster_qa/roles/test02/molecule/default/prepare.yml +++ b/ansible_collections/sap/cluster_qa/roles/test02/molecule/default/prepare.yml @@ -4,8 +4,9 @@ gather_facts: false tasks: - name: Starting the cluster - community.general.pacemaker_cluster: - state: online - timeout: 900 + ansible.builtin.shell: | + pcs cluster start --all + pcs cluster enable --all become: true become_user: root + changed_when: false diff --git a/ansible_collections/sap/cluster_qa/roles/test09/README.md b/ansible_collections/sap/cluster_qa/roles/test09/README.md new file mode 100644 index 00000000..deab89d5 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/README.md @@ -0,0 +1,100 @@ +test09 +========= + +This role tests the SAP Message Server automatic restart mechanism and its interaction with the HA solution. It verifies that recoverable Message Server outages are handled correctly by the SAP Start Service and that unrecoverable failures trigger appropriate HA responses. + +**Test Purpose:** +- **Verify Restart_Program parameter configuration** for Message Server (auto-configure if missing) +- Verify Message Server automatic restart functionality via SAP Start Service +- Test interaction between SAP automatic restart and HA solution +- Ensure HA solution responds appropriately when automatic restart fails +- Validate that ASCS failover respects ERS location constraints + +**Test Procedure:** +1. **Validate Restart_Program parameter** is configured in ASCS profile (auto-insert if missing) +2. Kill Message Server process repeatedly (up to 6 times by default) +3. Monitor SAP Start Service automatic restart behavior +4. Verify HA solution response when automatic restart threshold is exceeded +5. Ensure ASCS and ERS remain on different nodes throughout + +Requirements +------------ + +A 3 or more node pacemaker cluster managing S4/HANA ASCS and ERS Instances using the `SAPInstance` resource agent with the SAP HA interface for SAP ABAP application server instances as mentioned in: https://access.redhat.com/solutions/3606101. + +**Prerequisites:** +- **SAP Profile Parameter "Restart_Program" must be configured for Message Server** (auto-configured by test if missing) +- SAP system running in stable mode with HA solution activated +- 3+ node cluster setup required + +**Reference:** [SAP Support Content: Message Server Restart](https://help.sap.com/docs/SUPPORT_CONTENT/si/3362959619.html?locale=en-US) + +**Restart_Program Configuration Example:** +``` +Restart_Program_01 = local $(DIR_EXECUTABLE)/msg_server pf=$(DIR_PROFILE)/$(SAPSYSTEMNAME)_$(INSTANCE_NAME)_$(HOSTNAME) +``` + +Role Variables +-------------- + +This role uses variables provided by the `sap.cluster_qa.pcs_find_ascs` and `sap.cluster_qa.pcs_find_ers` roles: +- `sap_ascs_node_name` - The node where ASCS is currently running +- `sap_ers_node_name` - The node where ERS is currently running +- `sap_ascs_resource_name` - The name of the ASCS resource in the cluster +- `sap_ascs_instance_number` - The ASCS instance number +- `max_kill_attempts` - Maximum Message Server kill attempts (default: 6) +- `sap_ascs_start_profile` - Path to ASCS profile file (used for Restart_Program validation) + +**Expected Outcomes:** +- **Restart_Program parameter validation passes** (auto-configured if missing) +- Message Server restarts automatically via SAP Start Service (recoverable errors) +- Process ID changes with each restart +- Restart events logged in sapstartsrv.log/sapstart.log +- HA solution triggers ASCS restart/failover after restart threshold exceeded +- ASCS never moves to ERS node + +**Auto-Configuration Feature:** +If the `Restart_Program` parameter is not found, the test will automatically: +- Check for existing `Start_Program` parameter for Message Server +- **Replace `Start_Program` with `Restart_Program`** if found (to avoid conflicts) +- Insert `_MS = ms.sap$(SAPSYSTEMNAME)_$(INSTANCE_NAME)` variable definition if needed +- Add `Restart_Program_00 = local $(_MS) pf=$(_PF)` parameter if no existing Start_Program +- Create backup of original profile before modification +- **Restart sapstartsrv service** to apply the new configuration +- **Wait for cluster to detect ASCS resource failures** after service restart +- **Wait for ASCS resource to be fully started** by the cluster +- **Re-discover ASCS location** after cluster recovery (may cause failover) +- Verify successful configuration before proceeding + +**Important Note:** When the `Restart_Program` parameter is automatically configured, the sapstartsrv service will be restarted, which causes the cluster to detect resource failures and may trigger ASCS failover to another node. The test intelligently waits for complete cluster recovery before proceeding. + +**Configuration Logic:** +- If `Start_Program_XX = local $(_MS) pf=$(_PF)` exists → Replace with `Restart_Program_00 = local $(_MS) pf=$(_PF)` +- If no Start_Program exists → Add both `_MS` variable and `Restart_Program_00` parameter + +Dependencies +------------ + +- `sap.cluster_qa.pcs_find_ascs` - Required to locate the ASCS node and resource information +- `sap.cluster_qa.pcs_find_ers` - Required to locate the ERS node and resource information +- `sap.sap_operations` - Required for host_info and pcs_status_info modules + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - sap.cluster_qa.test09 + +License +------- + +GPLv3 + +Author Information +------------------ + +Amir Memon (@amemon-redhat) +Kirill Satarin (@kksat) \ No newline at end of file diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/meta/main.yml b/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml similarity index 60% rename from ansible_collections/sap/cluster_qa/roles/pcs_find_hana/meta/main.yml rename to ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml index 7959dfb7..02b62d8c 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_hana/meta/main.yml +++ b/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml @@ -1,13 +1,14 @@ --- galaxy_info: author: Amir Memon (@amemon-redhat) - description: Find HANA SID in pacemaker cluster - license: GPL-3.0-only + description: Run test09 - Message Server automatic restart and HA interaction test + license: GPl-3.0-only min_ansible_version: "2.15" platforms: - name: EL versions: - "8" - "9" + - "10" galaxy_tags: [] dependencies: [] diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml new file mode 100644 index 00000000..c4ff024a --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml @@ -0,0 +1,125 @@ +--- +- name: Get current Message Server process info + sap.sap_operations.host_info: + register: current_ascs_host_info + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Store current Message Server PID + ansible.builtin.set_fact: + current_msg_server_pid: >- + {{ (msg_server_process_list | selectattr('name', 'equalto', 'msg_server') | first)['pid'] + if msg_server_process_list | length > 0 else 'NO_INSTANCE' }} + previous_msg_server_pid: "{{ previous_msg_server_pid | default('none') }}" + ascs_instance_found: "{{ ascs_instance_list | length > 0 }}" + vars: + ascs_instance_list: >- + {{ current_ascs_host_info.instances | default([]) | selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list | default([]) }} + msg_server_process_list: >- + {{ (ascs_instance_list | first | default({}))['ProcessList'] | default([]) | selectattr('name', 'equalto', 'msg_server') | list | default([]) }} + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_ascs_host_info is defined + - current_ascs_host_info.instances is defined + +- name: Handle case when ASCS instance not found + ansible.builtin.set_fact: + current_msg_server_pid: "NO_INSTANCE" + ascs_instance_found: false + msg_server_restarted: false + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_ascs_host_info is defined + - (current_ascs_host_info.instances | default([]) | + selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list | length == 0) + +- name: Display Message Server PID info + ansible.builtin.debug: + msg: | + Kill attempt {{ kill_attempt }}: + - ASCS instance found: {{ ascs_instance_found | default(false) }} + - Current PID: {{ current_msg_server_pid | default('N/A') }} + - Previous PID: {{ previous_msg_server_pid | default('none') }} + {% if not (ascs_instance_found | default(false)) %} + - WARNING: ASCS instance {{ sap_ascs_instance_number }} not found in process list + {% endif %} + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Killing the Message Server process + ansible.builtin.command: "kill -9 {{ current_msg_server_pid }}" + changed_when: true + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_msg_server_pid is defined + - current_msg_server_pid != "NO_INSTANCE" + - ascs_instance_found | default(false) + +- name: Update kill counter + ansible.builtin.set_fact: + message_server_kill_count: "{{ kill_attempt }}" + previous_msg_server_pid: "{{ current_msg_server_pid | default('unknown') }}" + +- name: Wait for SAP automatic restart or HA intervention + ansible.builtin.pause: + seconds: 30 + prompt: >- + Waiting for SAP automatic restart or HA intervention after kill attempt {{ kill_attempt }} + +- name: Check if ASCS resource is still running on original node + sap.sap_operations.pcs_status_info: + register: ascs_status_check + run_once: true + +- name: Verify ASCS resource status + ansible.builtin.set_fact: + ascs_still_on_original_node: >- + {{ ascs_status_check | sap.sap_operations.pcs_resources_from_status(role='Started', id=sap_ascs_resource_name) | length > 0 }} + run_once: true + +- name: Check if Message Server process is running again + sap.sap_operations.host_info: + register: restart_check_host_info + failed_when: false + when: + - ansible_hostname == sap_ascs_node_name_initial + - ascs_still_on_original_node | bool + +- name: Determine if Message Server restarted automatically + ansible.builtin.set_fact: + msg_server_restarted: "{{ (restart_msg_server_list | length > 0) }}" + restart_ascs_instance_found: "{{ restart_ascs_instance_list | length > 0 }}" + vars: + restart_ascs_instance_list: >- + {{ restart_check_host_info.instances | default([]) | + selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list | default([]) }} + restart_msg_server_list: >- + {{ (restart_ascs_instance_list | first | default({}))['ProcessList'] | default([]) | + selectattr('name', 'equalto', 'msg_server') | list | default([]) }} + when: + - ansible_hostname == sap_ascs_node_name_initial + - ascs_still_on_original_node | bool + - restart_check_host_info is defined + - not (restart_check_host_info.failed | default(false)) + +- name: Set restart status to false if ASCS moved + ansible.builtin.set_fact: + msg_server_restarted: false + when: + - not (ascs_still_on_original_node | bool) + +- name: Display restart status + ansible.builtin.debug: + msg: | + After kill {{ kill_attempt }}: + - Message Server restarted: {{ msg_server_restarted | default(false) }} + - ASCS on original node: {{ ascs_still_on_original_node }} + - ASCS instance found during kill: {{ ascs_instance_found | default(false) }} + - ASCS instance found during restart check: {{ restart_ascs_instance_found | default(false) }} + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Set global fact to stop further iterations if Message Server stopped restarting + ansible.builtin.set_fact: + msg_server_restarted: false + when: + - (not (msg_server_restarted | default(false) | bool)) or + (not (ascs_instance_found | default(true) | bool)) + - kill_attempt | int >= 2 diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml new file mode 100644 index 00000000..23f8259f --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml @@ -0,0 +1,39 @@ +--- +- name: Clean-up of cluster + ansible.builtin.shell: |- + set -o pipefail | + pcs resource cleanup + run_once: true + +- name: Collect necessary gather_facts + ansible.builtin.setup: + gather_subset: + - min + +- name: Finding ASCS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ascs + +- name: Store original ASCS node name before any modifications + ansible.builtin.set_fact: + sap_ascs_node_name_original: "{{ sap_ascs_node_name }}" + +- name: Prerequisite verification + ansible.builtin.include_tasks: + file: prerequisite.yml + +- name: Test setup + ansible.builtin.include_tasks: + file: test_setup.yml + +- name: Test execution + ansible.builtin.include_tasks: + file: test_execution.yml + +- name: Test verification + ansible.builtin.include_tasks: + file: test_verification.yml + +- name: Test summary + ansible.builtin.include_tasks: + file: test_summary.yml diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/prerequisite.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/prerequisite.yml new file mode 100644 index 00000000..d370789f --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/prerequisite.yml @@ -0,0 +1,43 @@ +--- +- name: Check if ASCS profile file exists + ansible.builtin.stat: + path: "{{ sap_ascs_start_profile }}" + register: profile_file_stat + run_once: true + delegate_to: "{{ sap_ascs_node_name }}" + +- name: Check for Restart_Program parameter in ASCS profile + ansible.builtin.shell: | + grep "Restart_Program" "{{ sap_ascs_start_profile }}" | grep "_MS" || echo "NOT_FOUND" + register: restart_program_check + changed_when: false + run_once: true + delegate_to: "{{ sap_ascs_node_name }}" + when: profile_file_stat.stat.exists + +- name: Verify Restart_Program parameter is configured for Message Server + ansible.builtin.assert: + that: + - profile_file_stat.stat.exists + - restart_program_check.stdout != "NOT_FOUND" + - restart_program_check.stdout | length > 0 + fail_msg: | + PREREQUISITE FAILED: Restart_Program parameter for Message Server not found in ASCS profile. + Profile checked: {{ sap_ascs_start_profile }} + Profile exists: {{ profile_file_stat.stat.exists | default(false) }} + + Please manually configure it according to: https://help.sap.com/docs/SUPPORT_CONTENT/si/3362959619.html + + Expected configuration format: + _MS = ms.sap$(SAPSYSTEMNAME)_$(INSTANCE_NAME) + Restart_Program_00 = local $(_MS) pf=$(_PF) + + Or the expanded form: + Restart_Program_00 = local ms.sapSYSTEMNAME_INSTANCENAME pf=/path/to/profile + + TEST WILL STOP HERE - Fix the configuration and re-run the test. + success_msg: | + ✓ PREREQUISITE VERIFIED: Restart_Program parameter found for Message Server. + Configuration: {{ restart_program_check.stdout }} + ✓ Proceeding with Message Server kill test... + run_once: true diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_execution.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_execution.yml new file mode 100644 index 00000000..8f2657b6 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_execution.yml @@ -0,0 +1,12 @@ +--- +- name: Execute Message Server kill attempts + ansible.builtin.include_tasks: kill_message_server.yml + loop: "{{ range(1, max_kill_attempts | int + 1) | list }}" + loop_control: + loop_var: kill_attempt + when: msg_server_restarted | bool + +- name: Wait for HA solution to respond to unrecoverable Message Server failure + ansible.builtin.pause: + seconds: 60 + prompt: "Waiting for HA solution to respond to repeated Message Server failures" diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_setup.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_setup.yml new file mode 100644 index 00000000..b7e97834 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_setup.yml @@ -0,0 +1,18 @@ +--- +- name: Finding ERS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ers + +- name: Setting initial facts + ansible.builtin.set_fact: + sap_ascs_node_name_initial: "{{ sap_ascs_node_name }}" + sap_ers_node_name_initial: "{{ sap_ers_node_name }}" + message_server_kill_count: 0 + max_kill_attempts: 6 + msg_server_restarted: true + +- name: Verify ASCS and ERS are on different nodes initially + ansible.builtin.assert: + that: sap_ascs_node_name_initial != sap_ers_node_name_initial + fail_msg: "ASCS and ERS are on the same node initially, which violates HA setup requirements" + success_msg: "ASCS on {{ sap_ascs_node_name_initial }}, ERS on {{ sap_ers_node_name_initial }} - proper HA setup confirmed" diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_summary.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_summary.yml new file mode 100644 index 00000000..ab85467f --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_summary.yml @@ -0,0 +1,16 @@ +--- +- name: Display test summary + ansible.builtin.debug: + msg: + - "===============================================" + - " TEST09 SUMMARY" + - "===============================================" + - "Message Server killed: {{ message_server_kill_count }} times" + - "Initial ASCS location: {{ sap_ascs_node_name_initial }}" + - "Final ASCS location: {{ sap_ascs_node_name_final }}" + - "Initial ERS location: {{ sap_ers_node_name_initial }}" + - "Final ERS location: {{ sap_ers_node_name_final }}" + - "HA Action taken: {{ 'ASCS Failover' if sap_ascs_node_name_initial != sap_ascs_node_name_final else 'ASCS Restart on same node' }}" + - "ASCS/ERS separation maintained: {{ 'YES' if sap_ascs_node_name_final != sap_ers_node_name_final else 'NO' }}" + - "===============================================" + run_once: true diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_verification.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_verification.yml new file mode 100644 index 00000000..ed1fa11c --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/test_verification.yml @@ -0,0 +1,20 @@ +--- +- name: Check final ASCS location after HA intervention + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ascs + +- name: Check final ERS location + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ers + +- name: Set final location facts + ansible.builtin.set_fact: + sap_ascs_node_name_final: "{{ sap_ascs_node_name }}" + sap_ers_node_name_final: "{{ sap_ers_node_name }}" + +- name: Verify HA solution responded appropriately + ansible.builtin.assert: + that: + - sap_ascs_node_name_final != sap_ers_node_name_final + fail_msg: "HA solution failed: ASCS and ERS ended up on the same node ({{ sap_ascs_node_name_final }})" + success_msg: "HA solution succeeded: ASCS on {{ sap_ascs_node_name_final }}, ERS on {{ sap_ers_node_name_final }}" diff --git a/requirements.yml b/requirements.yml index c0e9d954..4e5be4c7 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,7 +2,3 @@ collections: - name: "sap.sap_operations" version: ">=2.10.0" - - name: "ansible.posix" - version: ">=1.5.4" - - name: "community.general" - version: ">=7.3.0" diff --git a/tests/ansible.cfg b/tests/ansible.cfg deleted file mode 100644 index 8ef7c7a2..00000000 --- a/tests/ansible.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[defaults] -inventory = ./inventory -remote_user = root -ask_pass = false -timeout = 60 -force_color = 1 -# stdout_callback = yaml -bin_ansible_callbacks = true - -[privilege_escalation] -become = true -become_method = sudo -become_user = root -become_ask_pass = false diff --git a/tests/inventory/ppc64le.yml b/tests/inventory/ppc64le.yml index 7f8edcd8..4c4d6a88 100644 --- a/tests/inventory/ppc64le.yml +++ b/tests/inventory/ppc64le.yml @@ -4,9 +4,14 @@ all: s4hana-3n: hosts: lsh40410: + ansible_user: root lsh40411: + ansible_user: root lsh40412: + ansible_user: root s4hana-2n: hosts: lsh40410: + ansible_user: root lsh40411: + ansible_user: root diff --git a/tests/inventory/x86_64.yml b/tests/inventory/x86_64.yml index 79071865..af406547 100644 --- a/tests/inventory/x86_64.yml +++ b/tests/inventory/x86_64.yml @@ -4,9 +4,14 @@ all: s4hana-3n: hosts: s4hana17: + ansible_user: root s4hana18: + ansible_user: root s4hana19: + ansible_user: root s4hana-2n: hosts: s4hana17: + ansible_user: root s4hana18: + ansible_user: root