From e7d1504b138f100fa186f324373ff8b9e9ad5222 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Fri, 19 Dec 2025 14:14:35 +0000 Subject: [PATCH] [perf-cleanup] remove redundant tests and update mcast polling --- .../tests/integration_tests/multicast/api.rs | 235 +----------- .../multicast/cache_invalidation.rs | 26 +- .../integration_tests/multicast/failures.rs | 238 +++++------- .../integration_tests/multicast/instances.rs | 342 +----------------- .../tests/integration_tests/multicast/mod.rs | 47 +-- .../multicast/networking_integration.rs | 262 +++----------- 6 files changed, 168 insertions(+), 982 deletions(-) diff --git a/nexus/tests/integration_tests/multicast/api.rs b/nexus/tests/integration_tests/multicast/api.rs index bf9a2ba5056..52240a45faa 100644 --- a/nexus/tests/integration_tests/multicast/api.rs +++ b/nexus/tests/integration_tests/multicast/api.rs @@ -461,7 +461,13 @@ async fn test_join_by_ip_ssm_with_sources(cptestctx: &ControlPlaneTestContext) { } /// Test SSM join-by-IP without sources should fail. +/// /// SSM addresses (232.0.0.0/8) require source IPs for implicit creation. +/// +/// This is the canonical test for SSM source validation. The validation +/// code path is shared regardless of how you join (by IP, name, or ID) - +/// all routes converge on the same `instance_multicast_group_join` logic +/// that checks `is_ssm_address()` and rejects joins without sources. #[nexus_test] async fn test_join_by_ip_ssm_without_sources_fails( cptestctx: &ControlPlaneTestContext, @@ -516,158 +522,6 @@ async fn test_join_by_ip_ssm_without_sources_fails( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test joining an existing SSM group by ID without sources should fail. -/// -/// This tests the SSM validation for join-by-ID path: if an SSM group exists -/// (created by first instance with sources), a second instance cannot join -/// by group ID without providing sources. -#[nexus_test] -async fn test_join_existing_ssm_group_by_id_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-id-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-id-fail-pool", - (232, 40, 0, 1), - (232, 40, 0, 255), - ), - ) - .await; - - create_instance(client, project_name, "ssm-id-inst-1").await; - create_instance(client, project_name, "ssm-id-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.40.0.100"; - let source_ip: IpAddr = "10.40.0.1".parse().unwrap(); - let join_url_1 = format!( - "/v1/instances/ssm-id-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - - let join_body_1 = - InstanceMulticastGroupJoin { source_ips: Some(vec![source_ip]) }; - let member_1: MulticastGroupMember = - put_upsert(client, &join_url_1, &join_body_1).await; - - let group_id = member_1.multicast_group_id; - - // Second instance tries to join by group ID WITHOUT sources - should fail - let join_url_by_id = format!( - "/v1/instances/ssm-id-inst-2/multicast-groups/{group_id}?project={project_name}" - ); - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_url_by_id) - .body(Some(&InstanceMulticastGroupJoin { - source_ips: None, // No sources! - })) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join by ID without sources should fail for SSM group"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-id-inst-1", "ssm-id-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - -/// Test joining an existing SSM group by NAME without sources should fail. -#[nexus_test] -async fn test_join_existing_ssm_group_by_name_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-name-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-name-fail-pool", - (232, 45, 0, 1), - (232, 45, 0, 100), - ), - ) - .await; - - create_instance(client, project_name, "ssm-name-inst-1").await; - create_instance(client, project_name, "ssm-name-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.45.0.50"; - let join_url = format!( - "/v1/instances/ssm-name-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body = InstanceMulticastGroupJoin { - source_ips: Some(vec!["10.0.0.1".parse().unwrap()]), - }; - - put_upsert::<_, MulticastGroupMember>(client, &join_url, &join_body).await; - - // Get the group's auto-generated name - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - - // Second instance tries to join by NAME without sources - should fail - let join_by_name_url = format!( - "/v1/instances/ssm-name-inst-2/multicast-groups/{expected_group_name}?project={project_name}" - ); - let join_body_no_sources = InstanceMulticastGroupJoin { source_ips: None }; - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_by_name_url) - .body(Some(&join_body_no_sources)) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join by name without sources should fail for SSM group"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-name-inst-1", "ssm-name-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - /// Test that SSM join-by-IP with empty sources array fails. /// /// `source_ips: Some(vec![])` (empty array) is treated the same as @@ -725,83 +579,6 @@ async fn test_ssm_with_empty_sources_array_fails( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test joining an existing SSM group by IP without sources fails. -/// -/// When an SSM group already exists (created by first instance with sources), -/// a second instance joining by IP should still fail without sources since -/// the group is SSM. -#[nexus_test] -async fn test_join_existing_ssm_group_by_ip_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-ip-existing-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-ip-existing-fail-pool", - (232, 47, 0, 1), - (232, 47, 0, 100), - ), - ) - .await; - - create_instance(client, project_name, "ssm-ip-inst-1").await; - create_instance(client, project_name, "ssm-ip-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.47.0.50"; - let join_url = format!( - "/v1/instances/ssm-ip-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body = InstanceMulticastGroupJoin { - source_ips: Some(vec!["10.0.0.1".parse().unwrap()]), - }; - - put_upsert::<_, MulticastGroupMember>(client, &join_url, &join_body).await; - - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - - // Second instance tries to join by IP without sources - should fail - // Even though the group exists, SSM still requires sources - let join_url_2 = format!( - "/v1/instances/ssm-ip-inst-2/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body_no_sources = InstanceMulticastGroupJoin { source_ips: None }; - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_url_2) - .body(Some(&join_body_no_sources)) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join existing SSM group by IP without sources should fail"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-ip-inst-1", "ssm-ip-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - /// Test join-by-IP with IP not in any pool should fail. #[nexus_test] async fn test_join_by_ip_not_in_pool_fails( diff --git a/nexus/tests/integration_tests/multicast/cache_invalidation.rs b/nexus/tests/integration_tests/multicast/cache_invalidation.rs index ccf8c69db2e..a0bb73b812c 100644 --- a/nexus/tests/integration_tests/multicast/cache_invalidation.rs +++ b/nexus/tests/integration_tests/multicast/cache_invalidation.rs @@ -284,15 +284,15 @@ async fn test_cache_ttl_driven_refresh() { "test_cache_ttl_driven_refresh", ) .customize_nexus_config(&|config| { - // Set short cache TTLs for testing (2 seconds for sled cache) + // Set short cache TTLs for testing config.pkg.background_tasks.multicast_reconciler.sled_cache_ttl_secs = - chrono::TimeDelta::seconds(2).to_std().unwrap(); + chrono::TimeDelta::milliseconds(500).to_std().unwrap(); config .pkg .background_tasks .multicast_reconciler .backplane_cache_ttl_secs = - chrono::TimeDelta::seconds(1).to_std().unwrap(); + chrono::TimeDelta::milliseconds(250).to_std().unwrap(); // Ensure multicast is enabled config.pkg.multicast.enabled = true; @@ -435,9 +435,8 @@ async fn test_cache_ttl_driven_refresh() { .await .expect("Should insert new inventory collection"); - // Wait for cache TTL to expire (sled_cache_ttl = 1 second) - // Sleep for 1.5 seconds to ensure TTL has expired - tokio::time::sleep(std::time::Duration::from_millis(1500)).await; + // Wait for cache TTL to expire (sled_cache_ttl = 500ms) + tokio::time::sleep(std::time::Duration::from_millis(600)).await; wait_for_condition_with_reconciler( &cptestctx.lockstep_client, @@ -487,19 +486,17 @@ async fn test_backplane_cache_ttl_expiry() { "test_backplane_cache_ttl_expiry", ) .customize_nexus_config(&|config| { - // Set backplane cache TTL to 1 second (shorter than sled cache to test - // independently) + // Set backplane cache TTL short (shorter than sled cache to test independently) config .pkg .background_tasks .multicast_reconciler .backplane_cache_ttl_secs = - chrono::TimeDelta::seconds(1).to_std().unwrap(); + chrono::TimeDelta::milliseconds(250).to_std().unwrap(); - // Keep sled cache TTL longer to ensure we're testing backplane cache - // expiry + // Keep sled cache TTL longer to ensure we're testing backplane cache expiry config.pkg.background_tasks.multicast_reconciler.sled_cache_ttl_secs = - chrono::TimeDelta::seconds(10).to_std().unwrap(); + chrono::TimeDelta::seconds(2).to_std().unwrap(); // Ensure multicast is enabled config.pkg.multicast.enabled = true; @@ -550,9 +547,8 @@ async fn test_backplane_cache_ttl_expiry() { .await .expect("Should verify initial port mapping"); - // Wait for backplane cache TTL to expire (500ms) but not sled cache (5 seconds) - // Sleep for 1 second to ensure backplane TTL has expired - tokio::time::sleep(std::time::Duration::from_secs(1)).await; + // Wait for backplane cache TTL to expire (250ms) but not sled cache (2 seconds) + tokio::time::sleep(std::time::Duration::from_millis(300)).await; // Force cache access by triggering reconciler // This will cause the reconciler to check backplane cache, find it expired, diff --git a/nexus/tests/integration_tests/multicast/failures.rs b/nexus/tests/integration_tests/multicast/failures.rs index bf67d634915..bd609746e56 100644 --- a/nexus/tests/integration_tests/multicast/failures.rs +++ b/nexus/tests/integration_tests/multicast/failures.rs @@ -50,14 +50,26 @@ use crate::integration_tests::instances::{ instance_simulate, instance_wait_for_state, }; +/// Test DPD communication failure and recovery with multiple groups. +/// +/// This is the canonical test for DPD failure/recovery behavior. It verifies: +/// - Multiple groups remain in "Creating" state when DPD is unavailable +/// - Members stay in "Joining" or "Left" state during DPD failure +/// - All groups recover to "Active" after DPD restart +/// - Running instance members recover to "Joined" state +/// +/// Uses multiple groups to verify the reconciler handles concurrent recovery +/// correctly, and one DPD restart cycle to test the full recovery path. #[nexus_test] async fn test_multicast_group_dpd_communication_failure_recovery( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; let project_name = "test-project"; - let group_name = "dpd-failure-group"; - let instance_name = "dpd-failure-instance"; + let group1_name = "dpd-failure-group-1"; + let group2_name = "dpd-failure-group-2"; + let instance1_name = "dpd-failure-instance-1"; + let instance2_name = "dpd-failure-instance-2"; // Setup: project, pools - parallelize creation ops::join3( @@ -67,190 +79,116 @@ async fn test_multicast_group_dpd_communication_failure_recovery( ) .await; - // Create instance first - create_instance(client, project_name, instance_name).await; - - // Stop DPD before implicit creation to test failure recovery - cptestctx.stop_dendrite(SwitchLocation::Switch0).await; - - // Add instance to multicast group via instance-centric API - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; - - // Verify group was implicitly created and is in "Creating" state since DPD is unavailable - // The reconciler can't progress the group to Active without DPD communication - let group_get_url = mcast_group_url(group_name); - let fetched_group: MulticastGroup = - object_get(client, &group_get_url).await; - - assert_eq!( - fetched_group.state, "Creating", - "Group should remain in Creating state when DPD is unavailable, found: {}", - fetched_group.state - ); - - // Verify group properties are maintained despite DPD issues - assert_eq!(fetched_group.identity.name.as_str(), group_name); - - // Case: Verify member state during DPD failure - // Members should be in "Joining" or "Left" state when DPD is unavailable - // (they can't transition to "Joined" without successful DPD programming) - let members = list_multicast_group_members(client, group_name).await; - assert_eq!(members.len(), 1, "Should have exactly one member"); - assert!( - members[0].state == "Joining" || members[0].state == "Left", - "Member should be in Joining or Left state when DPD is unavailable, got: {}", - members[0].state - ); - - // Start instance so it has a valid VMM state for recovery - let instance: Instance = object_get( - client, - &format!("/v1/instances/{instance_name}?project={project_name}"), - ) - .await; - let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + // Create instance1 (will start running by default) + let instance1 = create_instance(client, project_name, instance1_name).await; + let instance1_id = InstanceUuid::from_untyped_uuid(instance1.identity.id); let nexus = &cptestctx.server.server_context().nexus; + instance_simulate(nexus, &instance1_id).await; + instance_wait_for_state(client, instance1_id, InstanceState::Running).await; - let start_url = - format!("/v1/instances/{instance_name}/start?project={project_name}"); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &start_url) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should start instance"); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Restart DPD and verify member recovers to Joined - cptestctx.restart_dendrite(SwitchLocation::Switch0).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Group should become Active and member should recover to Joined - wait_for_group_active(client, group_name).await; - wait_for_member_state( - cptestctx, - group_name, - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - let recovered_members = - list_multicast_group_members(client, group_name).await; - assert_eq!( - recovered_members[0].state, "Joined", - "Member should recover to Joined after DPD restart" - ); - - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; - wait_for_group_deleted(client, group_name).await; -} - -#[nexus_test] -async fn test_multicast_reconciler_state_consistency_validation( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "test-project"; - - // Setup: project and pools - let (_, _, _) = ops::join3( - create_project(&client, project_name), - create_default_ip_pool(&client), - create_multicast_ip_pool(&client, "mcast-pool"), + // Create instance2 as stopped (won't start) + create_instance_with( + client, + project_name, + instance2_name, + &InstanceNetworkInterfaceAttachment::Default, + vec![], + vec![], + false, // don't start + None, + None, + vec![], ) .await; - // Group names for implicit groups (implicitly created when first member joins) - let group_names = - ["consistency-group-1", "consistency-group-2", "consistency-group-3"]; - - // Create instances first (groups will be implicitly created when members attach) - let instance_names: Vec<_> = group_names - .iter() - .map(|&group_name| format!("instance-{group_name}")) - .collect(); - - // Create all instances in parallel - let create_futures = instance_names.iter().map(|instance_name| { - create_instance(client, project_name, instance_name) - }); - ops::join_all(create_futures).await; - - // Stop DPD before attaching members to test state consistency during failure - // Groups will be implicitly created but stay in "Creating" state + // Stop DPD before implicit creation to test failure recovery cptestctx.stop_dendrite(SwitchLocation::Switch0).await; - // Attach instances to their respective groups (triggers implicit creation for each group) - // Since DPD is down, groups will remain in "Creating" state - for (instance_name, &group_name) in instance_names.iter().zip(&group_names) - { + // Add instances to their respective groups via instance-centric API + ops::join2( multicast_group_attach( cptestctx, project_name, - instance_name, - group_name, - ) - .await; - } - - // Wait for reconciler to attempt processing (will fail due to DPD being down) - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + instance1_name, + group1_name, + ), + multicast_group_attach( + cptestctx, + project_name, + instance2_name, + group2_name, + ), + ) + .await; - // Verify each group is in a consistent state (DPD failure prevents reconciliation) - for group_name in group_names.iter() { + // Verify both groups are in "Creating" state since DPD is unavailable + for group_name in [group1_name, group2_name] { let group_get_url = mcast_group_url(group_name); let fetched_group: MulticastGroup = object_get(client, &group_get_url).await; - // State should be "Creating" since DPD is down assert_eq!( fetched_group.state, "Creating", "Group {group_name} should remain in Creating state when DPD is unavailable, found: {}", fetched_group.state ); - // Case: Verify member state during DPD failure + // Verify member state during DPD failure let members = list_multicast_group_members(client, group_name).await; assert_eq!( members.len(), 1, - "Group {group_name} should have exactly one member" + "Group {group_name} should have one member" ); assert!( members[0].state == "Joining" || members[0].state == "Left", - "Member in group {group_name} should be Joining or Left when DPD unavailable, got: {}", + "Member in {group_name} should be Joining or Left when DPD unavailable, got: {}", members[0].state ); } - // Verify groups are still stuck in expected states before restarting DPD - // This explicitly validates that without DPD, groups cannot transition - for group_name in group_names.iter() { - verify_group_deleted_or_in_states(client, group_name, &["Creating"]) - .await; - } - - // Restart DPD before cleanup so instances can stop properly + // Restart DPD and verify recovery + // instance1 is already running (started before DPD was stopped) cptestctx.restart_dendrite(SwitchLocation::Switch0).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let instance_name_refs: Vec<&str> = - instance_names.iter().map(|s| s.as_str()).collect(); - cleanup_instances(cptestctx, client, project_name, &instance_name_refs) - .await; + // Both groups should become Active + wait_for_group_active(client, group1_name).await; + wait_for_group_active(client, group2_name).await; - // With DPD now restored, groups should be cleaned up via implicit deletion - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + // Running instance's member should recover to Joined + wait_for_member_state( + cptestctx, + group1_name, + instance1.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; - // Verify groups are deleted (implicit deletion completes with DPD available) - for group_name in group_names.iter() { - wait_for_group_deleted(client, group_name).await; - } + let recovered_members = + list_multicast_group_members(client, group1_name).await; + assert_eq!( + recovered_members[0].state, "Joined", + "Running instance member should recover to Joined after DPD restart" + ); + + // Stopped instance's member should stay in Left (group is Active but instance not running) + let group2_members = + list_multicast_group_members(client, group2_name).await; + assert_eq!( + group2_members[0].state, "Left", + "Stopped instance member should be Left even after group becomes Active" + ); + + cleanup_instances( + cptestctx, + client, + project_name, + &[instance1_name, instance2_name], + ) + .await; + wait_for_group_deleted(client, group1_name).await; + wait_for_group_deleted(client, group2_name).await; } #[nexus_test] diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index a6e2c9dc62b..f76f3f85488 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -468,311 +468,6 @@ async fn test_multicast_group_attach_limits( .await; } -#[nexus_test] -async fn test_multicast_group_instance_state_transitions( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - - // Create project and pools in parallel - let (_, _, _) = ops::join3( - create_default_ip_pool(&client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(&client, "mcast-pool"), - ) - .await; - - // Create stopped instance (no multicast groups at creation) - let stopped_instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - "state-test-instance", - false, - &[], - ) - .await; - - // Add instance to group (group implicitly creates if it doesn't exist) - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "state-test-instance", - "state-test-group", - ) - .await; - - // Wait for group to become Active before proceeding - wait_for_group_active(client, "state-test-group").await; - - // Verify instance is stopped and in multicast group - assert_eq!(stopped_instance.runtime.run_state, InstanceState::Stopped); - - // Wait for member to reach "Left" state (stopped instance members start in "Left" state) - wait_for_member_state( - cptestctx, - "state-test-group", - stopped_instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, - ) - .await; - - // Start the instance and verify multicast behavior - let instance_id = - InstanceUuid::from_untyped_uuid(stopped_instance.identity.id); - let nexus = &cptestctx.server.server_context().nexus; - - // Start the instance using direct POST request (not PUT) - let start_url = format!( - "/v1/instances/state-test-instance/start?project={PROJECT_NAME}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &start_url) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .unwrap(); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_id, InstanceState::Running).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Stop the instance and verify multicast behavior persists - let stop_url = format!( - "/v1/instances/state-test-instance/stop?project={PROJECT_NAME}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &stop_url) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .unwrap(); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; - - // Verify control plane still shows membership regardless of instance state - let members_url = mcast_group_members_url("state-test-group"); - let final_members: Vec = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn( - client, - &members_url, - "", - None, - ) - .await - .unwrap() - .all_items; - - assert_eq!( - final_members.len(), - 1, - "Control plane should maintain multicast membership across instance state changes" - ); - assert_eq!(final_members[0].instance_id, stopped_instance.identity.id); - - object_delete( - client, - &format!("/v1/instances/state-test-instance?project={PROJECT_NAME}"), - ) - .await; - - wait_for_group_deleted(client, "state-test-group").await; -} - -/// Test that multicast group membership persists through instance stop/start cycles -/// (parallel to external IP persistence behavior) -#[nexus_test] -async fn test_multicast_group_persistence_through_stop_start( - cptestctx: &ControlPlaneTestContext, -) { - // Ensure inventory and DPD are ready before creating instances with multicast groups - ensure_multicast_test_ready(cptestctx).await; - - let client = &cptestctx.external_client; - - // Create project and pools in parallel - let (_, _, _) = ops::join3( - create_default_ip_pool(&client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(&client, "mcast-pool"), - ) - .await; - - // Create instance and start it (no multicast groups at creation) - let instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - "persist-test-instance", - true, - &[], - ) - .await; - - // Add instance to group (group implicitly creates if it doesn't exist) - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "persist-test-instance", - "persist-test-group", - ) - .await; - - // Wait for group to become Active - wait_for_group_active(client, "persist-test-group").await; - - let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); - - // Simulate the instance transitioning to Running state - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Wait for member to be joined (reconciler will process the sled_id set by instance start) - wait_for_member_state( - cptestctx, - "persist-test-group", - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify instance is in the group - let members_url = mcast_group_members_url("persist-test-group"); - let members_before_stop = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members before stop") - .all_items; - - assert_eq!( - members_before_stop.len(), - 1, - "Group should have 1 member before stop" - ); - assert_eq!(members_before_stop[0].instance_id, instance.identity.id); - - // Stop the instance - let instance_stop_url = format!( - "/v1/instances/persist-test-instance/stop?project={PROJECT_NAME}" - ); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - client, - http::Method::POST, - &instance_stop_url, - ) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(http::StatusCode::ACCEPTED)), - ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should stop instance"); - - // Simulate the stop transition - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - - // Wait for instance to be stopped - instance_wait_for_state( - client, - instance_id, - omicron_common::api::external::InstanceState::Stopped, - ) - .await; - - // Verify multicast group membership persists while stopped - let members_while_stopped = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members while stopped") - .all_items; - - assert_eq!( - members_while_stopped.len(), - 1, - "Group membership should persist while instance is stopped" - ); - assert_eq!(members_while_stopped[0].instance_id, instance.identity.id); - - // Start the instance again - let instance_start_url = format!( - "/v1/instances/persist-test-instance/start?project={PROJECT_NAME}" - ); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - client, - http::Method::POST, - &instance_start_url, - ) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(http::StatusCode::ACCEPTED)), - ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should start instance"); - - // Simulate the instance transitioning back to "Running" state - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - - // Wait for instance to be running again - instance_wait_for_state( - client, - instance_id, - omicron_common::api::external::InstanceState::Running, - ) - .await; - - // Verify multicast group membership still exists after restart - let members_after_restart = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members after restart") - .all_items; - - assert_eq!( - members_after_restart.len(), - 1, - "Group membership should persist after instance restart" - ); - assert_eq!(members_after_restart[0].instance_id, instance.identity.id); - - // Wait for member to be joined again after restart - wait_for_member_state( - cptestctx, - "persist-test-group", - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - cleanup_instances( - cptestctx, - client, - PROJECT_NAME, - &["persist-test-instance"], - ) - .await; - // Group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, "persist-test-group").await; -} - /// Verify concurrent multicast operations maintain correct member states. /// /// The system handles multiple instances joining simultaneously, rapid attach/detach @@ -2119,17 +1814,20 @@ async fn test_instance_reconfigure_add_new_ssm_without_sources_fails( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test explicit member state transitions during reactivation (Left → Joining → Joined). +/// Test member state transitions through instance start/stop cycle. /// /// Verifies the 3-state lifecycle: /// - Create instance with multicast group → member in "Left" state (stopped) /// - Start instance → RPW transitions to "Joined" -/// - Stop instance → RPW transitions to "Left" -/// - Start instance again → RPW transitions Left → Joining → Joined (reactivation) +/// - Stop instance → RPW transitions back to "Left" +/// +/// This is the canonical test for the multicast member state machine. The +/// RPW reconciler handles all state transitions (Left ↔ Joining ↔ Joined) +/// using the same code path regardless of how the instance lifecycle is +/// triggered. Other tests (e.g., `test_source_ips_preserved_on_instance_restart`) +/// also exercise start/stop but focus on different invariants. #[nexus_test] -async fn test_member_state_transitions_on_reactivation( - cptestctx: &ControlPlaneTestContext, -) { +async fn test_member_state_transitions(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; let project_name = "state-transition-project"; let instance_name = "state-transition-inst"; @@ -2240,28 +1938,6 @@ async fn test_member_state_transitions_on_reactivation( ) .await; - // Start instance again (reactivation) - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &start_url) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Restart should succeed"); - - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Case: Reactivation complete -> member goes back to "Joined" state - wait_for_member_state( - cptestctx, - &expected_group_name, - member.instance_id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - // Cleanup cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, &expected_group_name).await; diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index 563ba039b1e..73978803b4f 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -66,7 +66,7 @@ mod networking_integration; mod pool_selection; // Timeout constants for test operations -const POLL_INTERVAL: Duration = Duration::from_millis(80); +const POLL_INTERVAL: Duration = Duration::from_millis(50); const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); /// Generic helper for PUT upsert requests that return 201 Created. @@ -217,9 +217,7 @@ where F: Fn() -> Fut, Fut: Future>>, { - // Activate reconciler less frequently than we check the condition - // This reduces overhead while still driving state changes forward - const RECONCILER_ACTIVATION_INTERVAL: Duration = Duration::from_millis(500); + const RECONCILER_ACTIVATION_INTERVAL: Duration = Duration::from_millis(150); let last_reconciler_activation = Arc::new(Mutex::new(Instant::now())); @@ -331,8 +329,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &Duration::from_millis(150), + &Duration::from_secs(120), ) .await { @@ -394,8 +392,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &Duration::from_millis(100), + &Duration::from_secs(30), ) .await { @@ -896,39 +894,6 @@ pub(crate) async fn wait_for_group_deleted( } } -/// Verify a group is either deleted or in one of the expected states. -/// -/// Useful when DPD is unavailable and groups can't complete state transitions. -/// For example, when DPD is down during deletion, groups may be stuck in -/// "Creating" or "Deleting" state rather than being fully deleted. -pub(crate) async fn verify_group_deleted_or_in_states( - client: &ClientTestContext, - group_name: &str, - expected_states: &[&str], -) { - let groups_result = - nexus_test_utils::resource_helpers::objects_list_page_authz::< - MulticastGroup, - >(client, "/v1/multicast-groups") - .await; - - let matching_groups: Vec<_> = groups_result - .items - .into_iter() - .filter(|g| g.identity.name == group_name) - .collect(); - - if !matching_groups.is_empty() { - // Group still exists - should be in one of the expected states - let actual_state = &matching_groups[0].state; - assert!( - expected_states.contains(&actual_state.as_str()), - "Group {group_name} should be in one of {expected_states:?} states, found: \"{actual_state}\"" - ); - } - // If group is gone, that's also valid - operation completed -} - /// Wait for a multicast group to be deleted from DPD (dataplane) with reconciler activation. /// /// This function waits for the DPD to report that the multicast group no longer exists diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 0f397c5c1d1..c51e8e79352 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -34,40 +34,43 @@ use crate::integration_tests::instances::{ fetch_instance_external_ips, instance_simulate, instance_wait_for_state, }; -/// Verify instances can have both external IPs and multicast group membership. +/// Verify external IP allocation/deallocation works with multicast group members. /// -/// External IP allocation works for multicast group members, multicast state persists -/// through external IP operations, and no conflicts occur between external IP and multicast -/// DPD configuration. +/// External IP attach/detach doesn't affect multicast state, and dataplane +/// configuration remains consistent throughout the lifecycle. +/// +/// This is the canonical test for external IP + multicast coexistence. The +/// DPD configuration paths for external IPs and multicast are independent, +/// so testing one attach/detach cycle is sufficient to verify no interference. #[nexus_test] -async fn test_multicast_with_external_ip_basic( +async fn test_multicast_external_ip_lifecycle( cptestctx: &nexus_test_utils::ControlPlaneTestContext< omicron_nexus::Server, >, ) { let client = &cptestctx.external_client; - let project_name = "external-ip-mcast-project"; - let group_name = "external-ip-mcast-group"; - let instance_name = "external-ip-mcast-instance"; + let project_name = "external-ip-lifecycle-project"; + let group_name = "external-ip-lifecycle-group"; + let instance_name = "external-ip-lifecycle-instance"; - // Setup: project and IP pools in parallel + // Setup in parallel let (_, _, _) = ops::join3( create_project(client, project_name), - create_default_ip_pool(client), // For external IPs + create_default_ip_pool(client), create_multicast_ip_pool_with_range( client, - "external-ip-mcast-pool", - (224, 100, 0, 1), - (224, 100, 0, 255), + "external-ip-lifecycle-pool", + (224, 101, 0, 1), + (224, 101, 0, 255), ), ) .await; - // Create instance (will start by default) + // Create instance let instance_params = InstanceCreate { identity: IdentityMetadataCreateParams { name: instance_name.parse().unwrap(), - description: "Instance with external IP and multicast".to_string(), + description: "Instance for external IP lifecycle test".to_string(), }, ncpus: InstanceCpuCount::try_from(1).unwrap(), memory: ByteCount::from_gibibytes_u32(1), @@ -75,12 +78,12 @@ async fn test_multicast_with_external_ip_basic( user_data: vec![], ssh_public_keys: None, network_interfaces: InstanceNetworkInterfaceAttachment::Default, - external_ips: vec![], // Start without external IP + external_ips: vec![], multicast_groups: vec![], disks: vec![], boot_disk: None, cpu_platform: None, - start: true, // Start the instance + start: true, auto_restart_policy: Default::default(), anti_affinity_groups: Vec::new(), }; @@ -90,7 +93,7 @@ async fn test_multicast_with_external_ip_basic( object_create(client, &instance_url, &instance_params).await; let instance_id = instance.identity.id; - // Transition instance to Running state + // Start instance and add to multicast group let nexus = &cptestctx.server.server_context().nexus; let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); instance_simulate(nexus, &instance_uuid).await; @@ -106,7 +109,7 @@ async fn test_multicast_with_external_ip_basic( .await; wait_for_group_active(client, group_name).await; - // Wait for multicast member to reach "Joined" state + // Wait for member to transition from "Joining"->"Joined" wait_for_member_state( cptestctx, group_name, @@ -115,11 +118,14 @@ async fn test_multicast_with_external_ip_basic( ) .await; - // Verify member count - let members = list_multicast_group_members(client, group_name).await; - assert_eq!(members.len(), 1, "Should have one multicast member"); + // Verify initial multicast state + let initial_members = + list_multicast_group_members(client, group_name).await; + assert_eq!(initial_members.len(), 1); + assert_eq!(initial_members[0].state, "Joined"); - // Allocate ephemeral external IP to the same instance + // Test external IP allocation/deallocation cycle + // Allocate ephemeral external IP let ephemeral_ip_url = format!( "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" ); @@ -135,33 +141,31 @@ async fn test_multicast_with_external_ip_basic( .await .unwrap(); - // Verify both multicast and external IP work together + // Wait for dataplane configuration to settle + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Check that multicast membership is preserved - let members_after_ip = + // Verify multicast state is preserved + let members_with_ip = list_multicast_group_members(client, group_name).await; assert_eq!( - members_after_ip.len(), + members_with_ip.len(), 1, - "Multicast member should still exist after external IP allocation" + "Multicast member should persist during external IP allocation" ); - assert_eq!(members_after_ip[0].instance_id, instance_id); assert_eq!( - members_after_ip[0].state, "Joined", - "Member state should remain Joined" + members_with_ip[0].state, "Joined", + "Member should remain Joined" ); - // Check that external IP is properly attached - let external_ips_after_attach = + // Verify external IP is attached + let external_ips_with_ip = fetch_instance_external_ips(client, instance_name, project_name).await; assert!( - !external_ips_after_attach.is_empty(), + !external_ips_with_ip.is_empty(), "Instance should have external IP" ); - // Note: external_ip.ip() from the response may differ from what's actually attached, - // so we just verify that an external IP exists - // Remove ephemeral external IP and verify multicast is unaffected + // Deallocate ephemeral external IP let external_ip_detach_url = format!( "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" ); @@ -170,197 +174,27 @@ async fn test_multicast_with_external_ip_basic( // Wait for operations to settle wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify multicast membership is still intact after external IP removal - let members_after_detach = + // Verify multicast state is still preserved + let members_without_ip = list_multicast_group_members(client, group_name).await; assert_eq!( - members_after_detach.len(), + members_without_ip.len(), 1, "Multicast member should persist after external IP removal" ); - assert_eq!(members_after_detach[0].instance_id, instance_id); assert_eq!( - members_after_detach[0].state, "Joined", - "Member should remain Joined" + members_without_ip[0].state, "Joined", + "Member should remain Joined after IP removal" ); // Verify ephemeral external IP is removed (SNAT IP may still be present) - let external_ips_after_detach = + let external_ips_without_ip = fetch_instance_external_ips(client, instance_name, project_name).await; - // Instance should have at most 1 IP left (the SNAT IP), not the ephemeral IP we attached assert!( - external_ips_after_detach.len() <= 1, + external_ips_without_ip.len() <= 1, "Instance should have at most SNAT IP remaining" ); - // Cleanup - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; - // Implicit deletion model: group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, group_name).await; -} - -/// Verify external IP allocation/deallocation lifecycle for multicast group members. -/// -/// Multiple external IP attach/detach cycles don't affect multicast state, concurrent -/// operations don't cause race conditions, and dataplane configuration remains consistent -/// throughout the lifecycle. -#[nexus_test] -async fn test_multicast_external_ip_lifecycle( - cptestctx: &nexus_test_utils::ControlPlaneTestContext< - omicron_nexus::Server, - >, -) { - let client = &cptestctx.external_client; - let project_name = "external-ip-lifecycle-project"; - let group_name = "external-ip-lifecycle-group"; - let instance_name = "external-ip-lifecycle-instance"; - - // Setup in parallel - let (_, _, _) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "external-ip-lifecycle-pool", - (224, 101, 0, 1), - (224, 101, 0, 255), - ), - ) - .await; - - // Create instance - let instance_params = InstanceCreate { - identity: IdentityMetadataCreateParams { - name: instance_name.parse().unwrap(), - description: "Instance for external IP lifecycle test".to_string(), - }, - ncpus: InstanceCpuCount::try_from(1).unwrap(), - memory: ByteCount::from_gibibytes_u32(1), - hostname: instance_name.parse().unwrap(), - user_data: vec![], - ssh_public_keys: None, - network_interfaces: InstanceNetworkInterfaceAttachment::Default, - external_ips: vec![], - multicast_groups: vec![], - disks: vec![], - boot_disk: None, - cpu_platform: None, - start: true, - auto_restart_policy: Default::default(), - anti_affinity_groups: Vec::new(), - }; - - let instance_url = format!("/v1/instances?project={project_name}"); - let instance: Instance = - object_create(client, &instance_url, &instance_params).await; - let instance_id = instance.identity.id; - - // Start instance and add to multicast group - let nexus = &cptestctx.server.server_context().nexus; - let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); - instance_simulate(nexus, &instance_uuid).await; - instance_wait_for_state(client, instance_uuid, InstanceState::Running) - .await; - - // Ensure multicast test prerequisites (inventory + DPD) are ready - ensure_multicast_test_ready(cptestctx).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Add instance to multicast group via instance-centric API - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; - wait_for_group_active(client, group_name).await; - - // Wait for member to transition from "Joining"->"Joined" - wait_for_member_state( - cptestctx, - group_name, - instance_id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify initial multicast state - let initial_members = - list_multicast_group_members(client, group_name).await; - assert_eq!(initial_members.len(), 1); - assert_eq!(initial_members[0].state, "Joined"); - - // Test multiple external IP allocation/deallocation cycles - for cycle in 1..=3 { - // Allocate ephemeral external IP - let ephemeral_ip_url = format!( - "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &ephemeral_ip_url) - .body(Some(&EphemeralIpCreate { - pool: None, // Use default pool - })) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap(); - - // Wait for dataplane configuration to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify multicast state is preserved - let members_with_ip = - list_multicast_group_members(client, group_name).await; - assert_eq!( - members_with_ip.len(), - 1, - "Cycle {cycle}: Multicast member should persist during external IP allocation" - ); - assert_eq!( - members_with_ip[0].state, "Joined", - "Cycle {cycle}: Member should remain Joined" - ); - - // Verify external IP is attached - let external_ips_with_ip = - fetch_instance_external_ips(client, instance_name, project_name) - .await; - assert!( - !external_ips_with_ip.is_empty(), - "Cycle {cycle}: Instance should have external IP" - ); - - // Deallocate ephemeral external IP - let external_ip_detach_url = format!( - "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" - ); - object_delete(client, &external_ip_detach_url).await; - - // Wait for operations to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify multicast state is still preserved - let members_without_ip = - list_multicast_group_members(client, group_name).await; - assert_eq!( - members_without_ip.len(), - 1, - "Cycle {cycle}: Multicast member should persist after external IP removal" - ); - assert_eq!( - members_without_ip[0].state, "Joined", - "Cycle {cycle}: Member should remain Joined after IP removal" - ); - - // Verify ephemeral external IP is removed (SNAT IP may still be present) - let external_ips_without_ip = - fetch_instance_external_ips(client, instance_name, project_name) - .await; - assert!( - external_ips_without_ip.len() <= 1, - "Cycle {cycle}: Instance should have at most SNAT IP remaining" - ); - } - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, group_name).await; }