diff --git a/openhcl/openhcl_boot/src/host_params/dt/mod.rs b/openhcl/openhcl_boot/src/host_params/dt/mod.rs index 8a08b2b5bf..865f68deb3 100644 --- a/openhcl/openhcl_boot/src/host_params/dt/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/dt/mod.rs @@ -849,25 +849,21 @@ impl PartitionInfo { init_heap(params); let persisted_state_header = read_persisted_region_header(params); - let (topology, has_devices_that_should_disable_sidecar) = - if let Some(header) = persisted_state_header { - log!("found persisted state header"); - let persisted_topology = - topology_from_persisted_state(header, params, parsed, address_space)?; - - ( - persisted_topology.topology, - !(persisted_topology - .cpus_with_mapped_interrupts_no_io - .is_empty() - && persisted_topology.cpus_with_outstanding_io.is_empty()), - ) - } else { - ( - topology_from_host_dt(params, parsed, &options, address_space)?, - false, - ) - }; + let (topology, cpus_with_mapped_interrupts) = if let Some(header) = persisted_state_header { + log!("found persisted state header"); + let persisted_topology = + topology_from_persisted_state(header, params, parsed, address_space)?; + + ( + persisted_topology.topology, + persisted_topology.cpus_with_outstanding_io, + ) + } else { + ( + topology_from_host_dt(params, parsed, &options, address_space)?, + Vec::new(), + ) + }; let Self { vtl2_ram, @@ -875,6 +871,7 @@ impl PartitionInfo { isolation, bsp_reg, cpus, + sidecar_cpu_overrides, vmbus_vtl0, vmbus_vtl2, cmdline: _, @@ -888,20 +885,32 @@ impl PartitionInfo { boot_options, } = storage; - if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = ( + if let (SidecarOptions::Enabled { .. }, true) = ( &boot_options.sidecar, - has_devices_that_should_disable_sidecar, + !cpus_with_mapped_interrupts.is_empty(), ) { - if cpu_threshold.is_none() - || cpu_threshold - .and_then(|threshold| threshold.try_into().ok()) - .is_some_and(|threshold| parsed.cpu_count() < threshold) - { - // If we are in the restore path, disable sidecar for small VMs, as the amortization - // benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway - // to check for interrupts. - log!("disabling sidecar, as we are restoring from persisted state"); - boot_options.sidecar = SidecarOptions::DisabledServicing; + let max_cpu_id = *cpus_with_mapped_interrupts + .iter() + .max() + .expect("non-empty vector") as usize; + if max_cpu_id < sidecar_cpu_overrides.sidecar_starts_cpu.len() { + sidecar_cpu_overrides.per_cpu_state_specified = true; + cpus_with_mapped_interrupts.iter().for_each(|&cpu_id| { + sidecar_cpu_overrides.sidecar_starts_cpu[cpu_id as usize] = false; + }); + log!( + "disabling sidecar for CPUs {:?} due to mapped interrupts, per_cpu_state_specified={}, per_cpu_state={:?}", + cpus_with_mapped_interrupts, + sidecar_cpu_overrides.per_cpu_state_specified, + &sidecar_cpu_overrides.sidecar_starts_cpu[..max_cpu_id + 1], + ); + } else { + // Degenerate case, and we'll need those VPs started by the time OpenHCL usermode restores anyways, so just + // disable sidecar. + log!( + "too many CPUs with mapped interrupts ({}), disabling sidecar", + cpus_with_mapped_interrupts.len() + ); options.sidecar = SidecarOptions::DisabledServicing; } } diff --git a/openhcl/openhcl_boot/src/host_params/mod.rs b/openhcl/openhcl_boot/src/host_params/mod.rs index 30f7c91173..7147b5578d 100644 --- a/openhcl/openhcl_boot/src/host_params/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/mod.rs @@ -56,6 +56,8 @@ pub struct PartitionInfo { pub bsp_reg: u32, /// Cpu info for enabled cpus. pub cpus: ArrayVec, + /// Per-CPU state to apply when starting the sidecar kernel. + pub sidecar_cpu_overrides: sidecar_defs::PerCpuState, /// VMBUS info for VTL2. pub vmbus_vtl2: VmbusInfo, /// VMBUS info for VTL0. @@ -90,6 +92,10 @@ impl PartitionInfo { isolation: IsolationType::None, bsp_reg: 0, cpus: ArrayVec::new_const(), + sidecar_cpu_overrides: sidecar_defs::PerCpuState { + per_cpu_state_specified: false, + sidecar_starts_cpu: [true; sidecar_defs::NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE], + }, vmbus_vtl2: VmbusInfo { mmio: ArrayVec::new_const(), connection_id: 0, diff --git a/openhcl/openhcl_boot/src/main.rs b/openhcl/openhcl_boot/src/main.rs index 414af5deaa..4b9a4b7c6a 100644 --- a/openhcl/openhcl_boot/src/main.rs +++ b/openhcl/openhcl_boot/src/main.rs @@ -889,6 +889,7 @@ mod test { use loader_defs::linux::e820entry; use memory_range::MemoryRange; use memory_range::subtract_ranges; + use sidecar_defs::PerCpuState; use zerocopy::FromZeros; const HIGH_MMIO_GAP_END: u64 = 0x1000000000; // 64 GiB @@ -915,6 +916,10 @@ mod test { isolation: IsolationType::None, bsp_reg: cpus[0].reg as u32, cpus, + sidecar_cpu_overrides: PerCpuState { + per_cpu_state_specified: false, + sidecar_starts_cpu: [true; sidecar_defs::NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE], + }, cmdline: ArrayString::new(), vmbus_vtl2: VmbusInfo { mmio, diff --git a/openhcl/openhcl_boot/src/sidecar.rs b/openhcl/openhcl_boot/src/sidecar.rs index ad9db8d296..3a50e68397 100644 --- a/openhcl/openhcl_boot/src/sidecar.rs +++ b/openhcl/openhcl_boot/src/sidecar.rs @@ -27,7 +27,9 @@ const _: () = assert!( ); pub struct SidecarConfig<'a> { + pub num_cpus: usize, pub node_params: &'a [SidecarNodeParams], + pub per_cpu_state: &'a sidecar_defs::PerCpuState, pub nodes: &'a [SidecarNodeOutput], pub start_reftime: u64, pub end_reftime: u64, @@ -49,10 +51,27 @@ impl core::fmt::Display for SidecarKernelCommandLine<'_> { // Linux boots with the base VP of each sidecar node. Other CPUs will // be brought up by the sidecar kernel. f.write_str("boot_cpus=")?; - let mut comma = ""; - for node in self.0.node_params { - write!(f, "{}{}", comma, node.base_vp)?; - comma = ","; + if self.0.per_cpu_state.per_cpu_state_specified { + let mut comma = ""; + for (cpu_index, &sidecar_starts) in self + .0 + .per_cpu_state + .sidecar_starts_cpu + .iter() + .take(self.0.num_cpus) + .enumerate() + { + if !sidecar_starts { + write!(f, "{}{}", comma, cpu_index)?; + comma = ","; + } + } + } else { + let mut comma = ""; + for node in self.0.node_params { + write!(f, "{}{}", comma, node.base_vp)?; + comma = ","; + } } Ok(()) } @@ -137,6 +156,7 @@ pub fn start_sidecar<'a>( enable_logging: _, node_count, nodes, + initial_state, } = sidecar_params; *hypercall_page = 0; @@ -189,6 +209,14 @@ pub fn start_sidecar<'a>( base_vp, vp_count: cpus.len() as u32, }; + *initial_state = partition_info.sidecar_cpu_overrides.clone(); + if initial_state.per_cpu_state_specified { + // If per-CPU state is specified, make sure to explicitly state that + // sidecar should not start the base vp of this node. + // The code that set per_cpu_state_specified should have already ensured that + // the array is large enough for any `base_vp` we might have here. + initial_state.sidecar_starts_cpu[base_vp as usize] = false; + } base_vp += cpus.len() as u32; *node_count += 1; total_ram += required_ram; @@ -217,9 +245,11 @@ pub fn start_sidecar<'a>( let SidecarOutput { nodes, error: _ } = sidecar_output; Some(SidecarConfig { + num_cpus: partition_info.cpus.len(), start_reftime: boot_start_reftime, end_reftime: boot_end_reftime, node_params: &sidecar_params.nodes[..node_count], nodes: &nodes[..node_count], + per_cpu_state: &sidecar_params.initial_state, }) } diff --git a/openhcl/sidecar/src/arch/x86_64/init.rs b/openhcl/sidecar/src/arch/x86_64/init.rs index 948218943b..48ecf8beb1 100644 --- a/openhcl/sidecar/src/arch/x86_64/init.rs +++ b/openhcl/sidecar/src/arch/x86_64/init.rs @@ -22,6 +22,7 @@ use core::hint::spin_loop; use core::mem::MaybeUninit; use core::ptr::addr_of; use core::ptr::addr_of_mut; +use core::sync::atomic::AtomicU8; use core::sync::atomic::AtomicU32; use core::sync::atomic::Ordering::Acquire; use core::sync::atomic::Ordering::Relaxed; @@ -185,6 +186,7 @@ fn init( enable_logging, node_count, ref nodes, + ref initial_state, } = params; ENABLE_LOG.store(enable_logging, Relaxed); @@ -318,9 +320,31 @@ fn init( *response_vector = 0.into(); *needs_attention = 0.into(); reserved.fill(0); + + // Default: the base VP is REMOVED (meaning the kernel will start it), the + // next `vp_count - 1` VPs are RUN, and any remaining VPs are REMOVED. cpu_status[0] = CpuStatus::REMOVED.0.into(); cpu_status[1..vp_count as usize].fill_with(|| CpuStatus::RUN.0.into()); cpu_status[vp_count as usize..].fill_with(|| CpuStatus::REMOVED.0.into()); + + // Override with the initial state from openhcl_boot. + if initial_state.per_cpu_state_specified { + for (i, &should_start) in initial_state + .sidecar_starts_cpu + .iter() + .skip(base_vp as usize) + .enumerate() + .take(vp_count as usize) + { + // Pragmatically, any CPU that has `RUN` should already match + // the default above, but set it anyway for clarity. + cpu_status[i] = if should_start { + CpuStatus::RUN.0.into() + } else { + CpuStatus::REMOVED.0.into() + }; + } + } } node_init.push(NodeInit { @@ -340,7 +364,8 @@ fn init( // // SAFETY: no concurrent mutators. let node_init = unsafe { &*addr_of!(NODE_INIT) }; - start_aps(node_init, mapper); + let cpu_status: [AtomicU8; 1] = [0.into()]; // TODO: get the cpu_status for the BSP node + start_aps(node_init, mapper, &cpu_status); // Wait for all the APs to finish starting. { @@ -366,7 +391,7 @@ struct NodeInit { static mut NODE_INIT: ArrayVec = ArrayVec::new_const(); -fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) { +fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper, cpu_status: &[AtomicU8]) { for node in node_init { loop { let node_cpu_index = node.next_vp.fetch_add(1, Relaxed); @@ -374,6 +399,20 @@ fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) { if node_cpu_index >= node.node.vp_count { break; } + + // See the init function for where we set the initial CPU status for why we can end up here with a REMOVED CPU. + // todo: map the control page PA for this node rather than relying on the passed in cpu_status slice (since that is only for + // the node that's actually starting something). + if cpu_status[node_cpu_index as usize].load(Relaxed) == CpuStatus::REMOVED.0.into() { + log!( + "skipping VP {} (node base {}, node idx {}) as REMOVED", + node.node.base_vp + node_cpu_index, + node.node.base_vp, + node_cpu_index + ); + continue; + } + match node.node.start(mapper, node_cpu_index) { Ok(()) => {} Err(err) => { @@ -392,12 +431,14 @@ fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) { /// Must be called as an AP entry point. unsafe fn ap_init() -> ! { // Start any other pending APs. + // mattkur: I'm confused why this works ... start_aps will start at node 0, just like the BSP. + // what's stopping each node from trampling over each other? { // SAFETY: `NODE_INIT` is set before this routine is called. let node_init = unsafe { &*addr_of!(NODE_INIT) }; // SAFETY: nothing else on this CPU is using the temporary map. let mut mapper = unsafe { temporary_map::Mapper::new(0) }; - start_aps(node_init, &mut mapper) + start_aps(node_init, &mut mapper, &super::vp::control().cpu_status); // see todo in start_aps for why cpu_status is wrong here. } // SAFETY: this is an entry point. unsafe { super::vp::ap_entry() } diff --git a/openhcl/sidecar/src/arch/x86_64/vp.rs b/openhcl/sidecar/src/arch/x86_64/vp.rs index 56343ea799..8c6fc1dbc6 100644 --- a/openhcl/sidecar/src/arch/x86_64/vp.rs +++ b/openhcl/sidecar/src/arch/x86_64/vp.rs @@ -224,7 +224,9 @@ fn ap_run(globals: &mut VpGlobals) { } } -fn control() -> &'static ControlPage { +/// Returns a reference to the control page for this VP. +/// Only use this on AP startup paths. +pub fn control() -> &'static ControlPage { // SAFETY: all mutable fields of the control page have interior mutability, // so this is a valid dereference. unsafe { &*addr_space::control_page() } diff --git a/openhcl/sidecar_defs/src/lib.rs b/openhcl/sidecar_defs/src/lib.rs index 112b7de230..bbebe2656c 100644 --- a/openhcl/sidecar_defs/src/lib.rs +++ b/openhcl/sidecar_defs/src/lib.rs @@ -18,6 +18,23 @@ use zerocopy::Immutable; use zerocopy::IntoBytes; use zerocopy::KnownLayout; +/// Starting state for each CPU, supplied by `openhcl_boot`. +/// For various reasons, `openhcl_boot` may have decided to +/// instruct the linux kernel to start additional CPUs (notably, +/// in the event that we know we're about to spawn tasks on those +/// CPUs right away to handle device interrupts). +#[repr(C)] +#[derive(FromZeros, Immutable, KnownLayout, Debug, Clone)] +pub struct PerCpuState { + /// Whether the per-CPU state is specified, since `NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE` + /// is less than the maximum number of CPUs supported by OpenHCL and also by sidecar. + pub per_cpu_state_specified: bool, + + /// Whether the CPU should be started by the sidecar kernel. If false, + /// the CPU will be started by (or remain with) the main kernel instead. + pub sidecar_starts_cpu: [bool; NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE], +} + /// Sidecar start input parameters. #[repr(C, align(4096))] #[derive(FromZeros, Immutable, KnownLayout)] @@ -30,6 +47,8 @@ pub struct SidecarParams { pub node_count: u32, /// The node-specific input parameters. pub nodes: [SidecarNodeParams; MAX_NODES], + /// The initial CPU state for each CPU. + pub initial_state: PerCpuState, } /// Node-specific input parameters. @@ -49,6 +68,10 @@ pub struct SidecarNodeParams { /// The maximum number of supported sidecar nodes. pub const MAX_NODES: usize = 128; +/// The maximum number of supported sidecar CPUs. +/// This odd number is chosen so that `SidecarParams` still +/// fits within a single page. +pub const NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE: usize = 1000; const _: () = assert!(size_of::() <= PAGE_SIZE);