diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs index 4b0a21993..f962f5d69 100644 --- a/dev_tests/src/ratchet.rs +++ b/dev_tests/src/ratchet.rs @@ -72,7 +72,6 @@ fn ratchet_maybe_uninit() -> Result<()> { ("dev_tests/", 1), ("litebox/", 1), ("litebox_platform_linux_userland/", 3), - ("litebox_platform_lvbs/", 5), ("litebox_shim_linux/", 5), ("litebox_shim_optee/", 1), ], diff --git a/litebox/src/shim.rs b/litebox/src/shim.rs index b80aad8a1..468db9fce 100644 --- a/litebox/src/shim.rs +++ b/litebox/src/shim.rs @@ -90,12 +90,27 @@ pub trait EnterShim { } /// The operation to perform after returning from a shim handler +/// +/// - `ResumeGuest` and `ExitThread` cover the cases where the platform enters the shim +/// in response to events that occur during guest execution (e.g., a syscall). +/// - `ResumeKernelPlatform` and `ExceptionFixup` cover the cases where the **kernel platform** +/// enters the shim in response to events that occur during platform execution +/// (e.g., a user-space page fault triggered by a syscall handler). #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum ContinueOperation { /// Resume execution of the guest. ResumeGuest, /// Exit the current thread. ExitThread, + /// The shim successfully handled an exception which was triggered by + /// the kernel platform (e.g., a syscall handler's copy_from_user against + /// demand-pageable user memory); Resume the kernel platform's execution. + ResumeKernelPlatform, + /// The shim failed to handle the exception (e.g., invalid memory access). + /// The kernel platform will apply a fixup via + /// [`search_exception_tables`](crate::mm::exception_table::search_exception_tables) + /// if one exists. + ExceptionFixup, } /// Information about a hardware exception. @@ -109,6 +124,9 @@ pub struct ExceptionInfo { /// The value of the CR2 register at the time of the exception, if /// applicable (e.g., for page faults). pub cr2: usize, + /// Whether the exception occurred in kernel mode (e.g., a demand page + /// fault during a kernel-mode access to a user-space address). + pub kernel_mode: bool, } /// An x86 exception type. diff --git a/litebox_common_linux/src/vmap.rs b/litebox_common_linux/src/vmap.rs index e2c6e3d65..325ce75b3 100644 --- a/litebox_common_linux/src/vmap.rs +++ b/litebox_common_linux/src/vmap.rs @@ -172,4 +172,6 @@ pub enum PhysPointerError { UnsupportedOperation, #[error("Unsupported permissions: {0:#x}")] UnsupportedPermissions(u8), + #[error("Memory copy failed")] + CopyFailed, } diff --git a/litebox_platform_linux_kernel/src/host/snp/snp_impl.rs b/litebox_platform_linux_kernel/src/host/snp/snp_impl.rs index c6662aaa9..78dfda3e8 100644 --- a/litebox_platform_linux_kernel/src/host/snp/snp_impl.rs +++ b/litebox_platform_linux_kernel/src/host/snp/snp_impl.rs @@ -216,6 +216,12 @@ pub fn init_thread( match tls.shim.get().unwrap().init(pt_regs) { litebox::shim::ContinueOperation::ResumeGuest => {} litebox::shim::ContinueOperation::ExitThread => exit_thread(), + litebox::shim::ContinueOperation::ResumeKernelPlatform => { + panic!("ResumeKernelPlatform not expected in SNP init") + } + litebox::shim::ContinueOperation::ExceptionFixup => { + panic!("ExceptionFixup not expected in SNP init") + } } } @@ -238,6 +244,12 @@ pub fn handle_syscall(pt_regs: &mut litebox_common_linux::PtRegs) { match tls.shim.get().unwrap().syscall(pt_regs) { litebox::shim::ContinueOperation::ResumeGuest => {} litebox::shim::ContinueOperation::ExitThread => exit_thread(), + litebox::shim::ContinueOperation::ResumeKernelPlatform => { + panic!("ResumeKernelPlatform not expected in SNP syscall") + } + litebox::shim::ContinueOperation::ExceptionFixup => { + panic!("ExceptionFixup not expected in SNP syscall") + } } } diff --git a/litebox_platform_linux_userland/src/lib.rs b/litebox_platform_linux_userland/src/lib.rs index 038ebcf45..2b2c2a3d3 100644 --- a/litebox_platform_linux_userland/src/lib.rs +++ b/litebox_platform_linux_userland/src/lib.rs @@ -1596,6 +1596,7 @@ extern "C-unwind" fn exception_handler( exception: litebox::shim::Exception(trapno.try_into().unwrap()), error_code: error.try_into().unwrap(), cr2, + kernel_mode: false, }; thread_ctx.call_shim(|shim, ctx| shim.exception(ctx, &info)); } @@ -1632,6 +1633,12 @@ impl ThreadContext<'_> { match op { ContinueOperation::ResumeGuest => unsafe { switch_to_guest(self.ctx) }, ContinueOperation::ExitThread => {} + ContinueOperation::ResumeKernelPlatform => { + panic!("ResumeKernelPlatform not expected in linux_userland") + } + ContinueOperation::ExceptionFixup => { + panic!("ExceptionFixup not expected in linux_userland") + } } } } @@ -2228,6 +2235,25 @@ impl litebox::platform::CrngProvider for LinuxUserland { /// testing, or use a kernel module to provide this functionality (if needed). impl VmapManager for LinuxUserland {} +/// Dummy `VmemPageFaultHandler`. +/// +/// Page faults are handled transparently by the host Linux kernel. +/// Provided to satisfy trait bounds for `PageManager::handle_page_fault`. +impl litebox::mm::linux::VmemPageFaultHandler for LinuxUserland { + unsafe fn handle_page_fault( + &self, + _fault_addr: usize, + _flags: litebox::mm::linux::VmFlags, + _error_code: u64, + ) -> Result<(), litebox::mm::linux::PageFaultError> { + unreachable!("host kernel handles page faults for Linux userland") + } + + fn access_error(_error_code: u64, _flags: litebox::mm::linux::VmFlags) -> bool { + unreachable!("host kernel handles page faults for Linux userland") + } +} + #[cfg(test)] mod tests { use core::sync::atomic::AtomicU32; diff --git a/litebox_platform_lvbs/Cargo.toml b/litebox_platform_lvbs/Cargo.toml index 236d25b5b..190f28b13 100644 --- a/litebox_platform_lvbs/Cargo.toml +++ b/litebox_platform_lvbs/Cargo.toml @@ -37,7 +37,7 @@ object = { version = "0.36.7", default-features = false, features = ["pe"] } digest = { version = "0.10.7", default-features = false } aligned-vec = { version = "0.6.4", default-features = false } raw-cpuid = "11.6.0" -zerocopy = { version = "0.8", default-features = false } +zerocopy = { version = "0.8", default-features = false, features = ["derive"] } [target.'cfg(target_arch = "x86_64")'.dependencies] x86_64 = { version = "0.15.2", default-features = false, features = ["instructions"] } diff --git a/litebox_platform_lvbs/src/arch/x86/gdt.rs b/litebox_platform_lvbs/src/arch/x86/gdt.rs index dd405b97f..e86fcbe9f 100644 --- a/litebox_platform_lvbs/src/arch/x86/gdt.rs +++ b/litebox_platform_lvbs/src/arch/x86/gdt.rs @@ -82,10 +82,17 @@ impl Default for GdtWrapper { } fn setup_gdt_tss() { - let stack_top = with_per_cpu_variables_asm(PerCpuVariablesAsm::get_interrupt_stack_ptr); + let double_fault_stack_top = + with_per_cpu_variables_asm(PerCpuVariablesAsm::get_double_fault_stack_ptr); + let exception_stack_top = + with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception_stack_ptr); let mut tss = Box::new(AlignedTss(TaskStateSegment::new())); - tss.0.interrupt_stack_table[0] = VirtAddr::new(stack_top as u64); + // TSS.IST1: dedicated stack for double faults + tss.0.interrupt_stack_table[0] = VirtAddr::new(double_fault_stack_top as u64); + // TSS.RSP0: stack loaded by the CPU on Ring 3 -> Ring 0 transition when the IDT + // entry's IST index is 0. In our setup, all exceptions except for double faults. + tss.0.privilege_stack_table[0] = VirtAddr::new(exception_stack_top as u64); // `tss_segment()` requires `&'static TaskStateSegment`. Leaking `tss` is fine because // it will be used until the LVBS kernel resets. let tss = Box::leak(tss); diff --git a/litebox_platform_lvbs/src/arch/x86/interrupts.S b/litebox_platform_lvbs/src/arch/x86/interrupts.S index 23dcc3e94..e4a92e096 100644 --- a/litebox_platform_lvbs/src/arch/x86/interrupts.S +++ b/litebox_platform_lvbs/src/arch/x86/interrupts.S @@ -4,10 +4,29 @@ /* * Interrupt Service Routine (ISR) stubs for x86_64 * - * This file provides assembly stubs for interrupt handlers that: - * 1. Save all general-purpose registers in PtRegs layout - * 2. Call the appropriate Rust handler - * 3. Restore registers and return via iretq + * Each stub checks the saved CS RPL bits to determine whether the exception + * came from user mode (ring 3) or kernel mode (ring 0): + * + * - User-mode exceptions: push the vector number and jump to + * exception_callback (run_thread_arch), which swaps GS, saves the full + * CPU context, and routes to the shim's exception handler. + * + * - Kernel-mode exceptions: standard push_regs/call/pop_regs/iretq flow + * into a per-vector Rust handler. + * + * Stacks (Reference: Intel SDM Vol. 3A, §6.12.1): + * + * Unless an IST entry is configured for the vector (i.e., #DF in our case), + * the CPU selects the stack based on the privilege transition: + * + * - User-mode (CPL change): the CPU loads RSP from TSS.RSP0. We set a + * dedicated per-CPU stack for this (gdt.rs). Since RSP0 is always + * reloaded from the TSS, we do not wipe stale data from old exceptions. + * + * - Kernel-mode (no CPL change): the CPU continues on the current stack. + * The ISR stub pushes registers onto it. Kernel code must ensure enough + * stack space before performing operations that might fault; otherwise + * the fault handler may overwrite live data or trigger a double fault. * * The x86_64 interrupt frame pushed by CPU: * [rsp+40] SS @@ -72,15 +91,14 @@ pop rdi .endm -/* - * ISR stub for interrupts WITHOUT an error code. - * The CPU does not push an error code, so we push a dummy 0. - */ -.macro isr_no_err_code name:req handler:req +/* ISR stub for interrupts WITHOUT an error code. */ +.macro isr_no_err_code name:req handler:req vector:req .global \name \name: cld push 0 /* Push dummy error code */ + test qword ptr [rsp + 16], 0x3 /* Check CS RPL bits */ + jnz .Luser_\name push_regs mov rbp, rsp /* Save stack pointer */ and rsp, -16 /* Align stack to 16 bytes for call */ @@ -90,16 +108,18 @@ pop_regs add rsp, 8 /* Skip error code */ iretq +.Luser_\name: + push \vector /* Pass vector number to exception_callback */ + jmp exception_callback .endm -/* - * ISR stub for interrupts WITH an error code. - * The CPU pushes the error code automatically. - */ -.macro isr_with_err_code name:req handler:req +/* ISR stub for interrupts WITH an error code. */ +.macro isr_with_err_code name:req handler:req vector:req .global \name \name: cld + test qword ptr [rsp + 16], 0x3 /* Check CS RPL bits */ + jnz .Luser_\name push_regs mov rbp, rsp /* Save stack pointer */ and rsp, -16 /* Align stack to 16 bytes for call */ @@ -109,51 +129,68 @@ pop_regs add rsp, 8 /* Skip error code */ iretq +.Luser_\name: + push \vector /* Pass vector number to exception_callback */ + jmp exception_callback .endm /* Exception handlers (vectors 0-31) */ /* Vector 0: Divide Error (#DE) - No error code */ -isr_no_err_code isr_divide_error divide_error_handler_impl +isr_no_err_code isr_divide_error divide_error_handler_impl 0 /* Vector 1: Debug (#DB) - No error code */ -isr_no_err_code isr_debug debug_handler_impl +isr_no_err_code isr_debug debug_handler_impl 1 /* Vector 3: Breakpoint (#BP) - No error code */ -isr_no_err_code isr_breakpoint breakpoint_handler_impl +isr_no_err_code isr_breakpoint breakpoint_handler_impl 3 /* Vector 4: Overflow (#OF) - No error code */ -isr_no_err_code isr_overflow overflow_handler_impl +isr_no_err_code isr_overflow overflow_handler_impl 4 /* Vector 5: Bound Range Exceeded (#BR) - No error code */ -isr_no_err_code isr_bound_range_exceeded bound_range_exceeded_handler_impl +isr_no_err_code isr_bound_range_exceeded bound_range_exceeded_handler_impl 5 /* Vector 6: Invalid Opcode (#UD) - No error code */ -isr_no_err_code isr_invalid_opcode invalid_opcode_handler_impl +isr_no_err_code isr_invalid_opcode invalid_opcode_handler_impl 6 /* Vector 7: Device Not Available (#NM) - No error code */ -isr_no_err_code isr_device_not_available device_not_available_handler_impl +isr_no_err_code isr_device_not_available device_not_available_handler_impl 7 /* Vector 8: Double Fault (#DF) - Error code (always 0) */ -isr_with_err_code isr_double_fault double_fault_handler_impl +isr_with_err_code isr_double_fault double_fault_handler_impl 8 /* Vector 12: Stack-Segment Fault (#SS) - Error code */ -isr_with_err_code isr_stack_segment_fault stack_segment_fault_handler_impl +isr_with_err_code isr_stack_segment_fault stack_segment_fault_handler_impl 12 /* Vector 13: General Protection Fault (#GP) - Error code */ -isr_with_err_code isr_general_protection_fault general_protection_fault_handler_impl +isr_with_err_code isr_general_protection_fault general_protection_fault_handler_impl 13 -/* Vector 14: Page Fault (#PF) - Error code */ -isr_with_err_code isr_page_fault page_fault_handler_impl +/* Vector 14: Page Fault (#PF) - Error code + * + * Both kernel-mode and user-mode page faults are routed through the shim's + * exception_handler, which handles demand paging, exception table fixup, + * and panic in order. The ISR stub just pushes the vector number and jumps + * to the appropriate callback. + */ +.global isr_page_fault +isr_page_fault: + cld + test qword ptr [rsp + 16], 0x3 /* Check CS RPL bits */ + push 14 /* Pass vector number (push does not affect flags) */ + jnz .Luser_isr_page_fault + jmp kernel_exception_callback +.Luser_isr_page_fault: + jmp exception_callback /* Vector 16: x87 Floating-Point Exception (#MF) - No error code */ -isr_no_err_code isr_x87_floating_point x87_floating_point_handler_impl +isr_no_err_code isr_x87_floating_point x87_floating_point_handler_impl 16 /* Vector 17: Alignment Check (#AC) - Error code */ -isr_with_err_code isr_alignment_check alignment_check_handler_impl +isr_with_err_code isr_alignment_check alignment_check_handler_impl 17 /* Vector 19: SIMD Floating-Point Exception (#XM) - No error code */ -isr_no_err_code isr_simd_floating_point simd_floating_point_handler_impl +isr_no_err_code isr_simd_floating_point simd_floating_point_handler_impl 19 /* * Hypervisor synthetic interrupt handler (vector 0xf3) diff --git a/litebox_platform_lvbs/src/arch/x86/interrupts.rs b/litebox_platform_lvbs/src/arch/x86/interrupts.rs index 94b6c2b18..3d03e0cc8 100644 --- a/litebox_platform_lvbs/src/arch/x86/interrupts.rs +++ b/litebox_platform_lvbs/src/arch/x86/interrupts.rs @@ -97,174 +97,99 @@ pub fn init_idt() { idt().load(); } -// TODO: carefully handle exceptions/interrupts. If an exception or interrupt is due to userspace code, -// we should destroy the corresponding user context rather than halt the entire kernel. +// TODO: Let's consider whether we can recover some of the below exceptions instead of panicking. -/// User-mode CS selector has RPL=3 (bits 0-1 set) -const USER_MODE_RPL_MASK: usize = 0x3; - -/// Check if the exception occurred in user mode by examining the saved CS register. -#[inline] -fn is_user_mode(regs: &PtRegs) -> bool { - (regs.cs & USER_MODE_RPL_MASK) == USER_MODE_RPL_MASK -} - -/// Get a string indicating the execution context (kernel or user mode). -#[inline] -fn mode_str(regs: &PtRegs) -> &'static str { - if is_user_mode(regs) { "USER" } else { "KERNEL" } -} - -/// Rust handler for divide error exception (vector 0). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for divide error exception (vector 0). #[unsafe(no_mangle)] extern "C" fn divide_error_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: DIVIDE BY ZERO\n{:#x?}", - mode_str(regs), - regs - ); + panic!("EXCEPTION: DIVIDE BY ZERO\n{:#x?}", regs); } -/// Rust handler for debug exception (vector 1). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for debug exception (vector 1). #[unsafe(no_mangle)] extern "C" fn debug_handler_impl(regs: &PtRegs) { - todo!("EXCEPTION [{}]: DEBUG\n{:#x?}", mode_str(regs), regs); + panic!("EXCEPTION: DEBUG\n{:#x?}", regs); } -/// Rust handler for breakpoint exception (vector 3). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for breakpoint exception (vector 3). #[unsafe(no_mangle)] extern "C" fn breakpoint_handler_impl(regs: &PtRegs) { - todo!("EXCEPTION [{}]: BREAKPOINT\n{:#x?}", mode_str(regs), regs); + panic!("EXCEPTION: BREAKPOINT\n{:#x?}", regs); } -/// Rust handler for overflow exception (vector 4). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for overflow exception (vector 4). #[unsafe(no_mangle)] extern "C" fn overflow_handler_impl(regs: &PtRegs) { - todo!("EXCEPTION [{}]: OVERFLOW\n{:#x?}", mode_str(regs), regs); + panic!("EXCEPTION: OVERFLOW\n{:#x?}", regs); } -/// Rust handler for bound range exceeded exception (vector 5). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for bound range exceeded exception (vector 5). #[unsafe(no_mangle)] extern "C" fn bound_range_exceeded_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: BOUND RANGE EXCEEDED\n{:#x?}", - mode_str(regs), - regs - ); + panic!("EXCEPTION: BOUND RANGE EXCEEDED\n{:#x?}", regs); } -/// Rust handler for invalid opcode exception (vector 6). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for invalid opcode exception (vector 6). #[unsafe(no_mangle)] extern "C" fn invalid_opcode_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: INVALID OPCODE at RIP {:#x}\n{:#x?}", - mode_str(regs), - regs.rip, - regs + panic!( + "EXCEPTION: INVALID OPCODE at RIP {:#x}\n{:#x?}", + regs.rip, regs ); } -/// Rust handler for device not available exception (vector 7). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for device not available exception (vector 7). #[unsafe(no_mangle)] extern "C" fn device_not_available_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: DEVICE NOT AVAILABLE (FPU/SSE)\n{:#x?}", - mode_str(regs), - regs - ); + panic!("EXCEPTION: DEVICE NOT AVAILABLE (FPU/SSE)\n{:#x?}", regs); } -/// Rust handler for double fault exception (vector 8). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for double fault exception (vector 8). #[unsafe(no_mangle)] extern "C" fn double_fault_handler_impl(regs: &PtRegs) { - // Double faults are always fatal - no recovery possible panic!( - "EXCEPTION [{}]: DOUBLE FAULT (Error Code: {:#x})\n{:#x?}", - mode_str(regs), - regs.orig_rax, - regs + "EXCEPTION: DOUBLE FAULT (Error Code: {:#x})\n{:#x?}", + regs.orig_rax, regs ); } -/// Rust handler for stack-segment fault exception (vector 12). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for stack-segment fault exception (vector 12). #[unsafe(no_mangle)] extern "C" fn stack_segment_fault_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: STACK-SEGMENT FAULT (Error Code: {:#x})\n{:#x?}", - mode_str(regs), - regs.orig_rax, - regs + panic!( + "EXCEPTION: STACK-SEGMENT FAULT (Error Code: {:#x})\n{:#x?}", + regs.orig_rax, regs ); } -/// Rust handler for general protection fault exception (vector 13). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for general protection fault exception (vector 13). #[unsafe(no_mangle)] extern "C" fn general_protection_fault_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: GENERAL PROTECTION FAULT (Error Code: {:#x})\n{:#x?}", - mode_str(regs), - regs.orig_rax, - regs - ); -} - -/// Rust handler for page fault exception (vector 14). -/// Called from assembly stub with pointer to saved register state. -#[unsafe(no_mangle)] -extern "C" fn page_fault_handler_impl(regs: &PtRegs) { - use x86_64::registers::control::Cr2; - - todo!( - "EXCEPTION [{}]: PAGE FAULT\nAccessed Address: {:?}\nError Code: {:#x}\n{:#x?}", - mode_str(regs), - Cr2::read(), - regs.orig_rax, - regs + panic!( + "EXCEPTION: GENERAL PROTECTION FAULT (Error Code: {:#x})\n{:#x?}", + regs.orig_rax, regs ); } -/// Rust handler for x87 floating-point exception (vector 16). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for x87 floating-point exception (vector 16). #[unsafe(no_mangle)] extern "C" fn x87_floating_point_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: x87 FLOATING-POINT ERROR\n{:#x?}", - mode_str(regs), - regs - ); + panic!("EXCEPTION: x87 FLOATING-POINT ERROR\n{:#x?}", regs); } -/// Rust handler for alignment check exception (vector 17). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for alignment check exception (vector 17). #[unsafe(no_mangle)] extern "C" fn alignment_check_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: ALIGNMENT CHECK (Error Code: {:#x})\n{:#x?}", - mode_str(regs), - regs.orig_rax, - regs + panic!( + "EXCEPTION: ALIGNMENT CHECK (Error Code: {:#x})\n{:#x?}", + regs.orig_rax, regs ); } -/// Rust handler for SIMD floating-point exception (vector 19). -/// Called from assembly stub with pointer to saved register state. +/// Kernel-mode handler for SIMD floating-point exception (vector 19). #[unsafe(no_mangle)] extern "C" fn simd_floating_point_handler_impl(regs: &PtRegs) { - todo!( - "EXCEPTION [{}]: SIMD FLOATING-POINT ERROR\n{:#x?}", - mode_str(regs), - regs - ); + panic!("EXCEPTION: SIMD FLOATING-POINT ERROR\n{:#x?}", regs); } // Note: isr_hyperv_sint is defined in interrupts.S as a minimal stub that only diff --git a/litebox_platform_lvbs/src/host/linux.rs b/litebox_platform_lvbs/src/host/linux.rs index e08fd73f5..44dcb6cd7 100644 --- a/litebox_platform_lvbs/src/host/linux.rs +++ b/litebox_platform_lvbs/src/host/linux.rs @@ -4,6 +4,7 @@ //! Linux Structs use crate::arch::MAX_CORES; +use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; /// Context saved when entering the kernel /// @@ -60,7 +61,7 @@ pub struct Timespec { const BITS_PER_LONG: usize = 64; #[repr(C)] -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, FromBytes, Immutable, KnownLayout)] pub struct CpuMask { bits: [u64; MAX_CORES.div_ceil(BITS_PER_LONG)], } @@ -103,7 +104,7 @@ pub enum PkeyIdType { /// `module_signature` from [Linux](https://elixir.bootlin.com/linux/v6.6.85/source/include/linux/module_signature.h#L33) #[repr(C)] -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, FromBytes, Immutable, KnownLayout)] pub struct ModuleSignature { pub algo: u8, pub hash: u8, @@ -133,9 +134,10 @@ impl ModuleSignature { /// `kexec_segment` from [Linux](https://elixir.bootlin.com/linux/v6.6.85/source/include/linux/kexec.h#L82) #[repr(C)] -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)] pub struct KexecSegment { - pub buf: *const core::ffi::c_void, + /// Pointer to buffer (stored as u64 since we don't dereference it) + pub buf: u64, pub bufsz: u64, pub mem: u64, pub memsz: u64, @@ -146,16 +148,17 @@ pub struct KexecSegment { /// we need for our use case, such as `nr_segments` and `segment`, and /// are not affected by the kernel build configurations like `CONFIG_KEXEC_FILE` and `CONFIG_IMA_KEXEC`. #[repr(C)] -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)] pub struct Kimage { head: u64, - entry: *const u64, - last_entry: *const u64, + /// Pointer fields stored as u64 since we don't dereference them + entry: u64, + last_entry: u64, start: u64, - control_code_page: *const core::ffi::c_void, // struct page* - swap_page: *const core::ffi::c_void, // struct page* - vmcoreinfo_page: *const core::ffi::c_void, // struct page* - vmcoreinfo_data_copy: *const core::ffi::c_void, + control_code_page: u64, // struct page* + swap_page: u64, // struct page* + vmcoreinfo_page: u64, // struct page* + vmcoreinfo_data_copy: u64, pub nr_segments: u64, pub segment: [KexecSegment; KEXEC_SEGMENT_MAX], // we do not need the rest of the fields for now @@ -163,9 +166,10 @@ pub struct Kimage { pub const KEXEC_SEGMENT_MAX: usize = 16; /// `list_head` from [Linux](https://elixir.bootlin.com/linux/v6.6.85/source/include/linux/types.h#L190) -#[derive(Clone, Copy, Debug)] +/// Pointer fields stored as u64 since we don't dereference them. +#[derive(Clone, Copy, Debug, FromBytes, IntoBytes, Immutable, KnownLayout)] #[repr(C)] pub struct ListHead { - pub next: *mut ListHead, - pub prev: *mut ListHead, + pub next: u64, + pub prev: u64, } diff --git a/litebox_platform_lvbs/src/host/per_cpu_variables.rs b/litebox_platform_lvbs/src/host/per_cpu_variables.rs index 7a3fc311c..34f730760 100644 --- a/litebox_platform_lvbs/src/host/per_cpu_variables.rs +++ b/litebox_platform_lvbs/src/host/per_cpu_variables.rs @@ -21,7 +21,8 @@ use litebox::utils::TruncateExt; use litebox_common_linux::{rdgsbase, wrgsbase}; use x86_64::VirtAddr; -pub const INTERRUPT_STACK_SIZE: usize = 2 * PAGE_SIZE; +pub const DOUBLE_FAULT_STACK_SIZE: usize = 2 * PAGE_SIZE; +pub const EXCEPTION_STACK_SIZE: usize = PAGE_SIZE; pub const KERNEL_STACK_SIZE: usize = 10 * PAGE_SIZE; /// Per-CPU VTL1 kernel variables @@ -30,8 +31,9 @@ pub const KERNEL_STACK_SIZE: usize = 10 * PAGE_SIZE; pub struct PerCpuVariables { hv_vp_assist_page: [u8; PAGE_SIZE], hv_simp_page: [u8; PAGE_SIZE], - interrupt_stack: [u8; INTERRUPT_STACK_SIZE], + double_fault_stack: [u8; DOUBLE_FAULT_STACK_SIZE], _guard_page_0: [u8; PAGE_SIZE], + exception_stack: [u8; EXCEPTION_STACK_SIZE], kernel_stack: [u8; KERNEL_STACK_SIZE], _guard_page_1: [u8; PAGE_SIZE], hvcall_input: [u8; PAGE_SIZE], @@ -52,8 +54,12 @@ impl PerCpuVariables { &raw const self.kernel_stack as u64 + (self.kernel_stack.len() - 1) as u64 } - pub(crate) fn interrupt_stack_top(&self) -> u64 { - &raw const self.interrupt_stack as u64 + (self.interrupt_stack.len() - 1) as u64 + pub(crate) fn double_fault_stack_top(&self) -> u64 { + &raw const self.double_fault_stack as u64 + (self.double_fault_stack.len() - 1) as u64 + } + + pub(crate) fn exception_stack_top(&self) -> u64 { + &raw const self.exception_stack as u64 + (self.exception_stack.len() - 1) as u64 } pub fn hv_vp_assist_page_as_ptr(&self) -> *const HvVpAssistPage { @@ -146,8 +152,9 @@ impl PerCpuVariables { static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables { hv_vp_assist_page: [0u8; PAGE_SIZE], hv_simp_page: [0u8; PAGE_SIZE], - interrupt_stack: [0u8; INTERRUPT_STACK_SIZE], + double_fault_stack: [0u8; DOUBLE_FAULT_STACK_SIZE], _guard_page_0: [0u8; PAGE_SIZE], + exception_stack: [0u8; EXCEPTION_STACK_SIZE], kernel_stack: [0u8; KERNEL_STACK_SIZE], _guard_page_1: [0u8; PAGE_SIZE], hvcall_input: [0u8; PAGE_SIZE], @@ -195,8 +202,10 @@ static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables { pub struct PerCpuVariablesAsm { /// Initial kernel stack pointer to reset the kernel stack on VTL switch kernel_stack_ptr: Cell, - /// Initial interrupt stack pointer for x86 IST - interrupt_stack_ptr: Cell, + /// Double fault stack pointer (TSS.IST1) + double_fault_stack_ptr: Cell, + /// Exception stack pointer (TSS.RSP0) + exception_stack_ptr: Cell, /// Return address for call-based VTL switching vtl_return_addr: Cell, /// Scratch pad @@ -231,17 +240,25 @@ pub struct PerCpuVariablesAsm { vtl1_kernel_xsaved: Cell, /// XSAVE/XRSTOR state tracking for VTL1 user (see `vtl1_kernel_xsaved` for state values and reset). vtl1_user_xsaved: Cell, + /// Exception info: exception vector number + exception_trapno: Cell, } impl PerCpuVariablesAsm { pub fn set_kernel_stack_ptr(&self, sp: usize) { self.kernel_stack_ptr.set(sp); } - pub fn set_interrupt_stack_ptr(&self, sp: usize) { - self.interrupt_stack_ptr.set(sp); + pub fn set_double_fault_stack_ptr(&self, sp: usize) { + self.double_fault_stack_ptr.set(sp); + } + pub fn get_double_fault_stack_ptr(&self) -> usize { + self.double_fault_stack_ptr.get() + } + pub fn set_exception_stack_ptr(&self, sp: usize) { + self.exception_stack_ptr.set(sp); } - pub fn get_interrupt_stack_ptr(&self) -> usize { - self.interrupt_stack_ptr.get() + pub fn get_exception_stack_ptr(&self) -> usize { + self.exception_stack_ptr.get() } pub fn set_vtl_return_addr(&self, addr: usize) { self.vtl_return_addr.set(addr); @@ -271,8 +288,11 @@ impl PerCpuVariablesAsm { pub const fn kernel_stack_ptr_offset() -> usize { offset_of!(PerCpuVariablesAsm, kernel_stack_ptr) } - pub const fn interrupt_stack_ptr_offset() -> usize { - offset_of!(PerCpuVariablesAsm, interrupt_stack_ptr) + pub const fn double_fault_stack_ptr_offset() -> usize { + offset_of!(PerCpuVariablesAsm, double_fault_stack_ptr) + } + pub const fn exception_stack_ptr_offset() -> usize { + offset_of!(PerCpuVariablesAsm, exception_stack_ptr) } pub const fn vtl_return_addr_offset() -> usize { offset_of!(PerCpuVariablesAsm, vtl_return_addr) @@ -319,6 +339,15 @@ impl PerCpuVariablesAsm { pub const fn vtl1_user_xsaved_offset() -> usize { offset_of!(PerCpuVariablesAsm, vtl1_user_xsaved) } + pub const fn exception_trapno_offset() -> usize { + offset_of!(PerCpuVariablesAsm, exception_trapno) + } + pub fn get_exception(&self) -> litebox::shim::Exception { + litebox::shim::Exception(self.exception_trapno.get()) + } + pub fn get_user_context_top_addr(&self) -> usize { + self.user_context_top_addr.get() + } /// Reset VTL1 xsaved flags to 0 at each VTL1 entry (OP-TEE SMC call). /// This ensures: /// - XRSTOR is skipped until XSAVE populates valid data (no spurious restores on fresh entry) @@ -350,7 +379,8 @@ impl RefCellWrapper { Self { pcv_asm: PerCpuVariablesAsm { kernel_stack_ptr: Cell::new(0), - interrupt_stack_ptr: Cell::new(0), + double_fault_stack_ptr: Cell::new(0), + exception_stack_ptr: Cell::new(0), vtl_return_addr: Cell::new(0), scratch: Cell::new(0), vtl0_state_top_addr: Cell::new(0), @@ -366,6 +396,7 @@ impl RefCellWrapper { vtl1_xsave_mask_hi: Cell::new(0), vtl1_kernel_xsaved: Cell::new(0), vtl1_user_xsaved: Cell::new(0), + exception_trapno: Cell::new(0), }, inner: RefCell::new(value), } @@ -551,14 +582,18 @@ pub fn init_per_cpu_variables() { with_per_cpu_variables_mut(|per_cpu_variables| { let kernel_sp = TruncateExt::::truncate(per_cpu_variables.kernel_stack_top()) & !(STACK_ALIGNMENT - 1); - let interrupt_sp = TruncateExt::::truncate(per_cpu_variables.interrupt_stack_top()) + let double_fault_sp = + TruncateExt::::truncate(per_cpu_variables.double_fault_stack_top()) + & !(STACK_ALIGNMENT - 1); + let exception_sp = TruncateExt::::truncate(per_cpu_variables.exception_stack_top()) & !(STACK_ALIGNMENT - 1); let vtl0_state_top_addr = TruncateExt::::truncate(&raw const per_cpu_variables.vtl0_state as u64) + core::mem::size_of::(); with_per_cpu_variables_asm(|pcv_asm| { pcv_asm.set_kernel_stack_ptr(kernel_sp); - pcv_asm.set_interrupt_stack_ptr(interrupt_sp); + pcv_asm.set_double_fault_stack_ptr(double_fault_sp); + pcv_asm.set_exception_stack_ptr(exception_sp); pcv_asm.set_vtl0_state_top_addr(vtl0_state_top_addr); }); }); diff --git a/litebox_platform_lvbs/src/lib.rs b/litebox_platform_lvbs/src/lib.rs index 4ba966eb5..1892e5f52 100644 --- a/litebox_platform_lvbs/src/lib.rs +++ b/litebox_platform_lvbs/src/lib.rs @@ -9,7 +9,7 @@ use crate::{host::per_cpu_variables::PerCpuVariablesAsm, mshv::vsm::Vtl0KernelInfo}; use core::{ arch::asm, - sync::atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering}, + sync::atomic::{AtomicU32, AtomicU64}, }; use hashbrown::HashMap; use litebox::platform::{ @@ -50,9 +50,35 @@ pub mod mshv; pub mod syscall_entry; +/// Allocate a zeroed `Box` directly on the heap, avoiding stack intermediaries +/// for large types (e.g., 4096-byte `HekiPage`). +/// +/// This is safe because `T: FromBytes` guarantees that all-zero bytes are a valid `T`. +/// +/// # Panics +/// +/// Panics if `T` is a zero-sized type, since `alloc_zeroed` with a zero-sized +/// layout is undefined behavior. +fn box_new_zeroed() -> alloc::boxed::Box { + assert!( + core::mem::size_of::() > 0, + "box_new_zeroed does not support zero-sized types" + ); + let layout = core::alloc::Layout::new::(); + // Safety: layout has a non-zero size and correct alignment for T. + let ptr = unsafe { alloc::alloc::alloc_zeroed(layout) }.cast::(); + if ptr.is_null() { + alloc::alloc::handle_alloc_error(layout); + } + // Safety: ptr is a valid, zeroed, properly aligned heap allocation for T. + // T: FromBytes guarantees all-zero is a valid bit pattern. + unsafe { alloc::boxed::Box::from_raw(ptr) } +} + static CPU_MHZ: AtomicU64 = AtomicU64::new(0); /// Special page table ID for the base (kernel-only) page table. +/// No real physical frame has address 0, so this is a safe sentinel. pub const BASE_PAGE_TABLE_ID: usize = 0; /// Maximum virtual address (exclusive) for user-space allocations. @@ -92,14 +118,8 @@ pub struct PageTableManager { base_page_table: mm::PageTable, /// Cached physical frame of the base page table (for fast CR3 comparison). base_page_table_frame: PhysFrame, - /// Task page tables indexed by their ID (starting from 1). - /// Each contains kernel mappings + task-specific user-space mappings. + /// Task page tables keyed by their P4 frame start address (the page table ID). task_page_tables: spin::Mutex>>>, - /// Reverse lookup: physical frame -> page table ID (for O(1) CR3 lookup). - /// Only contains task page tables (base page table is checked separately). - frame_to_id: spin::Mutex, usize>>, - /// Next available task page table ID. - next_task_pt_id: AtomicUsize, } impl PageTableManager { @@ -115,8 +135,6 @@ impl PageTableManager { base_page_table: base_pt, base_page_table_frame: base_frame, task_page_tables: spin::Mutex::new(HashMap::new()), - frame_to_id: spin::Mutex::new(HashMap::new()), - next_task_pt_id: AtomicUsize::new(1), } } @@ -139,25 +157,18 @@ impl PageTableManager { return &self.base_page_table; } - // Look up task page table by frame using the reverse lookup map - let task_pt_id = { - let frame_to_id = self.frame_to_id.lock(); - frame_to_id.get(&cr3_frame).copied() - }; - - if let Some(id) = task_pt_id { - let task_pts = self.task_page_tables.lock(); - if let Some(pt) = task_pts.get(&id) { - // SAFETY: Three invariants guarantee this reference remains valid: - // 1. The PageTable is Box-allocated, so HashMap rehashing does not - // move the PageTable itself (only the Box pointer moves). - // 2. This page table is the current CR3, so `delete_task_page_table` - // will refuse to remove it (returns EBUSY). - // 3. The PageTableManager is 'static, so neither it nor the HashMap - // will be deallocated. - let pt_ref: &mm::PageTable = pt; - return unsafe { &*core::ptr::from_ref(pt_ref) }; - } + let cr3_id: usize = cr3_frame.start_address().as_u64().truncate(); + let task_pts = self.task_page_tables.lock(); + if let Some(pt) = task_pts.get(&cr3_id) { + // SAFETY: Three invariants guarantee this reference remains valid: + // 1. The PageTable is Box-allocated, so HashMap rehashing does not + // move the PageTable itself (only the Box pointer moves). + // 2. This page table is the current CR3, so `delete_task_page_table` + // will refuse to remove it (returns EBUSY). + // 3. The PageTableManager is 'static, so neither it nor the HashMap + // will be deallocated. + let pt_ref: &mm::PageTable = pt; + return unsafe { &*core::ptr::from_ref(pt_ref) }; } // CR3 doesn't match any known page table - this shouldn't happen @@ -185,16 +196,8 @@ impl PageTableManager { return BASE_PAGE_TABLE_ID; } - let frame_to_id = self.frame_to_id.lock(); - if let Some(&id) = frame_to_id.get(&cr3_frame) { - return id; - } - - // CR3 doesn't match any known page table - this shouldn't happen - unreachable!( - "CR3 contains unknown page table: {:?}", - cr3_frame.start_address() - ); + // The task page table ID is the start address of the P4 frame. + cr3_frame.start_address().as_u64().truncate() } /// Returns `true` if the base page table is currently active. @@ -256,8 +259,8 @@ impl PageTableManager { /// /// # Returns /// - /// The ID of the newly created task page table, or `Err(Errno::ENOMEM)` if - /// allocation fails or the ID space is exhausted. + /// The ID of the newly created task page table (its P4 frame start address), + /// or `Err(Errno::ENOMEM)` if allocation fails. pub fn create_task_page_table( &self, vtl1_phys_frame_range: PhysFrameRange, @@ -273,21 +276,11 @@ impl PageTableManager { return Err(Errno::ENOMEM); } - let task_pt_id = self.next_task_pt_id.fetch_add(1, Ordering::Relaxed); - if task_pt_id == 0 { - // Wrapped around, which shouldn't happen in practice - return Err(Errno::ENOMEM); - } - let pt = alloc::boxed::Box::new(pt); - let phys_frame = pt.get_physical_frame(); + let task_pt_id: usize = pt.get_physical_frame().start_address().as_u64().truncate(); let mut task_pts = self.task_page_tables.lock(); task_pts.insert(task_pt_id, pt); - drop(task_pts); - - let mut frame_to_id = self.frame_to_id.lock(); - frame_to_id.insert(phys_frame, task_pt_id); Ok(task_pt_id) } @@ -319,20 +312,18 @@ impl PageTableManager { return Err(Errno::EINVAL); } - // Ensure we're not deleting the current page table (check CR3) - if self.current_page_table_id() == task_pt_id { + let mut task_pts = self.task_page_tables.lock(); + + // Check CR3 under the same lock to avoid TOCTOU with the removal below. + let (cr3_frame, _) = x86_64::registers::control::Cr3::read(); + let cr3_id: usize = cr3_frame.start_address().as_u64().truncate(); + if cr3_id == task_pt_id { return Err(Errno::EBUSY); } - let mut task_pts = self.task_page_tables.lock(); if let Some(pt) = task_pts.remove(&task_pt_id) { - let phys_frame = pt.get_physical_frame(); drop(task_pts); - let mut frame_to_id = self.frame_to_id.lock(); - frame_to_id.remove(&phys_frame); - drop(frame_to_id); - // Safety: We're about to delete this page table, so it's safe to unmap all pages. unsafe { pt.cleanup_user_mappings(Self::USER_ADDR_MIN, Self::USER_ADDR_MAX); @@ -522,42 +513,65 @@ impl LinuxKernel { } } + /// Map a VTL0 physical range and return a guard that unmaps on drop. + fn map_vtl0_guard( + &self, + phys_addr: x86_64::PhysAddr, + size: u64, + flags: PageTableFlags, + ) -> Option> { + let (page_addr, page_aligned_length) = self + .map_vtl0_phys_range(phys_addr, phys_addr + size, flags) + .ok()?; + let page_offset: usize = (phys_addr - phys_addr.align_down(Size4KiB::SIZE)).truncate(); + Some(Vtl0MappedGuard { + owner: self, + page_addr, + page_aligned_length, + ptr: page_addr.wrapping_add(page_offset), + size: size.truncate(), + }) + } + /// This function copies data from VTL0 physical memory to the VTL1 kernel through `Box`. /// Use this function instead of map/unmap functions to avoid potential TOCTTOU. - /// Better to replace this function with `::from_bytes()` or similar + /// /// # Safety /// /// The caller must ensure that the `phys_addr` is a valid VTL0 physical address - /// # Panics - /// - /// Panics if `phys_addr` is invalid or not properly aligned for `T` - pub unsafe fn copy_from_vtl0_phys( + pub unsafe fn copy_from_vtl0_phys( &self, phys_addr: x86_64::PhysAddr, ) -> Option> { - use alloc::boxed::Box; + if core::mem::size_of::() == 0 { + return Some(alloc::boxed::Box::new(T::new_zeroed())); + } - if let Ok((page_addr, length)) = self.map_vtl0_phys_range( + let src_guard = self.map_vtl0_guard( phys_addr, - phys_addr + core::mem::size_of::() as u64, + core::mem::size_of::() as u64, PageTableFlags::PRESENT, - ) { - let page_offset: usize = (phys_addr - phys_addr.align_down(Size4KiB::SIZE)).truncate(); - let src_ptr = page_addr.wrapping_add(page_offset).cast::(); - assert!(src_ptr.is_aligned(), "src_ptr is not properly aligned"); - - // Safety: src_ptr points to valid VTL0 memory that was just mapped - let boxed = Box::::new(unsafe { core::ptr::read_volatile(src_ptr) }); - - assert!( - self.unmap_vtl0_pages(page_addr, length).is_ok(), - "Failed to unmap VTL0 pages" - ); + )?; + + let mut boxed = box_new_zeroed::(); + // Use memcpy_fallible instead of ptr::copy_nonoverlapping to handle + // the race where another core unmaps this page (via a shared page + // table) between map_vtl0_guard and the copy. The mapping is valid + // at this point, so a fault is not expected in the common case. + // TODO: Once VTL0 page-range locking is in place, this fallible copy + // may become unnecessary since the lock would prevent concurrent + // unmapping. It could still serve as a safety net against callers + // that forget to acquire the lock. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + core::ptr::from_mut::(boxed.as_mut()).cast(), + src_guard.ptr, + src_guard.size, + ) + }; + debug_assert!(result.is_ok(), "fault copying from VTL0 mapped page"); - Some(boxed) - } else { - None - } + result.ok().map(|()| boxed) } /// This function copies data from the VTL1 kernel to VTL0 physical memory. @@ -565,34 +579,33 @@ impl LinuxKernel { /// # Safety /// /// The caller must ensure that the `phys_addr` is a valid VTL0 physical address - /// # Panics - /// - /// Panics if phys_addr is invalid or not properly aligned for `T` pub unsafe fn copy_to_vtl0_phys( &self, phys_addr: x86_64::PhysAddr, value: &T, ) -> bool { - if let Ok((page_addr, length)) = self.map_vtl0_phys_range( + if core::mem::size_of::() == 0 { + return true; + } + + let Some(dst_guard) = self.map_vtl0_guard( phys_addr, - phys_addr + core::mem::size_of::() as u64, + core::mem::size_of::() as u64, PageTableFlags::PRESENT | PageTableFlags::WRITABLE, - ) { - let page_offset: usize = (phys_addr - phys_addr.align_down(Size4KiB::SIZE)).truncate(); - let dst_ptr = page_addr.wrapping_add(page_offset).cast::(); - assert!(dst_ptr.is_aligned(), "dst_ptr is not properly aligned"); - - // Safety: dst_ptr points to valid VTL0 memory that was just mapped - unsafe { core::ptr::write_volatile(dst_ptr, *value) }; + ) else { + return false; + }; - assert!( - self.unmap_vtl0_pages(page_addr, length).is_ok(), - "Failed to unmap VTL0 pages" - ); - true - } else { - false - } + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + dst_guard.ptr, + core::ptr::from_ref::(value).cast::(), + dst_guard.size, + ) + }; + debug_assert!(result.is_ok(), "fault copying to VTL0 mapped page"); + result.is_ok() } /// This function copies a slice from the VTL1 kernel to VTL0 physical memory. @@ -601,40 +614,33 @@ impl LinuxKernel { /// # Safety /// /// The caller must ensure that the `phys_addr` is a valid VTL0 physical address. - /// - /// # Panics - /// - /// Panics if phys_addr is invalid or not properly aligned for `T` pub unsafe fn copy_slice_to_vtl0_phys( &self, phys_addr: x86_64::PhysAddr, value: &[T], ) -> bool { - if let Ok((page_addr, length)) = self.map_vtl0_phys_range( + if core::mem::size_of_val(value) == 0 { + return true; + } + + let Some(dst_guard) = self.map_vtl0_guard( phys_addr, - phys_addr + core::mem::size_of_val(value) as u64, + core::mem::size_of_val(value) as u64, PageTableFlags::PRESENT | PageTableFlags::WRITABLE, - ) { - let page_offset: usize = (phys_addr - phys_addr.align_down(Size4KiB::SIZE)).truncate(); - let dst_ptr = page_addr.wrapping_add(page_offset).cast::(); - assert!(dst_ptr.is_aligned(), "dst_ptr is not properly aligned"); - - // Safety: dst_ptr points to mapped VTL0 memory with enough space for value.len() - // elements. We use copy_nonoverlapping instead of creating a slice reference - // because VTL0 memory is external (similar to MMIO/DMA) and may be concurrently - // modified, which would violate Rust's aliasing model for references. - unsafe { - core::ptr::copy_nonoverlapping(value.as_ptr(), dst_ptr, value.len()); - } + ) else { + return false; + }; - assert!( - self.unmap_vtl0_pages(page_addr, length).is_ok(), - "Failed to unmap VTL0 pages" - ); - true - } else { - false - } + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + dst_guard.ptr, + value.as_ptr().cast::(), + dst_guard.size, + ) + }; + debug_assert!(result.is_ok(), "fault copying to VTL0 mapped page"); + result.is_ok() } /// This function copies a slice from VTL0 physical memory to the VTL1 kernel. @@ -643,39 +649,33 @@ impl LinuxKernel { /// # Safety /// /// The caller must ensure that the `phys_addr` is a valid VTL0 physical address. - /// - /// # Panics - /// - /// Panics if phys_addr is invalid or not properly aligned for `T` pub unsafe fn copy_slice_from_vtl0_phys( &self, phys_addr: x86_64::PhysAddr, buf: &mut [T], ) -> bool { - if let Ok((page_addr, length)) = self.map_vtl0_phys_range( - phys_addr, - phys_addr + core::mem::size_of_val(buf) as u64, - PageTableFlags::PRESENT, - ) { - let page_offset: usize = (phys_addr - phys_addr.align_down(Size4KiB::SIZE)).truncate(); - let src_ptr = page_addr.wrapping_add(page_offset).cast::(); - assert!(src_ptr.is_aligned(), "src_ptr is not properly aligned"); - - // Safety: see copy_slice_to_vtl0_phys for why we use copy_nonoverlapping - // instead of creating a slice reference to VTL0 memory. - unsafe { - core::ptr::copy_nonoverlapping(src_ptr, buf.as_mut_ptr(), buf.len()); - } - - assert!( - self.unmap_vtl0_pages(page_addr, length).is_ok(), - "Failed to unmap VTL0 pages" - ); - + if core::mem::size_of_val(buf) == 0 { return true; } - false + let Some(src_guard) = self.map_vtl0_guard( + phys_addr, + core::mem::size_of_val(buf) as u64, + PageTableFlags::PRESENT, + ) else { + return false; + }; + + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + buf.as_mut_ptr().cast::(), + src_guard.ptr, + src_guard.size, + ) + }; + debug_assert!(result.is_ok(), "fault copying from VTL0 mapped page"); + result.is_ok() } /// Create a new task page table for VTL1 user space and returns its ID. @@ -759,6 +759,26 @@ impl LinuxKernel { } } +/// RAII guard that unmaps VTL0 physical pages when dropped. +struct Vtl0MappedGuard<'a, Host: HostInterface> { + owner: &'a LinuxKernel, + page_addr: *mut u8, + page_aligned_length: usize, + ptr: *mut u8, + size: usize, +} + +impl Drop for Vtl0MappedGuard<'_, Host> { + fn drop(&mut self) { + assert!( + self.owner + .unmap_vtl0_pages(self.page_addr, self.page_aligned_length) + .is_ok(), + "Failed to unmap VTL0 pages" + ); + } +} + impl RawMutexProvider for LinuxKernel { type RawMutex = RawMutex; } @@ -1521,14 +1541,79 @@ macro_rules! SAVE_SYSCALL_USER_CONTEXT_ASM { }; } -/// Restore user context from the memory area pointed by the current `rsp`. +/// Save user context after an ISR exception into the user context area. +/// +/// Similar to `SAVE_SYSCALL_USER_CONTEXT_ASM` but it preserves all GPRs. +/// The ISR stub pushes the vector number on top of the CPU-pushed error code +/// and iret frame. This macro copies them via a saved ISR stack pointer. /// -/// This macro uses the `pop` instructions (i.e., from low addresses up to high ones) such that -/// it requires the start address of the memory area (not the top one). +/// Prerequisites: +/// - `rsp` points to the top of the user context area (push target) +/// - `rax` points to the ISR stack: `[rax]`=vector, `[rax+8]`=error_code, +/// `[rax+16]`=RIP, `[rax+24]`=CS, `[rax+32]`=RFLAGS, `[rax+40]`=RSP, +/// `[rax+48]`=SS +/// - All GPRs except `rax` contain user-mode values +/// - User `rax` has been saved to per-CPU scratch +/// - `swapgs` has already been executed (GS = kernel) /// -/// Prerequisite: The memory area has `PtRegs` structure containing user context. +/// Clobbers: rax #[cfg(target_arch = "x86_64")] -macro_rules! RESTORE_USER_CONTEXT_ASM { +macro_rules! SAVE_PF_USER_CONTEXT_ASM { + () => { + " + push [rax + 48] // pt_regs->ss + push [rax + 40] // pt_regs->rsp + push [rax + 32] // pt_regs->eflags + push [rax + 24] // pt_regs->cs + push [rax + 16] // pt_regs->rip + push [rax + 8] // pt_regs->orig_rax (error code) + push rdi // pt_regs->rdi + push rsi // pt_regs->rsi + push rdx // pt_regs->rdx + push rcx // pt_regs->rcx + mov rax, gs:[{scratch_off}] + push rax // pt_regs->rax + push r8 // pt_regs->r8 + push r9 // pt_regs->r9 + push r10 // pt_regs->r10 + push r11 // pt_regs->r11 + push rbx // pt_regs->rbx + push rbp // pt_regs->rbp + push r12 // pt_regs->r12 + push r13 // pt_regs->r13 + push r14 // pt_regs->r14 + push r15 // pt_regs->r15 + " + }; +} + +/// Save all general-purpose registers onto the stack. +#[cfg(target_arch = "x86_64")] +macro_rules! SAVE_CPU_CONTEXT_ASM { + () => { + " + push rdi + push rsi + push rdx + push rcx + push rax + push r8 + push r9 + push r10 + push r11 + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + " + }; +} + +/// Restore all general-purpose registers and skip `orig_rax` from the stack. +#[cfg(target_arch = "x86_64")] +macro_rules! RESTORE_CPU_CONTEXT_ASM { () => { " pop r15 @@ -1595,13 +1680,65 @@ unsafe extern "C" fn run_thread_arch( "mov rdi, [rsp]", // pass `thread_ctx` "call {syscall_handler}", "jmp done", - // Exception and interrupt callback placeholders - // IDT handler functions will jump to these labels to - // handle user-mode exceptions/interrupts. - // Note that these two callbacks are not yet implemented and no code path jumps to them. + // Exception callback: entered from ISR stubs for user-mode exceptions. + // At this point: + // - rsp = ISR stack: [vector, error_code, rip, cs, rflags, rsp, ss] + // - All GPRs contain user-mode values + // - Interrupts are disabled (IDT gate clears IF) + // - GS = user (swapgs has NOT happened yet) ".globl exception_callback", "exception_callback:", + "swapgs", + "mov gs:[{scratch_off}], rax", // Save `rax` to per-CPU scratch + "mov al, [rsp]", + "mov gs:[{exception_trapno_off}], al", // vector number from ISR stack + "mov rax, rsp", // store ISR `rsp` in `rax` + "mov rsp, gs:[{user_context_top_off}]", // `rsp` points to the top address of user context area + SAVE_PF_USER_CONTEXT_ASM!(), + XSAVE_VTL1_ASM!({vtl1_user_xsave_area_off}, {vtl1_xsave_mask_lo_off}, {vtl1_xsave_mask_hi_off}, {vtl1_user_xsaved_off}), + "mov rbp, gs:[{cur_kernel_bp_off}]", + "mov rsp, gs:[{cur_kernel_sp_off}]", + "mov rdi, [rsp]", // pass `thread_ctx` + "xor esi, esi", // kernel_mode = false + "mov rdx, cr2", // cr2 (still valid — nothing overwrites it) + "call {exception_handler}", "jmp done", + // Kernel-mode exception callback (currently used for #PF demand paging + // and exception-table fixup). + // At entry: + // - rsp = ISR stack: [vector, error_code, rip, cs, rflags, rsp, ss] + // - All GPRs = kernel values at time of fault + // - Interrupts are disabled (IDT gate clears IF) + // - GS = kernel (no swapgs needed) + // + // Saves GPRs, then passes exception info (CR2, error code, faulting + // RIP) to exception_handler via registers. exception_handler will try + // demand paging, exception table fixup, and kernel panic in that order. + ".globl kernel_exception_callback", + "kernel_exception_callback:", + "add rsp, 8", // skip vector number + // Now stack: [error_code, rip, cs, rflags, rsp, ss] + SAVE_CPU_CONTEXT_ASM!(), + "mov rbp, rsp", + "and rsp, -16", + // Pass exception info via registers (SysV ABI args 1-5) + "mov rdi, gs:[{cur_kernel_sp_off}]", + "mov rdi, [rdi]", // arg1: thread_ctx + "mov esi, 1", // arg2: kernel_mode = true + "mov rdx, cr2", // arg3: cr2 (fault address) + "mov ecx, [rbp + 120]", // arg4: error_code (orig_rax slot) + "mov r8, [rbp + 128]", // arg5: faulting RIP (iret frame) + "call {exception_handler}", + // If demand paging failed, rax contains the exception table fixup + // address. Patch the saved RIP on the ISR stack so iretq resumes + // at the fixup instead of re-faulting. + "test rax, rax", + "jz 5f", + "mov [rbp + 128], rax", // patch saved RIP (15 GPRs + error_code = 128) + "5:", + "mov rsp, rbp", + RESTORE_CPU_CONTEXT_ASM!(), + "iretq", ".globl interrupt_callback", "interrupt_callback:", "jmp done", @@ -1621,9 +1758,12 @@ unsafe extern "C" fn run_thread_arch( vtl1_kernel_xsaved_off = const { PerCpuVariablesAsm::vtl1_kernel_xsaved_offset() }, vtl1_user_xsaved_off = const { PerCpuVariablesAsm::vtl1_user_xsaved_offset() }, USER_CONTEXT_SIZE = const core::mem::size_of::(), + scratch_off = const { PerCpuVariablesAsm::scratch_offset() }, + exception_trapno_off = const { PerCpuVariablesAsm::exception_trapno_offset() }, init_handler = sym init_handler, reenter_handler = sym reenter_handler, syscall_handler = sym syscall_handler, + exception_handler = sym exception_handler, ); } @@ -1631,7 +1771,67 @@ unsafe extern "C" fn syscall_handler(thread_ctx: &mut ThreadContext) { thread_ctx.call_shim(|shim, ctx| shim.syscall(ctx)); } +/// Handles exceptions and routes to the shim's exception handler via `call_shim`. +/// +/// `cr2` is passed by both kernel- and user-mode assembly callbacks. +/// For kernel-mode exceptions, `error_code` and `faulting_rip` +/// are also passed from the ISR stack. +/// For user-mode exceptions, `error_code` is read from the saved +/// `orig_rax` in the user context and the vector number is read from +/// the per-CPU trapno variable. +/// +/// Returns 0 for normal flow (user-mode or successful demand paging), or +/// a fixup address when kernel-mode user-space demand paging fails and +/// an exception table entry exists. Panics if no fixup is found. +unsafe extern "C" fn exception_handler( + thread_ctx: &mut ThreadContext, + kernel_mode: bool, + cr2: usize, + error_code: usize, + faulting_rip: usize, +) -> usize { + let info = if kernel_mode { + use litebox::utils::TruncateExt as _; + litebox::shim::ExceptionInfo { + exception: litebox::shim::Exception::PAGE_FAULT, + error_code: error_code.truncate(), + cr2, + kernel_mode: true, + } + } else { + use crate::host::per_cpu_variables::{PerCpuVariablesAsm, with_per_cpu_variables_asm}; + use litebox::utils::TruncateExt as _; + litebox::shim::ExceptionInfo { + exception: with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception), + error_code: thread_ctx.ctx.orig_rax.truncate(), + cr2, + kernel_mode: false, + } + }; + match thread_ctx.call_shim(|shim, ctx| shim.exception(ctx, &info)) { + Some(val) => val, + None => { + // ExceptionFixup: look up exception table, panic if not found. + litebox::mm::exception_table::search_exception_tables(faulting_rip).unwrap_or_else( + || { + panic!( + "EXCEPTION: PAGE FAULT\n\ + Accessed Address: {:#x}\n\ + Error Code: {:#x}\n\ + Faulting RIP: {:#x}", + info.cr2, info.error_code, faulting_rip, + ) + }, + ) + } + } +} + /// Calls `f` in order to call into a shim entrypoint. +/// +/// Returns `Some(0)` for most operations. Returns `None` for +/// `ExceptionFixup` (caller is responsible for looking up the fixup). +/// For `ResumeGuest`, does not return (switches directly to user mode). impl ThreadContext<'_> { fn call_shim( &mut self, @@ -1639,11 +1839,12 @@ impl ThreadContext<'_> { &dyn litebox::shim::EnterShim, &mut litebox_common_linux::PtRegs, ) -> ContinueOperation, - ) { + ) -> Option { let op = f(self.shim, self.ctx); match op { ContinueOperation::ResumeGuest => unsafe { switch_to_user(self.ctx) }, - ContinueOperation::ExitThread => {} + ContinueOperation::ExitThread | ContinueOperation::ResumeKernelPlatform => Some(0), + ContinueOperation::ExceptionFixup => None, } } } @@ -1681,7 +1882,7 @@ unsafe extern "C" fn switch_to_user(_ctx: &litebox_common_linux::PtRegs) -> ! { XRSTOR_VTL1_ASM!({vtl1_user_xsave_area_off}, {vtl1_xsave_mask_lo_off}, {vtl1_xsave_mask_hi_off}, {vtl1_user_xsaved_off}), // Restore user context from ctx. "mov rsp, rdi", - RESTORE_USER_CONTEXT_ASM!(), + RESTORE_CPU_CONTEXT_ASM!(), // clear the GS base register (as the `KernelGsBase` MSR contains 0) // while writing the current GS base value to `KernelGsBase`. "swapgs", diff --git a/litebox_platform_lvbs/src/mshv/heki.rs b/litebox_platform_lvbs/src/mshv/heki.rs index 5314992c8..9b6d3f7e7 100644 --- a/litebox_platform_lvbs/src/mshv/heki.rs +++ b/litebox_platform_lvbs/src/mshv/heki.rs @@ -12,6 +12,7 @@ use x86_64::{ PhysAddr, VirtAddr, structures::paging::{PageSize, Size4KiB}, }; +use zerocopy::{FromBytes, FromZeros, Immutable, IntoBytes, KnownLayout}; bitflags::bitflags! { #[derive(Clone, Copy, Debug, PartialEq)] @@ -106,7 +107,7 @@ pub(crate) fn mod_mem_type_to_mem_attr(mod_mem_type: ModMemType) -> MemAttr { /// `HekiRange` is a generic container for various types of memory ranges. /// It has an `attributes` field which can be interpreted differently based on the context like /// `MemAttr`, `KdataType`, `ModMemType`, or `KexecType`. -#[derive(Default, Clone, Copy)] +#[derive(Default, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)] #[repr(C, packed)] pub struct HekiRange { pub va: u64, @@ -194,11 +195,12 @@ impl core::fmt::Debug for HekiRange { pub const HEKI_MAX_RANGES: usize = ((PAGE_SIZE as u32 - u64::BITS * 3 / 8) / core::mem::size_of::() as u32) as usize; -#[derive(Clone, Copy)] +#[derive(Clone, Copy, FromBytes, Immutable, KnownLayout)] #[repr(align(4096))] #[repr(C)] pub struct HekiPage { - pub next: *mut HekiPage, + /// Pointer to next page (stored as u64 since we don't dereference it) + pub next: u64, pub next_pa: u64, pub nranges: u64, pub ranges: [HekiRange; HEKI_MAX_RANGES], @@ -207,10 +209,8 @@ pub struct HekiPage { impl HekiPage { pub fn new() -> Self { - HekiPage { - next: core::ptr::null_mut(), - ..Default::default() - } + // Safety: all fields are valid when zeroed (u64 zeros, array of zeroed HekiRange) + Self::new_zeroed() } pub fn is_valid(&self) -> bool { @@ -234,7 +234,7 @@ impl HekiPage { impl Default for HekiPage { fn default() -> Self { - Self::new() + Self::new_zeroed() } } @@ -247,30 +247,20 @@ impl<'a> IntoIterator for &'a HekiPage { } } -#[derive(Default, Clone, Copy, Debug)] +#[derive(Default, Clone, Copy, Debug, FromBytes, IntoBytes, Immutable, KnownLayout)] #[repr(C)] pub struct HekiPatch { pub pa: [u64; 2], pub size: u8, pub code: [u8; POKE_MAX_OPCODE_SIZE], + _padding: [u8; 2], } pub const POKE_MAX_OPCODE_SIZE: usize = 5; impl HekiPatch { /// Creates a new `HekiPatch` with a given buffer. Returns `None` if any field is invalid. pub fn try_from_bytes(bytes: &[u8]) -> Option { - if bytes.len() != core::mem::size_of::() { - return None; - } - let mut patch = core::mem::MaybeUninit::::uninit(); - let patch = unsafe { - core::ptr::copy_nonoverlapping( - bytes.as_ptr().cast::(), - patch.as_mut_ptr().cast::(), - core::mem::size_of::(), - ); - patch.assume_init() - }; + let patch = Self::read_from_bytes(bytes).ok()?; if patch.is_valid() { Some(patch) } else { None } } @@ -312,12 +302,14 @@ pub enum HekiPatchType { Unknown = 0xffff_ffff, } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, FromBytes, Immutable, KnownLayout)] #[repr(C)] pub struct HekiPatchInfo { - pub typ_: HekiPatchType, + /// Patch type stored as u32 for zerocopy compatibility (see `HekiPatchType`) + pub typ_: u32, list: ListHead, - mod_: *const core::ffi::c_void, // *const `struct module` + /// *const `struct module` (stored as u64 since we don't dereference it) + mod_: u64, pub patch_index: u64, pub max_patch_count: u64, // pub patch: [HekiPatch; *] @@ -326,23 +318,12 @@ pub struct HekiPatchInfo { impl HekiPatchInfo { /// Creates a new `HekiPatchInfo` with a given buffer. Returns `None` if any field is invalid. pub fn try_from_bytes(bytes: &[u8]) -> Option { - if bytes.len() != core::mem::size_of::() { - return None; - } - let mut info = core::mem::MaybeUninit::::uninit(); - let info = unsafe { - core::ptr::copy_nonoverlapping( - bytes.as_ptr().cast::(), - info.as_mut_ptr().cast::(), - core::mem::size_of::(), - ); - info.assume_init() - }; + let info = Self::read_from_bytes(bytes).ok()?; if info.is_valid() { Some(info) } else { None } } pub fn is_valid(&self) -> bool { - !(self.typ_ != HekiPatchType::JumpLabel + !(self.typ_ != HekiPatchType::JumpLabel as u32 || self.patch_index == 0 || self.patch_index > self.max_patch_count) } diff --git a/litebox_platform_lvbs/src/mshv/mem_integrity.rs b/litebox_platform_lvbs/src/mshv/mem_integrity.rs index 4f60649e6..a66182f4f 100644 --- a/litebox_platform_lvbs/src/mshv/mem_integrity.rs +++ b/litebox_platform_lvbs/src/mshv/mem_integrity.rs @@ -37,6 +37,7 @@ use x509_cert::{ Certificate, der::{Decode, Encode, oid::ObjectIdentifier}, }; +use zerocopy::FromBytes; /// This function validates the memory content of a loaded kernel module against the original ELF file. /// In particular, it checks whether the non-relocatable/patchable bytes of certain sections @@ -447,18 +448,11 @@ fn extract_module_data_and_signature( }) .ok_or(VerificationError::SignatureNotFound)?; - let mut module_signature = core::mem::MaybeUninit::::uninit(); - unsafe { - core::ptr::copy_nonoverlapping( - signed_module - .as_ptr() - .add(module_signature_offset) - .cast::(), - module_signature.as_mut_ptr().cast::(), - core::mem::size_of::(), - ); - } - let module_signature = unsafe { module_signature.assume_init() }; + let module_signature = ModuleSignature::read_from_bytes( + &signed_module[module_signature_offset + ..module_signature_offset + core::mem::size_of::()], + ) + .map_err(|_| VerificationError::InvalidSignature)?; if !module_signature.is_valid() { return Err(VerificationError::InvalidSignature); } diff --git a/litebox_platform_lvbs/src/mshv/vsm.rs b/litebox_platform_lvbs/src/mshv/vsm.rs index 51ee92afa..c338e8b1b 100644 --- a/litebox_platform_lvbs/src/mshv/vsm.rs +++ b/litebox_platform_lvbs/src/mshv/vsm.rs @@ -56,8 +56,9 @@ use x86_64::{ structures::paging::{PageSize, PhysFrame, Size4KiB, frame::PhysFrameRange}, }; use x509_cert::{Certificate, der::Decode}; +use zerocopy::{FromBytes, FromZeros, Immutable, IntoBytes, KnownLayout}; -#[derive(Copy, Clone)] +#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)] #[repr(align(4096))] struct AlignedPage([u8; PAGE_SIZE]); @@ -753,20 +754,13 @@ pub fn mshv_vsm_kexec_validate(pa: u64, nranges: u64, crash: u64) -> Result::uninit(); - let kimage_slice: &mut [u8] = unsafe { - core::slice::from_raw_parts_mut( - kimage.as_mut_ptr().cast::(), - core::mem::size_of::(), - ) - }; - kimage_slice.copy_from_slice(&kexec_image[..core::mem::size_of::()]); - let kimage = unsafe { kimage.assume_init() }; + let kimage = Kimage::read_from_bytes(&kexec_image[..core::mem::size_of::()]) + .map_err(|_| VsmError::KexecImageSegmentsInvalid)?; if kimage.nr_segments > KEXEC_SEGMENT_MAX as u64 { return Err(VsmError::KexecImageSegmentsInvalid); } for i in 0..usize::try_from(kimage.nr_segments).unwrap_or(0) { - let va = kimage.segment[i].buf as u64; + let va = kimage.segment[i].buf; let pa = kimage.segment[i].mem; if let Some(epa) = pa.checked_add(kimage.segment[i].memsz) { kexec_memory_metadata.insert_memory_range(KexecMemoryRange::new(va, pa, epa)); @@ -850,25 +844,19 @@ fn copy_heki_patch_from_vtl0(patch_pa_0: u64, patch_pa_1: u64) -> Result::uninit(); - let heki_patch_slice: &mut [u8] = unsafe { - core::slice::from_raw_parts_mut( - heki_patch.as_mut_ptr().cast::(), - core::mem::size_of::(), - ) - }; + let mut heki_patch = HekiPatch::new_zeroed(); + let heki_patch_bytes = heki_patch.as_mut_bytes(); unsafe { if !crate::platform_low().copy_slice_from_vtl0_phys( patch_pa_0, - heki_patch_slice.get_unchecked_mut(..bytes_in_first_page), + heki_patch_bytes.get_unchecked_mut(..bytes_in_first_page), ) || !crate::platform_low().copy_slice_from_vtl0_phys( patch_pa_1, - heki_patch_slice.get_unchecked_mut(bytes_in_first_page..), + heki_patch_bytes.get_unchecked_mut(bytes_in_first_page..), ) { return Err(VsmError::Vtl0CopyFailed); } } - let heki_patch = unsafe { heki_patch.assume_init() }; if heki_patch.is_valid() { Ok(heki_patch) } else { diff --git a/litebox_platform_windows_userland/src/lib.rs b/litebox_platform_windows_userland/src/lib.rs index 00a95b086..836c161ea 100644 --- a/litebox_platform_windows_userland/src/lib.rs +++ b/litebox_platform_windows_userland/src/lib.rs @@ -1713,6 +1713,7 @@ unsafe extern "C-unwind" fn exception_handler( exception, error_code, cr2, + kernel_mode: false, }; thread_ctx.call_shim(|shim, ctx, _interrupt| shim.exception(ctx, &info)); @@ -1753,6 +1754,12 @@ impl ThreadContext<'_> { match op { ContinueOperation::ResumeGuest => unsafe { switch_to_guest(self.ctx) }, ContinueOperation::ExitThread => {} + ContinueOperation::ResumeKernelPlatform => { + panic!("ResumeKernelPlatform not expected in windows_userland") + } + ContinueOperation::ExceptionFixup => { + panic!("ExceptionFixup not expected in windows_userland") + } } } } @@ -1795,6 +1802,25 @@ impl litebox::platform::CrngProvider for WindowsUserland { } } +/// Dummy `VmemPageFaultHandler`. +/// +/// Page faults are handled transparently by the host Windows kernel. +/// Provided to satisfy trait bounds for `PageManager::handle_page_fault`. +impl litebox::mm::linux::VmemPageFaultHandler for WindowsUserland { + unsafe fn handle_page_fault( + &self, + _fault_addr: usize, + _flags: litebox::mm::linux::VmFlags, + _error_code: u64, + ) -> Result<(), litebox::mm::linux::PageFaultError> { + unreachable!("host kernel handles page faults for Windows userland") + } + + fn access_error(_error_code: u64, _flags: litebox::mm::linux::VmFlags) -> bool { + unreachable!("host kernel handles page faults for Windows userland") + } +} + #[cfg(test)] mod tests { use core::sync::atomic::AtomicU32; diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 53157c692..1ca9e28fd 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -10,6 +10,7 @@ use alloc::sync::Arc; use core::{ops::Neg, panic::PanicInfo}; use litebox::{ mm::linux::PAGE_SIZE, + platform::RawConstPointer, utils::{ReinterpretSignedExt, TruncateExt}, }; use litebox_common_linux::errno::Errno; @@ -39,9 +40,9 @@ use litebox_shim_optee::msg_handler::{ decode_ta_request, handle_optee_msg_args, handle_optee_smc_args, update_optee_msg_args, }; use litebox_shim_optee::session::{ - MAX_TA_INSTANCES, SessionManager, TaInstance, allocate_session_id, + MAX_TA_INSTANCES, SessionIdGuard, SessionManager, TaInstance, allocate_session_id, }; -use litebox_shim_optee::{NormalWorldConstPtr, NormalWorldMutPtr}; +use litebox_shim_optee::{NormalWorldConstPtr, NormalWorldMutPtr, UserConstPtr}; use once_cell::race::OnceBox; use spin::mutex::SpinMutex; @@ -401,8 +402,13 @@ fn open_session_single_instance( .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID - let runner_session_id = allocate_session_id().ok_or(OpteeSmcReturnCode::EBusy)?; + // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. + // Use SessionIdGuard to ensure the ID is recycled on any error path + // (before it is registered with the session manager). + let session_id_guard = + SessionIdGuard::new(allocate_session_id().ok_or(OpteeSmcReturnCode::EBusy)?); + // Safe to unwrap: guard was just created with Some(id). + let runner_session_id = session_id_guard.id().unwrap(); debug_serial_println!( "Reusing single-instance TA: uuid={:?}, task_pt_id={}, session_id={}", @@ -445,7 +451,9 @@ fn open_session_single_instance( .loaded_program .params_address .ok_or(OpteeSmcReturnCode::EBadAddr)?; - let ta_params = unsafe { *(params_address as *const UteeParams) }; + let ta_params = UserConstPtr::::from_usize(params_address) + .read_at_offset(0) + .ok_or(OpteeSmcReturnCode::EBadAddr)?; // Check the return code from the TA's OpenSession entry point let return_code: u32 = ctx.rax.truncate(); @@ -515,7 +523,9 @@ fn open_session_single_instance( return Ok(()); } - // Success: register session + // Success: register session and disarm the guard (ownership transfers to session map) + // Safe to unwrap: guard has not been disarmed yet. + let runner_session_id = session_id_guard.disarm().unwrap(); session_manager().register_session(runner_session_id, instance_arc.clone(), ta_uuid, ta_flags); write_msg_args_to_normal_world( @@ -672,7 +682,9 @@ fn open_session_new_instance( let params_address = loaded_program .params_address .ok_or(OpteeSmcReturnCode::EBadAddr)?; - let ta_params = unsafe { *(params_address as *const UteeParams) }; + let ta_params = UserConstPtr::::from_usize(params_address) + .read_at_offset(0) + .ok_or(OpteeSmcReturnCode::EBadAddr)?; // Check the return code from the TA's OpenSession entry point let return_code: u32 = ctx.rax.truncate(); @@ -804,7 +816,9 @@ fn handle_invoke_command( .loaded_program .params_address .ok_or(OpteeSmcReturnCode::EBadAddr)?; - let ta_params = unsafe { *(params_address as *const UteeParams) }; + let ta_params = UserConstPtr::::from_usize(params_address) + .read_at_offset(0) + .ok_or(OpteeSmcReturnCode::EBadAddr)?; let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); diff --git a/litebox_runner_lvbs/x86_64_vtl1.ld b/litebox_runner_lvbs/x86_64_vtl1.ld index a7060f942..1123244a2 100644 --- a/litebox_runner_lvbs/x86_64_vtl1.ld +++ b/litebox_runner_lvbs/x86_64_vtl1.ld @@ -23,6 +23,13 @@ SECTIONS _data_start = .; *(.rodata .rodata.*) *(.data .data.*) + + /* Exception table for fallible memory operations (memcpy_fallible, etc.) */ + . = ALIGN(4); + __start_ex_table = .; + KEEP(*(ex_table)) + __stop_ex_table = .; + _data_end = .; . = ALIGN(0x1000); diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index f4dccc729..1db39dda4 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -94,6 +94,20 @@ impl litebox::shim::EnterShim for LinuxShimEntrypoints { ctx: &mut Self::ExecutionContext, info: &litebox::shim::ExceptionInfo, ) -> ContinueOperation { + if info.kernel_mode && info.exception == litebox::shim::Exception::PAGE_FAULT { + if unsafe { + self.task + .global + .pm + .handle_page_fault(info.cr2, info.error_code.into()) + } + .is_ok() + { + return ContinueOperation::ResumeKernelPlatform; + } else { + return ContinueOperation::ExceptionFixup; + } + } self.enter_shim(false, ctx, |task, _ctx| task.handle_exception_request(info)) } diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index a48b98dfa..9c21bb1c9 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -63,6 +63,7 @@ impl SignalState { exception: litebox::shim::Exception(0), error_code: 0, cr2: 0, + kernel_mode: false, }), } } diff --git a/litebox_shim_optee/src/lib.rs b/litebox_shim_optee/src/lib.rs index 453dd0595..b3d71781a 100644 --- a/litebox_shim_optee/src/lib.rs +++ b/litebox_shim_optee/src/lib.rs @@ -39,7 +39,7 @@ pub mod ptr; // Re-export session management types for convenience pub use session::{ MAX_TA_INSTANCES, SessionEntry, SessionManager, SessionMap, SingleInstanceCache, TaInstance, - allocate_session_id, recycle_session_id, + allocate_session_id, }; const MAX_KERNEL_BUF_SIZE: usize = 0x80_000; @@ -68,10 +68,31 @@ impl litebox::shim::EnterShim for OpteeShimEntrypoints { fn exception( &self, - _ctx: &mut Self::ExecutionContext, + ctx: &mut Self::ExecutionContext, info: &litebox::shim::ExceptionInfo, ) -> ContinueOperation { - todo!("Handle exception in OP-TEE shim: {:?}", info,); + if info.exception == litebox::shim::Exception::PAGE_FAULT { + let result = unsafe { + self.task + .global + .pm + .handle_page_fault(info.cr2, info.error_code.into()) + }; + if info.kernel_mode { + return if result.is_ok() { + ContinueOperation::ResumeKernelPlatform + } else { + ContinueOperation::ExceptionFixup + }; + } else if result.is_ok() { + return ContinueOperation::ResumeGuest; + } + // User-mode page fault that couldn't be resolved; + // fall through to kill the TA below. + } + // OP-TEE has no signal handling. Kill the TA on any non-PF exception. + ctx.rax = (TeeResult::TargetDead as u32) as usize; + ContinueOperation::ExitThread } fn interrupt(&self, _ctx: &mut Self::ExecutionContext) -> ContinueOperation { @@ -742,7 +763,7 @@ impl Task { 0, tls_size, ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, - MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_POPULATE, + MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS, -1, 0, )?; @@ -953,16 +974,16 @@ impl TeeObjMap { match user_attrs[0].attribute_id { TeeAttributeType::SecretValue => { let key_addr: usize = user_attrs[0].a.truncate(); - let key_len = usize::try_from(user_attrs[0].b).unwrap(); + let key_len: usize = user_attrs[0].b.truncate(); // TODO: revisit buffer size limits based on OP-TEE spec and deployment constraints if key_len > MAX_KERNEL_BUF_SIZE { return Err(TeeResult::BadParameters); } let key_ptr = UserConstPtr::::from_usize(key_addr); - let key_slice = key_ptr - .to_owned_slice(key_len) - .ok_or(TeeResult::BadParameters)?; - tee_obj.set_key(&key_slice); + let Some(key_box) = key_ptr.to_owned_slice(key_len) else { + return Err(TeeResult::BadParameters); + }; + tee_obj.set_key(&key_box); } _ => todo!( "handle attribute ID: {}", @@ -1252,12 +1273,6 @@ struct Task { // TODO: OP-TEE supports global, persistent objects across sessions. Add these maps if needed. } -impl Drop for Task { - fn drop(&mut self) { - SessionIdPool::recycle(self.session_id); - } -} - struct ThreadState { init_state: Cell, /// Whether init has been called. This is used to ensure `handle_init_request` diff --git a/litebox_shim_optee/src/loader/elf.rs b/litebox_shim_optee/src/loader/elf.rs index c6b43c05e..47859eb09 100644 --- a/litebox_shim_optee/src/loader/elf.rs +++ b/litebox_shim_optee/src/loader/elf.rs @@ -81,7 +81,7 @@ impl litebox_common_linux::loader::MapMemory for ElfFileInMemory<'_> { super::DEFAULT_LOW_ADDR, mapping_len, ProtFlags::PROT_NONE, - MapFlags::MAP_ANONYMOUS | MapFlags::MAP_PRIVATE | MapFlags::MAP_POPULATE, + MapFlags::MAP_ANONYMOUS | MapFlags::MAP_PRIVATE, -1, 0, )? @@ -120,6 +120,8 @@ impl litebox_common_linux::loader::MapMemory for ElfFileInMemory<'_> { MapFlags::MAP_ANONYMOUS | MapFlags::MAP_PRIVATE | MapFlags::MAP_FIXED + // Pre-populate: ELF loading runs before run_thread_arch sets up + // the kernel-mode demand paging infrastructure. | MapFlags::MAP_POPULATE, -1, offset.truncate(), @@ -157,6 +159,8 @@ impl litebox_common_linux::loader::MapMemory for ElfFileInMemory<'_> { MapFlags::MAP_ANONYMOUS | MapFlags::MAP_PRIVATE | MapFlags::MAP_FIXED + // Pre-populate: ELF loading runs before run_thread_arch sets up + // the kernel-mode demand paging infrastructure. | MapFlags::MAP_POPULATE, -1, 0, diff --git a/litebox_shim_optee/src/loader/ta_stack.rs b/litebox_shim_optee/src/loader/ta_stack.rs index 4cf687c9c..c1cf3d10d 100644 --- a/litebox_shim_optee/src/loader/ta_stack.rs +++ b/litebox_shim_optee/src/loader/ta_stack.rs @@ -289,8 +289,8 @@ pub(crate) fn allocate_stack(task: &crate::Task, stack_base: Option) -> O .create_stack_pages( None, length, - // Use POPULATE_PAGES_IMMEDIATELY since some platforms (e.g., LVBS) - // do not support demand paging yet. + // Pre-populate: stack initialization runs before run_thread_arch + // sets up the kernel-mode demand paging infrastructure. CreatePagesFlags::POPULATE_PAGES_IMMEDIATELY, ) .ok()? diff --git a/litebox_shim_optee/src/ptr.rs b/litebox_shim_optee/src/ptr.rs index b6b720203..a27105caa 100644 --- a/litebox_shim_optee/src/ptr.rs +++ b/litebox_shim_optee/src/ptr.rs @@ -183,38 +183,26 @@ impl PhysMutPtr { if count >= self.count { return Err(PhysPointerError::IndexOutOfBounds(count, self.count)); } - let src = match unsafe { - self.map_and_get_ptr( + let guard = unsafe { + self.map_and_get_ptr_guard( count, core::mem::size_of::(), PhysPageMapPermissions::READ, - ) - } { - Ok(ptr) => ptr, - Err(e) => { - let _ = unsafe { self.unmap() }; - return Err(e); - } + )? }; - let val = { - let mut buffer = core::mem::MaybeUninit::::uninit(); - if (src as usize).is_multiple_of(core::mem::align_of::()) { - unsafe { - core::ptr::copy_nonoverlapping(src, buffer.as_mut_ptr(), 1); - } - } else { - unsafe { - core::ptr::copy_nonoverlapping( - src.cast::(), - buffer.as_mut_ptr().cast::(), - core::mem::size_of::(), - ); - } - } - unsafe { buffer.assume_init() } + let mut buffer = core::mem::MaybeUninit::::uninit(); + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + buffer.as_mut_ptr().cast::(), + guard.ptr.cast::(), + guard.size, + ) }; - let _ = unsafe { self.unmap() }; - Ok(alloc::boxed::Box::new(val)) + debug_assert!(result.is_ok(), "fault reading from mapped physical page"); + result.map_err(|_| PhysPointerError::CopyFailed)?; + // Safety: memcpy_fallible fully initialized the buffer on success. + Ok(alloc::boxed::Box::new(unsafe { buffer.assume_init() })) } /// Read a slice of values at the given offset from the physical pointer. @@ -235,33 +223,23 @@ impl PhysMutPtr { { return Err(PhysPointerError::IndexOutOfBounds(count, self.count)); } - let src = match unsafe { - self.map_and_get_ptr( + let guard = unsafe { + self.map_and_get_ptr_guard( count, core::mem::size_of_val(values), PhysPageMapPermissions::READ, + )? + }; + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + values.as_mut_ptr().cast::(), + guard.ptr.cast::(), + guard.size, ) - } { - Ok(ptr) => ptr, - Err(e) => { - let _ = unsafe { self.unmap() }; - return Err(e); - } }; - if (src as usize).is_multiple_of(core::mem::align_of::()) { - unsafe { - core::ptr::copy_nonoverlapping(src, values.as_mut_ptr(), values.len()); - } - } else { - unsafe { - core::ptr::copy_nonoverlapping( - src.cast::(), - values.as_mut_ptr().cast::(), - core::mem::size_of_val(values), - ); - } - } - let _ = unsafe { self.unmap() }; + debug_assert!(result.is_ok(), "fault reading from mapped physical page"); + result.map_err(|_| PhysPointerError::CopyFailed)?; Ok(()) } @@ -280,25 +258,23 @@ impl PhysMutPtr { if count >= self.count { return Err(PhysPointerError::IndexOutOfBounds(count, self.count)); } - let dst = match unsafe { - self.map_and_get_ptr( + let guard = unsafe { + self.map_and_get_ptr_guard( count, core::mem::size_of::(), PhysPageMapPermissions::READ | PhysPageMapPermissions::WRITE, + )? + }; + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + guard.ptr.cast::(), + core::ptr::from_ref(&value).cast::(), + guard.size, ) - } { - Ok(ptr) => ptr, - Err(e) => { - let _ = unsafe { self.unmap() }; - return Err(e); - } }; - if (dst as usize).is_multiple_of(core::mem::align_of::()) { - unsafe { core::ptr::write(dst, value) }; - } else { - unsafe { core::ptr::write_unaligned(dst, value) }; - } - let _ = unsafe { self.unmap() }; + debug_assert!(result.is_ok(), "fault writing to mapped physical page"); + result.map_err(|_| PhysPointerError::CopyFailed)?; Ok(()) } @@ -320,38 +296,28 @@ impl PhysMutPtr { { return Err(PhysPointerError::IndexOutOfBounds(count, self.count)); } - let dst = match unsafe { - self.map_and_get_ptr( + let guard = unsafe { + self.map_and_get_ptr_guard( count, core::mem::size_of_val(values), PhysPageMapPermissions::READ | PhysPageMapPermissions::WRITE, + )? + }; + // Fallible: another core may unmap this page concurrently. + let result = unsafe { + litebox::mm::exception_table::memcpy_fallible( + guard.ptr.cast::(), + values.as_ptr().cast::(), + guard.size, ) - } { - Ok(ptr) => ptr, - Err(e) => { - let _ = unsafe { self.unmap() }; - return Err(e); - } }; - if (dst as usize).is_multiple_of(core::mem::align_of::()) { - unsafe { - core::ptr::copy_nonoverlapping(values.as_ptr(), dst, values.len()); - } - } else { - unsafe { - core::ptr::copy_nonoverlapping( - values.as_ptr().cast::(), - dst.cast::(), - core::mem::size_of_val(values), - ); - } - } - let _ = unsafe { self.unmap() }; + debug_assert!(result.is_ok(), "fault writing to mapped physical page"); + result.map_err(|_| PhysPointerError::CopyFailed)?; Ok(()) } - /// This is a helper function to map physical pages and get a pointer to the requested - /// data element at a given index. + /// This function maps physical pages for the requested data element at a given + /// index and returns a guard that unmaps on drop. /// /// It bridges element-level access (used by `read_at_offset`, `write_at_offset`, etc.) /// with page-level mapping. It determines which physical pages contain the requested @@ -364,13 +330,14 @@ impl PhysMutPtr { /// /// # Safety /// - /// Same safety requirements as `map_range`. - unsafe fn map_and_get_ptr( + /// Same as [`Self::map_range`]. The returned guard borrows `self` mutably, ensuring + /// the mapping is released when the guard goes out of scope. + unsafe fn map_and_get_ptr_guard( &mut self, count: usize, size: usize, perms: PhysPageMapPermissions, - ) -> Result<*mut T, PhysPointerError> { + ) -> Result, PhysPointerError> { let skip = self .offset .checked_add( @@ -388,7 +355,13 @@ impl PhysMutPtr { .map_info .as_ref() .ok_or(PhysPointerError::NoMappingInfo)?; - Ok(map_info.base.wrapping_add(skip % ALIGN).cast::()) + let ptr = map_info.base.wrapping_add(skip % ALIGN).cast::(); + let _ = map_info; + Ok(MappedGuard { + owner: self, + ptr, + size, + }) } /// Map the physical pages from `start` to `end` indexes. @@ -443,13 +416,28 @@ impl PhysMutPtr { } } -/// Ensures physical pages are unmapped when `PhysMutPtr` goes out of scope. +/// RAII guard that unmaps physical pages when dropped. /// -/// This type is designed for single-use access: create, read/write once, then drop. -/// The `Drop` implementation guarantees that mapped pages are always released, -/// preventing resource leaks and adhering to the "minimize persistent mapping" -/// security principle. Errors during unmapping are silently ignored since we -/// cannot propagate errors from `drop`. +/// Created by `map_and_get_ptr_guard`. Holds a mutable borrow on the parent +/// `PhysMutPtr` and provides the mapped base pointer for the duration of the mapping. +struct MappedGuard<'a, T: Clone, const ALIGN: usize> { + owner: &'a mut PhysMutPtr, + ptr: *mut T, + size: usize, +} + +impl Drop for MappedGuard<'_, T, ALIGN> { + fn drop(&mut self) { + // SAFETY: The platform is expected to handle unmapping safely, including + // the case where pages were never mapped (returns Unmapped error, ignored). + let result = unsafe { self.owner.unmap() }; + debug_assert!( + result.is_ok() || matches!(result, Err(PhysPointerError::Unmapped(_))), + "unexpected error during unmap in drop: {result:?}", + ); + } +} + impl Drop for PhysMutPtr { fn drop(&mut self) { // SAFETY: The platform is expected to handle unmapping safely, including @@ -545,13 +533,6 @@ impl PhysConstPtr { } } -/// See [`Drop`] implementation for [`PhysMutPtr`] for details. -impl Drop for PhysConstPtr { - fn drop(&mut self) { - let _ = unsafe { self.inner.unmap() }; - } -} - impl core::fmt::Debug for PhysConstPtr { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("PhysConstPtr") diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index e6bc6eea0..86518a373 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -119,7 +119,7 @@ pub struct TaInstance { pub task_page_table_id: usize, } -// SAFETY: The shim is designed to be used in single-threaded contexts per-CPU. +// SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) unsafe impl Send for TaInstance {} unsafe impl Sync for TaInstance {} @@ -274,6 +274,47 @@ pub fn recycle_session_id(session_id: u32) { SessionIdPool::recycle(session_id); } +/// RAII guard that recycles a session ID on drop unless disarmed. +/// +/// Session IDs are allocated before the TA is invoked and only registered on +/// success via [`SessionManager::register_session`]. This guard ensures it is +/// recycled on all error paths before this registration. +pub struct SessionIdGuard { + session_id: Option, +} + +impl SessionIdGuard { + /// Create a new guard that will recycle `session_id` on drop. + pub fn new(session_id: u32) -> Self { + Self { + session_id: Some(session_id), + } + } + + /// Return the guarded session ID, or `None` if already disarmed. + pub fn id(&self) -> Option { + self.session_id + } + + /// Disarm the guard so the session ID is **not** recycled on drop. + /// + /// Call this after the session ID has been successfully registered. + /// Once registered, [`SessionManager::unregister_session`] owns recycling. + /// + /// Returns `None` if the guard was already disarmed. + pub fn disarm(mut self) -> Option { + self.session_id.take() + } +} + +impl Drop for SessionIdGuard { + fn drop(&mut self) { + if let Some(id) = self.session_id { + recycle_session_id(id); + } + } +} + /// Session manager that coordinates session and instance lifecycle. /// /// This provides a unified interface for: @@ -338,9 +379,13 @@ impl SessionManager { .insert(session_id, instance, ta_uuid, ta_flags); } - /// Unregister a session and return its entry. + /// Unregister a session, recycle its session ID, and return the entry. pub fn unregister_session(&self, session_id: u32) -> Option { - self.sessions.remove(session_id) + let entry = self.sessions.remove(session_id); + if entry.is_some() { + recycle_session_id(session_id); + } + entry } /// Remove a single-instance TA from the cache. diff --git a/litebox_shim_optee/src/syscalls/ldelf.rs b/litebox_shim_optee/src/syscalls/ldelf.rs index 8b5ac945a..72f6a5ab4 100644 --- a/litebox_shim_optee/src/syscalls/ldelf.rs +++ b/litebox_shim_optee/src/syscalls/ldelf.rs @@ -57,14 +57,10 @@ impl Task { if addr.checked_add(total_size).is_none() { return Err(TeeResult::BadParameters); } - // `sys_map_zi` always creates read/writeable mapping - // Use MAP_POPULATE to ensure pages are allocated immediately (required for platforms - // that don't support demand paging, e.g., LVBS). + // `sys_map_zi` always creates read/writeable mapping. // // We map with PROT_READ_WRITE first, then mprotect padding regions to PROT_NONE. - // This is because our mmap with MAP_POPULATE and PROT_NONE create pages without - // USER_ACCESSIBLE bit, making them inaccessible even to mprotect. - let mut flags = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_POPULATE; + let mut flags = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS; if addr != 0 { flags |= MapFlags::MAP_FIXED; } @@ -187,13 +183,9 @@ impl Task { if addr.checked_add(total_size).is_none() { return Err(TeeResult::BadParameters); } - // Use MAP_POPULATE to ensure pages are allocated immediately (required for platforms - // that don't support demand paging, e.g., LVBS). - // // We map with PROT_READ_WRITE first, then mprotect padding regions to PROT_NONE as // explained in `sys_map_zi`. - let mut flags_internal = - MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_POPULATE; + let mut flags_internal = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS; if addr != 0 { flags_internal |= MapFlags::MAP_FIXED; }