From 62fba950ec0e298c81a40d660c47321d4d2b2c92 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 10 Dec 2025 19:41:33 +0000 Subject: [PATCH 1/2] block/file: do pread/pwrite from Propolis heap instead of VM memory As described in the comment on `MAPPING_IO_LIMIT_BYTES`, this is a gross hack: if VM memory is used as the iovec for `p{read,write}v` to a block device or zvol, we'll end up contending on `svmd->svmd_lock` pretty heavily in higher IOPS situations. This might arise when doing fewer larger I/Os to zvols or block devices as well, when we'll be forwarding each of the 4k entries in guest PRP lists as a distinct iovec, but we haven't tested that. Either way, operating from Propolis heap avoids the issue for now. One could imagine this "should" be a configurable behavior on file block backends, since many files (like disk images in propolis-standalone) would not need this. The hope is that we can fix this in the kernel so the hack is no longer needed for zvols or block devices, rather than leaning further into properly supporting this copy hack. --- lib/propolis/src/vmm/mem.rs | 186 +++++++++++++++++++++++++++++++----- 1 file changed, 161 insertions(+), 25 deletions(-) diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index f5ba1534b..881ede057 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -754,6 +754,28 @@ pub trait MappingExt { fn pwritev(&self, fd: RawFd, offset: i64) -> Result; } +// Gross hack alert: since the mappings below are memory regions backed by +// segvmm_ops, `zvol_{read,write}` and similar will end up contending on +// `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate +// it, copy from guest memory to Propolis heap. The segment backing Propolis' +// heap has an `as_page{,un}lock` impl that avoids the more +// expensive/contentious `as_fault()` fallback. +// +// This is an optimization until stlouis#871 can get things sorted, at +// which point it should be strictly worse than directly using the +// requested mappings. +// +// Beyond this, either fall back to using iovecs directly (and trust the kernel +// to handle bizarre vecs) or error. This is an especially gross hack because we +// could/should just set MDTS, which more naturally communicates to the guest +// the largest I/O we're willing to handle (and gives an out for too-large I/Os) +// +// The amount of memory used for temporary buffers is given by the number of +// worker threads for all file-backed disks, times this threshold. It works out +// to up to 128 MiB (8 worker threads) of buffers per disk by default as of +// writing. +const MAPPING_IO_LIMIT_BYTES: usize = 16 * crate::common::MB; + impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { fn preadv(&self, fd: RawFd, offset: i64) -> Result { if !self @@ -767,23 +789,92 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let iov = self - .as_ref() - .iter() - .map(|mapping| iovec { - iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, - iov_len: mapping.len, - }) - .collect::>(); - - let read = unsafe { - libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) - }; - if read == -1 { - return Err(Error::last_os_error()); + let mut total_capacity = 0; + for mapping in self.as_ref().iter() { + total_capacity += mapping.len; } - Ok(read as usize) + // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. + if total_capacity < MAPPING_IO_LIMIT_BYTES { + // If we're motivated to avoid the zero-fill via + // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should + // probably avoid this gross hack entirely (see comment on + // MAPPING_IO_LIMIT_BYTES). + let mut buf = vec![0; total_capacity]; + + let iov = [iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }]; + + let read = unsafe { + libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + }; + if read == -1 { + return Err(Error::last_os_error()); + } + let read: usize = read.try_into().expect("read is positive"); + + // copy `read` bytes back into the iovecs and return + let mut remaining = read; + let mut offset = 0; + for mapping in self.as_ref().iter() { + let to_copy = std::cmp::min(remaining, mapping.len); + if to_copy == 0 { + break; + } + + // Safety: guest physical memory is READ|WRITE, and won't become + // unmapped as long as we hold a Mapping, which `self` implies. + // + // Sketchy: nothing guarantees the guest is not concurrently + // modifying this memory. Also, nothing prevents the guest from + // submitting the same ranges as part of another I/O where we + // might be doing this same operation on another thread on the + // same ptr/len pair. Because we are just moving u8's, it should + // be OK. + let guest_slice = unsafe { + std::slice::from_raw_parts_mut( + mapping.ptr.as_ptr(), + mapping.len, + ) + }; + + guest_slice.copy_from_slice(&buf[offset..][..to_copy]); + + remaining -= to_copy; + offset += to_copy; + + if remaining == 0 { + // Either we're at the last iov and we're finished copying + // back into the guest, or `preadv` did a short read. + break; + } + } + + // We should never read more than the guest mappings could hold. + assert_eq!(remaining, 0); + + Ok(read) + } else { + let iov = self + .as_ref() + .iter() + .map(|mapping| iovec { + iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, + iov_len: mapping.len, + }) + .collect::>(); + + let read = unsafe { + libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + }; + if read == -1 { + return Err(Error::last_os_error()); + } + let read: usize = read.try_into().expect("read is positive"); + Ok(read) + } } fn pwritev(&self, fd: RawFd, offset: i64) -> Result { @@ -798,18 +889,63 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let iov = self - .as_ref() - .iter() - .map(|mapping| iovec { - iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, - iov_len: mapping.len, - }) - .collect::>(); + let mut total_capacity = 0; + for mapping in self.as_ref().iter() { + total_capacity += mapping.len; + } - let written = unsafe { - libc::pwritev(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. + let written = if total_capacity < MAPPING_IO_LIMIT_BYTES { + let mut buf = Vec::with_capacity(total_capacity); + for mapping in self.as_ref().iter() { + // Safety: these pointer/length pairs are into guest physical + // memory, which the VM has no control over. Having a `{Sub}Mapping` + // is a commitment that the guest memory mapping is valid, and will + // remain so until `pwritev` returns. We're going to immediately + // read from this slice, but it would be just as valid to pass these + // as an iovec directly to pwritev. + let s = unsafe { + std::slice::from_raw_parts( + mapping.ptr.as_ptr() as *const u8, + mapping.len, + ) + }; + buf.extend_from_slice(s); + } + + let iovs = [iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }]; + + unsafe { + libc::pwritev( + fd, + iovs.as_ptr(), + iovs.len() as libc::c_int, + offset, + ) + } + } else { + let iovs = self + .as_ref() + .iter() + .map(|mapping| iovec { + iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, + iov_len: mapping.len, + }) + .collect::>(); + + unsafe { + libc::pwritev( + fd, + iovs.as_ptr(), + iovs.len() as libc::c_int, + offset, + ) + } }; + if written == -1 { return Err(Error::last_os_error()); } From c2ea48f21b7f8cdd13199ef9b6f2e1a804cad814 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 12 Dec 2025 21:06:59 +0000 Subject: [PATCH 2/2] review feedback, use differently-fiddly copy --- lib/propolis/src/vmm/mem.rs | 95 +++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index 881ede057..c1e07ae89 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -789,13 +789,19 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let mut total_capacity = 0; + let mut total_capacity: usize = 0; for mapping in self.as_ref().iter() { - total_capacity += mapping.len; + total_capacity = + total_capacity.checked_add(mapping.len).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + "Total mapping too large", + ) + })?; } // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. - if total_capacity < MAPPING_IO_LIMIT_BYTES { + if total_capacity <= MAPPING_IO_LIMIT_BYTES { // If we're motivated to avoid the zero-fill via // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should // probably avoid this gross hack entirely (see comment on @@ -816,36 +822,29 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { let read: usize = read.try_into().expect("read is positive"); // copy `read` bytes back into the iovecs and return - let mut remaining = read; - let mut offset = 0; + let mut remaining = &mut buf[..read]; for mapping in self.as_ref().iter() { - let to_copy = std::cmp::min(remaining, mapping.len); - if to_copy == 0 { - break; - } + let to_copy = std::cmp::min(remaining.len(), mapping.len); // Safety: guest physical memory is READ|WRITE, and won't become // unmapped as long as we hold a Mapping, which `self` implies. // - // Sketchy: nothing guarantees the guest is not concurrently - // modifying this memory. Also, nothing prevents the guest from - // submitting the same ranges as part of another I/O where we - // might be doing this same operation on another thread on the - // same ptr/len pair. Because we are just moving u8's, it should - // be OK. - let guest_slice = unsafe { - std::slice::from_raw_parts_mut( + // The guest may be concurrently modifying this memory, we may + // be concurrently reading or writing this memory (if it is + // submitted for a read or write on another thread, for + // example), but `copy_nonoverlapping` has no compunctions about + // concurrent mutation. + unsafe { + copy_nonoverlapping::( + remaining.as_ptr(), mapping.ptr.as_ptr(), mapping.len, - ) - }; - - guest_slice.copy_from_slice(&buf[offset..][..to_copy]); + ); + } - remaining -= to_copy; - offset += to_copy; + remaining = remaining.split_at_mut(to_copy).1; - if remaining == 0 { + if remaining.len() == 0 { // Either we're at the last iov and we're finished copying // back into the guest, or `preadv` did a short read. break; @@ -853,7 +852,7 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { } // We should never read more than the guest mappings could hold. - assert_eq!(remaining, 0); + assert_eq!(remaining.len(), 0); Ok(read) } else { @@ -889,28 +888,44 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let mut total_capacity = 0; + let mut total_capacity: usize = 0; for mapping in self.as_ref().iter() { - total_capacity += mapping.len; + total_capacity = + total_capacity.checked_add(mapping.len).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + "Total mapping too large", + ) + })?; } // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. - let written = if total_capacity < MAPPING_IO_LIMIT_BYTES { - let mut buf = Vec::with_capacity(total_capacity); + let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES { + // If we're motivated to avoid the zero-fill via + // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should + // probably avoid this gross hack entirely (see comment on + // MAPPING_IO_LIMIT_BYTES). + let mut buf = vec![0; total_capacity]; + + let mut remaining = buf.as_mut_slice(); for mapping in self.as_ref().iter() { - // Safety: these pointer/length pairs are into guest physical - // memory, which the VM has no control over. Having a `{Sub}Mapping` - // is a commitment that the guest memory mapping is valid, and will - // remain so until `pwritev` returns. We're going to immediately - // read from this slice, but it would be just as valid to pass these - // as an iovec directly to pwritev. - let s = unsafe { - std::slice::from_raw_parts( + // Safety: guest physical memory is READ|WRITE, and won't become + // unmapped as long as we hold a Mapping, which `self` implies. + // + // The guest may be concurrently modifying this memory, we may + // be concurrently reading or writing this memory (if it is + // submitted for a read or write on another thread, for + // example), but `copy_nonoverlapping` has no compunctions about + // concurrent mutation. + unsafe { + copy_nonoverlapping::( mapping.ptr.as_ptr() as *const u8, + remaining.as_mut_ptr(), mapping.len, - ) - }; - buf.extend_from_slice(s); + ); + } + + remaining = remaining.split_at_mut(mapping.len).1; } let iovs = [iovec {