diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index f5ba1534b..c1e07ae89 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -754,6 +754,28 @@ pub trait MappingExt { fn pwritev(&self, fd: RawFd, offset: i64) -> Result; } +// Gross hack alert: since the mappings below are memory regions backed by +// segvmm_ops, `zvol_{read,write}` and similar will end up contending on +// `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate +// it, copy from guest memory to Propolis heap. The segment backing Propolis' +// heap has an `as_page{,un}lock` impl that avoids the more +// expensive/contentious `as_fault()` fallback. +// +// This is an optimization until stlouis#871 can get things sorted, at +// which point it should be strictly worse than directly using the +// requested mappings. +// +// Beyond this, either fall back to using iovecs directly (and trust the kernel +// to handle bizarre vecs) or error. This is an especially gross hack because we +// could/should just set MDTS, which more naturally communicates to the guest +// the largest I/O we're willing to handle (and gives an out for too-large I/Os) +// +// The amount of memory used for temporary buffers is given by the number of +// worker threads for all file-backed disks, times this threshold. It works out +// to up to 128 MiB (8 worker threads) of buffers per disk by default as of +// writing. +const MAPPING_IO_LIMIT_BYTES: usize = 16 * crate::common::MB; + impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { fn preadv(&self, fd: RawFd, offset: i64) -> Result { if !self @@ -767,23 +789,91 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let iov = self - .as_ref() - .iter() - .map(|mapping| iovec { - iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, - iov_len: mapping.len, - }) - .collect::>(); - - let read = unsafe { - libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) - }; - if read == -1 { - return Err(Error::last_os_error()); + let mut total_capacity: usize = 0; + for mapping in self.as_ref().iter() { + total_capacity = + total_capacity.checked_add(mapping.len).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + "Total mapping too large", + ) + })?; } - Ok(read as usize) + // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. + if total_capacity <= MAPPING_IO_LIMIT_BYTES { + // If we're motivated to avoid the zero-fill via + // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should + // probably avoid this gross hack entirely (see comment on + // MAPPING_IO_LIMIT_BYTES). + let mut buf = vec![0; total_capacity]; + + let iov = [iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }]; + + let read = unsafe { + libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + }; + if read == -1 { + return Err(Error::last_os_error()); + } + let read: usize = read.try_into().expect("read is positive"); + + // copy `read` bytes back into the iovecs and return + let mut remaining = &mut buf[..read]; + for mapping in self.as_ref().iter() { + let to_copy = std::cmp::min(remaining.len(), mapping.len); + + // Safety: guest physical memory is READ|WRITE, and won't become + // unmapped as long as we hold a Mapping, which `self` implies. + // + // The guest may be concurrently modifying this memory, we may + // be concurrently reading or writing this memory (if it is + // submitted for a read or write on another thread, for + // example), but `copy_nonoverlapping` has no compunctions about + // concurrent mutation. + unsafe { + copy_nonoverlapping::( + remaining.as_ptr(), + mapping.ptr.as_ptr(), + mapping.len, + ); + } + + remaining = remaining.split_at_mut(to_copy).1; + + if remaining.len() == 0 { + // Either we're at the last iov and we're finished copying + // back into the guest, or `preadv` did a short read. + break; + } + } + + // We should never read more than the guest mappings could hold. + assert_eq!(remaining.len(), 0); + + Ok(read) + } else { + let iov = self + .as_ref() + .iter() + .map(|mapping| iovec { + iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, + iov_len: mapping.len, + }) + .collect::>(); + + let read = unsafe { + libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + }; + if read == -1 { + return Err(Error::last_os_error()); + } + let read: usize = read.try_into().expect("read is positive"); + Ok(read) + } } fn pwritev(&self, fd: RawFd, offset: i64) -> Result { @@ -798,18 +888,79 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { )); } - let iov = self - .as_ref() - .iter() - .map(|mapping| iovec { - iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, - iov_len: mapping.len, - }) - .collect::>(); + let mut total_capacity: usize = 0; + for mapping in self.as_ref().iter() { + total_capacity = + total_capacity.checked_add(mapping.len).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + "Total mapping too large", + ) + })?; + } - let written = unsafe { - libc::pwritev(fd, iov.as_ptr(), iov.len() as libc::c_int, offset) + // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. + let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES { + // If we're motivated to avoid the zero-fill via + // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should + // probably avoid this gross hack entirely (see comment on + // MAPPING_IO_LIMIT_BYTES). + let mut buf = vec![0; total_capacity]; + + let mut remaining = buf.as_mut_slice(); + for mapping in self.as_ref().iter() { + // Safety: guest physical memory is READ|WRITE, and won't become + // unmapped as long as we hold a Mapping, which `self` implies. + // + // The guest may be concurrently modifying this memory, we may + // be concurrently reading or writing this memory (if it is + // submitted for a read or write on another thread, for + // example), but `copy_nonoverlapping` has no compunctions about + // concurrent mutation. + unsafe { + copy_nonoverlapping::( + mapping.ptr.as_ptr() as *const u8, + remaining.as_mut_ptr(), + mapping.len, + ); + } + + remaining = remaining.split_at_mut(mapping.len).1; + } + + let iovs = [iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }]; + + unsafe { + libc::pwritev( + fd, + iovs.as_ptr(), + iovs.len() as libc::c_int, + offset, + ) + } + } else { + let iovs = self + .as_ref() + .iter() + .map(|mapping| iovec { + iov_base: mapping.ptr.as_ptr() as *mut libc::c_void, + iov_len: mapping.len, + }) + .collect::>(); + + unsafe { + libc::pwritev( + fd, + iovs.as_ptr(), + iovs.len() as libc::c_int, + offset, + ) + } }; + if written == -1 { return Err(Error::last_os_error()); }