diff --git a/kernel/src/driver/block/cache/mod.rs b/kernel/src/driver/block/cache/mod.rs index 1e2f87fe7..cf516b5d4 100644 --- a/kernel/src/driver/block/cache/mod.rs +++ b/kernel/src/driver/block/cache/mod.rs @@ -9,7 +9,7 @@ pub const BLOCK_SIZE_LOG: usize = 9; ///块大小这里固定为512 pub const BLOCK_SIZE: usize = 1 << BLOCK_SIZE_LOG; ///这里规定Cache的threshold大小,单位为:MB -pub const CACHE_THRESHOLD: usize = 64; +pub const CACHE_THRESHOLD: usize = 2; pub enum BlockCacheError { BlockSizeError, diff --git a/kernel/src/filesystem/vfs/iov.rs b/kernel/src/filesystem/vfs/iov.rs index 5d095f1c2..b86b7485e 100644 --- a/kernel/src/filesystem/vfs/iov.rs +++ b/kernel/src/filesystem/vfs/iov.rs @@ -2,9 +2,15 @@ use alloc::vec::Vec; use system_error::SystemError; use crate::{ + mm::verify_area, mm::VirtAddr, - syscall::user_access::{user_accessible_len, UserBufferReader, UserBufferWriter}, + syscall::user_access::{ + copy_from_user_protected, user_accessible_len, UserBufferReader, UserBufferWriter, + }, }; + +/// Linux UIO_MAXIOV: maximum number of iovec structures per syscall +const IOV_MAX: usize = 1024; #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct IoVec { @@ -25,7 +31,15 @@ impl IoVecs { /// 获取IoVecs中所有缓冲区的总长度 #[inline(never)] pub fn total_len(&self) -> usize { - self.0.iter().map(|x| x.iov_len).sum() + self.0 + .iter() + .try_fold(0usize, |acc, x| acc.checked_add(x.iov_len)) + .unwrap_or(usize::MAX) + } + + /// Borrow the validated iovec list. + pub fn iovs(&self) -> &[IoVec] { + &self.0 } /// Constructs `IoVecs` from an array of `IoVec` in userspace. @@ -34,7 +48,7 @@ impl IoVecs { /// /// * `iov` - Pointer to the array of `IoVec` in userspace /// * `iovcnt` - Number of `IoVec` elements in the array - /// * `readv` - Whether this is for the `readv` syscall (currently unused) + /// * `readv` - Whether this is for the `readv` syscall (true = check write permission) /// /// # Returns /// @@ -52,23 +66,53 @@ impl IoVecs { iovcnt: usize, _readv: bool, ) -> Result { - let iovs_reader = UserBufferReader::new(iov, iovcnt * core::mem::size_of::(), true)?; + // Linux: iovcnt must be > 0 and not unreasonably large. + if iovcnt == 0 || iovcnt > IOV_MAX { + return Err(SystemError::EINVAL); + } + + let elem_size = core::mem::size_of::(); + let total_bytes = iovcnt.checked_mul(elem_size).ok_or(SystemError::EINVAL)?; + + // Only does range check (user range) here. + let iovs_reader = UserBufferReader::new(iov, total_bytes, true)?; + + // Use exception-table protected copy to avoid kernel faults when userspace pointer is bad. + let iovs_buf = iovs_reader.buffer_protected(0)?; - // 将用户空间的IoVec转换为引用(注意:这里的引用是静态的,因为用户空间的IoVec不会被释放) - let iovs = iovs_reader.buffer::(0)?; + let mut slices: Vec = Vec::with_capacity(iovcnt); + for idx in 0..iovcnt { + let offset = idx * elem_size; + let one: IoVec = iovs_buf.read_one(offset)?; - let mut slices: Vec = Vec::with_capacity(iovs.len()); + // Linux behavior: always validate iov_base is a user pointer, even when iov_len==0. + // This matches Linux access_ok(addr, 0) behavior and is required by gVisor tests. + let base = VirtAddr::new(one.iov_base as usize); - for iov in iovs.iter() { - if iov.iov_len == 0 { + // Only do lightweight address range check (like Linux's access_ok). + // This checks that the address range is within user space limits, + // but does NOT traverse page tables or check actual mappings. + // Actual page mapping/permission checks happen during copy operations. + verify_area(base, one.iov_len)?; + + // Skip zero-length iovecs after validation + if one.iov_len == 0 { continue; } - let _ = UserBufferWriter::new(iov.iov_base, iov.iov_len, true)?; - slices.push(*iov); + // If the first byte isn't writable/readable at all, fail early with EFAULT. + // Partial accessibility is handled by the syscall implementation. + // Note: user_accessible_len returns 0 for null pointers (addr.is_null() check), + // so null pointer detection is covered here. + let accessible = user_accessible_len(base, one.iov_len, _readv /* check_write */); + if accessible == 0 { + return Err(SystemError::EFAULT); + } + + slices.push(one); } - return Ok(Self(slices)); + Ok(Self(slices)) } /// Aggregates data from all IoVecs into a single buffer. @@ -87,35 +131,36 @@ impl IoVecs { /// read at all, `Err(SystemError::EFAULT)` is returned. pub fn gather(&self) -> Result, SystemError> { let mut buf = Vec::with_capacity(self.total_len()); + let mut read_any = false; for iov in self.0.iter() { + let base = VirtAddr::new(iov.iov_base as usize); // 检查从 iov_base 开始有多少 bytes 在 vma 内部且实际可以访问 - let accessible = - user_accessible_len(VirtAddr::new(iov.iov_base as usize), iov.iov_len, false); - - // log::debug!( - // "iov is {:?}. iov_len: {}; accessible len:{}", - // iov, - // iov.iov_len, - // accessible - // ); + let accessible = user_accessible_len(base, iov.iov_len, false /* read */); // 如果一个字节都不能访问 if accessible == 0 { - if buf.is_empty() { - // log::error!( - // "The first iov is empty, returning EFAULT. iov shape: {:?}", - // iov - // ); + if !read_any { return Err(SystemError::EFAULT); } return Ok(buf); } - // 复制可访问的部分 - unsafe { - let src = core::slice::from_raw_parts(iov.iov_base as *const u8, accessible); - buf.extend_from_slice(src); + // 使用异常保护的拷贝,与 scatter 保持一致 + let mut chunk = alloc::vec![0u8; accessible]; + match unsafe { copy_from_user_protected(&mut chunk, base) } { + Ok(_) => { + buf.extend_from_slice(&chunk); + read_any = true; + } + Err(SystemError::EFAULT) => { + // Linux: return partial data if any bytes were copied. + if !read_any { + return Err(SystemError::EFAULT); + } + return Ok(buf); + } + Err(e) => return Err(e), } // 如果没有读取完整个 iov,说明遇到了不可访问的区域 @@ -144,21 +189,43 @@ impl IoVecs { /// let iovecs = IoVecs::from_user(/* ... */)?; /// iovecs.scatter(&[1, 2, 3, 4, 5]); /// ``` - pub fn scatter(&self, data: &[u8]) { - let mut data: &[u8] = data; - for slice in self.0.iter() { - let len = core::cmp::min(slice.iov_len, data.len()); - if len == 0 { + pub fn scatter(&self, data: &[u8]) -> Result<(), SystemError> { + let mut remaining = data; + let mut written_any = false; + + for iov in self.0.iter() { + if remaining.is_empty() { + break; + } + + let want = core::cmp::min(iov.iov_len, remaining.len()); + if want == 0 { continue; } - let mut buf_writer = - UserBufferWriter::new(slice.iov_base, slice.iov_len, true).unwrap(); - let slice = buf_writer.buffer::(0).unwrap(); + let base = VirtAddr::new(iov.iov_base as usize); + let accessible = user_accessible_len(base, want, true /*write*/); + if accessible == 0 { + if !written_any { + return Err(SystemError::EFAULT); + } + break; + } + + let mut writer = UserBufferWriter::new(iov.iov_base, accessible, true)?; + let mut user_buf = writer.buffer_protected(0)?; + user_buf.write_to_user(0, &remaining[..accessible])?; - slice[..len].copy_from_slice(&data[..len]); - data = &data[len..]; + written_any = true; + remaining = &remaining[accessible..]; + + if accessible < want { + // Hit an unmapped/forbidden region; stop as Linux does. + break; + } } + + Ok(()) } /// Creates a buffer with capacity equal to the total length of all IoVecs. diff --git a/kernel/src/filesystem/vfs/syscall/sys_preadv.rs b/kernel/src/filesystem/vfs/syscall/sys_preadv.rs index b7123db5e..77d94dd0d 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_preadv.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_preadv.rs @@ -79,7 +79,7 @@ pub fn do_preadv(fd: i32, iovecs: &IoVecs, offset: usize) -> Result Result { return file.read(buf.len(), buf); } + +/// Read into a userspace buffer safely (exception-table protected) and in chunks. +/// +/// Linux semantics: if a fault happens after some bytes are copied, return the number +/// of bytes copied instead of -EFAULT. +fn read_into_user_buffer(fd: i32, user_ptr: *mut u8, len: usize) -> Result { + // 用户态:先计算可写入长度,避免直接写入无效用户页。 + let accessible = + user_accessible_len(VirtAddr::new(user_ptr as usize), len, true /*write*/); + if accessible == 0 { + return Err(SystemError::EFAULT); + } + + // Keep the kernel-side buffer modest to avoid huge allocations/long critical sections. + const CHUNK: usize = 64 * 1024; + let mut total = 0usize; + + while total < accessible { + let remain = accessible - total; + let chunk_len = core::cmp::min(CHUNK, remain); + + let mut kbuf = alloc::vec![0u8; chunk_len]; + let n = do_read(fd, &mut kbuf[..])?; + if n == 0 { + break; + } + + let dst = VirtAddr::new(user_ptr as usize + total); + let write_res = unsafe { copy_to_user_protected(dst, &kbuf[..n]) }; + match write_res { + Ok(_) => { + total += n; + } + Err(SystemError::EFAULT) => { + if total == 0 { + return Err(SystemError::EFAULT); + } + break; + } + Err(e) => return Err(e), + } + + if n < chunk_len { + break; + } + } + + Ok(total) +} diff --git a/kernel/src/filesystem/vfs/syscall/sys_readv.rs b/kernel/src/filesystem/vfs/syscall/sys_readv.rs index 9a99d6f58..942a87de4 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_readv.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_readv.rs @@ -2,10 +2,14 @@ use system_error::SystemError; use crate::arch::interrupt::TrapFrame; use crate::arch::syscall::nr::SYS_READV; +use crate::arch::MMArch; use crate::filesystem::vfs::iov::IoVec; use crate::filesystem::vfs::iov::IoVecs; +use crate::mm::MemoryManagementArch; +use crate::mm::VirtAddr; use crate::syscall::table::FormattedSyscallParam; use crate::syscall::table::Syscall; +use crate::syscall::user_access::{copy_to_user_protected, user_accessible_len}; use alloc::string::ToString; use alloc::vec::Vec; @@ -27,16 +31,90 @@ impl Syscall for SysReadVHandle { let iov = Self::iov(args); let count = Self::count(args); - // IoVecs会进行用户态检验 + // IoVecs 会进行用户态检验(包含 len==0 的 iov_base 校验)。 let iovecs = unsafe { IoVecs::from_user(iov, count, true) }?; - let mut data = vec![0; iovecs.total_len()]; + // Linux: limit per readv() to MAX_RW_COUNT = INT_MAX & ~(PAGE_SIZE-1) + let max_rw_count = (i32::MAX as usize) & !(MMArch::PAGE_SIZE - 1); - let len = do_read(fd, &mut data)?; + let mut total_read: usize = 0; - iovecs.scatter(&data[..len]); + // Keep kernel-side buffer modest to avoid huge allocations. + // Also used as the granularity for accessibility checks to avoid + // traversing huge address ranges at once. + const CHUNK: usize = 64 * 1024; - return Ok(len); + for one in iovecs.iovs().iter() { + // Check if we've reached MAX_RW_COUNT limit + if total_read >= max_rw_count { + break; + } + + let remain = max_rw_count - total_read; + let want = core::cmp::min(one.iov_len, remain); + if want == 0 { + continue; + } + + let mut copied_this_iov = 0usize; + while copied_this_iov < want { + // Calculate how much to process in this iteration + let remain_iov = want - copied_this_iov; + let chunk_len = core::cmp::min(CHUNK, remain_iov); + + let current_base = one.iov_base as usize + copied_this_iov; + + // Check accessibility for this chunk only (not the entire iovec) + // This avoids traversing huge address ranges at once + let accessible = user_accessible_len(VirtAddr::new(current_base), chunk_len, true); + if accessible == 0 { + if total_read == 0 && copied_this_iov == 0 { + return Err(SystemError::EFAULT); + } + // Hit unmapped region, return what we've read so far + return Ok(total_read); + } + + // Read into kernel buffer + let to_read = core::cmp::min(accessible, chunk_len); + let mut kbuf = alloc::vec![0u8; to_read]; + let n = do_read(fd, &mut kbuf[..])?; + if n == 0 { + // EOF + return Ok(total_read); + } + + // Copy to user space + let dst = VirtAddr::new(current_base); + let write_res = unsafe { copy_to_user_protected(dst, &kbuf[..n]) }; + match write_res { + Ok(_) => { + copied_this_iov += n; + total_read = total_read.saturating_add(n); + + // Check MAX_RW_COUNT limit after each chunk + if total_read >= max_rw_count { + return Ok(total_read); + } + } + Err(SystemError::EFAULT) => { + // Linux: return partial count if any bytes were copied. + if total_read == 0 { + return Err(SystemError::EFAULT); + } + return Ok(total_read); + } + Err(e) => return Err(e), + } + + // Stop on short read (EOF or error in underlying file) + if n < to_read { + return Ok(total_read); + } + } + } + + Ok(total_read) } fn entry_format(&self, args: &[usize]) -> Vec { diff --git a/kernel/src/mm/page.rs b/kernel/src/mm/page.rs index ce5215442..11ceb25c7 100644 --- a/kernel/src/mm/page.rs +++ b/kernel/src/mm/page.rs @@ -332,6 +332,15 @@ impl PageReclaimer { for page in victims { let mut guard = page.write_irqsave(); if let PageType::File(info) = guard.page_type().clone() { + // Never evict a file-backed page that is still mapped into any VMA. + // Our eviction path removes the page from page_cache/page_manager; dropping a + // still-mapped page will trip InnerPage::drop assertions and can crash userland. + if guard.map_count() != 0 { + drop(guard); + page_reclaimer_lock_irqsave().insert_page(page.phys_address(), &page); + continue; + } + let page_index = info.index; let paddr = guard.phys_address(); diff --git a/kernel/src/net/syscall.rs b/kernel/src/net/syscall.rs index ef76cb8b9..4bd55f150 100644 --- a/kernel/src/net/syscall.rs +++ b/kernel/src/net/syscall.rs @@ -359,7 +359,7 @@ impl Syscall { }; // 将数据写入用户空间的iovecs - iovs.scatter(&buf[..recv_size]); + iovs.scatter(&buf[..recv_size])?; return Ok(recv_size); } diff --git a/kernel/src/syscall/user_access.rs b/kernel/src/syscall/user_access.rs index d72248dd8..90f120f5f 100644 --- a/kernel/src/syscall/user_access.rs +++ b/kernel/src/syscall/user_access.rs @@ -219,6 +219,11 @@ impl UserBufferReader<'_> { /// * Returns UserBufferReader instance on success, error code otherwise /// pub fn new(addr: *const U, len: usize, from_user: bool) -> Result { + // SAFETY: constructing a slice from a null pointer with non-zero length is UB. + // Linux semantics: passing a null pointer for a non-empty user buffer should fail with EFAULT. + if len != 0 && (addr as usize) == 0 { + return Err(SystemError::EFAULT); + } if from_user && verify_area(VirtAddr::new(addr as usize), len).is_err() { return Err(SystemError::EFAULT); } @@ -549,6 +554,10 @@ impl<'a> UserBufferWriter<'a> { /// * Returns UserBufferWriter instance on success, error code otherwise /// pub fn new(addr: *mut U, len: usize, from_user: bool) -> Result { + // SAFETY: constructing a slice from a null pointer with non-zero length is UB. + if len != 0 && (addr as usize) == 0 { + return Err(SystemError::EFAULT); + } if from_user && verify_area(VirtAddr::new(addr as usize), len).is_err() { return Err(SystemError::EFAULT); } @@ -889,8 +898,10 @@ pub unsafe fn copy_from_user_protected( let dst_ptr = dst.as_mut_ptr(); let src_ptr = src.data() as *const u8; - // 快速路径: 页表检查 - if !check_user_access_by_page_table(src, len, false) { + // 预检查:只基于 VMA 做权限/范围检查(不要求 PTE 已经存在)。 + // Linux 语义:对懒分配/按需调页的用户页,拷贝过程中触发的缺页应由缺页处理器补齐。 + // 这里避免用“页表必须 present”作为判断条件,否则 read()/readv() 写入 mmap 区域会错误返回 -EFAULT。 + if user_accessible_len(src, len, false) < len { return Err(SystemError::EFAULT); } @@ -921,8 +932,10 @@ pub unsafe fn copy_to_user_protected(dest: VirtAddr, src: &[u8]) -> Result/dev/null) + if [ -z "$avail_kb" ]; then + avail_kb=$(/bin/busybox awk '/^MemFree:/ {print $2; exit}' /proc/meminfo 2>/dev/null) + fi + if [ -z "$avail_kb" ]; then + echo "[gvisor] cannot read MemAvailable, skip blocklist adjust" + return 0 + fi + + if [ ! -f "$BLOCKLIST_FILE" ]; then + echo "[gvisor] blocklist not found: $BLOCKLIST_FILE" + return 0 + fi + + if [ "$avail_kb" -ge "$THRESHOLD_KB" ]; then + echo "[gvisor] MemAvailable=${avail_kb}kB >= ${THRESHOLD_KB}kB, enable ${HEAVY_TEST}" + # Comment out the entry so runner will NOT block it. + /bin/busybox awk -v t="$HEAVY_TEST" ' + { + line=$0 + gsub(/[ \t\r]+$/, "", line) + if (line == t) { + print "# " t + next + } + print $0 + } + ' "$BLOCKLIST_FILE" > "$BLOCKLIST_FILE" + else + echo "[gvisor] MemAvailable=${avail_kb}kB < ${THRESHOLD_KB}kB, keep ${HEAVY_TEST} blocked" + fi + +} + +adjust_readv_blocklist + +cd "$SYSCALL_TEST_DIR" || exit 1 ./gvisor-test-runner --stdout diff --git a/user/apps/tests/syscall/gvisor/whitelist.txt b/user/apps/tests/syscall/gvisor/whitelist.txt index c9043b411..d641f4b14 100644 --- a/user/apps/tests/syscall/gvisor/whitelist.txt +++ b/user/apps/tests/syscall/gvisor/whitelist.txt @@ -31,6 +31,7 @@ fchdir_test pread64_test rename_test getdents_test +readv_test preadv_test pwrite64_test pwritev2_test