From db967844b8b16872e07f4f26b3488db898f0cb45 Mon Sep 17 00:00:00 2001 From: longjin Date: Sat, 20 Dec 2025 23:03:37 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=94=A8=E6=88=B7?= =?UTF-8?q?=E7=A9=BA=E9=97=B4=E5=86=85=E5=AD=98=E8=AE=BF=E9=97=AE=E5=92=8C?= =?UTF-8?q?=E9=A1=B5=E9=9D=A2=E5=9B=9E=E6=94=B6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 修复 IoVecs 构造时对零长度缓冲区的验证,确保符合 Linux 语义 - 修复 scatter 方法在遇到不可访问内存时的错误处理,避免部分写入后返回错误 - 修复 readv/preadv 等系统调用,使其支持分块读取和部分成功写入 - 修复页面回收逻辑,避免回收仍被映射的文件页 - 修复 UserBufferReader/Writer 对空指针的检查,防止未定义行为 - 调整缓存阈值并添加 gVisor 测试的内存检测逻辑 Signed-off-by: longjin --- kernel/src/driver/block/cache/mod.rs | 2 +- kernel/src/filesystem/vfs/iov.rs | 115 ++++++++++++++---- .../src/filesystem/vfs/syscall/sys_preadv.rs | 2 +- .../src/filesystem/vfs/syscall/sys_preadv2.rs | 2 +- kernel/src/filesystem/vfs/syscall/sys_read.rs | 68 ++++++++--- .../src/filesystem/vfs/syscall/sys_readv.rs | 88 +++++++++++++- kernel/src/mm/page.rs | 9 ++ kernel/src/net/syscall.rs | 2 +- kernel/src/syscall/user_access.rs | 21 +++- .../syscall/gvisor/blocklists/readv_test | 3 + user/apps/tests/syscall/gvisor/run_tests.sh | 55 ++++++++- user/apps/tests/syscall/gvisor/whitelist.txt | 1 + 12 files changed, 319 insertions(+), 49 deletions(-) create mode 100644 user/apps/tests/syscall/gvisor/blocklists/readv_test diff --git a/kernel/src/driver/block/cache/mod.rs b/kernel/src/driver/block/cache/mod.rs index 1e2f87fe7..cf516b5d4 100644 --- a/kernel/src/driver/block/cache/mod.rs +++ b/kernel/src/driver/block/cache/mod.rs @@ -9,7 +9,7 @@ pub const BLOCK_SIZE_LOG: usize = 9; ///块大小这里固定为512 pub const BLOCK_SIZE: usize = 1 << BLOCK_SIZE_LOG; ///这里规定Cache的threshold大小,单位为:MB -pub const CACHE_THRESHOLD: usize = 64; +pub const CACHE_THRESHOLD: usize = 2; pub enum BlockCacheError { BlockSizeError, diff --git a/kernel/src/filesystem/vfs/iov.rs b/kernel/src/filesystem/vfs/iov.rs index 5d095f1c2..56cc9c642 100644 --- a/kernel/src/filesystem/vfs/iov.rs +++ b/kernel/src/filesystem/vfs/iov.rs @@ -2,9 +2,13 @@ use alloc::vec::Vec; use system_error::SystemError; use crate::{ + mm::verify_area, mm::VirtAddr, syscall::user_access::{user_accessible_len, UserBufferReader, UserBufferWriter}, }; + +/// Linux UIO_MAXIOV: maximum number of iovec structures per syscall +const IOV_MAX: usize = 1024; #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct IoVec { @@ -25,7 +29,15 @@ impl IoVecs { /// 获取IoVecs中所有缓冲区的总长度 #[inline(never)] pub fn total_len(&self) -> usize { - self.0.iter().map(|x| x.iov_len).sum() + self.0 + .iter() + .try_fold(0usize, |acc, x| acc.checked_add(x.iov_len)) + .unwrap_or(usize::MAX) + } + + /// Borrow the validated iovec list. + pub fn iovs(&self) -> &[IoVec] { + &self.0 } /// Constructs `IoVecs` from an array of `IoVec` in userspace. @@ -34,7 +46,7 @@ impl IoVecs { /// /// * `iov` - Pointer to the array of `IoVec` in userspace /// * `iovcnt` - Number of `IoVec` elements in the array - /// * `readv` - Whether this is for the `readv` syscall (currently unused) + /// * `readv` - Whether this is for the `readv` syscall (true = check write permission) /// /// # Returns /// @@ -52,23 +64,62 @@ impl IoVecs { iovcnt: usize, _readv: bool, ) -> Result { - let iovs_reader = UserBufferReader::new(iov, iovcnt * core::mem::size_of::(), true)?; + // Linux: iovcnt must be > 0 and not unreasonably large. + if iovcnt == 0 || iovcnt > IOV_MAX { + return Err(SystemError::EINVAL); + } + + let elem_size = core::mem::size_of::(); + let total_bytes = iovcnt.checked_mul(elem_size).ok_or(SystemError::EINVAL)?; + + // Only does range check (user range) here. + let iovs_reader = UserBufferReader::new(iov, total_bytes, true)?; + + // Use exception-table protected copy to avoid kernel faults when userspace pointer is bad. + let iovs_buf = iovs_reader.buffer_protected(0)?; - // 将用户空间的IoVec转换为引用(注意:这里的引用是静态的,因为用户空间的IoVec不会被释放) - let iovs = iovs_reader.buffer::(0)?; + let mut slices: Vec = Vec::with_capacity(iovcnt); + for idx in 0..iovcnt { + let offset = idx * elem_size; + let one: IoVec = iovs_buf.read_one(offset)?; - let mut slices: Vec = Vec::with_capacity(iovs.len()); + // Linux behavior: always validate iov_base is a user pointer, even when iov_len==0. + // This matches Linux access_ok(addr, 0) behavior and is required by gVisor tests. + let base = VirtAddr::new(one.iov_base as usize); - for iov in iovs.iter() { - if iov.iov_len == 0 { + // Only do lightweight address range check (like Linux's access_ok). + // This checks that the address range is within user space limits, + // but does NOT traverse page tables or check actual mappings. + // Actual page mapping/permission checks happen during copy operations. + verify_area(base, one.iov_len)?; + + // Skip zero-length iovecs after validation + if one.iov_len == 0 { continue; } - let _ = UserBufferWriter::new(iov.iov_base, iov.iov_len, true)?; - slices.push(*iov); + // Range check (prevents kernel addresses / overflow). + verify_area(base, one.iov_len)?; + + // If the first byte isn't writable/readable at all, fail early with EFAULT. + // Partial accessibility is handled by the syscall implementation. + let accessible = user_accessible_len(base, one.iov_len, _readv /* check_write */); + if accessible == 0 { + return Err(SystemError::EFAULT); + } + + // Also ensure we can build a writer/reader wrapper for the range. + // (This is a cheap range check; mapping faults are handled elsewhere.) + if _readv { + let _ = UserBufferWriter::new(one.iov_base, one.iov_len, true)?; + } else { + let _ = UserBufferReader::new(one.iov_base, one.iov_len, true)?; + } + + slices.push(one); } - return Ok(Self(slices)); + Ok(Self(slices)) } /// Aggregates data from all IoVecs into a single buffer. @@ -144,21 +195,43 @@ impl IoVecs { /// let iovecs = IoVecs::from_user(/* ... */)?; /// iovecs.scatter(&[1, 2, 3, 4, 5]); /// ``` - pub fn scatter(&self, data: &[u8]) { - let mut data: &[u8] = data; - for slice in self.0.iter() { - let len = core::cmp::min(slice.iov_len, data.len()); - if len == 0 { + pub fn scatter(&self, data: &[u8]) -> Result<(), SystemError> { + let mut remaining = data; + let mut written_any = false; + + for iov in self.0.iter() { + if remaining.is_empty() { + break; + } + + let want = core::cmp::min(iov.iov_len, remaining.len()); + if want == 0 { continue; } - let mut buf_writer = - UserBufferWriter::new(slice.iov_base, slice.iov_len, true).unwrap(); - let slice = buf_writer.buffer::(0).unwrap(); + let base = VirtAddr::new(iov.iov_base as usize); + let accessible = user_accessible_len(base, want, true /*write*/); + if accessible == 0 { + if !written_any { + return Err(SystemError::EFAULT); + } + break; + } + + let mut writer = UserBufferWriter::new(iov.iov_base, accessible, true)?; + let mut user_buf = writer.buffer_protected(0)?; + user_buf.write_to_user(0, &remaining[..accessible])?; - slice[..len].copy_from_slice(&data[..len]); - data = &data[len..]; + written_any = true; + remaining = &remaining[accessible..]; + + if accessible < want { + // Hit an unmapped/forbidden region; stop as Linux does. + break; + } } + + Ok(()) } /// Creates a buffer with capacity equal to the total length of all IoVecs. diff --git a/kernel/src/filesystem/vfs/syscall/sys_preadv.rs b/kernel/src/filesystem/vfs/syscall/sys_preadv.rs index b7123db5e..77d94dd0d 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_preadv.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_preadv.rs @@ -79,7 +79,7 @@ pub fn do_preadv(fd: i32, iovecs: &IoVecs, offset: usize) -> Result Result { return file.read(buf.len(), buf); } + +/// Read into a userspace buffer safely (exception-table protected) and in chunks. +/// +/// Linux semantics: if a fault happens after some bytes are copied, return the number +/// of bytes copied instead of -EFAULT. +fn read_into_user_buffer(fd: i32, user_ptr: *mut u8, len: usize) -> Result { + // 用户态:先计算可写入长度,避免直接写入无效用户页。 + let accessible = + user_accessible_len(VirtAddr::new(user_ptr as usize), len, true /*write*/); + if accessible == 0 { + return Err(SystemError::EFAULT); + } + + // Keep the kernel-side buffer modest to avoid huge allocations/long critical sections. + const CHUNK: usize = 64 * 1024; + let mut total = 0usize; + + while total < accessible { + let remain = accessible - total; + let chunk_len = core::cmp::min(CHUNK, remain); + + let mut kbuf = alloc::vec![0u8; chunk_len]; + let n = do_read(fd, &mut kbuf[..])?; + if n == 0 { + break; + } + + let dst = VirtAddr::new(user_ptr as usize + total); + let write_res = unsafe { copy_to_user_protected(dst, &kbuf[..n]) }; + match write_res { + Ok(_) => { + total += n; + } + Err(SystemError::EFAULT) => { + if total == 0 { + return Err(SystemError::EFAULT); + } + break; + } + Err(e) => return Err(e), + } + + if n < chunk_len { + break; + } + } + + Ok(total) +} diff --git a/kernel/src/filesystem/vfs/syscall/sys_readv.rs b/kernel/src/filesystem/vfs/syscall/sys_readv.rs index 9a99d6f58..942a87de4 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_readv.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_readv.rs @@ -2,10 +2,14 @@ use system_error::SystemError; use crate::arch::interrupt::TrapFrame; use crate::arch::syscall::nr::SYS_READV; +use crate::arch::MMArch; use crate::filesystem::vfs::iov::IoVec; use crate::filesystem::vfs::iov::IoVecs; +use crate::mm::MemoryManagementArch; +use crate::mm::VirtAddr; use crate::syscall::table::FormattedSyscallParam; use crate::syscall::table::Syscall; +use crate::syscall::user_access::{copy_to_user_protected, user_accessible_len}; use alloc::string::ToString; use alloc::vec::Vec; @@ -27,16 +31,90 @@ impl Syscall for SysReadVHandle { let iov = Self::iov(args); let count = Self::count(args); - // IoVecs会进行用户态检验 + // IoVecs 会进行用户态检验(包含 len==0 的 iov_base 校验)。 let iovecs = unsafe { IoVecs::from_user(iov, count, true) }?; - let mut data = vec![0; iovecs.total_len()]; + // Linux: limit per readv() to MAX_RW_COUNT = INT_MAX & ~(PAGE_SIZE-1) + let max_rw_count = (i32::MAX as usize) & !(MMArch::PAGE_SIZE - 1); - let len = do_read(fd, &mut data)?; + let mut total_read: usize = 0; - iovecs.scatter(&data[..len]); + // Keep kernel-side buffer modest to avoid huge allocations. + // Also used as the granularity for accessibility checks to avoid + // traversing huge address ranges at once. + const CHUNK: usize = 64 * 1024; - return Ok(len); + for one in iovecs.iovs().iter() { + // Check if we've reached MAX_RW_COUNT limit + if total_read >= max_rw_count { + break; + } + + let remain = max_rw_count - total_read; + let want = core::cmp::min(one.iov_len, remain); + if want == 0 { + continue; + } + + let mut copied_this_iov = 0usize; + while copied_this_iov < want { + // Calculate how much to process in this iteration + let remain_iov = want - copied_this_iov; + let chunk_len = core::cmp::min(CHUNK, remain_iov); + + let current_base = one.iov_base as usize + copied_this_iov; + + // Check accessibility for this chunk only (not the entire iovec) + // This avoids traversing huge address ranges at once + let accessible = user_accessible_len(VirtAddr::new(current_base), chunk_len, true); + if accessible == 0 { + if total_read == 0 && copied_this_iov == 0 { + return Err(SystemError::EFAULT); + } + // Hit unmapped region, return what we've read so far + return Ok(total_read); + } + + // Read into kernel buffer + let to_read = core::cmp::min(accessible, chunk_len); + let mut kbuf = alloc::vec![0u8; to_read]; + let n = do_read(fd, &mut kbuf[..])?; + if n == 0 { + // EOF + return Ok(total_read); + } + + // Copy to user space + let dst = VirtAddr::new(current_base); + let write_res = unsafe { copy_to_user_protected(dst, &kbuf[..n]) }; + match write_res { + Ok(_) => { + copied_this_iov += n; + total_read = total_read.saturating_add(n); + + // Check MAX_RW_COUNT limit after each chunk + if total_read >= max_rw_count { + return Ok(total_read); + } + } + Err(SystemError::EFAULT) => { + // Linux: return partial count if any bytes were copied. + if total_read == 0 { + return Err(SystemError::EFAULT); + } + return Ok(total_read); + } + Err(e) => return Err(e), + } + + // Stop on short read (EOF or error in underlying file) + if n < to_read { + return Ok(total_read); + } + } + } + + Ok(total_read) } fn entry_format(&self, args: &[usize]) -> Vec { diff --git a/kernel/src/mm/page.rs b/kernel/src/mm/page.rs index ce5215442..11ceb25c7 100644 --- a/kernel/src/mm/page.rs +++ b/kernel/src/mm/page.rs @@ -332,6 +332,15 @@ impl PageReclaimer { for page in victims { let mut guard = page.write_irqsave(); if let PageType::File(info) = guard.page_type().clone() { + // Never evict a file-backed page that is still mapped into any VMA. + // Our eviction path removes the page from page_cache/page_manager; dropping a + // still-mapped page will trip InnerPage::drop assertions and can crash userland. + if guard.map_count() != 0 { + drop(guard); + page_reclaimer_lock_irqsave().insert_page(page.phys_address(), &page); + continue; + } + let page_index = info.index; let paddr = guard.phys_address(); diff --git a/kernel/src/net/syscall.rs b/kernel/src/net/syscall.rs index ef76cb8b9..4bd55f150 100644 --- a/kernel/src/net/syscall.rs +++ b/kernel/src/net/syscall.rs @@ -359,7 +359,7 @@ impl Syscall { }; // 将数据写入用户空间的iovecs - iovs.scatter(&buf[..recv_size]); + iovs.scatter(&buf[..recv_size])?; return Ok(recv_size); } diff --git a/kernel/src/syscall/user_access.rs b/kernel/src/syscall/user_access.rs index d72248dd8..90f120f5f 100644 --- a/kernel/src/syscall/user_access.rs +++ b/kernel/src/syscall/user_access.rs @@ -219,6 +219,11 @@ impl UserBufferReader<'_> { /// * Returns UserBufferReader instance on success, error code otherwise /// pub fn new(addr: *const U, len: usize, from_user: bool) -> Result { + // SAFETY: constructing a slice from a null pointer with non-zero length is UB. + // Linux semantics: passing a null pointer for a non-empty user buffer should fail with EFAULT. + if len != 0 && (addr as usize) == 0 { + return Err(SystemError::EFAULT); + } if from_user && verify_area(VirtAddr::new(addr as usize), len).is_err() { return Err(SystemError::EFAULT); } @@ -549,6 +554,10 @@ impl<'a> UserBufferWriter<'a> { /// * Returns UserBufferWriter instance on success, error code otherwise /// pub fn new(addr: *mut U, len: usize, from_user: bool) -> Result { + // SAFETY: constructing a slice from a null pointer with non-zero length is UB. + if len != 0 && (addr as usize) == 0 { + return Err(SystemError::EFAULT); + } if from_user && verify_area(VirtAddr::new(addr as usize), len).is_err() { return Err(SystemError::EFAULT); } @@ -889,8 +898,10 @@ pub unsafe fn copy_from_user_protected( let dst_ptr = dst.as_mut_ptr(); let src_ptr = src.data() as *const u8; - // 快速路径: 页表检查 - if !check_user_access_by_page_table(src, len, false) { + // 预检查:只基于 VMA 做权限/范围检查(不要求 PTE 已经存在)。 + // Linux 语义:对懒分配/按需调页的用户页,拷贝过程中触发的缺页应由缺页处理器补齐。 + // 这里避免用“页表必须 present”作为判断条件,否则 read()/readv() 写入 mmap 区域会错误返回 -EFAULT。 + if user_accessible_len(src, len, false) < len { return Err(SystemError::EFAULT); } @@ -921,8 +932,10 @@ pub unsafe fn copy_to_user_protected(dest: VirtAddr, src: &[u8]) -> Result/dev/null) + if [ -z "$avail_kb" ]; then + avail_kb=$(/bin/busybox awk '/^MemFree:/ {print $2; exit}' /proc/meminfo 2>/dev/null) + fi + if [ -z "$avail_kb" ]; then + echo "[gvisor] cannot read MemAvailable, skip blocklist adjust" + return 0 + fi + + if [ ! -f "$BLOCKLIST_FILE" ]; then + echo "[gvisor] blocklist not found: $BLOCKLIST_FILE" + return 0 + fi + + if [ "$avail_kb" -ge "$THRESHOLD_KB" ]; then + echo "[gvisor] MemAvailable=${avail_kb}kB >= ${THRESHOLD_KB}kB, enable ${HEAVY_TEST}" + # Comment out the entry so runner will NOT block it. + /bin/busybox awk -v t="$HEAVY_TEST" ' + { + line=$0 + gsub(/[ \t\r]+$/, "", line) + if (line == t) { + print "# " t + next + } + print $0 + } + ' "$BLOCKLIST_FILE" > "$BLOCKLIST_FILE" + else + echo "[gvisor] MemAvailable=${avail_kb}kB < ${THRESHOLD_KB}kB, keep ${HEAVY_TEST} blocked" + fi + +} + +adjust_readv_blocklist + +cd "$SYSCALL_TEST_DIR" || exit 1 ./gvisor-test-runner --stdout diff --git a/user/apps/tests/syscall/gvisor/whitelist.txt b/user/apps/tests/syscall/gvisor/whitelist.txt index c9043b411..d641f4b14 100644 --- a/user/apps/tests/syscall/gvisor/whitelist.txt +++ b/user/apps/tests/syscall/gvisor/whitelist.txt @@ -31,6 +31,7 @@ fchdir_test pread64_test rename_test getdents_test +readv_test preadv_test pwrite64_test pwritev2_test From 994614f785e07c14804e98bbf7c425510e88fc34 Mon Sep 17 00:00:00 2001 From: longjin Date: Mon, 22 Dec 2025 00:10:25 +0800 Subject: [PATCH 2/2] =?UTF-8?q?refactor(vfs):=20=E4=BC=98=E5=8C=96=20IoVec?= =?UTF-8?q?s=20=E7=9A=84=E7=94=A8=E6=88=B7=E7=A9=BA=E9=97=B4=E5=86=85?= =?UTF-8?q?=E5=AD=98=E8=AE=BF=E9=97=AE=E6=A3=80=E6=9F=A5=E4=B8=8E=E6=8B=B7?= =?UTF-8?q?=E8=B4=9D=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 移除冗余的 verify_area 和 UserBufferReader/Writer 检查,统一使用 user_accessible_len 进行访问性验证 - 在 gather 方法中使用 copy_from_user_protected 进行异常保护的拷贝,与 scatter 方法保持一致 - 改进错误处理逻辑,当部分数据已成功读取时返回已读取的数据,否则返回 EFAULT Signed-off-by: longjin --- kernel/src/filesystem/vfs/iov.rs | 54 ++++++++++++++------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/kernel/src/filesystem/vfs/iov.rs b/kernel/src/filesystem/vfs/iov.rs index 56cc9c642..b86b7485e 100644 --- a/kernel/src/filesystem/vfs/iov.rs +++ b/kernel/src/filesystem/vfs/iov.rs @@ -4,7 +4,9 @@ use system_error::SystemError; use crate::{ mm::verify_area, mm::VirtAddr, - syscall::user_access::{user_accessible_len, UserBufferReader, UserBufferWriter}, + syscall::user_access::{ + copy_from_user_protected, user_accessible_len, UserBufferReader, UserBufferWriter, + }, }; /// Linux UIO_MAXIOV: maximum number of iovec structures per syscall @@ -98,24 +100,15 @@ impl IoVecs { continue; } - // Range check (prevents kernel addresses / overflow). - verify_area(base, one.iov_len)?; - // If the first byte isn't writable/readable at all, fail early with EFAULT. // Partial accessibility is handled by the syscall implementation. + // Note: user_accessible_len returns 0 for null pointers (addr.is_null() check), + // so null pointer detection is covered here. let accessible = user_accessible_len(base, one.iov_len, _readv /* check_write */); if accessible == 0 { return Err(SystemError::EFAULT); } - // Also ensure we can build a writer/reader wrapper for the range. - // (This is a cheap range check; mapping faults are handled elsewhere.) - if _readv { - let _ = UserBufferWriter::new(one.iov_base, one.iov_len, true)?; - } else { - let _ = UserBufferReader::new(one.iov_base, one.iov_len, true)?; - } - slices.push(one); } @@ -138,35 +131,36 @@ impl IoVecs { /// read at all, `Err(SystemError::EFAULT)` is returned. pub fn gather(&self) -> Result, SystemError> { let mut buf = Vec::with_capacity(self.total_len()); + let mut read_any = false; for iov in self.0.iter() { + let base = VirtAddr::new(iov.iov_base as usize); // 检查从 iov_base 开始有多少 bytes 在 vma 内部且实际可以访问 - let accessible = - user_accessible_len(VirtAddr::new(iov.iov_base as usize), iov.iov_len, false); - - // log::debug!( - // "iov is {:?}. iov_len: {}; accessible len:{}", - // iov, - // iov.iov_len, - // accessible - // ); + let accessible = user_accessible_len(base, iov.iov_len, false /* read */); // 如果一个字节都不能访问 if accessible == 0 { - if buf.is_empty() { - // log::error!( - // "The first iov is empty, returning EFAULT. iov shape: {:?}", - // iov - // ); + if !read_any { return Err(SystemError::EFAULT); } return Ok(buf); } - // 复制可访问的部分 - unsafe { - let src = core::slice::from_raw_parts(iov.iov_base as *const u8, accessible); - buf.extend_from_slice(src); + // 使用异常保护的拷贝,与 scatter 保持一致 + let mut chunk = alloc::vec![0u8; accessible]; + match unsafe { copy_from_user_protected(&mut chunk, base) } { + Ok(_) => { + buf.extend_from_slice(&chunk); + read_any = true; + } + Err(SystemError::EFAULT) => { + // Linux: return partial data if any bytes were copied. + if !read_any { + return Err(SystemError::EFAULT); + } + return Ok(buf); + } + Err(e) => return Err(e), } // 如果没有读取完整个 iov,说明遇到了不可访问的区域