From 6676fee073378d60b4d4332f4f2d30520fcc7ccc Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 23 Sep 2025 14:52:27 +0100 Subject: [PATCH 1/8] not-for-upstream: use downstream vm_memory_vendored Copy the implementation downstream while we wait for upstream. Signed-off-by: Riccardo Mancini --- src/vmm/src/lib.rs | 2 + src/vmm/src/vm_memory_vendored.rs | 265 ++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 src/vmm/src/vm_memory_vendored.rs diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 7a168db0146..1abd5aad186 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -106,6 +106,8 @@ pub mod snapshot; pub mod test_utils; /// Utility functions and struct pub mod utils; +/// Vendored vm memory (vm-memory#312) +pub mod vm_memory_vendored; /// Wrappers over structures used to configure the VMM. pub mod vmm_config; /// Module with virtual state structs. diff --git a/src/vmm/src/vm_memory_vendored.rs b/src/vmm/src/vm_memory_vendored.rs new file mode 100644 index 00000000000..7e3ecb81682 --- /dev/null +++ b/src/vmm/src/vm_memory_vendored.rs @@ -0,0 +1,265 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Module temporarily containing vendored in-review vm-memory features +//! +//! TODO: To be removed once https://github.com/rust-vmm/vm-memory/pull/312 is merged + +#![allow(clippy::cast_possible_truncation)] // vm-memory has different clippy configuration + +use std::io::{Read, Write}; +use std::sync::Arc; +use std::sync::atomic::Ordering; + +use vm_memory::guest_memory::Result; +use vm_memory::{ + Address, AtomicAccess, Bytes, Error, GuestAddress, GuestMemory, GuestMemoryError, + GuestMemoryRegion, MemoryRegionAddress, +}; + +use crate::vstate::memory::GuestRegionMmapExt; + +/// [`GuestMemory`](trait.GuestMemory.html) implementation based on a homogeneous collection +/// of [`GuestMemoryRegion`] implementations. +/// +/// Represents a sorted set of non-overlapping physical guest memory regions. +#[derive(Debug)] +pub struct GuestRegionCollection { + regions: Vec>, +} + +impl Default for GuestRegionCollection { + fn default() -> Self { + Self { + regions: Vec::new(), + } + } +} + +impl Clone for GuestRegionCollection { + fn clone(&self) -> Self { + GuestRegionCollection { + regions: self.regions.iter().map(Arc::clone).collect(), + } + } +} + +impl GuestRegionCollection { + /// Creates a new [`GuestRegionCollection`] from a vector of regions. + /// + /// # Arguments + /// + /// * `regions` - The vector of regions. The regions shouldn't overlap, and they should be + /// sorted by the starting address. + pub fn from_regions(mut regions: Vec) -> std::result::Result { + Self::from_arc_regions(regions.drain(..).map(Arc::new).collect()) + } + + /// Creates a new [`GuestRegionCollection`] from a vector of Arc regions. + /// + /// Similar to the constructor `from_regions()` as it returns a + /// [`GuestRegionCollection`]. The need for this constructor is to provide a way for + /// consumer of this API to create a new [`GuestRegionCollection`] based on existing + /// regions coming from an existing [`GuestRegionCollection`] instance. + /// + /// # Arguments + /// + /// * `regions` - The vector of `Arc` regions. The regions shouldn't overlap and they should be + /// sorted by the starting address. + pub fn from_arc_regions(regions: Vec>) -> std::result::Result { + if regions.is_empty() { + return Err(Error::NoMemoryRegion); + } + + for window in regions.windows(2) { + let prev = &window[0]; + let next = &window[1]; + + if prev.start_addr() > next.start_addr() { + return Err(Error::UnsortedMemoryRegions); + } + + if prev.last_addr() >= next.start_addr() { + return Err(Error::MemoryRegionOverlap); + } + } + + Ok(Self { regions }) + } + + /// Insert a region into the `GuestMemoryMmap` object and return a new `GuestMemoryMmap`. + /// + /// # Arguments + /// * `region`: the memory region to insert into the guest memory object. + pub fn insert_region( + &self, + region: Arc, + ) -> std::result::Result, Error> { + let mut regions = self.regions.clone(); + regions.push(region); + regions.sort_by_key(|x| x.start_addr()); + + Self::from_arc_regions(regions) + } +} + +impl GuestMemory for GuestRegionCollection { + type R = R; + + fn num_regions(&self) -> usize { + self.regions.len() + } + + fn find_region(&self, addr: GuestAddress) -> Option<&R> { + let index = match self.regions.binary_search_by_key(&addr, |x| x.start_addr()) { + Ok(x) => Some(x), + // Within the closest region with starting address < addr + Err(x) if (x > 0 && addr <= self.regions[x - 1].last_addr()) => Some(x - 1), + _ => None, + }; + index.map(|x| self.regions[x].as_ref()) + } + + fn iter(&self) -> impl Iterator { + self.regions.iter().map(AsRef::as_ref) + } +} + +// This impl will be subsumed by the default impl in vm-memory#312 +impl Bytes for GuestRegionMmapExt { + type E = GuestMemoryError; + + /// # Examples + /// * Write a slice at guest address 0x1200. + /// + /// ``` + /// # #[cfg(feature = "backend-mmap")] + /// # use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + /// # + /// # #[cfg(feature = "backend-mmap")] + /// # { + /// # let start_addr = GuestAddress(0x1000); + /// # let mut gm = GuestMemoryMmap::<()>::from_ranges(&vec![(start_addr, 0x400)]) + /// # .expect("Could not create guest memory"); + /// # + /// let res = gm + /// .write(&[1, 2, 3, 4, 5], GuestAddress(0x1200)) + /// .expect("Could not write to guest memory"); + /// assert_eq!(5, res); + /// # } + /// ``` + fn write(&self, buf: &[u8], addr: MemoryRegionAddress) -> Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .write(buf, maddr) + .map_err(Into::into) + } + + /// # Examples + /// * Read a slice of length 16 at guestaddress 0x1200. + /// + /// ``` + /// # #[cfg(feature = "backend-mmap")] + /// # use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + /// # + /// # #[cfg(feature = "backend-mmap")] + /// # { + /// # let start_addr = GuestAddress(0x1000); + /// # let mut gm = GuestMemoryMmap::<()>::from_ranges(&vec![(start_addr, 0x400)]) + /// # .expect("Could not create guest memory"); + /// # + /// let buf = &mut [0u8; 16]; + /// let res = gm + /// .read(buf, GuestAddress(0x1200)) + /// .expect("Could not read from guest memory"); + /// assert_eq!(16, res); + /// # } + /// ``` + fn read(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .read(buf, maddr) + .map_err(Into::into) + } + + fn write_slice(&self, buf: &[u8], addr: MemoryRegionAddress) -> Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .write_slice(buf, maddr) + .map_err(Into::into) + } + + fn read_slice(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .read_slice(buf, maddr) + .map_err(Into::into) + } + + fn store( + &self, + val: T, + addr: MemoryRegionAddress, + order: Ordering, + ) -> Result<()> { + self.as_volatile_slice().and_then(|s| { + s.store(val, addr.raw_value() as usize, order) + .map_err(Into::into) + }) + } + + fn load(&self, addr: MemoryRegionAddress, order: Ordering) -> Result { + self.as_volatile_slice() + .and_then(|s| s.load(addr.raw_value() as usize, order).map_err(Into::into)) + } + + // All remaining functions are deprecated and have been removed in vm-memory/main. + // Firecracker does not use them, so no point in writing out implementations here. + fn read_from( + &self, + _addr: MemoryRegionAddress, + _src: &mut F, + _count: usize, + ) -> std::result::Result + where + F: Read, + { + unimplemented!() + } + + fn read_exact_from( + &self, + _addr: MemoryRegionAddress, + _src: &mut F, + _count: usize, + ) -> std::result::Result<(), Self::E> + where + F: Read, + { + unimplemented!() + } + + fn write_to( + &self, + _addr: MemoryRegionAddress, + _dst: &mut F, + _count: usize, + ) -> std::result::Result + where + F: Write, + { + unimplemented!() + } + + fn write_all_to( + &self, + _addr: MemoryRegionAddress, + _dst: &mut F, + _count: usize, + ) -> std::result::Result<(), Self::E> + where + F: Write, + { + unimplemented!() + } +} From b0f7c2a60d1499440465d7d194f04e9b7518d1a0 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 23 Sep 2025 15:34:34 +0100 Subject: [PATCH 2/8] refactor(vm-memory): use GuestMemoryCollection to store KVM slot This refactor enables us to add more information to the GuestMemory. To start off, I am adding KVM slot number and type of memory. So far we only support Dram (normal memory), but we will also add Hotpluggable region type in the future. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 4 +- src/vmm/src/device_manager/mmio.rs | 6 +- .../src/devices/virtio/block/virtio/io/mod.rs | 8 +- src/vmm/src/devices/virtio/vhost_user.rs | 11 +- src/vmm/src/persist.rs | 3 +- src/vmm/src/test_utils/mod.rs | 10 +- src/vmm/src/vstate/memory.rs | 181 ++++++++++++++---- src/vmm/src/vstate/vm.rs | 137 ++++++++----- 8 files changed, 251 insertions(+), 109 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 6bd81c46f18..9a91daf60d1 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -168,7 +168,7 @@ pub fn build_microvm_for_boot( // Build custom CPU config if a custom template is provided. let mut vm = Vm::new(&kvm)?; let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; - vm.register_memory_regions(guest_memory)?; + vm.register_dram_memory_regions(guest_memory)?; let mut device_manager = DeviceManager::new( event_manager, @@ -411,7 +411,7 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - vm.register_memory_regions(guest_memory) + vm.restore_memory_regions(guest_memory, µvm_state.vm_state.memory) .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 46accb637b0..e560204c9d2 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -586,7 +586,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); @@ -632,7 +632,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); @@ -685,7 +685,7 @@ pub(crate) mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_dram_memory_regions(guest_mem).unwrap(); #[cfg(target_arch = "x86_64")] vm.setup_irqchip().unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 09cc7c4e31d..eef347b51ce 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -179,6 +179,7 @@ pub mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use std::os::unix::ffi::OsStrExt; + use vm_memory::GuestMemoryRegion; use vmm_sys_util::tempfile::TempFile; use super::*; @@ -186,7 +187,7 @@ pub mod tests { use crate::utils::u64_to_usize; use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory; - use crate::vstate::memory::{Bitmap, Bytes, GuestMemory}; + use crate::vstate::memory::{Bitmap, Bytes, GuestMemory, GuestRegionMmapExt}; const FILE_LEN: u32 = 1024; // 2 pages of memory should be enough to test read/write ops and also dirty tracking. @@ -227,7 +228,10 @@ pub mod tests { true, HugePageConfig::None, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index bd1260fad40..8619b481edb 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -380,7 +380,7 @@ impl VhostUserHandleImpl { let vhost_user_net_reg = VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner.as_ptr() as u64, mmap_offset, mmap_handle, }; @@ -480,7 +480,7 @@ pub(crate) mod tests { use crate::devices::virtio::test_utils::default_interrupt; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; - use crate::vstate::memory::GuestAddress; + use crate::vstate::memory::{GuestAddress, GuestRegionMmapExt}; pub(crate) fn create_mem(file: File, regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { GuestMemoryMmap::from_regions( @@ -490,7 +490,10 @@ pub(crate) mod tests { Some(file), false, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } @@ -800,7 +803,7 @@ pub(crate) mod tests { .map(|region| VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner.as_ptr() as u64, mmap_offset: region.file_offset().unwrap().start(), mmap_handle: region.file_offset().unwrap().file().as_raw_fd(), }) diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 212b6105831..ee76bf6800b 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -588,7 +588,7 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; - use crate::vstate::memory::GuestMemoryRegionState; + use crate::vstate::memory::{GuestMemoryRegionState, GuestRegionType}; fn default_vmm_with_devices() -> Vmm { let mut event_manager = EventManager::new().expect("Cannot create EventManager"); @@ -693,6 +693,7 @@ mod tests { regions: vec![GuestMemoryRegionState { base_address: 0, size: 0x20000, + region_type: GuestRegionType::Dram, }], }; diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 2cfcc274b5d..03b0ce16726 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -12,11 +12,12 @@ use crate::builder::build_microvm_for_boot; use crate::resources::VmResources; use crate::seccomp::get_empty_filters; use crate::test_utils::mock_resources::{MockBootSourceConfig, MockVmConfig, MockVmResources}; +use crate::vm_memory_vendored::GuestRegionCollection; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory; -use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap}; +use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap, GuestRegionMmapExt}; use crate::{EventManager, Vmm}; pub mod mock_resources; @@ -43,9 +44,12 @@ pub fn single_region_mem_at_raw(at: u64, size: usize) -> Vec { /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::from_regions( + GuestRegionCollection::from_regions( memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) - .expect("Cannot initialize memory"), + .expect("Cannot initialize memory") + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), ) .unwrap() } diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 38ee7cc2ce6..5fe49fa1b82 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -9,6 +9,7 @@ use std::fs::File; use std::io::SeekFrom; use std::sync::Arc; +use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; @@ -17,7 +18,7 @@ pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use vm_memory::{Error as VmMemoryError, GuestMemoryError, VolatileSlice, WriteVolatile}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -25,7 +26,7 @@ use crate::utils::{get_page_size, u64_to_usize}; use crate::vmm_config::machine_config::HugePageConfig; /// Type of GuestMemoryMmap. -pub type GuestMemoryMmap = vm_memory::GuestMemoryMmap>; +pub type GuestMemoryMmap = crate::vm_memory_vendored::GuestRegionCollection; /// Type of GuestRegionMmap. pub type GuestRegionMmap = vm_memory::GuestRegionMmap>; /// Type of GuestMmapRegion. @@ -50,6 +51,100 @@ pub enum MemoryError { OffsetTooLarge, } +/// Type of the guest region +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub enum GuestRegionType { + /// Guest DRAM + Dram, +} + +/// An extension to GuestMemoryRegion that stores the type of region, and the KVM slot +/// number. +#[derive(Debug)] +pub struct GuestRegionMmapExt { + /// the wrapped GuestRegionMmap + pub inner: GuestRegionMmap, + /// the type of region + pub region_type: GuestRegionType, + /// the KVM slot number assigned to this region + pub slot: u32, +} + +impl GuestRegionMmapExt { + pub(crate) fn dram_from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { + GuestRegionMmapExt { + inner: region, + region_type: GuestRegionType::Dram, + slot, + } + } + + pub(crate) fn from_state( + region: GuestRegionMmap, + state: &GuestMemoryRegionState, + slot: u32, + ) -> Result { + Ok(GuestRegionMmapExt { + inner: region, + region_type: state.region_type, + slot, + }) + } + + pub(crate) fn kvm_userspace_memory_region(&self) -> kvm_userspace_memory_region { + let flags = if self.inner.bitmap().is_some() { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }; + + kvm_userspace_memory_region { + flags, + slot: self.slot, + guest_phys_addr: self.inner.start_addr().raw_value(), + memory_size: self.inner.len(), + userspace_addr: self.inner.as_ptr() as u64, + } + } +} + +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +impl GuestMemoryRegion for GuestRegionMmapExt { + type B = Option; + + fn len(&self) -> GuestUsize { + self.inner.len() + } + + fn start_addr(&self) -> GuestAddress { + self.inner.start_addr() + } + + fn bitmap(&self) -> &Self::B { + self.inner.bitmap() + } + + fn get_host_address( + &self, + addr: MemoryRegionAddress, + ) -> vm_memory::guest_memory::Result<*mut u8> { + self.inner.get_host_address(addr) + } + + fn file_offset(&self) -> Option<&FileOffset> { + self.inner.file_offset() + } + + fn get_slice( + &self, + offset: MemoryRegionAddress, + count: usize, + ) -> vm_memory::guest_memory::Result>> { + self.inner.get_slice(offset, count) + } +} + /// Creates a `Vec` of `GuestRegionMmap` with the given configuration pub fn create( regions: impl Iterator, @@ -169,6 +264,8 @@ pub struct GuestMemoryRegionState { pub base_address: u64, /// Region size. pub size: usize, + /// Region type + pub region_type: GuestRegionType, } /// Describes guest memory regions and their snapshot file mappings. @@ -196,6 +293,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { guest_memory_state.regions.push(GuestMemoryRegionState { base_address: region.start_addr().0, size: u64_to_usize(region.len()), + region_type: region.region_type, }); }); guest_memory_state @@ -227,8 +325,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size().map_err(MemoryError::PageSize)?; - let write_result = self.iter().zip(0..).try_for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let write_result = self.iter().try_for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.slot).unwrap(); let firecracker_bitmap = region.bitmap(); let mut write_size = 0; let mut dirty_batch_start: u64 = 0; @@ -291,8 +389,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { /// Stores the dirty bitmap inside into the internal bitmap fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize) { - self.iter().zip(0..).for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + self.iter().for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.slot).unwrap(); let firecracker_bitmap = region.bitmap(); for (i, v) in kvm_bitmap.iter().enumerate() { @@ -353,6 +451,17 @@ mod tests { use crate::snapshot::Snapshot; use crate::utils::{get_page_size, mib_to_bytes}; + fn into_region_ext(regions: Vec) -> GuestMemoryMmap { + GuestMemoryMmap::from_regions( + regions + .into_iter() + .zip(0u32..) // assign dummy slots + .map(|(region, slot)| GuestRegionMmapExt::dram_from_mmap_region(region, slot)) + .collect(), + ) + .unwrap() + } + #[test] fn test_anonymous() { for dirty_page_tracking in [true, false] { @@ -386,10 +495,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + into_region_ext(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); let dirty_map = [ // page 0: not dirty @@ -430,7 +537,7 @@ mod tests { } } - fn check_serde(guest_memory: &GuestMemoryMmap) { + fn check_serde(guest_memory: &M) { let mut snapshot_data = vec![0u8; 10000]; let original_state = guest_memory.describe(); Snapshot::new(&original_state) @@ -448,15 +555,14 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous( [(GuestAddress(0), region_size)].into_iter(), false, HugePageConfig::None, ) .unwrap(), - ) - .unwrap(); + ); check_serde(&guest_memory); // Test with some regions @@ -465,10 +571,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + into_region_ext(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); check_serde(&guest_memory); } @@ -481,20 +585,21 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); let expected_memory_state = GuestMemoryState { regions: vec![ GuestMemoryRegionState { base_address: 0, size: page_size, + region_type: GuestRegionType::Dram, }, GuestMemoryRegionState { base_address: page_size as u64 * 2, size: page_size, + region_type: GuestRegionType::Dram, }, ], }; @@ -507,20 +612,21 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); let expected_memory_state = GuestMemoryState { regions: vec![ GuestMemoryRegionState { base_address: 0, size: page_size * 3, + region_type: GuestRegionType::Dram, }, GuestMemoryRegionState { base_address: page_size as u64 * 4, size: page_size * 3, + region_type: GuestRegionType::Dram, }, ], }; @@ -541,10 +647,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -566,10 +671,8 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(memory_file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + into_region_ext(snapshot_file(memory_file, memory_state.regions(), false).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -596,10 +699,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -628,10 +730,8 @@ mod tests { guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); // We can restore from this because this is the first dirty dump. - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + into_region_ext(snapshot_file(file, memory_state.regions(), false).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -687,10 +787,9 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = into_region_ext( anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + ); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 27d43176367..a3752eb5ae8 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -15,8 +15,8 @@ use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; use kvm_bindings::{ - KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, - KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MSI_VALID_DEVID, KvmIrqRouting, + kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; use log::{debug, error}; @@ -36,7 +36,8 @@ use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ - Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestMemoryState, + GuestRegionMmap, GuestRegionMmapExt, MemoryError, }; use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; @@ -241,6 +242,8 @@ pub struct VmCommon { /// The KVM file descriptor used to access this Vm. pub fd: VmFd, max_memslots: u32, + /// Next free kvm slot + next_slot: u32, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, /// Interrupts used by Vm's devices @@ -275,7 +278,9 @@ pub enum VmError { /// Error calling mincore: {0} Mincore(vmm_sys_util::errno::Error), /// ResourceAllocator error: {0} - ResourceAllocator(#[from] vm_allocator::Error) + ResourceAllocator(#[from] vm_allocator::Error), + /// MemoryError error: {0} + MemoryError(#[from] MemoryError), } /// Contains Vm functions that are usable across CPU architectures @@ -321,6 +326,7 @@ impl Vm { Ok(VmCommon { fd, max_memslots: kvm.max_nr_memslots(), + next_slot: 0, guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::with_capacity(GSI_MSI_END as usize + 1)), resource_allocator: Mutex::new(ResourceAllocator::new()), @@ -348,53 +354,80 @@ impl Vm { Ok((vcpus, exit_evt)) } - /// Register a list of new memory regions to this [`Vm`]. - pub fn register_memory_regions( - &mut self, - regions: Vec, + /// Call set_user_memory_region on VM fd + pub(crate) fn set_user_memory_region( + &self, + region: kvm_userspace_memory_region, ) -> Result<(), VmError> { - for region in regions { - self.register_memory_region(region)? + // SAFETY: Safe because the fd is a valid KVM file descriptor. + unsafe { + self.fd() + .set_user_memory_region(region) + .map_err(VmError::SetUserMemoryRegion)?; } Ok(()) } - /// Register a new memory region to this [`Vm`]. - pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { - let next_slot = self - .guest_memory() - .num_regions() - .try_into() - .expect("Number of existing memory regions exceeds u32::MAX"); - if self.common.max_memslots <= next_slot { - return Err(VmError::NotEnoughMemorySlots(self.common.max_memslots)); - } + /// Allocates num_slots consecutive slot IDs and returns the first one. + fn allocate_slot_ids(&mut self, num_slots: u32) -> Result { + let slot = self.common.next_slot; + self.common.next_slot += num_slots; - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES + if self.common.next_slot > self.common.max_memslots { + Err(VmError::NotEnoughMemorySlots(self.common.max_memslots)) } else { - 0 - }; + Ok(slot) + } + } - let memory_region = kvm_userspace_memory_region { - slot: next_slot, - guest_phys_addr: region.start_addr().raw_value(), - memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, - flags, - }; + fn _register_memory_region(&mut self, region: Arc) -> Result<(), VmError> { + let new_guest_memory = self + .common + .guest_memory + .insert_region(Arc::clone(®ion))?; - let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; + self.set_user_memory_region(region.kvm_userspace_memory_region())?; - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { - self.fd() - .set_user_memory_region(memory_region) - .map_err(VmError::SetUserMemoryRegion)?; + self.common.guest_memory = new_guest_memory; + + Ok(()) + } + + /// Register a list of new memory regions to this [`Vm`]. + pub fn register_dram_memory_regions( + &mut self, + regions: Vec, + ) -> Result<(), VmError> { + for region in regions { + let arcd_region = Arc::new(GuestRegionMmapExt::dram_from_mmap_region( + region, + self.allocate_slot_ids(1)?, + )); + + self._register_memory_region(arcd_region)? } - self.common.guest_memory = new_guest_memory; + Ok(()) + } + + /// Register a list of new memory regions to this [`Vm`]. + /// + /// Note: regions and state.regions need to be in the same order. + pub fn restore_memory_regions( + &mut self, + regions: Vec, + state: &GuestMemoryState, + ) -> Result<(), VmError> { + for (region, state) in regions.into_iter().zip(state.regions.iter()) { + let arcd_region = Arc::new(GuestRegionMmapExt::from_state( + region, + state, + self.allocate_slot_ids(1)?, + )?); + + self._register_memory_region(arcd_region)? + } Ok(()) } @@ -419,28 +452,26 @@ impl Vm { /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { - self.guest_memory() - .iter() - .zip(0u32..) - .for_each(|(region, slot)| { - let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); - }); + self.guest_memory().iter().for_each(|region| { + let _ = self + .fd() + .get_dirty_log(region.slot, u64_to_usize(region.len())); + }); } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. pub fn get_dirty_bitmap(&self) -> Result { self.guest_memory() .iter() - .zip(0u32..) - .map(|(region, slot)| { + .map(|region| { let bitmap = match region.bitmap() { Some(_) => self .fd() - .get_dirty_log(slot, u64_to_usize(region.len())) + .get_dirty_log(region.slot, u64_to_usize(region.len())) .map_err(VmError::GetDirtyLog)?, - None => mincore_bitmap(region)?, + None => mincore_bitmap(®ion.inner)?, }; - Ok((slot, bitmap)) + Ok((region.slot, bitmap)) }) .collect() } @@ -700,7 +731,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm) { let (kvm, mut vm) = setup_vm(); let gm = single_region_mem_raw(mem_size); - vm.register_memory_regions(gm).unwrap(); + vm.register_dram_memory_regions(gm).unwrap(); (kvm, vm) } @@ -718,14 +749,14 @@ pub(crate) mod tests { // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. let gm = single_region_mem_raw(0x10); - let res = vm.register_memory_regions(gm); + let res = vm.register_dram_memory_regions(gm); assert_eq!( res.unwrap_err().to_string(), "Cannot set the memory regions: Invalid argument (os error 22)" ); let gm = single_region_mem_raw(0x1000); - let res = vm.register_memory_regions(gm); + let res = vm.register_dram_memory_regions(gm); res.unwrap(); } @@ -760,7 +791,7 @@ pub(crate) mod tests { let region = GuestRegionMmap::new(region, GuestAddress(i as u64 * 0x1000)).unwrap(); - let res = vm.register_memory_region(region); + let res = vm.register_dram_memory_regions(vec![region]); if max_nr_regions <= i { assert!( From 5a54d4494bcdcfdf61088c3ec9ee96b2d5cc8271 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 23 Sep 2025 15:38:44 +0100 Subject: [PATCH 3/8] refactor(aarch64): do not use GuestMemory::last_addr() As we will add other guest memory regions, last_addr() will not return what we actually want: the last addr of the DRAM region. Use the region type introduced in an earlier refactor to find the DRAM region and use that instead. Signed-off-by: Riccardo Mancini --- src/vmm/src/arch/aarch64/fdt.rs | 22 +++++++++--------- src/vmm/src/arch/aarch64/mod.rs | 40 ++++++++++++++++----------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 9946d3516cc..a84e457f37b 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -9,7 +9,7 @@ use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, GuestMemoryRegion}; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; @@ -22,7 +22,7 @@ use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; -use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestRegionType}; // This is a value for uniquely identifying the FDT node declaring the interrupt controller. const GIC_PHANDLE: u32 = 1; @@ -227,14 +227,16 @@ fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> Resul // The reason we do this is that Linux does not allow remapping system memory. However, without // remap, kernel drivers cannot get virtual addresses to read data from device memory. Leaving // this memory region out allows Linux kernel modules to remap and thus read this region. - let mem_size = guest_mem.last_addr().raw_value() - - super::layout::DRAM_MEM_START - - super::layout::SYSTEM_MEM_SIZE - + 1; - let mem_reg_prop = &[ - super::layout::DRAM_MEM_START + super::layout::SYSTEM_MEM_SIZE, - mem_size, - ]; + // Pick the first (and only) memory region + let dram_region = guest_mem + .iter() + .find(|region| region.region_type == GuestRegionType::Dram) + .unwrap(); + let start_addr = dram_region + .start_addr() + .unchecked_add(super::layout::SYSTEM_MEM_SIZE); + let mem_size = dram_region.last_addr().unchecked_offset_from(start_addr) + 1; + let mem_reg_prop = &[start_addr.raw_value(), mem_size]; let mem = fdt.begin_node("memory@ram")?; fdt.property_string("device_type", "memory")?; fdt.property_array_u64("reg", mem_reg_prop)?; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 74c5204af0e..4e82a7d3d56 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -22,7 +22,7 @@ use std::fs::File; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, GuestMemoryRegion}; use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; @@ -30,7 +30,9 @@ use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; -use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{ + Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, GuestRegionType, +}; use crate::vstate::vcpu::KvmVcpuError; use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; @@ -151,31 +153,29 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti usize_to_u64(initrd_size), usize_to_u64(super::GUEST_PAGE_SIZE), ); - match GuestAddress(get_fdt_addr(guest_mem)).checked_sub(rounded_size) { - Some(offset) => { - if guest_mem.address_in_range(offset) { - Some(offset.raw_value()) - } else { - None - } - } - None => None, - } + GuestAddress(get_fdt_addr(guest_mem)) + .checked_sub(rounded_size) + .filter(|&addr| guest_mem.address_in_range(addr)) + .map(|addr| addr.raw_value()) } // Auxiliary function to get the address where the device tree blob is loaded. fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { + // Find the first (and only) DRAM region. + let dram_region = mem + .iter() + .find(|region| region.region_type == GuestRegionType::Dram) + .unwrap(); + // If the memory allocated is smaller than the size allocated for the FDT, // we return the start of the DRAM so that // we allow the code to try and load the FDT. - - if let Some(addr) = mem.last_addr().checked_sub(layout::FDT_MAX_SIZE as u64 - 1) - && mem.address_in_range(addr) - { - return addr.raw_value(); - } - - layout::DRAM_MEM_START + dram_region + .last_addr() + .checked_sub(layout::FDT_MAX_SIZE as u64 - 1) + .filter(|&addr| mem.address_in_range(addr)) + .map(|addr| addr.raw_value()) + .unwrap_or(layout::DRAM_MEM_START) } /// Load linux kernel into guest memory. From 321fbd18f94753c21f6042d5d7ba9e245fd02d88 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 23 Sep 2025 15:40:22 +0100 Subject: [PATCH 4/8] refactor(x86): iterate over regions to populate memory map The way we populate the PVH and E820 memmap is by redoing all the math about the regions. Replace that duplicated logic with an iteration over the guest memory, filtering by Dram type (as we don't want to expose other regions we will add in the future). Furthermore, clamp the first 1MB as it's reserved for the system. Signed-off-by: Riccardo Mancini --- src/vmm/src/arch/x86_64/mod.rs | 78 +++++++++------------------------- 1 file changed, 21 insertions(+), 57 deletions(-) diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index b18267c6a1e..4d6b906768e 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -31,12 +31,13 @@ pub mod xstate; #[allow(missing_docs)] pub mod generated; +use std::cmp::max; use std::fs::File; use kvm::Kvm; use layout::{ - CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, - MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, + CMDLINE_START, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, + PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, }; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; @@ -59,7 +60,7 @@ use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ - Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, + Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, GuestRegionType, }; use crate::vstate::vcpu::KvmVcpuConfigureError; use crate::{Vcpu, VcpuConfig, Vm, logger}; @@ -253,10 +254,6 @@ fn configure_pvh( initrd: &Option, ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; - let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); - let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); - let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -294,36 +291,21 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); - let last_addr = guest_mem.last_addr(); - if last_addr > first_addr_past_64bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_64bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } - - if last_addr > first_addr_past_32bits { + for region in guest_mem + .iter() + .filter(|region| region.region_type == GuestRegionType::Dram) + { + // the first 1MB is reserved for the kernel + let addr = max(himem_start, region.start_addr()); memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) - .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), + addr: addr.raw_value(), + size: region.last_addr().unchecked_offset_from(addr) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); } - memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start - .unchecked_offset_from(himem_start) - .min(last_addr.unchecked_offset_from(himem_start) + 1), - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -368,10 +350,6 @@ fn configure_64bit_boot( const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. - let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); - let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); - let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -409,35 +387,20 @@ fn configure_64bit_boot( E820_RESERVED, )?; - let last_addr = guest_mem.last_addr(); - - if last_addr > first_addr_past_64bits { - add_e820_entry( - &mut params, - first_addr_past_64bits.raw_value(), - last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, - E820_RAM, - )?; - } - - if last_addr > first_addr_past_32bits { + for region in guest_mem + .iter() + .filter(|region| region.region_type == GuestRegionType::Dram) + { + // the first 1MB is reserved for the kernel + let addr = max(himem_start, region.start_addr()); add_e820_entry( &mut params, - first_addr_past_32bits.raw_value(), - (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) - .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), + addr.raw_value(), + region.last_addr().unchecked_offset_from(addr) + 1, E820_RAM, )?; } - add_e820_entry( - &mut params, - himem_start.raw_value(), - (last_addr.unchecked_offset_from(himem_start) + 1) - .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), - E820_RAM, - )?; - LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -573,6 +536,7 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; use crate::vstate::resources::ResourceAllocator; From 3970df8cff7874d394a6d4a7d0e8a004041d7ee0 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 23 Sep 2025 15:42:38 +0100 Subject: [PATCH 5/8] refactor(resources): split allocate_guest_memory in 2 Split the actual allocation from the calculations to find the right ranges. This will be used to create the hotpluggable region in the future. Signed-off-by: Riccardo Mancini --- src/vmm/src/resources.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 0d2f4bbed22..89fab1f8302 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -6,6 +6,7 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex, MutexGuard}; use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; @@ -463,11 +464,14 @@ impl VmResources { Ok(()) } - /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. + /// Allocates the given guest memory regions. /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + fn allocate_memory_regions( + &self, + regions: &[(GuestAddress, usize)], + ) -> Result, MemoryError> { let vhost_user_device_used = self .block .devices @@ -483,22 +487,27 @@ impl VmResources { // because that would require running a backend process. If in the future we converge to // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. - let regions = - crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( - regions.as_ref(), + regions, self.machine_config.track_dirty_pages, self.machine_config.huge_pages, ) } else { memory::anonymous( - regions.into_iter(), + regions.iter().copied(), self.machine_config.track_dirty_pages, self.machine_config.huge_pages, ) } } + + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. + pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + let regions = + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); + self.allocate_memory_regions(®ions) + } } impl From<&VmResources> for VmmConfig { From e410345abf4e0a561598526d2016833e6fb6df56 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 24 Sep 2025 11:48:03 +0100 Subject: [PATCH 6/8] refactor(balloon): move remove_range to GuestMemoryExtension Move the function to discard guest memory under the GuestMemoryExtension trait. This is done so that it could be reused in other parts of the code. Furthermore, this also changes it to use try_access, which is the correct way to access a range of guest memory as it deals with the possibility of it spanning multiple regions. Signed-off-by: Riccardo Mancini --- src/vmm/src/devices/virtio/balloon/device.rs | 13 +- src/vmm/src/devices/virtio/balloon/util.rs | 133 -------------- src/vmm/src/test_utils/mod.rs | 3 +- src/vmm/src/vstate/memory.rs | 175 ++++++++++++++++++- 4 files changed, 182 insertions(+), 142 deletions(-) diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 87a82c4fa9d..09006fb96ae 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -14,7 +14,7 @@ use super::super::ActivateError; use super::super::device::{DeviceState, VirtioDevice}; use super::super::queue::Queue; use super::metrics::METRICS; -use super::util::{compact_page_frame_numbers, remove_range}; +use super::util::compact_page_frame_numbers; use super::{ BALLOON_DEV_ID, BALLOON_NUM_QUEUES, BALLOON_QUEUE_SIZES, DEFLATE_INDEX, INFLATE_INDEX, MAX_PAGE_COMPACT_BUFFER, MAX_PAGES_IN_DESC, MIB_TO_4K_PAGES, STATS_INDEX, @@ -32,7 +32,9 @@ use crate::devices::virtio::queue::InvalidAvailIdx; use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; -use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{ + Address, ByteValued, Bytes, GuestAddress, GuestMemoryExtension, GuestMemoryMmap, +}; use crate::{impl_device_type, mem_size_mib}; const SIZE_OF_U32: usize = std::mem::size_of::(); @@ -335,10 +337,9 @@ impl Balloon { let guest_addr = GuestAddress(u64::from(page_frame_number) << VIRTIO_BALLOON_PFN_SHIFT); - if let Err(err) = remove_range( - mem, - (guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT), - self.restored_from_file, + if let Err(err) = mem.discard_range( + guest_addr, + usize::try_from(range_len).unwrap() << VIRTIO_BALLOON_PFN_SHIFT, ) { error!("Error removing memory range: {:?}", err); } diff --git a/src/vmm/src/devices/virtio/balloon/util.rs b/src/vmm/src/devices/virtio/balloon/util.rs index 35ef69f972f..1d00891783e 100644 --- a/src/vmm/src/devices/virtio/balloon/util.rs +++ b/src/vmm/src/devices/virtio/balloon/util.rs @@ -65,57 +65,6 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { result } -pub(crate) fn remove_range( - guest_memory: &GuestMemoryMmap, - range: (GuestAddress, u64), - restored_from_file: bool, -) -> Result<(), RemoveRegionError> { - let (guest_address, range_len) = range; - - if let Some(region) = guest_memory.find_region(guest_address) { - if guest_address.0 + range_len > region.start_addr().0 + region.len() { - return Err(RemoveRegionError::MalformedRange); - } - let phys_address = guest_memory - .get_host_address(guest_address) - .map_err(|_| RemoveRegionError::AddressTranslation)?; - - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored_from_file { - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - libc::mmap( - phys_address.cast(), - u64_to_usize(range_len), - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; - - // Madvise the region in order to mark it as not used. - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - let range_len = u64_to_usize(range_len); - libc::madvise(phys_address.cast(), range_len, libc::MADV_DONTNEED) - }; - if ret < 0 { - return Err(RemoveRegionError::MadviseFail(io::Error::last_os_error())); - } - - Ok(()) - } else { - Err(RemoveRegionError::RegionNotFound) - } -} - #[cfg(test)] mod tests { use std::fmt::Debug; @@ -174,88 +123,6 @@ mod tests { ); } - #[test] - fn test_remove_range() { - let page_size: usize = 0x1000; - let mem = single_region_mem(2 * page_size); - - // Fill the memory with ones. - let ones = vec![1u8; 2 * page_size]; - mem.write(&ones[..], GuestAddress(0)).unwrap(); - - // Remove the first page. - remove_range(&mem, (GuestAddress(0), page_size as u64), false).unwrap(); - - // Check that the first page is zeroed. - let mut actual_page = vec![0u8; page_size]; - mem.read(actual_page.as_mut_slice(), GuestAddress(0)) - .unwrap(); - assert_eq!(vec![0u8; page_size], actual_page); - // Check that the second page still contains ones. - mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) - .unwrap(); - assert_eq!(vec![1u8; page_size], actual_page); - - // Malformed range: the len is too big. - assert_match!( - remove_range(&mem, (GuestAddress(0), 0x10000), false).unwrap_err(), - RemoveRegionError::MalformedRange - ); - - // Region not mapped. - assert_match!( - remove_range(&mem, (GuestAddress(0x10000), 0x10), false).unwrap_err(), - RemoveRegionError::RegionNotFound - ); - - // Madvise fail: the guest address is not aligned to the page size. - assert_match!( - remove_range(&mem, (GuestAddress(0x20), page_size as u64), false).unwrap_err(), - RemoveRegionError::MadviseFail(_) - ); - } - - #[test] - fn test_remove_range_on_restored() { - let page_size: usize = 0x1000; - let mem = single_region_mem(2 * page_size); - - // Fill the memory with ones. - let ones = vec![1u8; 2 * page_size]; - mem.write(&ones[..], GuestAddress(0)).unwrap(); - - // Remove the first page. - remove_range(&mem, (GuestAddress(0), page_size as u64), true).unwrap(); - - // Check that the first page is zeroed. - let mut actual_page = vec![0u8; page_size]; - mem.read(actual_page.as_mut_slice(), GuestAddress(0)) - .unwrap(); - assert_eq!(vec![0u8; page_size], actual_page); - // Check that the second page still contains ones. - mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) - .unwrap(); - assert_eq!(vec![1u8; page_size], actual_page); - - // Malformed range: the len is too big. - assert_match!( - remove_range(&mem, (GuestAddress(0), 0x10000), true).unwrap_err(), - RemoveRegionError::MalformedRange - ); - - // Region not mapped. - assert_match!( - remove_range(&mem, (GuestAddress(0x10000), 0x10), true).unwrap_err(), - RemoveRegionError::RegionNotFound - ); - - // Mmap fail: the guest address is not aligned to the page size. - assert_match!( - remove_range(&mem, (GuestAddress(0x20), page_size as u64), true).unwrap_err(), - RemoveRegionError::MmapFail(_) - ); - } - /// ------------------------------------- /// BEGIN PROPERTY BASED TESTING use proptest::prelude::*; diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 03b0ce16726..6fe66cdbadb 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -16,8 +16,7 @@ use crate::vm_memory_vendored::GuestRegionCollection; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::HugePageConfig; -use crate::vstate::memory; -use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap, GuestRegionMmapExt}; +use crate::vstate::memory::{self, GuestMemoryMmap, GuestRegionMmap, GuestRegionMmapExt}; use crate::{EventManager, Vmm}; pub mod mock_resources; diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 5fe49fa1b82..949ccd48372 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -10,6 +10,7 @@ use std::io::SeekFrom; use std::sync::Arc; use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +use log::error; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; @@ -106,6 +107,53 @@ impl GuestRegionMmapExt { userspace_addr: self.inner.as_ptr() as u64, } } + + pub(crate) fn discard_range( + &self, + caddr: MemoryRegionAddress, + len: usize, + ) -> Result { + let phys_address = self.get_host_address(caddr)?; + + // If and only if we are resuming from a snapshot file, we have a file and it's mapped + // private + if self.inner.file_offset().is_some() && self.inner.flags() & libc::MAP_PRIVATE != 0 { + // Mmap a new anonymous region over the present one in order to create a hole + // with zero pages. + // This workaround is (only) needed after resuming from a snapshot file because the + // guest memory is mmaped from file as private. In this case, MADV_DONTNEED on the + // file only drops any anonymous pages in range, but subsequent accesses would read + // whatever page is stored on the backing file. Mmapping anonymous pages ensures it's + // zeroed. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { + libc::mmap( + phys_address.cast(), + len, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + -1, + 0, + ) + }; + if ret == libc::MAP_FAILED { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: mmap failed: {:?}", os_error); + return Err(GuestMemoryError::IOError(os_error)); + } + } + + // Madvise the region in order to mark it as not used. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) }; + if ret < 0 { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: madvise failed: {:?}", os_error); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(len) + } + } } #[allow(clippy::cast_possible_wrap)] @@ -253,6 +301,9 @@ where /// Store the dirty bitmap in internal store fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize); + + /// Discards a memory range, freeing up memory pages + fn discard_range(&self, addr: GuestAddress, range_len: usize) -> Result<(), GuestMemoryError>; } /// State of a guest memory region saved to file/buffer. @@ -406,6 +457,25 @@ impl GuestMemoryExtension for GuestMemoryMmap { } }); } + + fn discard_range(&self, addr: GuestAddress, range_len: usize) -> Result<(), GuestMemoryError> { + let res = self.try_access( + range_len, + addr, + |_offset, len, caddr, region| -> Result { + region.discard_range(caddr, len) + }, + ); + + match res { + Ok(count) if count == range_len => Ok(()), + Ok(count) => Err(GuestMemoryError::PartialBuffer { + expected: range_len, + completed: count, + }), + Err(e) => Err(e), + } + } } fn create_memfd( @@ -443,12 +513,13 @@ mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use std::collections::HashMap; - use std::io::{Read, Seek}; + use std::io::{Read, Seek, Write}; use vmm_sys_util::tempfile::TempFile; use super::*; use crate::snapshot::Snapshot; + use crate::test_utils::single_region_mem; use crate::utils::{get_page_size, mib_to_bytes}; fn into_region_ext(regions: Vec) -> GuestMemoryMmap { @@ -825,4 +896,106 @@ mod tests { seals.insert(memfd::FileSeal::SealGrow); memfd.add_seals(&seals).unwrap_err(); } + + /// This asserts that $lhs matches $rhs. + macro_rules! assert_match { + ($lhs:expr, $rhs:pat) => {{ assert!(matches!($lhs, $rhs)) }}; + } + + #[test] + fn test_discard_range() { + let page_size: usize = 0x1000; + let mem = single_region_mem(2 * page_size); + + // Fill the memory with ones. + let ones = vec![1u8; 2 * page_size]; + mem.write(&ones[..], GuestAddress(0)).unwrap(); + + // Remove the first page. + mem.discard_range(GuestAddress(0), page_size).unwrap(); + + // Check that the first page is zeroed. + let mut actual_page = vec![0u8; page_size]; + mem.read(actual_page.as_mut_slice(), GuestAddress(0)) + .unwrap(); + assert_eq!(vec![0u8; page_size], actual_page); + // Check that the second page still contains ones. + mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) + .unwrap(); + assert_eq!(vec![1u8; page_size], actual_page); + + // Malformed range: the len is too big. + assert_match!( + mem.discard_range(GuestAddress(0), 0x10000).unwrap_err(), + GuestMemoryError::PartialBuffer { .. } + ); + + // Region not mapped. + assert_match!( + mem.discard_range(GuestAddress(0x10000), 0x10).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Madvise fail: the guest address is not aligned to the page size. + assert_match!( + mem.discard_range(GuestAddress(0x20), page_size) + .unwrap_err(), + GuestMemoryError::IOError(_) + ); + } + + #[test] + fn test_discard_range_on_file() { + let page_size: usize = 0x1000; + let mut memory_file = TempFile::new().unwrap().into_file(); + memory_file.set_len(2 * page_size as u64).unwrap(); + memory_file.write_all(&vec![2u8; 2 * page_size]).unwrap(); + let mem = into_region_ext( + snapshot_file( + memory_file, + std::iter::once((GuestAddress(0), 2 * page_size)), + false, + ) + .unwrap(), + ); + + // Fill the memory with ones. + let ones = vec![1u8; 2 * page_size]; + mem.write(&ones[..], GuestAddress(0)).unwrap(); + + // Remove the first page. + mem.discard_range(GuestAddress(0), page_size).unwrap(); + + // Check that the first page is zeroed. + let mut actual_page = vec![0u8; page_size]; + mem.read(actual_page.as_mut_slice(), GuestAddress(0)) + .unwrap(); + assert_eq!(vec![0u8; page_size], actual_page); + // Check that the second page still contains ones. + mem.read(actual_page.as_mut_slice(), GuestAddress(page_size as u64)) + .unwrap(); + assert_eq!(vec![1u8; page_size], actual_page); + + // Malformed range: the len is too big. + assert_match!( + mem.discard_range(GuestAddress(0), 0x10000).unwrap_err(), + GuestMemoryError::PartialBuffer { + expected: _, + completed: _ + } + ); + + // Region not mapped. + assert_match!( + mem.discard_range(GuestAddress(0x10000), 0x10).unwrap_err(), + GuestMemoryError::InvalidGuestAddress(_) + ); + + // Mmap fail: the guest address is not aligned to the page size. + assert_match!( + mem.discard_range(GuestAddress(0x20), page_size) + .unwrap_err(), + GuestMemoryError::IOError(_) + ); + } } From 0ad563a31456968454b40f154e2ca4512c573122 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 24 Sep 2025 11:57:36 +0100 Subject: [PATCH 7/8] refactor(balloon): do not madvise(MADV_DONTNEED) when resuming from file This operation is completely useless after we mmap-ed anonymous private zero pages on top of the file mapping. Also, add a TODO for memfd-backed memory, as it's currently not supported using madvise. Signed-off-by: Riccardo Mancini --- src/vmm/src/vstate/memory.rs | 81 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 949ccd48372..1bf7cda6342 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -115,43 +115,52 @@ impl GuestRegionMmapExt { ) -> Result { let phys_address = self.get_host_address(caddr)?; - // If and only if we are resuming from a snapshot file, we have a file and it's mapped - // private - if self.inner.file_offset().is_some() && self.inner.flags() & libc::MAP_PRIVATE != 0 { - // Mmap a new anonymous region over the present one in order to create a hole - // with zero pages. - // This workaround is (only) needed after resuming from a snapshot file because the - // guest memory is mmaped from file as private. In this case, MADV_DONTNEED on the - // file only drops any anonymous pages in range, but subsequent accesses would read - // whatever page is stored on the backing file. Mmapping anonymous pages ensures it's - // zeroed. - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - libc::mmap( - phys_address.cast(), - len, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - let os_error = std::io::Error::last_os_error(); - error!("discard_range: mmap failed: {:?}", os_error); - return Err(GuestMemoryError::IOError(os_error)); + match (self.inner.file_offset(), self.inner.flags()) { + // If and only if we are resuming from a snapshot file, we have a file and it's mapped + // private + (Some(_), flags) if flags & libc::MAP_PRIVATE != 0 => { + // Mmap a new anonymous region over the present one in order to create a hole + // with zero pages. + // This workaround is (only) needed after resuming from a snapshot file because the + // guest memory is mmaped from file as private. In this case, MADV_DONTNEED on the + // file only drops any anonymous pages in range, but subsequent accesses would read + // whatever page is stored on the backing file. Mmapping anonymous pages ensures + // it's zeroed. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { + libc::mmap( + phys_address.cast(), + len, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + -1, + 0, + ) + }; + if ret == libc::MAP_FAILED { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: mmap failed: {:?}", os_error); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(len) + } + } + // TODO: madvise(MADV_DONTNEED) doesn't actually work with memfd + // (or in general MAP_SHARED of a fd). In those cases we should use + // fallocate64(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE). + // We keep falling to the madvise branch to keep the previous behaviour. + (None, _) | (Some(_), _) => { + // Madvise the region in order to mark it as not used. + // SAFETY: The address and length are known to be valid. + let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) }; + if ret < 0 { + let os_error = std::io::Error::last_os_error(); + error!("discard_range: madvise failed: {:?}", os_error); + Err(GuestMemoryError::IOError(os_error)) + } else { + Ok(len) + } } - } - - // Madvise the region in order to mark it as not used. - // SAFETY: The address and length are known to be valid. - let ret = unsafe { libc::madvise(phys_address.cast(), len, libc::MADV_DONTNEED) }; - if ret < 0 { - let os_error = std::io::Error::last_os_error(); - error!("discard_range: madvise failed: {:?}", os_error); - Err(GuestMemoryError::IOError(os_error)) - } else { - Ok(len) } } } From fc5c4b0cf9fd818dc8ac1df7a475c02161a4e520 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 24 Sep 2025 12:11:42 +0100 Subject: [PATCH 8/8] refactor(balloon): remove unused restored_from_file This flag was used for determining how to discard memory, but we previously changed that logic to look at the region mmap and perform the right action depending on that. This commit removes all references to this flag. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 1 - src/vmm/src/device_manager/mod.rs | 4 --- src/vmm/src/device_manager/pci_mngr.rs | 8 +----- src/vmm/src/device_manager/persist.rs | 7 +---- src/vmm/src/devices/virtio/balloon/device.rs | 27 +++++++++---------- .../devices/virtio/balloon/event_handler.rs | 2 +- src/vmm/src/devices/virtio/balloon/persist.rs | 16 +++-------- src/vmm/src/resources.rs | 2 +- src/vmm/src/vmm_config/balloon.rs | 5 +--- 9 files changed, 20 insertions(+), 52 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 9a91daf60d1..aa9c9590dfc 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -462,7 +462,6 @@ pub fn build_microvm_from_snapshot( event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: uffd.is_none(), vcpus_exit_evt: &vcpus_exit_evt, }; #[allow(unused_mut)] diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index d7052422a3a..5cdb6ad80ca 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -439,7 +439,6 @@ pub struct DeviceRestoreArgs<'a> { pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, } impl std::fmt::Debug for DeviceRestoreArgs<'_> { @@ -449,7 +448,6 @@ impl std::fmt::Debug for DeviceRestoreArgs<'_> { .field("vm", &self.vm) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) - .field("restored_from_file", &self.restored_from_file) .finish() } } @@ -487,7 +485,6 @@ impl<'a> Persist<'a> for DeviceManager { event_manager: constructor_args.event_manager, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, - restored_from_file: constructor_args.restored_from_file, }; let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; @@ -505,7 +502,6 @@ impl<'a> Persist<'a> for DeviceManager { mem: constructor_args.mem, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, - restored_from_file: constructor_args.restored_from_file, event_manager: constructor_args.event_manager, }; let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index b8371d82a83..a600c4e19b3 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -247,7 +247,6 @@ pub struct PciDevicesConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, pub event_manager: &'a mut EventManager, } @@ -258,7 +257,6 @@ impl<'a> Debug for PciDevicesConstructorArgs<'a> { .field("mem", &self.mem) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) - .field("restored_from_file", &self.restored_from_file) .finish() } } @@ -410,10 +408,7 @@ impl<'a> Persist<'a> for PciDevices { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new( Balloon::restore( - BalloonConstructorArgs { - mem: mem.clone(), - restored_from_file: constructor_args.restored_from_file, - }, + BalloonConstructorArgs { mem: mem.clone() }, &balloon_state.device_state, ) .unwrap(), @@ -668,7 +663,6 @@ mod tests { mem: vmm.vm.guest_memory(), vm_resources, instance_id: "microvm-id", - restored_from_file: true, event_manager: &mut event_manager, }; let _restored_dev_manager = diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index d6d46fff0f5..4414ddddd07 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -144,7 +144,6 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub event_manager: &'a mut EventManager, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, - pub restored_from_file: bool, } impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -412,10 +411,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new(Balloon::restore( - BalloonConstructorArgs { - mem: mem.clone(), - restored_from_file: constructor_args.restored_from_file, - }, + BalloonConstructorArgs { mem: mem.clone() }, &balloon_state.device_state, )?)); @@ -686,7 +682,6 @@ mod tests { event_manager: &mut event_manager, vm_resources, instance_id: "microvm-id", - restored_from_file: true, }; let _restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 09006fb96ae..f0ecf77bc9e 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -173,7 +173,6 @@ pub struct Balloon { pub(crate) device_state: DeviceState, // Implementation specific fields. - pub(crate) restored_from_file: bool, pub(crate) stats_polling_interval_s: u16, pub(crate) stats_timer: TimerFd, // The index of the previous stats descriptor is saved because @@ -190,7 +189,6 @@ impl Balloon { amount_mib: u32, deflate_on_oom: bool, stats_polling_interval_s: u16, - restored_from_file: bool, ) -> Result { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; @@ -230,7 +228,6 @@ impl Balloon { queues, device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, - restored_from_file, stats_polling_interval_s, stats_timer, stats_desc_index: None, @@ -740,7 +737,7 @@ pub(crate) mod tests { // Test all feature combinations. for deflate_on_oom in [true, false].iter() { for stats_interval in [0, 1].iter() { - let mut balloon = Balloon::new(0, *deflate_on_oom, *stats_interval, false).unwrap(); + let mut balloon = Balloon::new(0, *deflate_on_oom, *stats_interval).unwrap(); assert_eq!(balloon.device_type(), VIRTIO_ID_BALLOON); let features: u64 = (1u64 << VIRTIO_F_VERSION_1) @@ -767,7 +764,7 @@ pub(crate) mod tests { #[test] fn test_virtio_read_config() { - let balloon = Balloon::new(0x10, true, 0, false).unwrap(); + let balloon = Balloon::new(0x10, true, 0).unwrap(); let cfg = BalloonConfig { amount_mib: 16, @@ -801,7 +798,7 @@ pub(crate) mod tests { #[test] fn test_virtio_write_config() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let expected_config_space: [u8; BALLOON_CONFIG_SPACE_SIZE] = [0x00, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]; @@ -827,7 +824,7 @@ pub(crate) mod tests { #[test] fn test_invalid_request() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); // Only initialize the inflate queue to demonstrate invalid request handling. @@ -888,7 +885,7 @@ pub(crate) mod tests { #[test] fn test_inflate() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -960,7 +957,7 @@ pub(crate) mod tests { #[test] fn test_deflate() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1010,7 +1007,7 @@ pub(crate) mod tests { #[test] fn test_stats() { - let mut balloon = Balloon::new(0, true, 1, false).unwrap(); + let mut balloon = Balloon::new(0, true, 1).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1102,7 +1099,7 @@ pub(crate) mod tests { #[test] fn test_process_balloon_queues() { - let mut balloon = Balloon::new(0x10, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0x10, true, 0).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); @@ -1117,7 +1114,7 @@ pub(crate) mod tests { #[test] fn test_update_stats_interval() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); let mem = default_mem(); let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); @@ -1130,7 +1127,7 @@ pub(crate) mod tests { ); balloon.update_stats_polling_interval(0).unwrap(); - let mut balloon = Balloon::new(0, true, 1, false).unwrap(); + let mut balloon = Balloon::new(0, true, 1).unwrap(); let mem = default_mem(); let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); @@ -1148,14 +1145,14 @@ pub(crate) mod tests { #[test] fn test_cannot_update_inactive_device() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); // Assert that we can't update an inactive device. balloon.update_size(1).unwrap_err(); } #[test] fn test_num_pages() { - let mut balloon = Balloon::new(0, true, 0, false).unwrap(); + let mut balloon = Balloon::new(0, true, 0).unwrap(); // Switch the state to active. balloon.device_state = DeviceState::Activated(ActiveState { mem: single_region_mem(32 << 20), diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 3922b4b8385..6fdd00c434c 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -142,7 +142,7 @@ pub mod tests { #[test] fn test_event_handler() { let mut event_manager = EventManager::new().unwrap(); - let mut balloon = Balloon::new(0, true, 10, false).unwrap(); + let mut balloon = Balloon::new(0, true, 10).unwrap(); let mem = default_mem(); let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index c5fa227c620..e92356c394e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -95,7 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - pub restored_from_file: bool, } impl Persist<'_> for Balloon { @@ -122,12 +121,7 @@ impl Persist<'_> for Balloon { ) -> Result { // We can safely create the balloon with arbitrary flags and // num_pages because we will overwrite them after. - let mut balloon = Balloon::new( - 0, - false, - state.stats_polling_interval_s, - constructor_args.restored_from_file, - )?; + let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s)?; let mut num_queues = BALLOON_NUM_QUEUES; // As per the virtio 1.1 specification, the statistics queue @@ -184,7 +178,7 @@ mod tests { let mut mem = vec![0; 4096]; // Create and save the balloon device. - let balloon = Balloon::new(0x42, false, 2, false).unwrap(); + let balloon = Balloon::new(0x42, false, 2).unwrap(); Snapshot::new(balloon.save()) .save(&mut mem.as_mut_slice()) @@ -192,10 +186,7 @@ mod tests { // Deserialize and restore the balloon device. let restored_balloon = Balloon::restore( - BalloonConstructorArgs { - mem: guest_mem, - restored_from_file: true, - }, + BalloonConstructorArgs { mem: guest_mem }, &Snapshot::load_without_crc_check(mem.as_slice()) .unwrap() .data, @@ -203,7 +194,6 @@ mod tests { .unwrap(); assert_eq!(restored_balloon.device_type(), VIRTIO_ID_BALLOON); - assert!(restored_balloon.restored_from_file); assert_eq!(restored_balloon.acked_features, balloon.acked_features); assert_eq!(restored_balloon.avail_features, balloon.avail_features); diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 89fab1f8302..7d2318fa14d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -1520,7 +1520,7 @@ mod tests { .unwrap(); let err = vm_resources .update_from_restored_device(SharedDeviceType::Balloon(Arc::new(Mutex::new( - Balloon::new(128, false, 0, true).unwrap(), + Balloon::new(128, false, 0).unwrap(), )))) .unwrap_err(); assert!( diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index 83d419c49db..e56430d6dc6 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -88,9 +88,6 @@ impl BalloonBuilder { cfg.amount_mib, cfg.deflate_on_oom, cfg.stats_polling_interval_s, - // `restored` flag is false because this code path - // is never called by snapshot restore functionality. - false, )?))); Ok(()) @@ -178,7 +175,7 @@ pub(crate) mod tests { #[test] fn test_set_device() { let mut builder = BalloonBuilder::new(); - let balloon = Balloon::new(0, true, 0, true).unwrap(); + let balloon = Balloon::new(0, true, 0).unwrap(); builder.set_device(Arc::new(Mutex::new(balloon))); assert!(builder.inner.is_some()); }