From 655fead81dd4b70b6c2d0a7eb9846ad8ecdc67e9 Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Tue, 21 Apr 2026 22:28:45 +0800 Subject: [PATCH 1/2] [vm]: prune stale device metadata on startVm afterReceiveVmDeviceInfoResponse was upsert-only, leaving VmInstanceResourceMetadataVO rows for devices (e.g. cdrom) detached while the VM was stopped. Add pruneStaleDeviceMetadata and invoke it after each start/sync so DB matches the libvirt domain. Rows with non-null metadataClass are preserved. Resolves: ZSV-11374 Change-Id: I687463656770616d6967746b6270767366756a64 --- ...VmInstanceResourceMetadataManagerImpl.java | 56 ++++++++++++++ .../VmInstanceResourceMetadataManager.java | 17 +++++ .../VirtualPciDeviceKvmExtensionPoint.java | 76 +++++++++++++++---- 3 files changed, 133 insertions(+), 16 deletions(-) diff --git a/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java b/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java index 3d78c146588..0c17a86ce6a 100644 --- a/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java +++ b/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java @@ -22,8 +22,11 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import static org.zstack.core.Platform.operr; @@ -343,6 +346,59 @@ public void updateVmResourceMetadataDeviceAddress(String vmInstanceUuid, String createOrUpdateVmResourceMetadata(resourceUuid, DeviceAddress.fromString(deviceAddress), vmInstanceUuid, null, null); } + @Override + public int pruneStaleDeviceMetadata(String vmInstanceUuid, Set survivingResourceUuids) { + if (deviceAddressRecordingDisabled()) { + return 0; + } + if (vmInstanceUuid == null) { + return 0; + } + + List candidates = Q.New(VmInstanceResourceMetadataVO.class) + .eq(VmInstanceResourceMetadataVO_.vmInstanceUuid, vmInstanceUuid) + .isNull(VmInstanceResourceMetadataVO_.metadataClass) + .list(); + if (candidates.isEmpty()) { + return 0; + } + + Set alwaysKeep = new HashSet<>(); + // double-guard: the vmXml archive row uses vmInstanceUuid as resourceUuid, + // normally it has non-null metadataClass so it never reaches here. + alwaysKeep.add(vmInstanceUuid); + alwaysKeep.add(MEM_BALLOON_UUID); + alwaysKeep.add(RESOURCE_CONFIG_UUID); + alwaysKeep.add(GUEST_TOOLS_RESOURCE_CONFIG_UUID); + + Set surviving = survivingResourceUuids == null + ? Collections.emptySet() : survivingResourceUuids; + + List toDelete = candidates.stream() + .map(VmInstanceResourceMetadataVO::getResourceUuid) + .filter(ru -> !alwaysKeep.contains(ru)) + .filter(ru -> !surviving.contains(ru)) + .collect(Collectors.toList()); + + if (toDelete.isEmpty()) { + return 0; + } + + logger.debug(String.format( + "prune stale VmInstanceResourceMetadataVO for vm[%s], resourceUuids=%s", + vmInstanceUuid, toDelete)); + + SQL.New("delete from VmInstanceResourceMetadataVO v" + + " where v.vmInstanceUuid = :vmUuid" + + " and v.metadataClass is null" + + " and v.resourceUuid in (:uuids)") + .param("vmUuid", vmInstanceUuid) + .param("uuids", toDelete) + .execute(); + + return toDelete.size(); + } + @Override public void deleteArchiveVmInstanceResourceMetadataGroup(String archiveForResourceUuid) { SQL.New(VmInstanceResourceMetadataGroupVO.class).eq(VmInstanceResourceMetadataGroupVO_.resourceUuid, archiveForResourceUuid).hardDelete(); diff --git a/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java b/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java index 29f2f81070d..d01533c858c 100644 --- a/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java +++ b/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java @@ -4,6 +4,7 @@ import java.util.List; import java.util.Map; +import java.util.Set; public interface VmInstanceResourceMetadataManager { String MEM_BALLOON_UUID = "4780bf6d2fa65700f22e36c27e8ff05c"; @@ -151,4 +152,20 @@ public interface VmInstanceResourceMetadataManager { List getArchivedResourceMetadataInfoFromArchiveForResourceUuid(String vmInstanceUuid, String archiveForResourceUuid, String metadataClass); void updateVmResourceMetadataDeviceAddress(String vmInstanceUuid, String resourceUuid, String deviceAddress); + + /** + * Drop "address-only" rows (metadataClass IS NULL) of this VM whose + * resourceUuid is NOT in survivingResourceUuids. Used by start-vm / sync-vm-device-info + * extension point to reconcile DB state with the libvirt domain actually running. + * + * Always preserved: + * - rows with non-null metadataClass (memorySnapshot / guesttools / ArchiveVmBundle ...) + * - row whose resourceUuid == vmInstanceUuid (vmXml archive, as a double-guard) + * - MEM_BALLOON_UUID / RESOURCE_CONFIG_UUID / GUEST_TOOLS_RESOURCE_CONFIG_UUID + * + * @param vmInstanceUuid VM uuid + * @param survivingResourceUuids resourceUuids that still exist in the libvirt domain + * @return number of rows deleted + */ + int pruneStaleDeviceMetadata(String vmInstanceUuid, Set survivingResourceUuids); } diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java b/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java index e7c04f4e989..80ebc9c1f87 100644 --- a/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java @@ -13,6 +13,9 @@ import org.zstack.utils.gson.JSONObjectUtil; import org.zstack.utils.logging.CLogger; +import java.util.HashSet; +import java.util.Set; + public class VirtualPciDeviceKvmExtensionPoint implements KVMStartVmExtensionPoint, KVMSyncVmDeviceInfoExtensionPoint { private static final CLogger logger = Utils.getLogger(VirtualPciDeviceKvmExtensionPoint.class); @@ -77,7 +80,11 @@ private String getVmXml(String vmUuid) { public void afterReceiveVmDeviceInfoResponse(VmInstanceInventory vm, KVMAgentCommands.VmDevicesInfoResponse rsp, VmInstanceSpec spec) { String vmUuid = spec != null ? spec.getVmInventory().getUuid() : vm.getUuid(); + Set surviving = new HashSet<>(); + vidManager.saveVmXmlMetadata(rsp.getVmXml(), vmUuid); + // the vmXml row uses vmUuid as resourceUuid; keep it + surviving.add(vmUuid); // only update pci address, metadata is not mandatory in normal usage // check its usage when create snapshot or backup @@ -85,30 +92,67 @@ public void afterReceiveVmDeviceInfoResponse(VmInstanceInventory vm, KVMAgentCom rsp.getVirtualDeviceInfoList().forEach(info -> { if (info.isValid()) { vidManager.createOrUpdateVmResourceMetadata(info, vmUuid); + surviving.add(info.getResourceUuid()); } }); } - if (rsp.getNicInfos() == null) { - return; - } - - rsp.getNicInfos().forEach(info -> { - VmNicInventory nic = (spec != null ? spec.getDestNics() : vm.getVmNics()) - .stream() - .filter(vmNicInventory -> vmNicInventory.getMac().equals(info.getMacAddress())) - .findFirst() - .orElse(null); - if (nic == null) { - return; - } + if (rsp.getNicInfos() != null) { + rsp.getNicInfos().forEach(info -> { + VmNicInventory nic = (spec != null ? spec.getDestNics() : vm.getVmNics()) + .stream() + .filter(vmNicInventory -> vmNicInventory.getMac().equals(info.getMacAddress())) + .findFirst() + .orElse(null); + if (nic == null) { + return; + } - vidManager.createOrUpdateVmResourceMetadata(new VirtualDeviceInfo(nic.getUuid(), info.getDeviceAddress()), vmUuid); - }); + vidManager.createOrUpdateVmResourceMetadata(new VirtualDeviceInfo(nic.getUuid(), info.getDeviceAddress()), vmUuid); + surviving.add(nic.getUuid()); + }); + } - if (!StringUtils.isEmpty(rsp.getMemBalloonInfo().getDeviceAddress().toString())) { + if (rsp.getMemBalloonInfo() != null + && rsp.getMemBalloonInfo().getDeviceAddress() != null + && !StringUtils.isEmpty(rsp.getMemBalloonInfo().getDeviceAddress().toString())) { vidManager.createOrUpdateVmResourceMetadata(new VirtualDeviceInfo(vidManager.MEM_BALLOON_UUID, DeviceAddress.fromString(rsp.getMemBalloonInfo().getDeviceAddress().toString())), vmUuid); + surviving.add(vidManager.MEM_BALLOON_UUID); + } + + // differential prune: drop stale address-only rows (metadataClass IS NULL) + // whose resourceUuid is no longer present in the libvirt domain. This keeps + // DB state consistent after devices like cdrom get detached/deleted while + // the VM was stopped. Rows written by other extension points (with non-null + // metadataClass) are preserved. + // + // Guard: only prune when the agent response actually carries an + // authoritative device snapshot. A response with all three device + // fields empty/null is not authoritative (e.g. certain mocked or + // partial agent replies) and must not be used as the basis for prune, + // otherwise legitimate metadata rows get wiped. + boolean hasAuthoritativeDeviceInfo = + (rsp.getVirtualDeviceInfoList() != null && !rsp.getVirtualDeviceInfoList().isEmpty()) + || (rsp.getNicInfos() != null && !rsp.getNicInfos().isEmpty()) + || (rsp.getMemBalloonInfo() != null + && rsp.getMemBalloonInfo().getDeviceAddress() != null + && !StringUtils.isEmpty(rsp.getMemBalloonInfo().getDeviceAddress().toString())); + if (!hasAuthoritativeDeviceInfo) { + logger.debug(String.format( + "skip pruneStaleDeviceMetadata for vm[%s]: response has no authoritative device info", + vmUuid)); + return; + } + try { + int pruned = vidManager.pruneStaleDeviceMetadata(vmUuid, surviving); + if (pruned > 0) { + logger.debug(String.format( + "pruned %d stale VmInstanceResourceMetadataVO row(s) for vm[%s]", + pruned, vmUuid)); + } + } catch (Exception e) { + logger.warn(String.format("failed to prune stale device metadata for vm[%s]", vmUuid), e); } } From 486fb4eeb574de56bca9cdf6c064d812c598dea0 Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Wed, 22 Apr 2026 17:27:49 +0800 Subject: [PATCH 2/2] [vm]: address review on prune stale device metadata 1. hasAuthoritativeDeviceInfo now tracks actually valid entries observed from the agent response instead of raw list non-emptiness, preventing a reply with all-invalid VirtualDeviceInfo from triggering prune with only vmUuid in surviving. 2. When rsp.nicInfos is null, populate surviving with every known nic uuid from spec/vm so the prune step cannot drop their address rows as collateral damage. 3. Extract ALWAYS_KEEP_UUIDS to a static final Set. 4. Remove stray uncompilable line left in prune method. Resolves: ZSV-11374 Change-Id: Ieb86e65494b0d957ef00267f4013605170f35adb --- ...VmInstanceResourceMetadataManagerImpl.java | 41 ++++++++------ .../VmInstanceResourceMetadataManager.java | 16 +++--- .../VirtualPciDeviceKvmExtensionPoint.java | 53 +++++++++++-------- 3 files changed, 65 insertions(+), 45 deletions(-) diff --git a/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java b/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java index 0c17a86ce6a..c88290fa26b 100644 --- a/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java +++ b/compute/src/main/java/org/zstack/compute/vm/devices/VmInstanceResourceMetadataManagerImpl.java @@ -32,6 +32,19 @@ public class VmInstanceResourceMetadataManagerImpl implements VmInstanceResourceMetadataManager { private static final CLogger logger = Utils.getLogger(VmInstanceResourceMetadataManagerImpl.class); + + // Synthetic resourceUuids that must survive pruneStaleDeviceMetadata + // regardless of what the libvirt snapshot reports. These rows are owned + // by other subsystems (memory balloon, resource config, guest tools). + private static final Set ALWAYS_KEEP_UUIDS; + static { + Set s = new HashSet<>(); + s.add(MEM_BALLOON_UUID); + s.add(RESOURCE_CONFIG_UUID); + s.add(GUEST_TOOLS_RESOURCE_CONFIG_UUID); + ALWAYS_KEEP_UUIDS = Collections.unmodifiableSet(s); + } + @Autowired private DatabaseFacade dbf; @@ -357,26 +370,23 @@ public int pruneStaleDeviceMetadata(String vmInstanceUuid, Set surviving List candidates = Q.New(VmInstanceResourceMetadataVO.class) .eq(VmInstanceResourceMetadataVO_.vmInstanceUuid, vmInstanceUuid) - .isNull(VmInstanceResourceMetadataVO_.metadataClass) .list(); if (candidates.isEmpty()) { return 0; } - Set alwaysKeep = new HashSet<>(); - // double-guard: the vmXml archive row uses vmInstanceUuid as resourceUuid, - // normally it has non-null metadataClass so it never reaches here. - alwaysKeep.add(vmInstanceUuid); - alwaysKeep.add(MEM_BALLOON_UUID); - alwaysKeep.add(RESOURCE_CONFIG_UUID); - alwaysKeep.add(GUEST_TOOLS_RESOURCE_CONFIG_UUID); - + // Whitelist: always preserved regardless of surviving set. + // - vmInstanceUuid: vmXml archive row + // - MEM_BALLOON / RESOURCE_CONFIG / GUEST_TOOLS: synthetic uuids + // owned by other subsystems, never reported by the libvirt + // device snapshot. Set surviving = survivingResourceUuids == null ? Collections.emptySet() : survivingResourceUuids; List toDelete = candidates.stream() .map(VmInstanceResourceMetadataVO::getResourceUuid) - .filter(ru -> !alwaysKeep.contains(ru)) + .filter(ru -> !ALWAYS_KEEP_UUIDS.contains(ru)) + .filter(ru -> !vmInstanceUuid.equals(ru)) .filter(ru -> !surviving.contains(ru)) .collect(Collectors.toList()); @@ -388,13 +398,10 @@ public int pruneStaleDeviceMetadata(String vmInstanceUuid, Set surviving "prune stale VmInstanceResourceMetadataVO for vm[%s], resourceUuids=%s", vmInstanceUuid, toDelete)); - SQL.New("delete from VmInstanceResourceMetadataVO v" + - " where v.vmInstanceUuid = :vmUuid" + - " and v.metadataClass is null" + - " and v.resourceUuid in (:uuids)") - .param("vmUuid", vmInstanceUuid) - .param("uuids", toDelete) - .execute(); + SQL.New(VmInstanceResourceMetadataVO.class) + .eq(VmInstanceResourceMetadataVO_.vmInstanceUuid, vmInstanceUuid) + .in(VmInstanceResourceMetadataVO_.resourceUuid, toDelete) + .hardDelete(); return toDelete.size(); } diff --git a/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java b/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java index d01533c858c..16c2cc94be7 100644 --- a/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java +++ b/header/src/main/java/org/zstack/header/vm/devices/VmInstanceResourceMetadataManager.java @@ -154,15 +154,19 @@ public interface VmInstanceResourceMetadataManager { void updateVmResourceMetadataDeviceAddress(String vmInstanceUuid, String resourceUuid, String deviceAddress); /** - * Drop "address-only" rows (metadataClass IS NULL) of this VM whose - * resourceUuid is NOT in survivingResourceUuids. Used by start-vm / sync-vm-device-info - * extension point to reconcile DB state with the libvirt domain actually running. + * Drop rows of this VM whose resourceUuid is NOT in survivingResourceUuids. + * Used by start-vm / sync-vm-device-info extension point to reconcile DB + * state with the libvirt domain actually running: any extra resourceUuid + * that the agent response does not report is considered stale and removed, + * regardless of metadataClass. * - * Always preserved: - * - rows with non-null metadataClass (memorySnapshot / guesttools / ArchiveVmBundle ...) - * - row whose resourceUuid == vmInstanceUuid (vmXml archive, as a double-guard) + * Always preserved (whitelist, independent of survivingResourceUuids): + * - row whose resourceUuid == vmInstanceUuid (vmXml archive) * - MEM_BALLOON_UUID / RESOURCE_CONFIG_UUID / GUEST_TOOLS_RESOURCE_CONFIG_UUID * + * Caller is responsible for adding any resourceUuid that must survive + * (e.g. every VM nic uuid when rsp.nicInfos is null) to survivingResourceUuids. + * * @param vmInstanceUuid VM uuid * @param survivingResourceUuids resourceUuids that still exist in the libvirt domain * @return number of rows deleted diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java b/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java index 80ebc9c1f87..091423d34b9 100644 --- a/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/VirtualPciDeviceKvmExtensionPoint.java @@ -81,6 +81,13 @@ public void afterReceiveVmDeviceInfoResponse(VmInstanceInventory vm, KVMAgentCom String vmUuid = spec != null ? spec.getVmInventory().getUuid() : vm.getUuid(); Set surviving = new HashSet<>(); + // Tracks whether the agent reply actually contains valid device info. + // Only flipped to true when we observe at least one valid virtual + // device / matched nic / balloon address from the response itself. + // This is the ONLY signal used to decide whether prune may run; + // resourceUuids added as defensive fallbacks (e.g. nic uuids copied + // from spec when nicInfos is null) must NOT influence this flag. + boolean hasAuthoritativeDeviceInfo = false; vidManager.saveVmXmlMetadata(rsp.getVmXml(), vmUuid); // the vmXml row uses vmUuid as resourceUuid; keep it @@ -89,28 +96,37 @@ public void afterReceiveVmDeviceInfoResponse(VmInstanceInventory vm, KVMAgentCom // only update pci address, metadata is not mandatory in normal usage // check its usage when create snapshot or backup if (rsp.getVirtualDeviceInfoList() != null) { - rsp.getVirtualDeviceInfoList().forEach(info -> { + for (VirtualDeviceInfo info : rsp.getVirtualDeviceInfoList()) { if (info.isValid()) { vidManager.createOrUpdateVmResourceMetadata(info, vmUuid); surviving.add(info.getResourceUuid()); + hasAuthoritativeDeviceInfo = true; } - }); + } } if (rsp.getNicInfos() != null) { - rsp.getNicInfos().forEach(info -> { + for (KVMAgentCommands.VmNicInfo info : rsp.getNicInfos()) { VmNicInventory nic = (spec != null ? spec.getDestNics() : vm.getVmNics()) .stream() .filter(vmNicInventory -> vmNicInventory.getMac().equals(info.getMacAddress())) .findFirst() .orElse(null); if (nic == null) { - return; + continue; } vidManager.createOrUpdateVmResourceMetadata(new VirtualDeviceInfo(nic.getUuid(), info.getDeviceAddress()), vmUuid); surviving.add(nic.getUuid()); - }); + hasAuthoritativeDeviceInfo = true; + } + } else { + // nicInfos is null => partial agent response; we do NOT have an + // authoritative nic snapshot. Preserve every known nic uuid of + // this VM so a prune step (triggered by other authoritative fields) + // cannot drop their address rows as collateral damage. + (spec != null ? spec.getDestNics() : vm.getVmNics()) + .forEach(n -> surviving.add(n.getUuid())); } if (rsp.getMemBalloonInfo() != null @@ -119,28 +135,21 @@ public void afterReceiveVmDeviceInfoResponse(VmInstanceInventory vm, KVMAgentCom vidManager.createOrUpdateVmResourceMetadata(new VirtualDeviceInfo(vidManager.MEM_BALLOON_UUID, DeviceAddress.fromString(rsp.getMemBalloonInfo().getDeviceAddress().toString())), vmUuid); surviving.add(vidManager.MEM_BALLOON_UUID); + hasAuthoritativeDeviceInfo = true; } - // differential prune: drop stale address-only rows (metadataClass IS NULL) - // whose resourceUuid is no longer present in the libvirt domain. This keeps - // DB state consistent after devices like cdrom get detached/deleted while - // the VM was stopped. Rows written by other extension points (with non-null - // metadataClass) are preserved. + // Differential prune of stale address-only rows (metadataClass IS NULL) + // whose resourceUuid is no longer present in the libvirt domain, so DB + // state stays consistent after devices (e.g. cdrom) are detached while + // the VM is stopped. Rows owned by other extension points (non-null + // metadataClass) are untouched. // - // Guard: only prune when the agent response actually carries an - // authoritative device snapshot. A response with all three device - // fields empty/null is not authoritative (e.g. certain mocked or - // partial agent replies) and must not be used as the basis for prune, - // otherwise legitimate metadata rows get wiped. - boolean hasAuthoritativeDeviceInfo = - (rsp.getVirtualDeviceInfoList() != null && !rsp.getVirtualDeviceInfoList().isEmpty()) - || (rsp.getNicInfos() != null && !rsp.getNicInfos().isEmpty()) - || (rsp.getMemBalloonInfo() != null - && rsp.getMemBalloonInfo().getDeviceAddress() != null - && !StringUtils.isEmpty(rsp.getMemBalloonInfo().getDeviceAddress().toString())); + // Guard: only prune when the response actually delivered authoritative + // device info. An empty / all-invalid reply gives no ground truth; we + // must not treat absence-of-data as proof-of-deletion. if (!hasAuthoritativeDeviceInfo) { logger.debug(String.format( - "skip pruneStaleDeviceMetadata for vm[%s]: response has no authoritative device info", + "skip pruneStaleDeviceMetadata for vm[%s]: empty device snapshot", vmUuid)); return; }