[src/trunk]: src/sys/dev/nvmm nvmm(4): Add suspend/resume support.

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/dev/nvmm nvmm(4): Add suspend/resume support.
From: riastradh <riastradh%NetBSD.org@localhost>
Date: Tue, 13 Sep 2022 23:10:48 +0000
details:   https://anonhg.NetBSD.org/src/rev/1b6f5bdaa449
branches:  trunk
changeset: 370059:1b6f5bdaa449
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Tue Sep 13 20:10:04 2022 +0000

description:
nvmm(4): Add suspend/resume support.

New MD nvmm_impl callbacks:

- .suspend_interrupt forces all VMs on all physical CPUs to exit.
- .vcpu_suspend suspends an individual vCPU on a machine.
- .machine_suspend suspends an individual machine.
- .suspend suspends the whole system.
- .resume resumes the whole system.
- .machine_resume resumes an individual machine.
- .vcpu_resume resumes an indidivudal vCPU on a machine.

Suspending nvmm:

1. causes new VM operations (ioctl and close) to block until resumed,
2. uses .suspend_interrupt to interrupt any concurrent and force them
   to return early, and then
3. uses the various suspend callbacks to suspend all vCPUs, machines,
   and the whole system -- all vCPUs before the machine they're on,
   and all machines before the system.

Resuming nvmm does the reverse of (3) -- resume system, resume each
machine and then the vCPUs on that machine -- and then unblocks
operations.

Implemented only for x86-vmx for now:

- suspend_interrupt triggers a TLB IPI to cause VM exits;
- vcpu_suspend issues VMCLEAR to force any in-CPU state to be written
  to memory;
- machine_suspend does nothing;
- suspend does VMXOFF on all CPUs;
- resume does VMXON on all CPUs;
- machine_resume does nothing; and
- vcpu_resume just marks each vCPU as valid but inactive so
  subsequent use will clear it and load it with vmptrld.

x86-svm left as an exercise for the reader.

diffstat:

 sys/dev/nvmm/nvmm.c             |  223 +++++++++++++++++++++++++++++++++++++++-
 sys/dev/nvmm/nvmm_internal.h    |   15 ++-
 sys/dev/nvmm/x86/nvmm_x86_vmx.c |   85 +++++++++++++-
 3 files changed, 310 insertions(+), 13 deletions(-)

diffs (truncated from 505 to 300 lines):

diff -r 2e7e4a1bf7fa -r 1b6f5bdaa449 sys/dev/nvmm/nvmm.c
--- a/sys/dev/nvmm/nvmm.c       Tue Sep 13 13:09:16 2022 +0000
+++ b/sys/dev/nvmm/nvmm.c       Tue Sep 13 20:10:04 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $       */
+/*     $NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $      */
 
 /*
  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -59,6 +59,15 @@
 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
 static volatile unsigned int nmachines __cacheline_aligned;
 
+static struct {
+       kmutex_t        lock;
+       kcondvar_t      suspendcv;
+       kcondvar_t      resumecv;
+       unsigned        users;
+} suspension;
+
+volatile bool nvmm_suspending;
+
 static const struct nvmm_impl *nvmm_impl_list[] = {
 #if defined(__x86_64__)
        &nvmm_x86_svm,  /* x86 AMD SVM */
@@ -73,6 +82,50 @@
 /* -------------------------------------------------------------------------- */
 
 static int
+nvmm_enter_sig(void)
+{
+       int error;
+
+       mutex_enter(&suspension.lock);
+       while (nvmm_suspending) {
+               error = cv_wait_sig(&suspension.resumecv, &suspension.lock);
+               if (error)
+                       goto out;
+       }
+       KASSERT(suspension.users < UINT_MAX);
+       suspension.users++;
+       error = 0;
+out:   mutex_exit(&suspension.lock);
+
+       return 0;
+}
+
+static void
+nvmm_enter(void)
+{
+
+       mutex_enter(&suspension.lock);
+       while (nvmm_suspending)
+               cv_wait(&suspension.resumecv, &suspension.lock);
+       KASSERT(suspension.users < UINT_MAX);
+       suspension.users++;
+       mutex_exit(&suspension.lock);
+}
+
+static void
+nvmm_exit(void)
+{
+
+       mutex_enter(&suspension.lock);
+       KASSERT(suspension.users > 0);
+       if (--suspension.users == 0)
+               cv_signal(&suspension.suspendcv);
+       mutex_exit(&suspension.lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
 nvmm_machine_alloc(struct nvmm_machine **ret)
 {
        struct nvmm_machine *mach;
@@ -989,6 +1042,11 @@
                }
        }
 
+       mutex_init(&suspension.lock, MUTEX_DEFAULT, IPL_NONE);
+       cv_init(&suspension.suspendcv, "nvmmsus");
+       cv_init(&suspension.resumecv, "nvmmres");
+       suspension.users = 0;
+
        (*nvmm_impl->init)();
 
        return 0;
@@ -1080,7 +1138,11 @@
        struct nvmm_owner *owner = fp->f_data;
 
        KASSERT(owner != NULL);
+
+       nvmm_enter();
        nvmm_kill_machines(owner);
+       nvmm_exit();
+
        if (owner != &root_owner) {
                kmem_free(owner, sizeof(*owner));
        }
@@ -1126,7 +1188,7 @@
 }
 
 static int
-nvmm_ioctl(file_t *fp, u_long cmd, void *data)
+nvmm_ioctl_internal(file_t *fp, u_long cmd, void *data)
 {
        struct nvmm_owner *owner = fp->f_data;
 
@@ -1170,11 +1232,27 @@
        }
 }
 
+static int
+nvmm_ioctl(struct file *fp, u_long cmd, void *data)
+{
+       int error;
+
+       error = nvmm_enter_sig();
+       if (error)
+               return error;
+       error = nvmm_ioctl_internal(fp, cmd, data);
+       nvmm_exit();
+
+       return error;
+}
+
 /* -------------------------------------------------------------------------- */
 
 static int nvmm_match(device_t, cfdata_t, void *);
 static void nvmm_attach(device_t, device_t, void *);
 static int nvmm_detach(device_t, int);
+static bool nvmm_suspend(device_t, const pmf_qual_t *);
+static bool nvmm_resume(device_t, const pmf_qual_t *);
 
 extern struct cfdriver nvmm_cd;
 
@@ -1209,6 +1287,8 @@
                panic("%s: impossible", __func__);
        aprint_normal_dev(self, "attached, using backend %s\n",
            nvmm_impl->name);
+       if (nvmm_impl->suspend != NULL && nvmm_impl->resume != NULL)
+               pmf_device_register(self, nvmm_suspend, nvmm_resume);
 }
 
 static int
@@ -1216,10 +1296,147 @@
 {
        if (atomic_load_relaxed(&nmachines) > 0)
                return EBUSY;
+       pmf_device_deregister(self);
        nvmm_fini();
        return 0;
 }
 
+static void
+nvmm_suspend_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+
+       mutex_enter(&vcpu->lock);
+       if (vcpu->present && nvmm_impl->vcpu_suspend)
+               (*nvmm_impl->vcpu_suspend)(mach, vcpu);
+       mutex_exit(&vcpu->lock);
+}
+
+static void
+nvmm_resume_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+
+       mutex_enter(&vcpu->lock);
+       if (vcpu->present && nvmm_impl->vcpu_resume)
+               (*nvmm_impl->vcpu_resume)(mach, vcpu);
+       mutex_exit(&vcpu->lock);
+}
+
+static void
+nvmm_suspend_machine(struct nvmm_machine *mach)
+{
+
+       rw_enter(&mach->lock, RW_WRITER);
+       if (mach->present) {
+               if (nvmm_impl->vcpu_suspend) {
+                       size_t cpuid;
+
+                       for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++)
+                               nvmm_suspend_vcpu(mach, &mach->cpus[cpuid]);
+               }
+               if (nvmm_impl->machine_suspend)
+                       (*nvmm_impl->machine_suspend)(mach);
+       }
+       rw_exit(&mach->lock);
+}
+
+static void
+nvmm_resume_machine(struct nvmm_machine *mach)
+{
+
+       rw_enter(&mach->lock, RW_WRITER);
+       if (mach->present) {
+               if (nvmm_impl->vcpu_resume) {
+                       size_t cpuid;
+
+                       for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++)
+                               nvmm_resume_vcpu(mach, &mach->cpus[cpuid]);
+               }
+               if (nvmm_impl->machine_resume)
+                       (*nvmm_impl->machine_resume)(mach);
+       }
+       rw_exit(&mach->lock);
+}
+
+static bool
+nvmm_suspend(device_t self, const pmf_qual_t *qual)
+{
+       size_t i;
+
+       /*
+        * Prevent new users (via ioctl) from starting.
+        */
+       mutex_enter(&suspension.lock);
+       KASSERT(!nvmm_suspending);
+       atomic_store_relaxed(&nvmm_suspending, true);
+       mutex_exit(&suspension.lock);
+
+       /*
+        * Interrupt any running VMs so they will break out of run
+        * loops or anything else and not start up again until we've
+        * resumed.
+        */
+       if (nvmm_impl->suspend_interrupt)
+               (*nvmm_impl->suspend_interrupt)();
+
+       /*
+        * Wait for any running VMs or other ioctls to finish running
+        * or handling any other ioctls.
+        */
+       mutex_enter(&suspension.lock);
+       while (suspension.users)
+               cv_wait(&suspension.suspendcv, &suspension.lock);
+       mutex_exit(&suspension.lock);
+
+       /*
+        * Suspend all the machines.
+        */
+       if (nvmm_impl->machine_suspend || nvmm_impl->vcpu_suspend) {
+               for (i = 0; i < NVMM_MAX_MACHINES; i++)
+                       nvmm_suspend_machine(&machines[i]);
+       }
+
+       /*
+        * Take any systemwide suspend action.
+        */
+       if (nvmm_impl->suspend)
+               (*nvmm_impl->suspend)();
+
+       return true;
+}
+
+static bool
+nvmm_resume(device_t self, const pmf_qual_t *qual)
+{
+       size_t i;
+
+       KASSERT(atomic_load_relaxed(&nvmm_suspending));
+       KASSERT(suspension.users == 0);
+
+       /*
+        * Take any systemwide resume action.
+        */
+       if (nvmm_impl->resume)
+               (*nvmm_impl->resume)();
+
+       /*
+        * Resume all the machines.
+        */
+       if (nvmm_impl->machine_resume || nvmm_impl->vcpu_resume) {
+               for (i = 0; i < NVMM_MAX_MACHINES; i++)
+                       nvmm_resume_machine(&machines[i]);
+       }
+
+       /*
+        * Allow new users (via ioctl) to start again.
+        */
+       mutex_enter(&suspension.lock);
+       atomic_store_relaxed(&nvmm_suspending, false);
+       cv_broadcast(&suspension.resumecv);
+       mutex_exit(&suspension.lock);
+
+       return true;
+}
+
Prev by Date: [src/trunk]: src/sys/dev/iscsi kill_session now uses the session id to avoid ...
Next by Date: [src/trunk]: src/sys/dev/pci mfii: Don't panic on detach.
Previous by Thread: [src/trunk]: src/sys/dev/iscsi kill_session now uses the session id to avoid ...
Next by Thread: [src/trunk]: src/sys/dev/pci mfii: Don't panic on detach.
Indexes:
Home | Main Index | Thread Index | Old Index