Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch Large rewrite of the balloon driver. This one:



details:   https://anonhg.NetBSD.org/src/rev/751da6aa26f0
branches:  trunk
changeset: 764374:751da6aa26f0
user:      jym <jym%NetBSD.org@localhost>
date:      Mon Apr 18 01:36:24 2011 +0000

description:
Large rewrite of the balloon driver. This one:

- turns balloon into a driver that attaches to xenbus(4). This allows to
disable the functionality either at compile time or boot time via
userconf(4). Driver can implement detach or pmf(9) hooks if deemed
necessary.

- keeps Cherry's locking model, but simplify it a bit. There is now
only one target value serialized inside balloon, we do not feedback
alternative value to Xenstore (clients are not expected to see its value
evolve behind their back, and can't do much about that either)

- implements min threshold; this is an admin-settable value that tells
driver to "not balloon below this threshold." This can be used by domain
to keep memory reservations, useful if activity is expected in the near
future.

- in addition to min threshold, the driver implements internally a
safeguard value (uvmexp.freemin + 1MiB), so that admin cannot
inadvertently set min to a very low value forcing domain into heavy
memory pressure and swapping.

- create the sysctl(8) kern.xen.balloon tree. 4 nodes are actually present
(values are in KiB):
   - min: (rw) an admin-settable value that prevents ballooning below this
          mark
   - max: (ro) the maximum size for reservation, as set by xm(1) mem-max.
   - current: (ro) the current reservation for domain.
   - target:  (rw) the targetted reservation for domain.

- fix a few limitations here and there, most notably the max_reservation
hypercall, and KiB vs pages representations at interfaces.

The driver is still turned off by default. Enabling it would need more
approval, especially from bouyer@, cherry@ and cegger@.

FWIW: tested it two days long, from amd64 dom0 (with dom0 ballooning
enabled for xend), and bunch of domUs. Did not notice anything suspicious.

XXX it still has one big limitation: it cannot hotplug memory pages in
uvm(9) if they were not present beforehand. Example: ballooning above
physmem will give more pages to domain but it won't use it to serve
allocations, unless we teach uvm(9) how to handle the extra pages.

diffstat:

 sys/arch/amd64/conf/XEN3_DOM0      |     4 +-
 sys/arch/amd64/conf/XEN3_DOMU      |     4 +-
 sys/arch/i386/conf/XEN3_DOM0       |     4 +-
 sys/arch/i386/conf/XEN3_DOMU       |     4 +-
 sys/arch/xen/conf/files.xen        |    10 +-
 sys/arch/xen/xen/balloon.c         |  1179 ++++++++++++++++-------------------
 sys/arch/xen/xenbus/xenbus_probe.c |    23 +-
 7 files changed, 580 insertions(+), 648 deletions(-)

diffs (truncated from 1618 to 300 lines):

diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/amd64/conf/XEN3_DOM0
--- a/sys/arch/amd64/conf/XEN3_DOM0     Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/amd64/conf/XEN3_DOM0     Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: XEN3_DOM0,v 1.65 2011/03/06 17:08:19 bouyer Exp $
+# $NetBSD: XEN3_DOM0,v 1.66 2011/04/18 01:36:24 jym Exp $
 
 include        "arch/amd64/conf/std.xen"
 
@@ -12,7 +12,6 @@
 
 #
 options        DOM0OPS
-#options         XEN_BALLOON     # Xen memory ballooning - Experimental
 
 # boot messages with MPBIOS, acpi and ioapic can be quite large
 options        MSGBUFSIZE=24576
@@ -204,6 +203,7 @@
 xenbus*                at hypervisor?          # Xen virtual bus
 
 xencons*       at hypervisor?          # Xen virtual console
+#balloon*      at xenbus?              # Xen balloon device
 
 acpi0          at hypervisor?
 #options       ACPI_ACTIVATE_DEV       # If set, activate inactive devices
diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/amd64/conf/XEN3_DOMU
--- a/sys/arch/amd64/conf/XEN3_DOMU     Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/amd64/conf/XEN3_DOMU     Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: XEN3_DOMU,v 1.27 2011/03/06 17:08:19 bouyer Exp $
+# $NetBSD: XEN3_DOMU,v 1.28 2011/04/18 01:36:24 jym Exp $
 
 include        "arch/amd64/conf/std.xen"
 
@@ -13,7 +13,6 @@
 #
 options        MAXPHYS=32768   #xbd doesn't handle 64k transfers
 #options       DOM0OPS
-#options         XEN_BALLOON   # Xen memory ballooning - Experimental
 
 #options       VM86            # virtual 8086 emulation
 #options       USER_LDT        # user-settable LDT; used by WINE
@@ -158,6 +157,7 @@
 xenbus*                at hypervisor?          # Xen virtual bus
 xennet*                at xenbus?              # Xen virtual network interface
 xbd*           at xenbus?              # Xen virtual block device
+#balloon*      at xenbus?              # Xen balloon device
 
 xencons*       at hypervisor?          # Xen virtual console
 
diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/i386/conf/XEN3_DOM0
--- a/sys/arch/i386/conf/XEN3_DOM0      Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/i386/conf/XEN3_DOM0      Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: XEN3_DOM0,v 1.47 2011/03/06 17:08:26 bouyer Exp $
+#      $NetBSD: XEN3_DOM0,v 1.48 2011/04/18 01:36:25 jym Exp $
 #
 #      XEN3_0: Xen 3.0 domain0 kernel
 
@@ -23,7 +23,6 @@
 # making MCLBYTES = PAGE_SIZE avoids a copy when a mbuf cluster is sent
 # to a domU, at the expense of a higher memory usage by the network stack.
 #options       MCLSHIFT=12
-#options         XEN_BALLOON     # Xen memory ballooning - Experimental
 
 makeoptions    CPUFLAGS="-march=i686"
 
@@ -221,6 +220,7 @@
 
 xencons*       at hypervisor?          # Xen virtual console
 #xennet*       at hypervisor?          # Xen virtual network interface
+#balloon*      at xenbus?              # Xen balloon device
 
 #xbd*          at hypervisor?          # Xen virtual block device
 #wd*           at hypervisor?          # Xen vbd (wd identity)
diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/i386/conf/XEN3_DOMU
--- a/sys/arch/i386/conf/XEN3_DOMU      Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/i386/conf/XEN3_DOMU      Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: XEN3_DOMU,v 1.30 2011/04/04 19:57:31 dyoung Exp $
+# $NetBSD: XEN3_DOMU,v 1.31 2011/04/18 01:36:25 jym Exp $
 
 include        "arch/xen/conf/std.xen"
 
@@ -13,7 +13,6 @@
 #
 options        XEN
 #options       DOM0OPS
-#options         XEN_BALLOON     # Xen memory ballooning - Experimental
 
 makeoptions    CPUFLAGS="-march=i686"
 
@@ -186,6 +185,7 @@
 xenbus*        at hypervisor?          # Xen virtual bus
 xennet*        at xenbus?              # Xen virtual network interface
 xbd*           at xenbus?              # Xen virtual block device
+#balloon*      at xenbus?              # Xen balloon device
 
 cinclude "arch/i386/conf/GENERIC.local"
 
diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/xen/conf/files.xen
--- a/sys/arch/xen/conf/files.xen       Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/xen/conf/files.xen       Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: files.xen,v 1.115 2011/04/11 08:56:17 cegger Exp $
+#      $NetBSD: files.xen,v 1.116 2011/04/18 01:36:25 jym Exp $
 #      NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
 #      NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 
 
@@ -217,6 +217,11 @@
 attach xpci at xenbus with xpci_xenbus
 file   arch/xen/xen/xpci_xenbus.c      xpci_xenbus
 
+# Xen Balloon driver
+device balloon
+attach balloon at xenbus
+file   arch/xen/xen/balloon.c          balloon
+
 # Non-Xen specific devices and options
 
 include        "dev/pckbport/files.pckbport"
@@ -400,7 +405,7 @@
 include        "dev/pcmcia/files.pcmcia"
 
 # Domain-0 operations
-defflag        opt_xen.h                       DOM0OPS XEN_COMPAT_030001 XEN_BALLOON
+defflag        opt_xen.h                       DOM0OPS XEN_COMPAT_030001
 file   arch/xen/xen/privcmd.c          dom0ops
 file   arch/xen/x86/xen_shm_machdep.c  dom0ops
 file   arch/x86/pci/pci_machdep.c      hypervisor & pci & dom0ops
@@ -410,7 +415,6 @@
 file   arch/xen/xen/xennetback_xenbus.c xvif
 file   arch/xen/xen/xennet_checksum.c  xvif | xennet
 file   arch/xen/xen/xbdback_xenbus.c xbdback
-file    arch/xen/xen/balloon.c          hypervisor
 
 ifdef i386
 include "arch/i386/conf/majors.i386"
diff -r 20bcbad30d01 -r 751da6aa26f0 sys/arch/xen/xen/balloon.c
--- a/sys/arch/xen/xen/balloon.c        Mon Apr 18 00:47:24 2011 +0000
+++ b/sys/arch/xen/xen/balloon.c        Mon Apr 18 01:36:24 2011 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: balloon.c,v 1.6 2010/11/12 13:18:59 uebayasi Exp $ */
+/* $NetBSD: balloon.c,v 1.7 2011/04/18 01:36:25 jym Exp $ */
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -31,40 +31,59 @@
  */
 
 /*
- * The Xen balloon driver enables growing and shrinking PV
- * domains on the fly, by allocating and freeing memory directly.
- */
-
-#define BALLOONDEBUG 1
-
-/*
- * sysctl TODOs:
- * xen.balloon
- * xen.balloon.current: DONE
- * xen.balloon.target: DONE
- * xen.balloon.low-balloon: In Progress
- * xen.balloon.high-balloon: In Progress
- * xen.balloon.limit: XXX
+ * The Xen balloon driver enables growing and shrinking PV domains
+ * memory on the fly, by allocating and freeing memory pages directly.
+ * This management needs domain cooperation to work properly, especially
+ * during balloon_inflate() operation where a domain gives back memory to
+ * the hypervisor.
+ *
+ * Shrinking memory on a live system is a difficult task, and may render
+ * it unstable or lead to crash. The driver takes a conservative approach
+ * there by doing memory operations in smal steps of a few MiB each time. It
+ * will also refuse to decrease reservation below a certain threshold
+ * (XEN_RESERVATION_MIN), so as to avoid a complete kernel memory exhaustion.
  *
- * sysctl labels = { 'current'      : 'Current allocation',
- *           'target'       : 'Requested target',
- *           'low-balloon'  : 'Low-mem balloon',
- *           'high-balloon' : 'High-mem balloon',
- *           'limit'        : 'Xen hard limit' }
+ * The user can intervene at two different levels to manage the ballooning
+ * of a domain:
+ * - directly within the domain using a sysctl(9) interface.
+ * - through the Xentools, by modifying the memory/target entry associated
+ *   to a domain. This is usually done in dom0.
+ *
+ * Modification of the reservation is signaled by writing inside the 
+ * memory/target node in Xenstore. Writing new values will fire the xenbus
+ * watcher, and wakeup the balloon thread to inflate or deflate balloon.
+ *
+ * Both sysctl(9) nodes and memory/target entry assume that the values passed
+ * to them are in KiB. Internally, the driver will convert this value in
+ * pages (assuming a page is PAGE_SIZE bytes), and issue the correct hypercalls
+ * to decrease/increase domain's reservation accordingly.
+ *
+ * XXX Pages used by balloon are tracked through entries stored in a SLIST.
+ * This allows driver to conveniently add/remove wired pages from memory
+ * without the need to support these "memory gaps" inside uvm(9). Still, the
+ * driver does not currently "plug" new pages into uvm(9) when more memory
+ * is available than originally managed by balloon. For example, deflating
+ * balloon with a total number of pages above physmem is not supported for
+ * now. See balloon_deflate() for more details.
  *
  */
 
+#define BALLOONDEBUG 0
+
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: balloon.c,v 1.6 2010/11/12 13:18:59 uebayasi Exp $");
+__KERNEL_RCSID(0, "$NetBSD: balloon.c,v 1.7 2011/04/18 01:36:25 jym Exp $");
 
 #include <sys/inttypes.h>
+#include <sys/device.h>
 #include <sys/param.h>
 
+#include <sys/atomic.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
+#include <sys/pool.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 
@@ -76,68 +95,182 @@
 #include <uvm/uvm.h>
 #include <xen/xenpmap.h>
 
-#define BALLOONINTERVALMS 100 /* milliseconds */
+#include "locators.h"
 
-#define BALLOON_DELTA 1024 /* The maximum increments allowed in a
-                           * single call of balloon_inflate() or
-                           * balloon_deflate
-                           */
-#define BALLOON_RETRIES 4  /* Number of time every (in|de)flate of
-                           * BALLOON_DELTA or less, occurs
-                           */
+/*
+ * Number of MFNs stored in the array passed back and forth between domain
+ * and balloon/hypervisor, during balloon_inflate() / balloon_deflate(). These
+ * should fit in a page, for performance reasons.
+ */
+#define BALLOON_DELTA (PAGE_SIZE / sizeof(xen_pfn_t))
 
-/* XXX: fix limits */
-#define BALLOON_BALLAST 256 /* In pages */
+/*
+ * Safeguard value. Refuse to go below this threshold, so that domain
+ * can keep some free pages for its own use. Value is arbitrary, and may
+ * evolve with time.
+ */
+#define BALLOON_BALLAST 256 /* In pages - 1MiB */
 #define XEN_RESERVATION_MIN (uvmexp.freemin + BALLOON_BALLAST) /* In pages */
-#define XEN_RESERVATION_MAX nkmempages /* In pages */
 
 /* KB <-> PAGEs */
-#define BALLOON_PAGES_TO_KB(_pg) (_pg * PAGE_SIZE / 1024)
-#define BALLOON_KB_TO_PAGES(_kb) (_kb * 1024 / PAGE_SIZE)
-#define BALLOON_PAGE_FLOOR(_kb) (_kb & PAGE_MASK)
+#define PAGE_SIZE_KB (PAGE_SIZE >> 10) /* page size in KB */
+#define BALLOON_PAGES_TO_KB(_pg) ((uint64_t)_pg * PAGE_SIZE_KB)
+#define BALLOON_KB_TO_PAGES(_kb) (roundup(_kb, PAGE_SIZE_KB) / PAGE_SIZE_KB)
 
-/* Forward declaration */
-static void xenbus_balloon_watcher(struct xenbus_watch *, const char **,
-                                  unsigned int);
-
+/*
+ * A balloon page entry. Needed to track pages put/reclaimed from balloon
+ */
 struct balloon_page_entry {
        struct vm_page *pg;
        SLIST_ENTRY(balloon_page_entry) entry;
 };
 
-static struct balloon_conf {
-       kmutex_t flaglock; /* Protects condvar (below) */
-       kcondvar_t cv_memchanged; /* Notifier flag for target (below) */
+struct balloon_xenbus_softc {
+       device_t sc_dev;
+       struct sysctllog *sc_log;
 
-       kmutex_t tgtlock; /* Spin lock, protects .target, below */
-       size_t target; /* Target VM reservation size, in pages. */
+       kmutex_t balloon_mtx;   /* Protects condvar and target (below) */
+       kcondvar_t balloon_cv;  /* Condvar variable for target (below) */
+       size_t balloon_target;  /* Target domain reservation size in pages. */
+       xen_pfn_t *sc_mfn_list; /* List of MFNs passed from/to balloon */
 
-       /* The following are not protected by above locks */
+       pool_cache_t bpge_pool; /* pool cache for balloon page entries */
+       /* linked list for tracking pages used by balloon */
        SLIST_HEAD(, balloon_page_entry) balloon_page_entries;
        size_t balloon_num_page_entries;
 
-       /* Balloon limits */



Home | Main Index | Thread Index | Old Index