Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/rump/librump/rumpkern Improve the CPU scheduler for a ho...



details:   https://anonhg.NetBSD.org/src/rev/f9acf7c31a73
branches:  trunk
changeset: 755227:f9acf7c31a73
user:      pooka <pooka%NetBSD.org@localhost>
date:      Fri May 28 16:44:14 2010 +0000

description:
Improve the CPU scheduler for a host MP system with multithreaded
access.  The old scheduler had a global freelist which caused a
cache crisis with multiple host threads trying to schedule a virtual
CPU simultaneously.

The rump scheduler is different from a normal thread scheduler, so
it has different requirements.  First, we schedule a CPU for a
thread (which we get from the host scheduler) instead of scheduling
a thread onto a CPU.  Second, scheduling points are at every
entry/exit to/from the rump kernel, including (but not limited to)
syscall entry points and hypercalls.  This means scheduling happens
a lot more frequently than in a normal kernel.

For every lwp, cache the previously used CPU.  When scheduling,
attempt to reuse the same CPU.  If we get it, we can use it directly
without any memory barriers or expensive locks.  If the CPU is
taken, migrate.  Use a lock/wait only in the slowpath.  Be very
wary of walking the entire CPU array because that does not lead to
a happy cacher.

The migration algorithm could probably benefit from improved
heuristics and tuning.  Even as such, with the new scheduler an
application which has two threads making rlimit syscalls in a tight
loop experiences almost 400% speedup.  The exact speedup is difficult
to pinpoint, though, since the old scheduler caused very jittery
results due to cache contention.  Also, the rump version is now
70% faster than the counterpart which calls the host kernel.

diffstat:

 sys/rump/librump/rumpkern/rump.c      |    9 +-
 sys/rump/librump/rumpkern/scheduler.c |  272 +++++++++++++++++++++++----------
 sys/rump/librump/rumpkern/threads.c   |    7 +-
 3 files changed, 198 insertions(+), 90 deletions(-)

diffs (truncated from 493 to 300 lines):

diff -r 89f2fbff963d -r f9acf7c31a73 sys/rump/librump/rumpkern/rump.c
--- a/sys/rump/librump/rumpkern/rump.c  Fri May 28 15:45:11 2010 +0000
+++ b/sys/rump/librump/rumpkern/rump.c  Fri May 28 16:44:14 2010 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rump.c,v 1.171 2010/05/11 14:57:20 pooka Exp $ */
+/*     $NetBSD: rump.c,v 1.172 2010/05/28 16:44:14 pooka Exp $ */
 
 /*
  * Copyright (c) 2007 Antti Kantee.  All Rights Reserved.
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rump.c,v 1.171 2010/05/11 14:57:20 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rump.c,v 1.172 2010/05/28 16:44:14 pooka Exp $");
 
 #include <sys/systm.h>
 #define ELFSIZE ARCH_ELFSIZE
@@ -269,7 +269,7 @@
        /* init minimal lwp/cpu context */
        l = &lwp0;
        l->l_lid = 1;
-       l->l_cpu = rump_cpu;
+       l->l_cpu = l->l_target_cpu = rump_cpu;
        rumpuser_set_curlwp(l);
 
        mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_NONE);
@@ -533,6 +533,7 @@
        l->l_lid = lid;
        l->l_fd = p->p_fd;
        l->l_cpu = NULL;
+       l->l_target_cpu = rump_cpu;
        lwp_initspecific(l);
        LIST_INSERT_HEAD(&alllwp, l, l_list);
 
@@ -545,7 +546,7 @@
        struct lwp *l = curlwp;
 
        rumpuser_set_curlwp(NULL);
-       newlwp->l_cpu = l->l_cpu;
+       newlwp->l_cpu = newlwp->l_target_cpu = l->l_cpu;
        newlwp->l_mutex = l->l_mutex;
        l->l_mutex = NULL;
        l->l_cpu = NULL;
diff -r 89f2fbff963d -r f9acf7c31a73 sys/rump/librump/rumpkern/scheduler.c
--- a/sys/rump/librump/rumpkern/scheduler.c     Fri May 28 15:45:11 2010 +0000
+++ b/sys/rump/librump/rumpkern/scheduler.c     Fri May 28 16:44:14 2010 +0000
@@ -1,10 +1,7 @@
-/*      $NetBSD: scheduler.c,v 1.14 2010/05/18 14:58:42 pooka Exp $    */
+/*      $NetBSD: scheduler.c,v 1.15 2010/05/28 16:44:14 pooka Exp $    */
 
 /*
- * Copyright (c) 2009 Antti Kantee.  All Rights Reserved.
- *
- * Development of this software was supported by
- * The Finnish Cultural Foundation.
+ * Copyright (c) 2010 Antti Kantee.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.14 2010/05/18 14:58:42 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.15 2010/05/28 16:44:14 pooka Exp $");
 
 #include <sys/param.h>
 #include <sys/cpu.h>
@@ -44,28 +41,62 @@
 
 #include "rump_private.h"
 
-/* should go for MAXCPUS at some point */
 static struct cpu_info rump_cpus[MAXCPUS];
 static struct rumpcpu {
+       /* needed in fastpath */
        struct cpu_info *rcpu_ci;
-       int rcpu_flags;
+       void *rcpu_prevlwp;
+
+       /* needed in slowpath */
+       struct rumpuser_mtx *rcpu_mtx;
        struct rumpuser_cv *rcpu_cv;
-       LIST_ENTRY(rumpcpu) rcpu_entries;
+       int rcpu_wanted;
+
+       /* offset 20 (P=4) or 36 (P=8) here */
+
+       /*
+        * Some stats.  Not really that necessary, but we should
+        * have room.  Note that these overflow quite fast, so need
+        * to be collected often.
+        */
+       unsigned int rcpu_fastpath;
+       unsigned int rcpu_slowpath;
+       unsigned int rcpu_migrated;
+
+       /* offset 32 (P=4) or 50 (P=8) */
+
+       int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
 } rcpu_storage[MAXCPUS];
 struct cpu_info *rump_cpu = &rump_cpus[0];
 int ncpu;
 
-#define RCPU_WANTED    0x01    /* someone wants this specific CPU */
-#define RCPU_BUSY      0x02    /* CPU is busy */
-#define RCPU_FREELIST  0x04    /* CPU is on freelist */
+#define RCPULWP_BUSY   ((void *)-1)
+#define RCPULWP_WANTED ((void *)-2)
 
-static LIST_HEAD(,rumpcpu) cpu_freelist = LIST_HEAD_INITIALIZER(cpu_freelist);
-
-static struct rumpuser_mtx *schedmtx;
-static struct rumpuser_cv *schedcv, *lwp0cv;
+static struct rumpuser_mtx *lwp0mtx;
+static struct rumpuser_cv *lwp0cv;
+static unsigned nextcpu;
 
 static bool lwp0busy = false;
 
+/*
+ * Keep some stats.
+ *
+ * Keeping track of there is not really critical for speed, unless
+ * stats happen to be on a different cache line (CACHE_LINE_SIZE is
+ * really just a coarse estimate), so default for the performant case
+ * (i.e. no stats).
+ */
+#ifdef RUMPSCHED_STATS
+#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
+#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
+#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
+#else
+#define SCHED_FASTPATH(rcpu)
+#define SCHED_SLOWPATH(rcpu)
+#define SCHED_MIGRATED(rcpu)
+#endif
+
 struct cpu_info *
 cpu_lookup(u_int index)
 {
@@ -73,6 +104,19 @@
        return &rump_cpus[index];
 }
 
+static inline struct rumpcpu *
+getnextcpu(void)
+{
+       unsigned newcpu;
+
+       newcpu = atomic_inc_uint_nv(&nextcpu);
+       if (__predict_false(ncpu > UINT_MAX/2))
+               atomic_and_uint(&nextcpu, 0);
+       newcpu = newcpu % ncpu;
+
+       return &rcpu_storage[newcpu];
+}
+
 /* this could/should be mi_attach_cpu? */
 void
 rump_cpus_bootstrap(int num)
@@ -103,8 +147,7 @@
        struct cpu_info *ci;
        int i;
 
-       rumpuser_mutex_init(&schedmtx);
-       rumpuser_cv_init(&schedcv);
+       rumpuser_mutex_init(&lwp0mtx);
        rumpuser_cv_init(&lwp0cv);
        for (i = 0; i < ncpu; i++) {
                rcpu = &rcpu_storage[i];
@@ -113,9 +156,9 @@
                ci->ci_schedstate.spc_mutex =
                    mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
                ci->ci_schedstate.spc_flags = SPCF_RUNNING;
-               LIST_INSERT_HEAD(&cpu_freelist, rcpu, rcpu_entries);
-               rcpu->rcpu_flags = RCPU_FREELIST;
+               rcpu->rcpu_wanted = 0;
                rumpuser_cv_init(&rcpu->rcpu_cv);
+               rumpuser_mutex_init(&rcpu->rcpu_mtx);
        }
 }
 
@@ -125,15 +168,22 @@
 void
 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
 {
+       struct lwp *l = curlwp;
+       struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
 
-       rumpuser_cv_wait(cv, schedmtx);
+       /* mutex will be taken and released in cpu schedule/unschedule */
+       rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
 }
 
 int
 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
 {
+       struct lwp *l = curlwp;
+       struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
 
-       return rumpuser_cv_timedwait(cv, schedmtx, ts->tv_sec, ts->tv_nsec);
+       /* mutex will be taken and released in cpu schedule/unschedule */
+       return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
+           ts->tv_sec, ts->tv_nsec);
 }
 
 void
@@ -144,16 +194,18 @@
        /*
         * If there is no dedicated lwp, allocate a temp one and
         * set it to be free'd upon unschedule().  Use lwp0 context
-        * for reserving the necessary resources.
+        * for reserving the necessary resources.  Don't optimize
+        * for this case -- anyone who cares about performance will
+        * start a real thread.
         */
        l = rumpuser_get_curlwp();
        if (l == NULL) {
                /* busy lwp0 */
-               rumpuser_mutex_enter_nowrap(schedmtx);
+               rumpuser_mutex_enter_nowrap(lwp0mtx);
                while (lwp0busy)
-                       rumpuser_cv_wait_nowrap(lwp0cv, schedmtx);
+                       rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
                lwp0busy = true;
-               rumpuser_mutex_exit(schedmtx);
+               rumpuser_mutex_exit(lwp0mtx);
 
                /* schedule cpu and use lwp0 */
                rump_schedule_cpu(&lwp0);
@@ -162,10 +214,10 @@
 
                /* release lwp0 */
                rump_lwp_switch(l);
-               rumpuser_mutex_enter_nowrap(schedmtx);
+               rumpuser_mutex_enter_nowrap(lwp0mtx);
                lwp0busy = false;
                rumpuser_cv_signal(lwp0cv);
-               rumpuser_mutex_exit(schedmtx);
+               rumpuser_mutex_exit(lwp0mtx);
 
                /* mark new lwp as dead-on-exit */
                rump_lwp_release(l);
@@ -181,45 +233,87 @@
        rump_schedule_cpu_interlock(l, NULL);
 }
 
+/*
+ * Schedule a CPU.  This optimizes for the case where we schedule
+ * the same thread often, and we have nCPU >= nFrequently-Running-Thread
+ * (where CPU is virtual rump cpu, not host CPU).
+ */
 void
 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
 {
        struct rumpcpu *rcpu;
-       bool schedlock = interlock == schedmtx;
+       void *old;
+       bool domigrate;
+       bool bound = l->l_pflag & LP_BOUND;
 
-       if (!schedlock)
-               rumpuser_mutex_enter_nowrap(schedmtx);
+       /*
+        * First, try fastpath: if we were the previous user of the
+        * CPU, everything is in order cachewise and we can just
+        * proceed to use it.
+        *
+        * If we are a different thread (i.e. CAS fails), we must go
+        * through a memory barrier to ensure we get a truthful
+        * view of the world.
+        */
+
+       rcpu = &rcpu_storage[l->l_target_cpu-&rump_cpus[0]];
+       if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
+               if (__predict_true(interlock == rcpu->rcpu_mtx))
+                       rumpuser_mutex_exit(rcpu->rcpu_mtx);
+               SCHED_FASTPATH(rcpu);
+               /* jones, you're the man */
+               goto fastlane;
+       }
 
-       if (l->l_pflag & LP_BOUND) {
-               KASSERT(l->l_cpu != NULL);
-               rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
-               if (rcpu->rcpu_flags & RCPU_BUSY) {
-                       KASSERT((rcpu->rcpu_flags & RCPU_FREELIST) == 0);
-                       while (rcpu->rcpu_flags & RCPU_BUSY) {
-                               rcpu->rcpu_flags |= RCPU_WANTED;
-                               rumpuser_cv_wait_nowrap(rcpu->rcpu_cv,
-                                   schedmtx);
+       /*
+        * Else, it's the slowpath for us.  First, determine if we
+        * can migrate.
+        */
+       if (ncpu == 1)
+               domigrate = false;
+       else



Home | Main Index | Thread Index | Old Index