[src/trunk]: src/sys Minor vnode locking changes:

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys Minor vnode locking changes:
From: ad <ad%NetBSD.org@localhost>
Date: Mon, 02 Dec 2019 00:55:34 +0000
details:   https://anonhg.NetBSD.org/src/rev/5d3c8e71b890
branches:  trunk
changeset: 465647:5d3c8e71b890
user:      ad <ad%NetBSD.org@localhost>
date:      Sun Dec 01 13:56:29 2019 +0000

description:
Minor vnode locking changes:

- Stop using atomics to maniupulate v_usecount.  It was a mistake to begin
  with.  It doesn't work as intended unless the XLOCK bit is incorporated in
  v_usecount and we don't have that any more.  When I introduced this 10+
  years ago it was to reduce pressure on v_interlock but it doesn't do that,
  it just makes stuff disappear from lockstat output and introduces problems
  elsewhere.  We could do atomic usecounts on vnodes but there has to be a
  well thought out scheme.

- Resurrect LK_UPGRADE/LK_DOWNGRADE which will be needed to work effectively
  when there is increased use of shared locks on vnodes.

- Allocate the vnode lock using rw_obj_alloc() to reduce false sharing of
  struct vnode.

- Put all of the LRU lists into a single cache line, and do not requeue a
  vnode if it's already on the correct list and was requeued recently (less
  than a second ago).

Kernel build before and after:

119.63s real  1453.16s user  2742.57s system
115.29s real  1401.52s user  2690.94s system

diffstat:

 sys/kern/vfs_subr.c            |    8 +-
 sys/kern/vfs_vnode.c           |  213 +++++++++++++++++++---------------------
 sys/kern/vfs_vnops.c           |    9 +-
 sys/kern/vnode_if.sh           |    6 +-
 sys/miscfs/genfs/genfs_vnops.c |   54 +++++++---
 sys/sys/vnode.h                |    9 +-
 sys/sys/vnode_impl.h           |   10 +-
 7 files changed, 163 insertions(+), 146 deletions(-)

diffs (truncated from 757 to 300 lines):

diff -r 4f8ad671aa1d -r 5d3c8e71b890 sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c       Sun Dec 01 13:46:34 2019 +0000
+++ b/sys/kern/vfs_subr.c       Sun Dec 01 13:56:29 2019 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $      */
+/*     $NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $        */
 
 /*-
  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ddb.h"
@@ -1110,7 +1110,7 @@
            vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
        (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
            prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
-       (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);
+       (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, vip->vi_lock);
 
        (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
            vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
@@ -1543,7 +1543,7 @@
 
        for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
                TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
-                       if (&vip->vi_lock != vlock)
+                       if (vip->vi_lock != vlock)
                                continue;
                        vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
                }
diff -r 4f8ad671aa1d -r 5d3c8e71b890 sys/kern/vfs_vnode.c
--- a/sys/kern/vfs_vnode.c      Sun Dec 01 13:46:34 2019 +0000
+++ b/sys/kern/vfs_vnode.c      Sun Dec 01 13:56:29 2019 +0000
@@ -1,7 +1,7 @@
-/*     $NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $  */
+/*     $NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $       */
 
 /*-
- * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -143,20 +143,10 @@
  *     as vput(9), routines.  Common points holding references are e.g.
  *     file openings, current working directory, mount points, etc.  
  *
- * Note on v_usecount and its locking
- *
- *     At nearly all points it is known that v_usecount could be zero,
- *     the vnode_t::v_interlock will be held.  To change v_usecount away
- *     from zero, the interlock must be held.  To change from a non-zero
- *     value to zero, again the interlock must be held.
- *
- *     Changing the usecount from a non-zero value to a non-zero value can
- *     safely be done using atomic operations, without the interlock held.
- *
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -181,33 +171,39 @@
 
 #include <uvm/uvm.h>
 #include <uvm/uvm_readahead.h>
+#include <uvm/uvm_stat.h>
 
 /* Flags to vrelel. */
-#define        VRELEL_ASYNC_RELE       0x0001  /* Always defer to vrele thread. */
-#define        VRELEL_FORCE_RELE       0x0002  /* Must always succeed. */
+#define        VRELEL_ASYNC    0x0001  /* Always defer to vrele thread. */
+#define        VRELEL_FORCE    0x0002  /* Must always succeed. */
+#define        VRELEL_NOINACT  0x0004  /* Don't bother calling VOP_INACTIVE(). */
 
-u_int                  numvnodes               __cacheline_aligned;
+#define        LRU_VRELE       0
+#define        LRU_FREE        1
+#define        LRU_HOLD        2
+#define        LRU_COUNT       3
 
 /*
  * There are three lru lists: one holds vnodes waiting for async release,
- * one is for vnodes which have no buffer/page references and
- * one for those which do (i.e. v_holdcnt is non-zero).
+ * one is for vnodes which have no buffer/page references and one for those
+ * which do (i.e.  v_holdcnt is non-zero).  We put the lists into a single,
+ * private cache line as vnodes migrate between them while under the same
+ * lock (vdrain_lock).
  */
-static vnodelst_t      lru_vrele_list          __cacheline_aligned;
-static vnodelst_t      lru_free_list           __cacheline_aligned;
-static vnodelst_t      lru_hold_list           __cacheline_aligned;
+u_int                  numvnodes               __cacheline_aligned;
+static vnodelst_t      lru_list[LRU_COUNT]     __cacheline_aligned;
 static kmutex_t                vdrain_lock             __cacheline_aligned;
-static kcondvar_t      vdrain_cv               __cacheline_aligned;
+static kcondvar_t      vdrain_cv;
 static int             vdrain_gen;
 static kcondvar_t      vdrain_gen_cv;
 static bool            vdrain_retry;
 static lwp_t *         vdrain_lwp;
 SLIST_HEAD(hashhead, vnode_impl);
 static kmutex_t                vcache_lock             __cacheline_aligned;
-static kcondvar_t      vcache_cv               __cacheline_aligned;
+static kcondvar_t      vcache_cv;
 static u_int           vcache_hashsize;
 static u_long          vcache_hashmask;
-static struct hashhead *vcache_hashtab         __cacheline_aligned;
+static struct hashhead *vcache_hashtab;
 static pool_cache_t    vcache_pool;
 static void            lru_requeue(vnode_t *, vnodelst_t *);
 static vnodelst_t *    lru_which(vnode_t *);
@@ -378,17 +374,16 @@
 void
 vfs_vnode_sysinit(void)
 {
-       int error __diagused;
+       int error __diagused, i;
 
        dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
        KASSERT(dead_rootmount != NULL);
        dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
 
        mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
-       TAILQ_INIT(&lru_free_list);
-       TAILQ_INIT(&lru_hold_list);
-       TAILQ_INIT(&lru_vrele_list);
-
+       for (i = 0; i < LRU_COUNT; i++) {
+               TAILQ_INIT(&lru_list[i]);
+       }
        vcache_init();
 
        cv_init(&vdrain_cv, "vdrain");
@@ -452,9 +447,9 @@
        KASSERT(mutex_owned(vp->v_interlock));
 
        if (vp->v_holdcnt > 0)
-               return &lru_hold_list;
+               return &lru_list[LRU_HOLD];
        else
-               return &lru_free_list;
+               return &lru_list[LRU_FREE];
 }
 
 /*
@@ -466,19 +461,39 @@
 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
 {
        vnode_impl_t *vip;
+       int d;
+
+       /*
+        * If the vnode is on the correct list, and was put there recently,
+        * then leave it be, thus avoiding huge cache and lock contention.
+        */
+       vip = VNODE_TO_VIMPL(vp);
+       if (listhd == vip->vi_lrulisthd &&
+           (hardclock_ticks - vip->vi_lrulisttm) < hz) {
+               return;
+       }
 
        mutex_enter(&vdrain_lock);
-       vip = VNODE_TO_VIMPL(vp);
+       d = 0;
        if (vip->vi_lrulisthd != NULL)
                TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
        else
-               numvnodes++;
+               d++;
        vip->vi_lrulisthd = listhd;
+       vip->vi_lrulisttm = hardclock_ticks;
        if (vip->vi_lrulisthd != NULL)
                TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
        else
-               numvnodes--;
-       if (numvnodes > desiredvnodes || listhd == &lru_vrele_list)
+               d--;
+       if (d != 0) {
+               /*
+                * Looks strange?  This is not a bug.  Don't store
+                * numvnodes unless there is a change - avoid false
+                * sharing on MP.
+                */
+               numvnodes += d;
+       }
+       if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE])
                cv_broadcast(&vdrain_cv);
        mutex_exit(&vdrain_lock);
 }
@@ -491,33 +506,37 @@
 vrele_flush(struct mount *mp)
 {
        vnode_impl_t *vip, *marker;
+       vnode_t *vp;
 
        KASSERT(fstrans_is_owner(mp));
 
        marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
 
        mutex_enter(&vdrain_lock);
-       TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist);
+       TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
 
        while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
-               TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
-               TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist);
-               if (vnis_marker(VIMPL_TO_VNODE(vip)))
+               TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
+               TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
+                   vi_lrulist);
+               vp = VIMPL_TO_VNODE(vip);
+               if (vnis_marker(vp))
                        continue;
 
-               KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
+               KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
                TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
-               vip->vi_lrulisthd = &lru_hold_list;
+               vip->vi_lrulisthd = &lru_list[LRU_HOLD];
+               vip->vi_lrulisttm = hardclock_ticks;
                TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
                mutex_exit(&vdrain_lock);
 
-               mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock);
-               vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE);
+               mutex_enter(vp->v_interlock);
+               vrelel(vp, VRELEL_FORCE);
 
                mutex_enter(&vdrain_lock);
        }
 
-       TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
+       TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
        mutex_exit(&vdrain_lock);
 
        vnfree_marker(VIMPL_TO_VNODE(marker));
@@ -555,7 +574,7 @@
        if (vcache_vget(vp) == 0) {
                if (!vrecycle(vp)) {
                        mutex_enter(vp->v_interlock);
-                       vrelel(vp, VRELEL_FORCE_RELE);
+                       vrelel(vp, VRELEL_FORCE);
                }
        }
        fstrans_done(mp);
@@ -584,16 +603,17 @@
         * will put it back onto the right list before
         * its v_usecount reaches zero.
         */
-       KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
+       KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
        TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
-       vip->vi_lrulisthd = &lru_hold_list;
+       vip->vi_lrulisthd = &lru_list[LRU_HOLD];
+       vip->vi_lrulisttm = hardclock_ticks;
        TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
 
        vdrain_retry = true;
        mutex_exit(&vdrain_lock);
 
        mutex_enter(vp->v_interlock);
-       vrelel(vp, VRELEL_FORCE_RELE);
+       vrelel(vp, VRELEL_FORCE);
        fstrans_done(mp);
 
        mutex_enter(&vdrain_lock);
@@ -606,9 +626,6 @@
 static void
 vdrain_thread(void *cookie)
 {
-       vnodelst_t *listhd[] = {
-           &lru_vrele_list, &lru_free_list, &lru_hold_list
-       };
        int i;
        u_int target;
        vnode_impl_t *vip, *marker;
@@ -621,22 +638,22 @@
                vdrain_retry = false;
                target = desiredvnodes - desiredvnodes/10;
 
-               for (i = 0; i < __arraycount(listhd); i++) {
-                       TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist);
+               for (i = 0; i < LRU_COUNT; i++) {
+                       TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
                        while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
-                               TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
-                               TAILQ_INSERT_AFTER(listhd[i], vip, marker,
+                               TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
+                               TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
Prev by Date: [src/trunk]: src/sys Regen.
Next by Date: [src/trunk]: src/sys Regen for VOP_LOCK & LK_UPGRADE/LK_DOWNGRADE.
Previous by Thread: [src/trunk]: src/sys Regen.
Next by Thread: [src/trunk]: src/sys Regen for VOP_LOCK & LK_UPGRADE/LK_DOWNGRADE.
Indexes:
Home | Main Index | Thread Index | Old Index