Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys Minor vnode locking changes:
details: https://anonhg.NetBSD.org/src/rev/193cf7f24c21
branches: trunk
changeset: 461593:193cf7f24c21
user: ad <ad%NetBSD.org@localhost>
date: Sun Dec 01 13:56:29 2019 +0000
description:
Minor vnode locking changes:
- Stop using atomics to maniupulate v_usecount. It was a mistake to begin
with. It doesn't work as intended unless the XLOCK bit is incorporated in
v_usecount and we don't have that any more. When I introduced this 10+
years ago it was to reduce pressure on v_interlock but it doesn't do that,
it just makes stuff disappear from lockstat output and introduces problems
elsewhere. We could do atomic usecounts on vnodes but there has to be a
well thought out scheme.
- Resurrect LK_UPGRADE/LK_DOWNGRADE which will be needed to work effectively
when there is increased use of shared locks on vnodes.
- Allocate the vnode lock using rw_obj_alloc() to reduce false sharing of
struct vnode.
- Put all of the LRU lists into a single cache line, and do not requeue a
vnode if it's already on the correct list and was requeued recently (less
than a second ago).
Kernel build before and after:
119.63s real 1453.16s user 2742.57s system
115.29s real 1401.52s user 2690.94s system
diffstat:
sys/kern/vfs_subr.c | 8 +-
sys/kern/vfs_vnode.c | 213 +++++++++++++++++++---------------------
sys/kern/vfs_vnops.c | 9 +-
sys/kern/vnode_if.sh | 6 +-
sys/miscfs/genfs/genfs_vnops.c | 54 +++++++---
sys/sys/vnode.h | 9 +-
sys/sys/vnode_impl.h | 10 +-
7 files changed, 163 insertions(+), 146 deletions(-)
diffs (truncated from 757 to 300 lines):
diff -r df994fa35f75 -r 193cf7f24c21 sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c Sun Dec 01 13:46:34 2019 +0000
+++ b/sys/kern/vfs_subr.c Sun Dec 01 13:56:29 2019 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $ */
+/* $NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
@@ -1110,7 +1110,7 @@
vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
(*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
- (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);
+ (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, vip->vi_lock);
(*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
@@ -1543,7 +1543,7 @@
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
- if (&vip->vi_lock != vlock)
+ if (vip->vi_lock != vlock)
continue;
vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
}
diff -r df994fa35f75 -r 193cf7f24c21 sys/kern/vfs_vnode.c
--- a/sys/kern/vfs_vnode.c Sun Dec 01 13:46:34 2019 +0000
+++ b/sys/kern/vfs_vnode.c Sun Dec 01 13:56:29 2019 +0000
@@ -1,7 +1,7 @@
-/* $NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $ */
+/* $NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $ */
/*-
- * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
@@ -143,20 +143,10 @@
* as vput(9), routines. Common points holding references are e.g.
* file openings, current working directory, mount points, etc.
*
- * Note on v_usecount and its locking
- *
- * At nearly all points it is known that v_usecount could be zero,
- * the vnode_t::v_interlock will be held. To change v_usecount away
- * from zero, the interlock must be held. To change from a non-zero
- * value to zero, again the interlock must be held.
- *
- * Changing the usecount from a non-zero value to a non-zero value can
- * safely be done using atomic operations, without the interlock held.
- *
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -181,33 +171,39 @@
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
+#include <uvm/uvm_stat.h>
/* Flags to vrelel. */
-#define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
-#define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */
+#define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
+#define VRELEL_FORCE 0x0002 /* Must always succeed. */
+#define VRELEL_NOINACT 0x0004 /* Don't bother calling VOP_INACTIVE(). */
-u_int numvnodes __cacheline_aligned;
+#define LRU_VRELE 0
+#define LRU_FREE 1
+#define LRU_HOLD 2
+#define LRU_COUNT 3
/*
* There are three lru lists: one holds vnodes waiting for async release,
- * one is for vnodes which have no buffer/page references and
- * one for those which do (i.e. v_holdcnt is non-zero).
+ * one is for vnodes which have no buffer/page references and one for those
+ * which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
+ * private cache line as vnodes migrate between them while under the same
+ * lock (vdrain_lock).
*/
-static vnodelst_t lru_vrele_list __cacheline_aligned;
-static vnodelst_t lru_free_list __cacheline_aligned;
-static vnodelst_t lru_hold_list __cacheline_aligned;
+u_int numvnodes __cacheline_aligned;
+static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
static kmutex_t vdrain_lock __cacheline_aligned;
-static kcondvar_t vdrain_cv __cacheline_aligned;
+static kcondvar_t vdrain_cv;
static int vdrain_gen;
static kcondvar_t vdrain_gen_cv;
static bool vdrain_retry;
static lwp_t * vdrain_lwp;
SLIST_HEAD(hashhead, vnode_impl);
static kmutex_t vcache_lock __cacheline_aligned;
-static kcondvar_t vcache_cv __cacheline_aligned;
+static kcondvar_t vcache_cv;
static u_int vcache_hashsize;
static u_long vcache_hashmask;
-static struct hashhead *vcache_hashtab __cacheline_aligned;
+static struct hashhead *vcache_hashtab;
static pool_cache_t vcache_pool;
static void lru_requeue(vnode_t *, vnodelst_t *);
static vnodelst_t * lru_which(vnode_t *);
@@ -378,17 +374,16 @@
void
vfs_vnode_sysinit(void)
{
- int error __diagused;
+ int error __diagused, i;
dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
KASSERT(dead_rootmount != NULL);
dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
- TAILQ_INIT(&lru_free_list);
- TAILQ_INIT(&lru_hold_list);
- TAILQ_INIT(&lru_vrele_list);
-
+ for (i = 0; i < LRU_COUNT; i++) {
+ TAILQ_INIT(&lru_list[i]);
+ }
vcache_init();
cv_init(&vdrain_cv, "vdrain");
@@ -452,9 +447,9 @@
KASSERT(mutex_owned(vp->v_interlock));
if (vp->v_holdcnt > 0)
- return &lru_hold_list;
+ return &lru_list[LRU_HOLD];
else
- return &lru_free_list;
+ return &lru_list[LRU_FREE];
}
/*
@@ -466,19 +461,39 @@
lru_requeue(vnode_t *vp, vnodelst_t *listhd)
{
vnode_impl_t *vip;
+ int d;
+
+ /*
+ * If the vnode is on the correct list, and was put there recently,
+ * then leave it be, thus avoiding huge cache and lock contention.
+ */
+ vip = VNODE_TO_VIMPL(vp);
+ if (listhd == vip->vi_lrulisthd &&
+ (hardclock_ticks - vip->vi_lrulisttm) < hz) {
+ return;
+ }
mutex_enter(&vdrain_lock);
- vip = VNODE_TO_VIMPL(vp);
+ d = 0;
if (vip->vi_lrulisthd != NULL)
TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
else
- numvnodes++;
+ d++;
vip->vi_lrulisthd = listhd;
+ vip->vi_lrulisttm = hardclock_ticks;
if (vip->vi_lrulisthd != NULL)
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
else
- numvnodes--;
- if (numvnodes > desiredvnodes || listhd == &lru_vrele_list)
+ d--;
+ if (d != 0) {
+ /*
+ * Looks strange? This is not a bug. Don't store
+ * numvnodes unless there is a change - avoid false
+ * sharing on MP.
+ */
+ numvnodes += d;
+ }
+ if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE])
cv_broadcast(&vdrain_cv);
mutex_exit(&vdrain_lock);
}
@@ -491,33 +506,37 @@
vrele_flush(struct mount *mp)
{
vnode_impl_t *vip, *marker;
+ vnode_t *vp;
KASSERT(fstrans_is_owner(mp));
marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
mutex_enter(&vdrain_lock);
- TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist);
+ TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
- TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
- TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist);
- if (vnis_marker(VIMPL_TO_VNODE(vip)))
+ TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
+ TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
+ vi_lrulist);
+ vp = VIMPL_TO_VNODE(vip);
+ if (vnis_marker(vp))
continue;
- KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
+ KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
- vip->vi_lrulisthd = &lru_hold_list;
+ vip->vi_lrulisthd = &lru_list[LRU_HOLD];
+ vip->vi_lrulisttm = hardclock_ticks;
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
mutex_exit(&vdrain_lock);
- mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock);
- vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE);
+ mutex_enter(vp->v_interlock);
+ vrelel(vp, VRELEL_FORCE);
mutex_enter(&vdrain_lock);
}
- TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist);
+ TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
mutex_exit(&vdrain_lock);
vnfree_marker(VIMPL_TO_VNODE(marker));
@@ -555,7 +574,7 @@
if (vcache_vget(vp) == 0) {
if (!vrecycle(vp)) {
mutex_enter(vp->v_interlock);
- vrelel(vp, VRELEL_FORCE_RELE);
+ vrelel(vp, VRELEL_FORCE);
}
}
fstrans_done(mp);
@@ -584,16 +603,17 @@
* will put it back onto the right list before
* its v_usecount reaches zero.
*/
- KASSERT(vip->vi_lrulisthd == &lru_vrele_list);
+ KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
- vip->vi_lrulisthd = &lru_hold_list;
+ vip->vi_lrulisthd = &lru_list[LRU_HOLD];
+ vip->vi_lrulisttm = hardclock_ticks;
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
vdrain_retry = true;
mutex_exit(&vdrain_lock);
mutex_enter(vp->v_interlock);
- vrelel(vp, VRELEL_FORCE_RELE);
+ vrelel(vp, VRELEL_FORCE);
fstrans_done(mp);
mutex_enter(&vdrain_lock);
@@ -606,9 +626,6 @@
static void
vdrain_thread(void *cookie)
{
- vnodelst_t *listhd[] = {
- &lru_vrele_list, &lru_free_list, &lru_hold_list
- };
int i;
u_int target;
vnode_impl_t *vip, *marker;
@@ -621,22 +638,22 @@
vdrain_retry = false;
target = desiredvnodes - desiredvnodes/10;
- for (i = 0; i < __arraycount(listhd); i++) {
- TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist);
+ for (i = 0; i < LRU_COUNT; i++) {
+ TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
- TAILQ_REMOVE(listhd[i], marker, vi_lrulist);
- TAILQ_INSERT_AFTER(listhd[i], vip, marker,
+ TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
+ TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
Home |
Main Index |
Thread Index |
Old Index