Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/ad-namecache]: src/sys namecache:
details: https://anonhg.NetBSD.org/src/rev/f8df200bb20b
branches: ad-namecache
changeset: 850518:f8df200bb20b
user: ad <ad%NetBSD.org@localhost>
date: Tue Jan 14 11:07:40 2020 +0000
description:
namecache:
This is working better than expected. It seems to cut system time for
build.sh by ~10% on my test machine and joerg@ is seeing better results with
pbulk. Improve it a bit more without changing the basic idea:
- Split cache_list_lock into a per-vnode rwlock for reverse lookup, and a
lightly contended global lock on LRU state (cache_lru_lock),
- For LRU replacement, imitate the VM system's page replacement algorithm.
This eliminates the writebacks to struct namecache (to track time of last
hit).
- Dynamically allocate the per-directory lock, preparing the way for having
a "struct nchdir" or similar which could contain stuff like different
structures for lookup, cached info to do the equivalent of VOP_ACCESS() in
cache, and so on.
diffstat:
sys/kern/vfs_cache.c | 526 +++++++++++++++++++++++++++++++-------------------
sys/sys/namei.src | 11 +-
sys/sys/vnode_impl.h | 7 +-
3 files changed, 338 insertions(+), 206 deletions(-)
diffs (truncated from 991 to 300 lines):
diff -r 16ed9f42586e -r f8df200bb20b sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c Mon Jan 13 08:51:06 2020 +0000
+++ b/sys/kern/vfs_cache.c Tue Jan 14 11:07:40 2020 +0000
@@ -1,7 +1,7 @@
-/* $NetBSD: vfs_cache.c,v 1.126.2.4 2020/01/13 08:51:07 ad Exp $ */
+/* $NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $ */
/*-
- * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
@@ -71,6 +71,10 @@
* DELETE, or NOCACHE is set (rewrite), and name is located in the
* cache, it will be dropped.
*
+ * Background:
+ *
+ * XXX add a bit of history
+ *
* Data structures:
*
* The original BSD implementation used a global hash table, which
@@ -86,24 +90,16 @@
* utimately make use of some other data structure, perhaps a Robin
* Hood hash. Insert blurb here when that happens.
*
+ * Replacement:
+ *
+ * XXX LRU blurb.
+ *
* Concurrency:
*
- * There are two locks that are of particular interest:
- *
- * nc_dvp->vi_nclock: a per-directory lock. This is taken mainly
- * during lookups.
+ * XXX need new blurb here
*
- * cache_list_lock: a global lock for all lists, including per-vnode
- * lists and the LRU queue. This is taken mainly during insertion and
- * removal, and when operating in the list -> tree direction.
- *
- * vp->v_interlock: per vnode interlock taken when acquiring a ref.
- *
- * Most all modifications are made holding both cache_list_lock and the
- * directory lock write locked. nc_hittime does not have any kind of
- * serialization appliet to it - updates are racy, but since it's only
- * used for pseudo-LRU replacement it doesn't matter. See definition
- * of "struct namecache" in src/sys/namei.src for the particulars.
+ * See definition of "struct namecache" in src/sys/namei.src for the
+ * particulars.
*
* Per-CPU statistics, and "numcache" are read unlocked, since an
* approximate value is OK. We maintain uintptr_t sized per-CPU
@@ -112,12 +108,14 @@
*
* Lock order:
*
- * 1) nc_dvp->vi_nclock
+ * 1) nc_dvp->vi_ncdlock
* 2) cache_list_lock
* 3) vp->v_interlock
*
* Ugly ASCII diagram:
*
+ * XXX replace tabs with spaces, make less ugly
+ *
* ...
* |
* -----o-----
@@ -150,7 +148,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.4 2020/01/13 08:51:07 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $");
#define __NAMECACHE_PRIVATE
#ifdef _KERNEL_OPT
@@ -176,13 +174,22 @@
/* Per-CPU counters. */
struct nchstats_percpu _NAMEI_CACHE_STATS(uintptr_t);
-/* Global lock, lists and pool. */
-static kmutex_t cache_list_lock __cacheline_aligned;
-static pool_cache_t namecache_cache __read_mostly;
-static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned;
+/* Global pool cache. */
+static pool_cache_t cache_pool __read_mostly;
-/* Number of cache entries allocated. */
-static u_int numcache __cacheline_aligned;
+/* LRU replacement state. */
+enum cache_lru_id {
+ LRU_ACTIVE,
+ LRU_INACTIVE,
+ LRU_COUNT
+};
+
+static struct {
+ TAILQ_HEAD(, namecache) list[LRU_COUNT];
+ u_int count[LRU_COUNT];
+} cache_lru __cacheline_aligned;
+
+static kmutex_t cache_lru_lock __cacheline_aligned;
/* Cache effectiveness statistics. This holds total from per-cpu stats */
struct nchstats nchstats __cacheline_aligned;
@@ -195,14 +202,12 @@
} while (/* CONSTCOND */ 0);
/* Tunables */
-static const int cache_hottime = 5; /* number of seconds */
-static const int cache_maxscan = 128; /* reclaim: max entries to scan */
-static int doingcache = 1; /* 1 => enable the cache */
+static const int cache_lru_maxdeact = 2; /* max # to deactivate */
+static const int cache_lru_maxscan = 128; /* max # to scan/reclaim */
+static int doingcache = 1; /* 1 => enable the cache */
static void cache_reclaim(void);
-static int cache_ctor(void *, void *, int);
-static void cache_dtor(void *, void *);
-static void cache_remove(struct namecache *, bool);
+static void cache_remove(struct namecache *, const bool);
/* sysctl */
static struct sysctllog *sysctllog;
@@ -273,15 +278,16 @@
};
/*
- * Remove an entry from the cache. The directory must be locked, and if
- * "inreverse" is false, cache_list_lock will be acquired when removing the
- * entry from the global lists.
+ * Remove an entry from the cache. The directory lock must be held, and if
+ * "dirtovnode" is true (it usually will be), the vnode's cache lock will be
+ * acquired when removing the entry from the vnode list.
*/
static void
-cache_remove(struct namecache *ncp, bool inreverse)
+cache_remove(struct namecache *ncp, const bool dirtovnode)
{
+ krwlock_t *vlock;
- KASSERT(rw_write_held(VNODE_TO_VIMPL(ncp->nc_dvp)->vi_nclock));
+ KASSERT(rw_write_held(VNODE_TO_VIMPL(ncp->nc_dvp)->vi_ncdlock));
SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
0, 0, 0, 0);
@@ -289,29 +295,88 @@
/* First remove from the directory's rbtree. */
rb_tree_remove_node(&VNODE_TO_VIMPL(ncp->nc_dvp)->vi_nctree, ncp);
- /* Then remove from the lists. */
- if (!inreverse) {
- mutex_enter(&cache_list_lock);
- }
- ncp->nc_dvp = NULL;
+ /* Then remove from the LRU lists. */
+ mutex_enter(&cache_lru_lock);
+ TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+ cache_lru.count[ncp->nc_lrulist]--;
+ mutex_exit(&cache_lru_lock);
+
+ /* Then remove from the vnode list. */
if (ncp->nc_vp != NULL) {
+ vlock = VNODE_TO_VIMPL(ncp->nc_vp)->vi_ncvlock;
+ if (__predict_true(dirtovnode)) {
+ rw_enter(vlock, RW_WRITER);
+ }
TAILQ_REMOVE(&VNODE_TO_VIMPL(ncp->nc_vp)->vi_nclist, ncp,
nc_vlist);
+ if (__predict_true(dirtovnode)) {
+ rw_exit(vlock);
+ }
ncp->nc_vp = NULL;
}
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- numcache--;
- if (!inreverse) {
- mutex_exit(&cache_list_lock);
- }
/* Finally, free it. */
if (ncp->nc_nlen > NCHNAMLEN) {
size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
kmem_free(ncp, sz);
} else {
- pool_cache_put(namecache_cache, ncp);
+ pool_cache_put(cache_pool, ncp);
+ }
+}
+
+/*
+ * Try to balance the LRU lists. Pick some victim entries, and re-queue
+ * them from the head of the ACTIVE list to the tail of the INACTIVE list.
+ */
+static void __noinline
+cache_deactivate(void)
+{
+ struct namecache *ncp;
+ int total, i;
+
+ KASSERT(mutex_owned(&cache_lru_lock));
+
+ /* If we're nowhere near budget yet, don't bother. */
+ total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
+ if (total < (desiredvnodes >> 1)) {
+ return;
+ }
+
+ /* If not out of balance, don't bother. */
+ if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
+ return;
}
+
+ /* Move victim from head of ACTIVE list, to tail of INACTIVE. */
+ for (i = 0; i < cache_lru_maxdeact; i++) {
+ ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
+ if (ncp == NULL) {
+ break;
+ }
+ KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
+ ncp->nc_lrulist = LRU_INACTIVE;
+ TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+ TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
+ cache_lru.count[LRU_ACTIVE]--;
+ cache_lru.count[LRU_INACTIVE]++;
+ }
+}
+
+/*
+ * Re-queue an entry onto the correct LRU list, after it has scored a hit.
+ */
+static void __noinline
+cache_activate(struct namecache *ncp)
+{
+
+ mutex_enter(&cache_lru_lock);
+ /* Put on tail of ACTIVE queue, since it just scored a hit. */
+ TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+ TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+ cache_lru.count[ncp->nc_lrulist]--;
+ cache_lru.count[LRU_ACTIVE]++;
+ ncp->nc_lrulist = LRU_ACTIVE;
+ mutex_exit(&cache_lru_lock);
}
/*
@@ -324,7 +389,7 @@
struct namecache *ncp;
struct iovec iov;
- KASSERT(rw_lock_held(VNODE_TO_VIMPL(dvp)->vi_nclock));
+ KASSERT(rw_lock_held(VNODE_TO_VIMPL(dvp)->vi_ncdlock));
iov.iov_base = __UNCONST(name);
iov.iov_len = namelen;
@@ -332,14 +397,9 @@
if (ncp != NULL) {
KASSERT(ncp->nc_dvp == dvp);
- /*
- * Avoid false sharing: don't write back to nc_hittime
- * unless changed significantly. This is an unlocked
- * update and is racy, but it doesn't matter since it's
- * only used for pseudo-LRU replacement.
- */
- if (((ncp->nc_hittime ^ hardclock_ticks) & ~31) != 0) {
- ncp->nc_hittime = hardclock_ticks;
+ /* If the entry is on the wrong LRU list, requeue it. */
+ if (__predict_false(ncp->nc_lrulist != LRU_ACTIVE)) {
+ cache_activate(ncp);
}
SDT_PROBE(vfs, namecache, lookup, hit, dvp,
name, namelen, 0, 0);
@@ -407,7 +467,7 @@
{
struct namecache *ncp;
struct vnode *vp;
- krwlock_t *dirlock;
+ krwlock_t *dlock;
int error;
bool hit;
krw_t op;
@@ -438,11 +498,16 @@
op = RW_READER;
}
- dirlock = VNODE_TO_VIMPL(dvp)->vi_nclock;
- rw_enter(dirlock, op);
+ /* If no directory lock yet, there's nothing in cache. */
+ dlock = VNODE_TO_VIMPL(dvp)->vi_ncdlock;
+ if (__predict_false(dlock == NULL)) {
+ return false;
+ }
+
+ rw_enter(dlock, op);
Home |
Main Index |
Thread Index |
Old Index