Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/kern improve performance of journal writes by paralleliz...
details: https://anonhg.NetBSD.org/src/rev/d78de4831e7d
branches: trunk
changeset: 352615:d78de4831e7d
user: jdolecek <jdolecek%NetBSD.org@localhost>
date: Mon Apr 10 21:34:37 2017 +0000
description:
improve performance of journal writes by parallelizing the I/O - use 4 bufs
by default, add sysctl vfs.wapbl.journal_iobufs to control it
this also removes need to allocate iobuf during commit, so it
might help to avoid deadlock during memory shortages like PR kern/47030
diffstat:
sys/kern/vfs_wapbl.c | 250 ++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 198 insertions(+), 52 deletions(-)
diffs (truncated from 415 to 300 lines):
diff -r 442e604da80a -r d78de4831e7d sys/kern/vfs_wapbl.c
--- a/sys/kern/vfs_wapbl.c Mon Apr 10 19:52:38 2017 +0000
+++ b/sys/kern/vfs_wapbl.c Mon Apr 10 21:34:37 2017 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $ */
+/* $NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $ */
/*-
* Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
@@ -36,7 +36,7 @@
#define WAPBL_INTERNAL
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $");
#include <sys/param.h>
#include <sys/bitops.h>
@@ -72,6 +72,7 @@
static int wapbl_flush_disk_cache = 1;
static int wapbl_verbose_commit = 0;
static int wapbl_allow_fuadpo = 0; /* switched off by default for now */
+static int wapbl_journal_iobufs = 4;
static inline size_t wapbl_space_free(size_t, off_t, off_t);
@@ -191,6 +192,8 @@
char wl_ev_group[EVCNT_STRING_MAX]; /* r */
struct evcnt wl_ev_commit; /* l */
struct evcnt wl_ev_journalwrite; /* l */
+ struct evcnt wl_ev_jbufs_bio_nowait; /* l */
+ struct evcnt wl_ev_jbufs_bio_wait; /* l */
struct evcnt wl_ev_metawrite; /* lm */
struct evcnt wl_ev_cacheflush; /* l */
#endif
@@ -228,9 +231,9 @@
SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
accounting */
- u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
- daddr_t wl_buffer_dblk; /* l: buffer disk block address */
- size_t wl_buffer_used; /* l: buffer current use */
+ /* buffers for wapbl_buffered_write() */
+ TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
+ TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
int wl_dkcache; /* r: disk cache flags */
#define WAPBL_USE_FUA(wl) \
@@ -360,6 +363,15 @@
if (rv)
return rv;
+ rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "journal_iobufs",
+ SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
+ NULL, 0, &wapbl_journal_iobufs, 0,
+ CTL_CREATE, CTL_EOL);
+ if (rv)
+ return rv;
+
return rv;
}
@@ -401,6 +413,10 @@
NULL, wl->wl_ev_group, "commit");
evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
NULL, wl->wl_ev_group, "journal sync block write");
+ evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
+ NULL, wl->wl_ev_group, "journal I/O bufs no wait");
+ evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_wait, EVCNT_TYPE_MISC,
+ NULL, wl->wl_ev_group, "journal I/O bufs biowait");
evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
NULL, wl->wl_ev_group, "metadata finished block write");
evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
@@ -412,6 +428,8 @@
{
evcnt_detach(&wl->wl_ev_commit);
evcnt_detach(&wl->wl_ev_journalwrite);
+ evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
+ evcnt_detach(&wl->wl_ev_jbufs_bio_wait);
evcnt_detach(&wl->wl_ev_metawrite);
evcnt_detach(&wl->wl_ev_cacheflush);
}
@@ -605,9 +623,6 @@
wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
TAILQ_INIT(&wl->wl_dealloclist);
- wl->wl_buffer = wapbl_alloc(MAXPHYS);
- wl->wl_buffer_used = 0;
-
wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
wapbl_evcnt_init(wl);
@@ -630,6 +645,25 @@
wl->wl_wc_scratch = wapbl_alloc(len);
}
+ TAILQ_INIT(&wl->wl_iobufs);
+ TAILQ_INIT(&wl->wl_iobufs_busy);
+ for (int i = 0; i < wapbl_journal_iobufs; i++) {
+ struct buf *bp;
+
+ if ((bp = geteblk(MAXPHYS)) == NULL)
+ goto errout;
+
+ mutex_enter(&bufcache_lock);
+ mutex_enter(devvp->v_interlock);
+ bgetvp(devvp, bp);
+ mutex_exit(devvp->v_interlock);
+ mutex_exit(&bufcache_lock);
+
+ bp->b_dev = devvp->v_rdev;
+
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+ }
+
/*
* if there was an existing set of unlinked but
* allocated inodes, preserve it in the new
@@ -656,7 +690,13 @@
wapbl_discard(wl);
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
- wapbl_free(wl->wl_buffer, MAXPHYS);
+ while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+ struct buf *bp;
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+ brelse(bp, BC_INVAL);
+ }
wapbl_inodetrk_free(wl);
wapbl_free(wl, sizeof(*wl));
@@ -832,10 +872,17 @@
KASSERT(wl->wl_inohashcnt == 0);
KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
KASSERT(wl->wl_dealloccnt == 0);
+ KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
- wapbl_free(wl->wl_buffer, MAXPHYS);
+ while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+ struct buf *bp;
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+ brelse(bp, BC_INVAL);
+ }
wapbl_inodetrk_free(wl);
wapbl_evcnt_free(wl);
@@ -853,14 +900,10 @@
* Unbuffered disk I/O
*/
-static int
-wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+static void
+wapbl_doio_accounting(struct vnode *devvp, int flags)
{
struct pstats *pstats = curlwp->l_proc->p_stats;
- struct buf *bp;
- int error;
-
- KASSERT(devvp->v_type == VBLK);
if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
mutex_enter(devvp->v_interlock);
@@ -871,6 +914,18 @@
pstats->p_ru.ru_inblock++;
}
+}
+
+static int
+wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+{
+ struct buf *bp;
+ int error;
+
+ KASSERT(devvp->v_type == VBLK);
+
+ wapbl_doio_accounting(devvp, flags);
+
bp = getiobuf(devvp, true);
bp->b_flags = flags;
bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
@@ -935,24 +990,77 @@
*/
/*
+ * wapbl_buffered_write_async(wl, bp)
+ *
+ * Send buffer for asynchronous write.
+ */
+static void
+wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
+{
+ wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
+
+ KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+
+ bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
+ bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
+ bp->b_oflags = 0;
+ bp->b_bcount = bp->b_resid;
+ BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+
+ VOP_STRATEGY(wl->wl_devvp, bp);
+
+ wl->wl_ev_journalwrite.ev_count++;
+
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
+}
+
+/*
* wapbl_buffered_flush(wl)
*
* Flush any buffered writes from wapbl_buffered_write.
*/
static int
-wapbl_buffered_flush(struct wapbl *wl)
+wapbl_buffered_flush(struct wapbl *wl, bool full)
{
- int error;
-
- if (wl->wl_buffer_used == 0)
- return 0;
-
- error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
- wl->wl_devvp, wl->wl_buffer_dblk,
- B_WRITE | WAPBL_JFLAGS(wl));
- wl->wl_buffer_used = 0;
-
- wl->wl_ev_journalwrite.ev_count++;
+ int error = 0;
+ struct buf *bp, *bnext;
+ bool only_done = true, found = false;
+
+ /* if there is outstanding buffered write, send it now */
+ if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
+ wapbl_buffered_write_async(wl, bp);
+
+ /* wait for I/O to complete */
+again:
+ TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
+ if (!full && only_done) {
+ /* skip unfinished */
+ if (!ISSET(bp->b_oflags, BO_DONE))
+ continue;
+ }
+
+ if (ISSET(bp->b_oflags, BO_DONE))
+ wl->wl_ev_jbufs_bio_nowait.ev_count++;
+ else
+ wl->wl_ev_jbufs_bio_wait.ev_count++;
+
+ TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
+ error = biowait(bp);
+
+ /* reset for reuse */
+ bp->b_blkno = bp->b_resid = 0;
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+ found = true;
+
+ if (!full)
+ break;
+ }
+
+ if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
+ only_done = false;
+ goto again;
+ }
return error;
}
@@ -967,49 +1075,63 @@
static int
wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
{
- int error;
size_t resid;
+ struct buf *bp;
+
+again:
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+
+ if (bp == NULL) {
+ /* No more buffers, wait for any previous I/O to finish. */
+ wapbl_buffered_flush(wl, false);
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ KASSERT(bp != NULL);
+ }
/*
Home |
Main Index |
Thread Index |
Old Index