Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/dev fix several bugs, make nvme(4) MPSAFE by default and...
details: https://anonhg.NetBSD.org/src/rev/848d31e91441
branches: trunk
changeset: 347851:848d31e91441
user: jdolecek <jdolecek%NetBSD.org@localhost>
date: Sun Sep 18 21:19:39 2016 +0000
description:
fix several bugs, make nvme(4) MPSAFE by default and also bump default
number of ioq from 128 to 1024; tested with VirtualBox and QEMU
* remove NVME_INTMC/NVME_INTMS writes in hw intr handler as this is not MPSAFE,
fortunately they don't seem to be necessary; shaves two register writes
* need to use full mutex_enter() in nvme_q_complete(), to avoid small
race between one handler exiting the loop and another entering
* for MSI, handover the command result processing to softintr; unfortunately
can't easily do that for INTx interrupts as they require doorbell write
to deassert
* unlock/relock q->q_cq_mtx before calling ccb_done to avoid potential deadlocks
* make sure to destroy queue mutexes when destroying the queue (LOCKDEBUG)
* make ns ctx pool per-device, so that it's deallocated properly on module
unload
* handle ctx allocation failure in ld_nvme_dobio()
* remove splbio() calls in ld_nvme_dobio() and sync, the paths are exercised
only for dump/shutdown, and that already disables interrupts
* free the ns ctx in ld_nvme_biodone() before calling lddone() to avoid
memory starvation, as lddone() can trigger another i/o request
* be more careful with using PR_WAITOK, the paths are called from interrupt
context and there we can't wait
diffstat:
sys/dev/ic/ld_nvme.c | 51 +++++++++++--------
sys/dev/ic/nvme.c | 123 +++++++++++++++++++++++++-----------------------
sys/dev/ic/nvmevar.h | 30 ++++++-----
sys/dev/pci/nvme_pci.c | 62 +++++++++++++++++++----
4 files changed, 160 insertions(+), 106 deletions(-)
diffs (truncated from 578 to 300 lines):
diff -r 64aac7d2c070 -r 848d31e91441 sys/dev/ic/ld_nvme.c
--- a/sys/dev/ic/ld_nvme.c Sun Sep 18 18:24:00 2016 +0000
+++ b/sys/dev/ic/ld_nvme.c Sun Sep 18 21:19:39 2016 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: ld_nvme.c,v 1.3 2016/09/16 15:24:47 jdolecek Exp $ */
+/* $NetBSD: ld_nvme.c,v 1.4 2016/09/18 21:19:39 jdolecek Exp $ */
/*-
* Copyright (C) 2016 NONAKA Kimihiro <nonaka%netbsd.org@localhost>
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ld_nvme.c,v 1.3 2016/09/16 15:24:47 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ld_nvme.c,v 1.4 2016/09/18 21:19:39 jdolecek Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -108,8 +108,8 @@
ld->sc_secsize = 1 << f->lbads;
ld->sc_secperunit = nsze;
- ld->sc_maxxfer = MAXPHYS;
- ld->sc_maxqueuecnt = naa->naa_qentries;
+ ld->sc_maxxfer = MAXPHYS; /* XXX set according to device */
+ ld->sc_maxqueuecnt = naa->naa_qentries; /* XXX set to max allowed by device, not current config */
ld->sc_start = ld_nvme_start;
ld->sc_dump = ld_nvme_dump;
ld->sc_flush = ld_nvme_flush;
@@ -156,9 +156,12 @@
{
struct nvme_ns_context *ctx;
int error;
- int s;
+ int waitok = (bp != NULL && !cpu_softintr_p() && !cpu_intr_p());
- ctx = nvme_ns_get_ctx(bp != NULL ? PR_WAITOK : PR_NOWAIT);
+ ctx = nvme_ns_get_ctx(sc, waitok ? PR_WAITOK : PR_NOWAIT);
+ if (ctx == NULL)
+ return EAGAIN;
+
ctx->nnc_cookie = sc;
ctx->nnc_nsid = sc->sc_nsid;
ctx->nnc_done = ld_nvme_biodone;
@@ -168,14 +171,12 @@
ctx->nnc_secsize = sc->sc_ld.sc_secsize;
ctx->nnc_blkno = blkno;
ctx->nnc_flags = dowrite ? 0 : NVME_NS_CTX_F_READ;
- if (bp == NULL) {
+ if (bp == NULL)
SET(ctx->nnc_flags, NVME_NS_CTX_F_POLL);
- s = splbio();
- }
+
error = nvme_ns_dobio(sc->sc_nvme, ctx);
- if (bp == NULL) {
- splx(s);
- }
+ if (error)
+ nvme_ns_put_ctx(sc, ctx);
return error;
}
@@ -187,6 +188,10 @@
struct buf *bp = ctx->nnc_buf;
int status = NVME_CQE_SC(ctx->nnc_status);
+ /* free before processing to avoid starvation, lddone() could trigger
+ * another i/o request */
+ nvme_ns_put_ctx(sc, ctx);
+
if (bp != NULL) {
if (status != NVME_CQE_SC_SUCCESS) {
bp->b_error = EIO;
@@ -201,7 +206,6 @@
aprint_error_dev(sc->sc_ld.sc_dv, "I/O error\n");
}
}
- nvme_ns_put_ctx(ctx);
}
static int
@@ -210,21 +214,23 @@
struct ld_nvme_softc *sc = device_private(ld->sc_dv);
struct nvme_ns_context *ctx;
int error;
- int s;
+ int waitok = (!ISSET(flags, LDFL_POLL)
+ && !cpu_softintr_p() && !cpu_intr_p());
- ctx = nvme_ns_get_ctx((flags & LDFL_POLL) ? PR_NOWAIT : PR_WAITOK);
+ ctx = nvme_ns_get_ctx(sc, waitok ? PR_WAITOK : PR_NOWAIT);
+ if (ctx == NULL)
+ return EAGAIN;
+
ctx->nnc_cookie = sc;
ctx->nnc_nsid = sc->sc_nsid;
ctx->nnc_done = ld_nvme_syncdone;
ctx->nnc_flags = 0;
- if (flags & LDFL_POLL) {
+ if (flags & LDFL_POLL)
SET(ctx->nnc_flags, NVME_NS_CTX_F_POLL);
- s = splbio();
- }
+
error = nvme_ns_sync(sc->sc_nvme, ctx);
- if (flags & LDFL_POLL) {
- splx(s);
- }
+ if (error)
+ nvme_ns_put_ctx(sc, ctx);
return error;
}
@@ -232,6 +238,7 @@
static void
ld_nvme_syncdone(struct nvme_ns_context *ctx)
{
+ struct ld_nvme_softc *sc = ctx->nnc_cookie;
- nvme_ns_put_ctx(ctx);
+ nvme_ns_put_ctx(sc, ctx);
}
diff -r 64aac7d2c070 -r 848d31e91441 sys/dev/ic/nvme.c
--- a/sys/dev/ic/nvme.c Sun Sep 18 18:24:00 2016 +0000
+++ b/sys/dev/ic/nvme.c Sun Sep 18 21:19:39 2016 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: nvme.c,v 1.8 2016/09/17 19:52:16 jdolecek Exp $ */
+/* $NetBSD: nvme.c,v 1.9 2016/09/18 21:19:39 jdolecek Exp $ */
/* $OpenBSD: nvme.c,v 1.49 2016/04/18 05:59:50 dlg Exp $ */
/*
@@ -18,7 +18,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvme.c,v 1.8 2016/09/17 19:52:16 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvme.c,v 1.9 2016/09/18 21:19:39 jdolecek Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -41,7 +41,7 @@
#include <dev/ic/nvmeio.h>
int nvme_adminq_size = 128;
-int nvme_ioq_size = 128;
+int nvme_ioq_size = 1024;
static int nvme_print(void *, const char *);
@@ -156,18 +156,6 @@
#define nvme_barrier(_s, _r, _l, _f) \
bus_space_barrier((_s)->sc_iot, (_s)->sc_ioh, (_r), (_l), (_f))
-pool_cache_t nvme_ns_ctx_cache;
-ONCE_DECL(nvme_init_once);
-
-static int
-nvme_init(void)
-{
- nvme_ns_ctx_cache = pool_cache_init(sizeof(struct nvme_ns_context),
- 0, 0, 0, "nvme_ns_ctx", NULL, IPL_BIO, NULL, NULL, NULL);
- KASSERT(nvme_ns_ctx_cache != NULL);
- return 0;
-}
-
static void
nvme_version(struct nvme_softc *sc, uint32_t ver)
{
@@ -350,8 +338,6 @@
int ioq_entries = nvme_ioq_size;
int i;
- RUN_ONCE(&nvme_init_once, nvme_init);
-
reg = nvme_read4(sc, NVME_VS);
if (reg == 0xffffffff) {
aprint_error_dev(sc->sc_dev, "invalid mapping\n");
@@ -431,8 +417,20 @@
if (!sc->sc_use_mq)
nvme_write4(sc, NVME_INTMC, 1);
+ snprintf(sc->sc_ctxpoolname, sizeof(sc->sc_ctxpoolname),
+ "%s_ns_ctx", device_xname(sc->sc_dev));
+ sc->sc_ctxpool = pool_cache_init(sizeof(struct nvme_ns_context),
+ 0, 0, 0, sc->sc_ctxpoolname, NULL, IPL_BIO, NULL, NULL, NULL);
+ if (sc->sc_ctxpool == NULL) {
+ aprint_error_dev(sc->sc_dev, "unable to create ctx pool\n");
+ goto free_q;
+ }
+
+ /* probe subdevices */
sc->sc_namespaces = kmem_zalloc(sizeof(*sc->sc_namespaces) * sc->sc_nn,
KM_SLEEP);
+ if (sc->sc_namespaces == NULL)
+ goto free_q;
for (i = 0; i < sc->sc_nn; i++) {
memset(&naa, 0, sizeof(naa));
naa.naa_nsid = i + 1;
@@ -485,6 +483,9 @@
if (error)
return error;
+ /* from now on we are committed to detach, following will never fail */
+ pool_cache_destroy(sc->sc_ctxpool);
+
for (i = 0; i < sc->sc_nq; i++)
nvme_q_free(sc, sc->sc_q[i]);
kmem_free(sc->sc_q, sizeof(*sc->sc_q) * sc->sc_nq);
@@ -564,7 +565,8 @@
KASSERT(nsid > 0);
ccb = nvme_ccb_get(sc->sc_admin_q);
- KASSERT(ccb != NULL);
+ if (ccb == NULL)
+ return EAGAIN;
mem = nvme_dmamem_alloc(sc, sizeof(*identify));
if (mem == NULL)
@@ -856,8 +858,9 @@
void *buf = NULL;
int error;
+ /* limit command size to maximum data transfer size */
if ((pt->buf == NULL && pt->len > 0) ||
- (pt->buf != NULL && pt->len == 0))
+ (pt->buf != NULL && (pt->len == 0 || pt->len > sc->sc_mdts)))
return EINVAL;
q = is_adminq ? sc->sc_admin_q : nvme_get_q(sc);
@@ -865,7 +868,8 @@
if (ccb == NULL)
return EBUSY;
- if (pt->buf != NULL && pt->len > 0) {
+ if (pt->buf != NULL) {
+ KASSERT(pt->len > 0);
buf = kmem_alloc(pt->len, KM_SLEEP);
if (buf == NULL) {
error = ENOMEM;
@@ -1022,35 +1026,42 @@
{
struct nvme_ccb *ccb;
struct nvme_cqe *ring = NVME_DMA_KVA(q->q_cq_dmamem), *cqe;
- uint32_t head;
uint16_t flags;
int rv = 0;
- if (!mutex_tryenter(&q->q_cq_mtx))
- return -1;
+ mutex_enter(&q->q_cq_mtx);
nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
- head = q->q_cq_head;
for (;;) {
- cqe = &ring[head];
+ cqe = &ring[q->q_cq_head];
flags = lemtoh16(&cqe->flags);
if ((flags & NVME_CQE_PHASE) != q->q_cq_phase)
break;
ccb = &q->q_ccbs[cqe->cid];
- ccb->ccb_done(q, ccb, cqe);
- if (++head >= q->q_entries) {
- head = 0;
+ if (++q->q_cq_head >= q->q_entries) {
+ q->q_cq_head = 0;
q->q_cq_phase ^= NVME_CQE_PHASE;
}
rv = 1;
+
+ /*
+ * Unlock the mutext before calling the ccb_done callback
+ * and re-lock afterwards. The callback triggers lddone()
+ * which schedules another i/o, and also calls nvme_ccb_put().
+ * Unlock/relock avoids possibility of deadlock.
+ */
+ mutex_exit(&q->q_cq_mtx);
+ ccb->ccb_done(q, ccb, cqe);
+ mutex_enter(&q->q_cq_mtx);
}
nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
if (rv)
- nvme_write4(sc, q->q_cqhdbl, q->q_cq_head = head);
+ nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
+
mutex_exit(&q->q_cq_mtx);
return rv;
@@ -1121,7 +1132,7 @@
struct nvme_ccb *ccb;
int rv;
- if (sc->sc_use_mq && sc->sc_intr_establish(sc, q->q_id, q))
+ if (sc->sc_use_mq && sc->sc_intr_establish(sc, q->q_id, q) != 0)
return 1;
Home |
Main Index |
Thread Index |
Old Index