[src/trunk]: src/sys/dev fix several bugs, make nvme(4) MPSAFE by default and...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/dev fix several bugs, make nvme(4) MPSAFE by default and...
From: jdolecek <jdolecek%NetBSD.org@localhost>
Date: Tue, 31 Jul 2018 20:28:29 +0000
details:   https://anonhg.NetBSD.org/src/rev/848d31e91441
branches:  trunk
changeset: 347851:848d31e91441
user:      jdolecek <jdolecek%NetBSD.org@localhost>
date:      Sun Sep 18 21:19:39 2016 +0000

description:
fix several bugs, make nvme(4) MPSAFE by default and also bump default
number of ioq from 128 to 1024; tested with VirtualBox and QEMU

* remove NVME_INTMC/NVME_INTMS writes in hw intr handler as this is not MPSAFE,
  fortunately they don't seem to be necessary; shaves two register writes
* need to use full mutex_enter() in nvme_q_complete(), to avoid small
  race between one handler exiting the loop and another entering
* for MSI, handover the command result processing to softintr; unfortunately
  can't easily do that for INTx interrupts as they require doorbell write
  to deassert
* unlock/relock q->q_cq_mtx before calling ccb_done to avoid potential deadlocks
* make sure to destroy queue mutexes when destroying the queue (LOCKDEBUG)
* make ns ctx pool per-device, so that it's deallocated properly on module
  unload
* handle ctx allocation failure in ld_nvme_dobio()
* remove splbio() calls in ld_nvme_dobio() and sync, the paths are exercised
  only for dump/shutdown, and that already disables interrupts
* free the ns ctx in ld_nvme_biodone() before calling lddone() to avoid
  memory starvation, as lddone() can trigger another i/o request
* be more careful with using PR_WAITOK, the paths are called from interrupt
  context and there we can't wait

diffstat:

 sys/dev/ic/ld_nvme.c   |   51 +++++++++++--------
 sys/dev/ic/nvme.c      |  123 +++++++++++++++++++++++++-----------------------
 sys/dev/ic/nvmevar.h   |   30 ++++++-----
 sys/dev/pci/nvme_pci.c |   62 +++++++++++++++++++----
 4 files changed, 160 insertions(+), 106 deletions(-)

diffs (truncated from 578 to 300 lines):

diff -r 64aac7d2c070 -r 848d31e91441 sys/dev/ic/ld_nvme.c
--- a/sys/dev/ic/ld_nvme.c      Sun Sep 18 18:24:00 2016 +0000
+++ b/sys/dev/ic/ld_nvme.c      Sun Sep 18 21:19:39 2016 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: ld_nvme.c,v 1.3 2016/09/16 15:24:47 jdolecek Exp $     */
+/*     $NetBSD: ld_nvme.c,v 1.4 2016/09/18 21:19:39 jdolecek Exp $     */
 
 /*-
  * Copyright (C) 2016 NONAKA Kimihiro <nonaka%netbsd.org@localhost>
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ld_nvme.c,v 1.3 2016/09/16 15:24:47 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ld_nvme.c,v 1.4 2016/09/18 21:19:39 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -108,8 +108,8 @@
 
        ld->sc_secsize = 1 << f->lbads;
        ld->sc_secperunit = nsze;
-       ld->sc_maxxfer = MAXPHYS;
-       ld->sc_maxqueuecnt = naa->naa_qentries;
+       ld->sc_maxxfer = MAXPHYS; /* XXX set according to device */
+       ld->sc_maxqueuecnt = naa->naa_qentries; /* XXX set to max allowed by device, not current config */
        ld->sc_start = ld_nvme_start;
        ld->sc_dump = ld_nvme_dump;
        ld->sc_flush = ld_nvme_flush;
@@ -156,9 +156,12 @@
 {
        struct nvme_ns_context *ctx;
        int error;
-       int s;
+       int waitok = (bp != NULL && !cpu_softintr_p() && !cpu_intr_p());
 
-       ctx = nvme_ns_get_ctx(bp != NULL ? PR_WAITOK : PR_NOWAIT);
+       ctx = nvme_ns_get_ctx(sc, waitok ? PR_WAITOK : PR_NOWAIT);
+       if (ctx == NULL)
+               return EAGAIN;
+
        ctx->nnc_cookie = sc;
        ctx->nnc_nsid = sc->sc_nsid;
        ctx->nnc_done = ld_nvme_biodone;
@@ -168,14 +171,12 @@
        ctx->nnc_secsize = sc->sc_ld.sc_secsize;
        ctx->nnc_blkno = blkno;
        ctx->nnc_flags = dowrite ? 0 : NVME_NS_CTX_F_READ;
-       if (bp == NULL) {
+       if (bp == NULL)
                SET(ctx->nnc_flags, NVME_NS_CTX_F_POLL);
-               s = splbio();
-       }
+
        error = nvme_ns_dobio(sc->sc_nvme, ctx);
-       if (bp == NULL) {
-               splx(s);
-       }
+       if (error)
+               nvme_ns_put_ctx(sc, ctx);
 
        return error;
 }
@@ -187,6 +188,10 @@
        struct buf *bp = ctx->nnc_buf;
        int status = NVME_CQE_SC(ctx->nnc_status);
 
+       /* free before processing to avoid starvation, lddone() could trigger
+        * another i/o request */
+       nvme_ns_put_ctx(sc, ctx);
+
        if (bp != NULL) {
                if (status != NVME_CQE_SC_SUCCESS) {
                        bp->b_error = EIO;
@@ -201,7 +206,6 @@
                        aprint_error_dev(sc->sc_ld.sc_dv, "I/O error\n");
                }
        }
-       nvme_ns_put_ctx(ctx);
 }
 
 static int
@@ -210,21 +214,23 @@
        struct ld_nvme_softc *sc = device_private(ld->sc_dv);
        struct nvme_ns_context *ctx;
        int error;
-       int s;
+       int waitok = (!ISSET(flags, LDFL_POLL)
+           && !cpu_softintr_p() && !cpu_intr_p());
 
-       ctx = nvme_ns_get_ctx((flags & LDFL_POLL) ? PR_NOWAIT : PR_WAITOK);
+       ctx = nvme_ns_get_ctx(sc, waitok ? PR_WAITOK : PR_NOWAIT);
+       if (ctx == NULL)
+               return EAGAIN;
+
        ctx->nnc_cookie = sc;
        ctx->nnc_nsid = sc->sc_nsid;
        ctx->nnc_done = ld_nvme_syncdone;
        ctx->nnc_flags = 0;
-       if (flags & LDFL_POLL) {
+       if (flags & LDFL_POLL)
                SET(ctx->nnc_flags, NVME_NS_CTX_F_POLL);
-               s = splbio();
-       }
+
        error = nvme_ns_sync(sc->sc_nvme, ctx);
-       if (flags & LDFL_POLL) {
-               splx(s);
-       }
+       if (error)
+               nvme_ns_put_ctx(sc, ctx);
 
        return error;
 }
@@ -232,6 +238,7 @@
 static void
 ld_nvme_syncdone(struct nvme_ns_context *ctx)
 {
+       struct ld_nvme_softc *sc = ctx->nnc_cookie;
 
-       nvme_ns_put_ctx(ctx);
+       nvme_ns_put_ctx(sc, ctx);
 }
diff -r 64aac7d2c070 -r 848d31e91441 sys/dev/ic/nvme.c
--- a/sys/dev/ic/nvme.c Sun Sep 18 18:24:00 2016 +0000
+++ b/sys/dev/ic/nvme.c Sun Sep 18 21:19:39 2016 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: nvme.c,v 1.8 2016/09/17 19:52:16 jdolecek Exp $        */
+/*     $NetBSD: nvme.c,v 1.9 2016/09/18 21:19:39 jdolecek Exp $        */
 /*     $OpenBSD: nvme.c,v 1.49 2016/04/18 05:59:50 dlg Exp $ */
 
 /*
@@ -18,7 +18,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvme.c,v 1.8 2016/09/17 19:52:16 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvme.c,v 1.9 2016/09/18 21:19:39 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -41,7 +41,7 @@
 #include <dev/ic/nvmeio.h>
 
 int nvme_adminq_size = 128;
-int nvme_ioq_size = 128;
+int nvme_ioq_size = 1024;
 
 static int     nvme_print(void *, const char *);
 
@@ -156,18 +156,6 @@
 #define nvme_barrier(_s, _r, _l, _f) \
        bus_space_barrier((_s)->sc_iot, (_s)->sc_ioh, (_r), (_l), (_f))
 
-pool_cache_t nvme_ns_ctx_cache;
-ONCE_DECL(nvme_init_once);
-
-static int
-nvme_init(void)
-{
-       nvme_ns_ctx_cache = pool_cache_init(sizeof(struct nvme_ns_context),
-           0, 0, 0, "nvme_ns_ctx", NULL, IPL_BIO, NULL, NULL, NULL);
-       KASSERT(nvme_ns_ctx_cache != NULL);
-       return 0;
-}
-
 static void
 nvme_version(struct nvme_softc *sc, uint32_t ver)
 {
@@ -350,8 +338,6 @@
        int ioq_entries = nvme_ioq_size;
        int i;
 
-       RUN_ONCE(&nvme_init_once, nvme_init);
-
        reg = nvme_read4(sc, NVME_VS);
        if (reg == 0xffffffff) {
                aprint_error_dev(sc->sc_dev, "invalid mapping\n");
@@ -431,8 +417,20 @@
        if (!sc->sc_use_mq)
                nvme_write4(sc, NVME_INTMC, 1);
 
+       snprintf(sc->sc_ctxpoolname, sizeof(sc->sc_ctxpoolname),
+           "%s_ns_ctx", device_xname(sc->sc_dev));
+       sc->sc_ctxpool = pool_cache_init(sizeof(struct nvme_ns_context),
+           0, 0, 0, sc->sc_ctxpoolname, NULL, IPL_BIO, NULL, NULL, NULL);
+       if (sc->sc_ctxpool == NULL) {
+               aprint_error_dev(sc->sc_dev, "unable to create ctx pool\n");
+               goto free_q;
+       }
+
+       /* probe subdevices */
        sc->sc_namespaces = kmem_zalloc(sizeof(*sc->sc_namespaces) * sc->sc_nn,
            KM_SLEEP);
+       if (sc->sc_namespaces == NULL)
+               goto free_q;
        for (i = 0; i < sc->sc_nn; i++) {
                memset(&naa, 0, sizeof(naa));
                naa.naa_nsid = i + 1;
@@ -485,6 +483,9 @@
        if (error)
                return error;
 
+       /* from now on we are committed to detach, following will never fail */
+       pool_cache_destroy(sc->sc_ctxpool);
+
        for (i = 0; i < sc->sc_nq; i++)
                nvme_q_free(sc, sc->sc_q[i]);
        kmem_free(sc->sc_q, sizeof(*sc->sc_q) * sc->sc_nq);
@@ -564,7 +565,8 @@
        KASSERT(nsid > 0);
 
        ccb = nvme_ccb_get(sc->sc_admin_q);
-       KASSERT(ccb != NULL);
+       if (ccb == NULL)
+               return EAGAIN;
 
        mem = nvme_dmamem_alloc(sc, sizeof(*identify));
        if (mem == NULL)
@@ -856,8 +858,9 @@
        void *buf = NULL;
        int error;
 
+       /* limit command size to maximum data transfer size */
        if ((pt->buf == NULL && pt->len > 0) ||
-           (pt->buf != NULL && pt->len == 0))
+           (pt->buf != NULL && (pt->len == 0 || pt->len > sc->sc_mdts)))
                return EINVAL;
 
        q = is_adminq ? sc->sc_admin_q : nvme_get_q(sc);
@@ -865,7 +868,8 @@
        if (ccb == NULL)
                return EBUSY;
 
-       if (pt->buf != NULL && pt->len > 0) {
+       if (pt->buf != NULL) {
+               KASSERT(pt->len > 0);
                buf = kmem_alloc(pt->len, KM_SLEEP);
                if (buf == NULL) {
                        error = ENOMEM;
@@ -1022,35 +1026,42 @@
 {
        struct nvme_ccb *ccb;
        struct nvme_cqe *ring = NVME_DMA_KVA(q->q_cq_dmamem), *cqe;
-       uint32_t head;
        uint16_t flags;
        int rv = 0;
 
-       if (!mutex_tryenter(&q->q_cq_mtx))
-               return -1;
+       mutex_enter(&q->q_cq_mtx);
 
        nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
-       head = q->q_cq_head;
        for (;;) {
-               cqe = &ring[head];
+               cqe = &ring[q->q_cq_head];
                flags = lemtoh16(&cqe->flags);
                if ((flags & NVME_CQE_PHASE) != q->q_cq_phase)
                        break;
 
                ccb = &q->q_ccbs[cqe->cid];
-               ccb->ccb_done(q, ccb, cqe);
 
-               if (++head >= q->q_entries) {
-                       head = 0;
+               if (++q->q_cq_head >= q->q_entries) {
+                       q->q_cq_head = 0;
                        q->q_cq_phase ^= NVME_CQE_PHASE;
                }
 
                rv = 1;
+
+               /*
+                * Unlock the mutext before calling the ccb_done callback
+                * and re-lock afterwards. The callback triggers lddone()
+                * which schedules another i/o, and also calls nvme_ccb_put().
+                * Unlock/relock avoids possibility of deadlock.
+                */
+               mutex_exit(&q->q_cq_mtx);
+               ccb->ccb_done(q, ccb, cqe);
+               mutex_enter(&q->q_cq_mtx);
        }
        nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
 
        if (rv)
-               nvme_write4(sc, q->q_cqhdbl, q->q_cq_head = head);
+               nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
+
        mutex_exit(&q->q_cq_mtx);
 
        return rv;
@@ -1121,7 +1132,7 @@
        struct nvme_ccb *ccb;
        int rv;
 
-       if (sc->sc_use_mq && sc->sc_intr_establish(sc, q->q_id, q))
+       if (sc->sc_use_mq && sc->sc_intr_establish(sc, q->q_id, q) != 0)
                return 1;
Prev by Date: [src/trunk]: src/distrib/atari/floppies/prepare Kill expr, modernize
Next by Date: [src/trunk]: src/lib/libc/time PR/51484: gson@: Fix off by one (wrong merge)
Previous by Thread: [src/trunk]: src/distrib/atari/floppies/prepare Kill expr, modernize
Next by Thread: [src/trunk]: src/lib/libc/time PR/51484: gson@: Fix off by one (wrong merge)
Indexes:
Home | Main Index | Thread Index | Old Index