Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/dev/ic Checking in changes to improve error handling. S...
details: https://anonhg.NetBSD.org/src/rev/cb2610ec1200
branches: trunk
changeset: 328386:cb2610ec1200
user: buhrow <buhrow%NetBSD.org@localhost>
date: Tue Apr 01 23:57:54 2014 +0000
description:
Checking in changes to improve error handling. Specifically:
- if commands timeout, clear the queues to the the card and perform a soft
reset on the LSI hardware since when these timeouts occur, the LSI firmware
is not graceful about recovering at all.
- Recover gracefully from more kinds of errors using the same recovery
mechanism listed above.
Also, implement mpt_ioctl() to handle bus reset requests from scsictl(8).
diffstat:
sys/dev/ic/mpt_netbsd.c | 257 +++++++++++++++++++++++++++++++++++++++++------
sys/dev/ic/mpt_netbsd.h | 4 +-
2 files changed, 226 insertions(+), 35 deletions(-)
diffs (truncated from 446 to 300 lines):
diff -r f71bac215fb3 -r cb2610ec1200 sys/dev/ic/mpt_netbsd.c
--- a/sys/dev/ic/mpt_netbsd.c Tue Apr 01 21:40:46 2014 +0000
+++ b/sys/dev/ic/mpt_netbsd.c Tue Apr 01 23:57:54 2014 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $ */
+/* $NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $ */
/*
* Copyright (c) 2003 Wasabi Systems, Inc.
@@ -77,22 +77,28 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $");
#include <dev/ic/mpt.h> /* pulls in all headers */
+#include <sys/scsiio.h>
static int mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int);
static void mpt_timeout(void *);
+static void mpt_restart(mpt_softc_t *, request_t *);
static void mpt_done(mpt_softc_t *, uint32_t);
+static int mpt_drain_queue(mpt_softc_t *);
static void mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *);
static void mpt_set_xfer_mode(mpt_softc_t *, struct scsipi_xfer_mode *);
static void mpt_get_xfer_mode(mpt_softc_t *, struct scsipi_periph *);
static void mpt_ctlop(mpt_softc_t *, void *vmsg, uint32_t);
static void mpt_event_notify_reply(mpt_softc_t *, MSG_EVENT_NOTIFY_REPLY *);
+static void mpt_bus_reset(mpt_softc_t *);
static void mpt_scsipi_request(struct scsipi_channel *,
scsipi_adapter_req_t, void *);
static void mpt_minphys(struct buf *);
+static int mpt_ioctl(struct scsipi_channel *, u_long, void *, int,
+ struct proc *);
/*
* XXX - this assumes the device_private() of the attachement starts with
@@ -121,6 +127,7 @@
adapt->adapt_max_periph = maxq - 2;
adapt->adapt_request = mpt_scsipi_request;
adapt->adapt_minphys = mpt_minphys;
+ adapt->adapt_ioctl = mpt_ioctl;
/* Fill in the scsipi_channel. */
memset(chan, 0, sizeof(*chan));
@@ -138,7 +145,8 @@
chan->chan_ntargets = mpt->mpt_max_devices;
chan->chan_id = mpt->mpt_ini_id;
- (void) config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
+/*Save the output of the config so we can rescan the bus in case of errors*/
+ mpt->sc_scsibus_dv = config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
}
int
@@ -303,26 +311,11 @@
{
mpt_softc_t *mpt = arg;
int nrepl = 0;
- uint32_t reply;
if ((mpt_read(mpt, MPT_OFFSET_INTR_STATUS) & MPT_INTR_REPLY_READY) == 0)
return (0);
- reply = mpt_pop_reply_queue(mpt);
- while (reply != MPT_REPLY_EMPTY) {
- nrepl++;
- if (mpt->verbose > 1) {
- if ((reply & MPT_CONTEXT_REPLY) != 0) {
- /* Address reply; IOC has something to say */
- mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
- } else {
- /* Context reply; all went well */
- mpt_prt(mpt, "context %u reply OK", reply);
- }
- }
- mpt_done(mpt, reply);
- reply = mpt_pop_reply_queue(mpt);
- }
+nrepl = mpt_drain_queue(mpt);
return (nrepl != 0);
}
@@ -357,13 +350,20 @@
mpt_timeout(void *arg)
{
request_t *req = arg;
- struct scsipi_xfer *xs = req->xfer;
- struct scsipi_periph *periph = xs->xs_periph;
- mpt_softc_t *mpt = DEV_TO_MPT(
- periph->periph_channel->chan_adapter->adapt_dev);
- uint32_t oseq;
- int s;
-
+ struct scsipi_xfer *xs;
+ struct scsipi_periph *periph;
+ mpt_softc_t *mpt;
+ uint32_t oseq;
+ int s, nrepl = 0;
+
+if (req->xfer == NULL) {
+ printf("mpt_timeout: NULL xfer for request index 0x%x, sequenc 0x%x\n",
+ req->index, req->sequence);
+ return;
+ }
+ xs = req->xfer;
+ periph = xs->xs_periph;
+ mpt = (void *) periph->periph_channel->chan_adapter->adapt_dev;
scsipi_printaddr(periph);
printf("command timeout\n");
@@ -373,11 +373,28 @@
mpt->timeouts++;
if (mpt_intr(mpt)) {
if (req->sequence != oseq) {
+ mpt->success ++;
mpt_prt(mpt, "recovered from command timeout");
splx(s);
return;
}
}
+
+ /*
+ *Ensure the IOC is really done giving us data since it appears it can
+ *sometimes fail to give us interrupts under heavy load.
+ */
+ nrepl = mpt_drain_queue(mpt);
+ if (nrepl ) {
+ mpt_prt(mpt, "mpt_timeout: recovered %d commands",nrepl);
+ }
+
+ if (req->sequence != oseq) {
+ mpt->success ++;
+ splx(s);
+ return;
+ }
+
mpt_prt(mpt,
"timeout on request index = 0x%x, seq = 0x%08x",
req->index, req->sequence);
@@ -390,14 +407,83 @@
if (mpt->verbose > 1)
mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf);
- /* XXX WHAT IF THE IOC IS STILL USING IT?? */
- req->xfer = NULL;
- mpt_free_request(mpt, req);
+ xs->error = XS_TIMEOUT;
+ splx(s);
+ mpt_restart(mpt, req);
+}
+
+static void
+mpt_restart(mpt_softc_t *mpt, request_t *req0)
+{
+ int i, s, nreq;
+ request_t *req;
+ struct scsipi_xfer *xs;
+
+ /* first, reset the IOC, leaving stopped so all requests are idle */
+ if (mpt_soft_reset(mpt) != MPT_OK) {
+ mpt_prt(mpt, "soft reset failed");
+ /* don't try a hard reset since this mangles the PCI configuration registers */
+ return;
+ }
+
+ /* freeze the channel so scsipi doesn't queue more commands */
+ scsipi_channel_freeze(&mpt->sc_channel, 1);
- xs->error = XS_TIMEOUT;
- scsipi_done(xs);
+ /* return all pending requests to scsipi and de-allocate them */
+ s = splbio();
+ nreq = 0;
+ for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
+ req = &mpt->request_pool[i];
+ xs = req->xfer;
+ if (xs != NULL) {
+ if (xs->datalen != 0)
+ bus_dmamap_unload(mpt->sc_dmat, req->dmap);
+ req->xfer = NULL;
+ callout_stop(&xs->xs_callout);
+ if (req != req0) {
+ nreq++;
+ xs->error = XS_REQUEUE;
+ }
+ scsipi_done(xs);
+ /* don't really need to mpt_free_request() since mpt_init() below will free all requests anyway */
+ mpt_free_request(mpt, req);
+ }
+ }
+ splx(s);
+ if (nreq > 0)
+ mpt_prt(mpt, "re-queued %d requests", nreq);
- splx(s);
+ /* re-initialize the IOC (which restarts it) */
+ if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0)
+ mpt_prt(mpt, "restart succeeded");
+ /* else error message already printed */
+
+ /* thaw the channel, causing scsipi to re-queue the commands */
+ scsipi_channel_thaw(&mpt->sc_channel, 1);
+}
+
+static
+int mpt_drain_queue(mpt_softc_t *mpt)
+{
+ int nrepl = 0;
+ uint32_t reply;
+
+ reply = mpt_pop_reply_queue(mpt);
+ while (reply != MPT_REPLY_EMPTY) {
+ nrepl++;
+ if (mpt->verbose > 1) {
+ if ((reply & MPT_CONTEXT_REPLY) != 0) {
+ /* Address reply; IOC has something to say */
+ mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
+ } else {
+ /* Context reply; all went well */
+ mpt_prt(mpt, "context %u reply OK", reply);
+ }
+ }
+ mpt_done(mpt, reply);
+ reply = mpt_pop_reply_queue(mpt);
+ }
+ return (nrepl);
}
static void
@@ -409,6 +495,7 @@
request_t *req;
MSG_REQUEST_HEADER *mpt_req;
MSG_SCSI_IO_REPLY *mpt_reply;
+ int restart = 0; /*nonzero if we need to restart the IOC*/
if (__predict_true((reply & MPT_CONTEXT_REPLY) == 0)) {
/* context reply (ok) */
@@ -468,6 +555,8 @@
if (__predict_false(mpt_req->Function == MPI_FUNCTION_SCSI_TASK_MGMT)) {
if (mpt->verbose > 1)
mpt_prt(mpt, "mpt_done: TASK MGMT");
+ KASSERT(req == mpt->mngt_req);
+ mpt->mngt_req = NULL;
goto done;
}
@@ -544,9 +633,10 @@
}
xs->status = mpt_reply->SCSIStatus;
- switch (le16toh(mpt_reply->IOCStatus)) {
+ switch ((le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_MASK)) {
case MPI_IOCSTATUS_SCSI_DATA_OVERRUN:
xs->error = XS_DRIVER_STUFFUP;
+ mpt_prt(mpt,"mpt_done: IOC overrun!");
break;
case MPI_IOCSTATUS_SCSI_DATA_UNDERRUN:
@@ -605,30 +695,56 @@
case MPI_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
xs->error = XS_DRIVER_STUFFUP;
+ mpt_prt(mpt,"mpt_done: IOC SCSI residual mismatch!");
+ restart = 1;
break;
case MPI_IOCSTATUS_SCSI_TASK_TERMINATED:
/* XXX What should we do here? */
+ mpt_prt(mpt,"mpt_done: IOC SCSI task terminated!");
+ restart = 1;
break;
case MPI_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
/* XXX */
xs->error = XS_DRIVER_STUFFUP;
+ mpt_prt(mpt,"mpt_done: IOC SCSI task failed!");
+ restart = 1;
break;
case MPI_IOCSTATUS_SCSI_IOC_TERMINATED:
/* XXX */
xs->error = XS_DRIVER_STUFFUP;
+ mpt_prt(mpt,"mpt_done: IOC task terminated!");
+ restart = 1;
break;
case MPI_IOCSTATUS_SCSI_EXT_TERMINATED:
/* XXX This is a bus-reset */
xs->error = XS_DRIVER_STUFFUP;
+ mpt_prt(mpt,"mpt_done: IOC SCSI bus reset!");
+ restart = 1;
+ break;
+
+ case MPI_IOCSTATUS_SCSI_PROTOCOL_ERROR:
+ /*
+ *FreeBSD and Linux indicate this is a phase error between
+ *the IOC and the drive itself.
Home |
Main Index |
Thread Index |
Old Index