Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/dev/ic Checking in changes to improve error handling. S...



details:   https://anonhg.NetBSD.org/src/rev/cb2610ec1200
branches:  trunk
changeset: 328386:cb2610ec1200
user:      buhrow <buhrow%NetBSD.org@localhost>
date:      Tue Apr 01 23:57:54 2014 +0000

description:
Checking in changes to improve error handling.  Specifically:

- if commands timeout, clear the queues to the the card and perform a soft
reset on the LSI hardware since when these timeouts occur, the LSI firmware
is not graceful about recovering at all.

- Recover gracefully from more kinds of errors using the same recovery
mechanism listed above.

Also, implement mpt_ioctl() to handle bus reset requests from scsictl(8).

diffstat:

 sys/dev/ic/mpt_netbsd.c |  257 +++++++++++++++++++++++++++++++++++++++++------
 sys/dev/ic/mpt_netbsd.h |    4 +-
 2 files changed, 226 insertions(+), 35 deletions(-)

diffs (truncated from 446 to 300 lines):

diff -r f71bac215fb3 -r cb2610ec1200 sys/dev/ic/mpt_netbsd.c
--- a/sys/dev/ic/mpt_netbsd.c   Tue Apr 01 21:40:46 2014 +0000
+++ b/sys/dev/ic/mpt_netbsd.c   Tue Apr 01 23:57:54 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $      */
+/*     $NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $   */
 
 /*
  * Copyright (c) 2003 Wasabi Systems, Inc.
@@ -77,22 +77,28 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.19 2012/09/23 01:13:21 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mpt_netbsd.c,v 1.20 2014/04/01 23:57:54 buhrow Exp $");
 
 #include <dev/ic/mpt.h>                        /* pulls in all headers */
+#include <sys/scsiio.h>
 
 static int     mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int);
 static void    mpt_timeout(void *);
+static void    mpt_restart(mpt_softc_t *, request_t *);
 static void    mpt_done(mpt_softc_t *, uint32_t);
+static int     mpt_drain_queue(mpt_softc_t *);
 static void    mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *);
 static void    mpt_set_xfer_mode(mpt_softc_t *, struct scsipi_xfer_mode *);
 static void    mpt_get_xfer_mode(mpt_softc_t *, struct scsipi_periph *);
 static void    mpt_ctlop(mpt_softc_t *, void *vmsg, uint32_t);
 static void    mpt_event_notify_reply(mpt_softc_t *, MSG_EVENT_NOTIFY_REPLY *);
+static void  mpt_bus_reset(mpt_softc_t *);
 
 static void    mpt_scsipi_request(struct scsipi_channel *,
                    scsipi_adapter_req_t, void *);
 static void    mpt_minphys(struct buf *);
+static int     mpt_ioctl(struct scsipi_channel *, u_long, void *, int,
+       struct proc *);
 
 /*
  * XXX - this assumes the device_private() of the attachement starts with
@@ -121,6 +127,7 @@
        adapt->adapt_max_periph = maxq - 2;
        adapt->adapt_request = mpt_scsipi_request;
        adapt->adapt_minphys = mpt_minphys;
+       adapt->adapt_ioctl = mpt_ioctl;
 
        /* Fill in the scsipi_channel. */
        memset(chan, 0, sizeof(*chan));
@@ -138,7 +145,8 @@
        chan->chan_ntargets = mpt->mpt_max_devices;
        chan->chan_id = mpt->mpt_ini_id;
 
-       (void) config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
+/*Save the output of the config so we can rescan the bus in case of errors*/
+       mpt->sc_scsibus_dv = config_found(mpt->sc_dev, &mpt->sc_channel, scsiprint);
 }
 
 int
@@ -303,26 +311,11 @@
 {
        mpt_softc_t *mpt = arg;
        int nrepl = 0;
-       uint32_t reply;
 
        if ((mpt_read(mpt, MPT_OFFSET_INTR_STATUS) & MPT_INTR_REPLY_READY) == 0)
                return (0);
 
-       reply = mpt_pop_reply_queue(mpt);
-       while (reply != MPT_REPLY_EMPTY) {
-               nrepl++;
-               if (mpt->verbose > 1) {
-                       if ((reply & MPT_CONTEXT_REPLY) != 0) {
-                               /* Address reply; IOC has something to say */
-                               mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
-                       } else {
-                               /* Context reply; all went well */
-                               mpt_prt(mpt, "context %u reply OK", reply);
-                       }
-               }
-               mpt_done(mpt, reply);
-               reply = mpt_pop_reply_queue(mpt);
-       }
+nrepl = mpt_drain_queue(mpt);
        return (nrepl != 0);
 }
 
@@ -357,13 +350,20 @@
 mpt_timeout(void *arg)
 {
        request_t *req = arg;
-       struct scsipi_xfer *xs = req->xfer;
-       struct scsipi_periph *periph = xs->xs_periph;
-       mpt_softc_t *mpt = DEV_TO_MPT(
-           periph->periph_channel->chan_adapter->adapt_dev);
-       uint32_t oseq;
-       int s;
-
+       struct scsipi_xfer *xs;
+       struct scsipi_periph *periph;
+       mpt_softc_t *mpt;
+       uint32_t oseq;
+       int s, nrepl = 0;
+ 
+if (req->xfer  == NULL) {
+               printf("mpt_timeout: NULL xfer for request index 0x%x, sequenc 0x%x\n",
+               req->index, req->sequence);
+               return;
+       }
+       xs = req->xfer;
+               periph = xs->xs_periph;
+       mpt = (void *) periph->periph_channel->chan_adapter->adapt_dev;
        scsipi_printaddr(periph);
        printf("command timeout\n");
 
@@ -373,11 +373,28 @@
        mpt->timeouts++;
        if (mpt_intr(mpt)) {
                if (req->sequence != oseq) {
+                       mpt->success ++;
                        mpt_prt(mpt, "recovered from command timeout");
                        splx(s);
                        return;
                }
        }
+
+       /*
+        *Ensure the IOC is really done giving us data since it appears it can
+        *sometimes fail to give us interrupts under heavy load.
+        */
+       nrepl = mpt_drain_queue(mpt);
+       if (nrepl ) {
+               mpt_prt(mpt, "mpt_timeout: recovered %d commands",nrepl);
+       }
+
+       if (req->sequence != oseq) {
+               mpt->success ++;
+               splx(s);
+               return;
+       }
+
        mpt_prt(mpt,
            "timeout on request index = 0x%x, seq = 0x%08x",
            req->index, req->sequence);
@@ -390,14 +407,83 @@
        if (mpt->verbose > 1)
                mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf);
 
-       /* XXX WHAT IF THE IOC IS STILL USING IT?? */
-       req->xfer = NULL;
-       mpt_free_request(mpt, req);
+       xs->error = XS_TIMEOUT;
+       splx(s);
+       mpt_restart(mpt, req);
+}
+
+static void
+mpt_restart(mpt_softc_t *mpt, request_t *req0)
+{
+       int i, s, nreq;
+       request_t *req;
+       struct scsipi_xfer *xs;
+
+       /* first, reset the IOC, leaving stopped so all requests are idle */
+       if (mpt_soft_reset(mpt) != MPT_OK) {
+               mpt_prt(mpt, "soft reset failed");
+               /* don't try a hard reset since this mangles the PCI configuration registers */
+               return;
+       }
+
+       /* freeze the channel so scsipi doesn't queue more commands */
+       scsipi_channel_freeze(&mpt->sc_channel, 1);
 
-       xs->error = XS_TIMEOUT;
-       scsipi_done(xs);
+       /* return all pending requests to scsipi and de-allocate them */
+       s = splbio();
+       nreq = 0;
+       for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
+               req = &mpt->request_pool[i];
+               xs = req->xfer;
+               if (xs != NULL) {
+                       if (xs->datalen != 0)
+                               bus_dmamap_unload(mpt->sc_dmat, req->dmap);
+                       req->xfer = NULL;
+                       callout_stop(&xs->xs_callout);
+                       if (req != req0) {
+                               nreq++;
+                               xs->error = XS_REQUEUE;
+                       }
+                       scsipi_done(xs);
+                       /* don't really need to mpt_free_request() since mpt_init() below will free all requests anyway */
+                       mpt_free_request(mpt, req);
+               }
+       }
+       splx(s);
+       if (nreq > 0)
+               mpt_prt(mpt, "re-queued %d requests", nreq);
 
-       splx(s);
+       /* re-initialize the IOC (which restarts it) */
+       if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0)
+               mpt_prt(mpt, "restart succeeded");
+       /* else error message already printed */
+
+       /* thaw the channel, causing scsipi to re-queue the commands */
+       scsipi_channel_thaw(&mpt->sc_channel, 1);
+}
+
+static
+int mpt_drain_queue(mpt_softc_t *mpt)
+{
+       int nrepl = 0;
+       uint32_t reply;
+
+       reply = mpt_pop_reply_queue(mpt);
+       while (reply != MPT_REPLY_EMPTY) {
+               nrepl++;
+               if (mpt->verbose > 1) {
+                       if ((reply & MPT_CONTEXT_REPLY) != 0) {
+                               /* Address reply; IOC has something to say */
+                               mpt_print_reply(MPT_REPLY_PTOV(mpt, reply));
+                       } else {
+                               /* Context reply; all went well */
+                               mpt_prt(mpt, "context %u reply OK", reply);
+                       }
+               }
+               mpt_done(mpt, reply);
+               reply = mpt_pop_reply_queue(mpt);
+       }
+       return (nrepl);
 }
 
 static void
@@ -409,6 +495,7 @@
        request_t *req;
        MSG_REQUEST_HEADER *mpt_req;
        MSG_SCSI_IO_REPLY *mpt_reply;
+       int restart = 0; /*nonzero if we need to restart the IOC*/
 
        if (__predict_true((reply & MPT_CONTEXT_REPLY) == 0)) {
                /* context reply (ok) */
@@ -468,6 +555,8 @@
        if (__predict_false(mpt_req->Function == MPI_FUNCTION_SCSI_TASK_MGMT)) {
                if (mpt->verbose > 1)
                        mpt_prt(mpt, "mpt_done: TASK MGMT");
+                       KASSERT(req == mpt->mngt_req);
+                       mpt->mngt_req = NULL;
                goto done;
        }
 
@@ -544,9 +633,10 @@
        }
 
        xs->status = mpt_reply->SCSIStatus;
-       switch (le16toh(mpt_reply->IOCStatus)) {
+       switch ((le16toh(mpt_reply->IOCStatus) & MPI_IOCSTATUS_MASK)) {
        case MPI_IOCSTATUS_SCSI_DATA_OVERRUN:
                xs->error = XS_DRIVER_STUFFUP;
+               mpt_prt(mpt,"mpt_done: IOC overrun!");
                break;
 
        case MPI_IOCSTATUS_SCSI_DATA_UNDERRUN:
@@ -605,30 +695,56 @@
 
        case MPI_IOCSTATUS_SCSI_RESIDUAL_MISMATCH:
                xs->error = XS_DRIVER_STUFFUP;
+               mpt_prt(mpt,"mpt_done: IOC SCSI residual mismatch!");
+               restart = 1;
                break;
 
        case MPI_IOCSTATUS_SCSI_TASK_TERMINATED:
                /* XXX What should we do here? */
+               mpt_prt(mpt,"mpt_done: IOC SCSI task terminated!");
+               restart = 1;
                break;
 
        case MPI_IOCSTATUS_SCSI_TASK_MGMT_FAILED:
                /* XXX */
                xs->error = XS_DRIVER_STUFFUP;
+               mpt_prt(mpt,"mpt_done: IOC SCSI task failed!");
+               restart = 1;
                break;
 
        case MPI_IOCSTATUS_SCSI_IOC_TERMINATED:
                /* XXX */
                xs->error = XS_DRIVER_STUFFUP;
+               mpt_prt(mpt,"mpt_done: IOC task terminated!");
+               restart = 1;
                break;
 
        case MPI_IOCSTATUS_SCSI_EXT_TERMINATED:
                /* XXX This is a bus-reset */
                xs->error = XS_DRIVER_STUFFUP;
+               mpt_prt(mpt,"mpt_done: IOC SCSI bus reset!");
+               restart = 1;
+               break;
+
+               case MPI_IOCSTATUS_SCSI_PROTOCOL_ERROR:
+               /*
+                *FreeBSD and Linux indicate this is a phase error between
+                *the IOC and the drive itself. 



Home | Main Index | Thread Index | Old Index