Re: mpt(4) timeout recovery improvements

To: tech-kern%netbsd.org@localhost
Subject: Re: mpt(4) timeout recovery improvements
From: Edgar Fuß <ef%math.uni-bonn.de@localhost>
Date: Mon, 25 Nov 2013 15:31:21 +0100

OK, I have something working now (based on Brian Buhrow's patch).

Could somebody please review it, especially whether I got the locking 
etc. right?

My basic test is to run
        dd if=/dev/rsd2d of=/dev/null bs=1m
(where sd2 is an unsued disc), then pull out and re-insert the disc.
With a stock kernel, that hangs after the timeout.
With the patch, it continues.

To test the impact on other concurrent SCSI commands, I background that dd 
and run
        dd if=/dev/rraid0e bs=1m | md5
(where rraid0e is a 20G unmounted partition) in parallel and compare the 
checksum to what I get when I run that command undisturbed.

I've seen "re-queued 11 requests", 14 and none and the checksums match.

Index: ic/mpt_netbsd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/mpt_netbsd.c,v
retrieving revision 1.17.2.1
diff -u -r1.17.2.1 mpt_netbsd.c
--- ic/mpt_netbsd.c     22 Nov 2012 17:19:56 -0000      1.17.2.1
+++ ic/mpt_netbsd.c     25 Nov 2013 14:19:27 -0000
@@ -82,6 +82,7 @@
 #include <dev/ic/mpt.h>                        /* pulls in all headers */
 
 static int     mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int);
+static void    mpt_restart(mpt_softc_t *, request_t *);
 static void    mpt_timeout(void *);
 static void    mpt_done(mpt_softc_t *, uint32_t);
 static void    mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *);
@@ -347,6 +348,56 @@
 }
 
 static void
+mpt_restart(mpt_softc_t *mpt, request_t *req0)
+{
+       int i, s, nreq;
+       request_t *req;
+       struct scsipi_xfer *xs;
+
+       /* first, reset the IOC, leaving stopped so all requests are idle */
+       if (mpt_soft_reset(mpt) != MPT_OK) {
+               mpt_prt(mpt, "soft reset failed");
+               /* don't try a hard reset since this mangles the PCI 
configuration registers */
+               return;
+       }
+
+       /* freeze the channel so scsipi doesn't queue more commands */
+       scsipi_channel_freeze(&mpt->sc_channel, 1);
+
+       /* return all pending requests to scsipi and de-allocate them */
+       s = splbio();
+       nreq = 0;
+       for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
+               req = &mpt->request_pool[i];
+               xs = req->xfer;
+               if (xs != NULL) {
+                       if (xs->datalen != 0)
+                               bus_dmamap_unload(mpt->sc_dmat, req->dmap);
+                       req->xfer = NULL;
+                       callout_stop(&xs->xs_callout);
+                       if (req != req0) {
+                               nreq++;
+                               xs->error = XS_REQUEUE;
+                       }
+                       scsipi_done(xs);
+                       /* don't really need to mpt_free_request() since 
mpt_init() below will free all requests anyway */
+                       mpt_free_request(mpt, req);
+               }
+       }
+       splx(s);
+       if (nreq > 0)
+               mpt_prt(mpt, "re-queued %d requests", nreq);
+
+       /* re-initialize the IOC (which restarts it) */
+       if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0)
+               mpt_prt(mpt, "restart succeeded");
+       /* else error message already printed */
+
+       /* thaw the channel, causing scsipi to re-queue the commands */
+       scsipi_channel_thaw(&mpt->sc_channel, 1);
+}
+
+static void
 mpt_timeout(void *arg)
 {
        request_t *req = arg;
@@ -383,14 +434,12 @@
        if (mpt->verbose > 1)
                mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf);
 
-       /* XXX WHAT IF THE IOC IS STILL USING IT?? */
-       req->xfer = NULL;
-       mpt_free_request(mpt, req);
+       splx(s);
 
+       /* restart the IOC since we don't know whether it's still using this 
request */
+       /* also, the IOC tends to lock up at least the respective device after 
a timeout */
        xs->error = XS_TIMEOUT;
-       scsipi_done(xs);
-
-       splx(s);
+       mpt_restart(mpt, req);
 }
 
 static void

Follow-Ups:
- Re: mpt(4) timeout recovery improvements
  - From: Edgar Fuß
- Re: mpt(4) timeout recovery improvements
  - From: Edgar Fuß
- Re: mpt(4) timeout recovery improvements
  - From: Brian Buhrow

References:
- mpii(4)
  - From: Edgar Fuß
- mpt(4) timeout recovery improvements
  - From: Edgar Fuß
- Re: mpt(4) timeout recovery improvements
  - From: Edgar Fuß
- Re: mpt(4) timeout recovery improvements
  - From: Manuel Bouyer

Prev by Date: Help for PR kern/46606 is needed
Next by Date: Re: Help for PR kern/46606 is needed
Previous by Thread: Re: mpt(4) timeout recovery improvements
Next by Thread: Re: mpt(4) timeout recovery improvements
Indexes:

Home | Main Index | Thread Index | Old Index