tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: mpt(4) timeout recovery improvements
OK, I have something working now (based on Brian Buhrow's patch).
Could somebody please review it, especially whether I got the locking
etc. right?
My basic test is to run
dd if=/dev/rsd2d of=/dev/null bs=1m
(where sd2 is an unsued disc), then pull out and re-insert the disc.
With a stock kernel, that hangs after the timeout.
With the patch, it continues.
To test the impact on other concurrent SCSI commands, I background that dd
and run
dd if=/dev/rraid0e bs=1m | md5
(where rraid0e is a 20G unmounted partition) in parallel and compare the
checksum to what I get when I run that command undisturbed.
I've seen "re-queued 11 requests", 14 and none and the checksums match.
Index: ic/mpt_netbsd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/mpt_netbsd.c,v
retrieving revision 1.17.2.1
diff -u -r1.17.2.1 mpt_netbsd.c
--- ic/mpt_netbsd.c 22 Nov 2012 17:19:56 -0000 1.17.2.1
+++ ic/mpt_netbsd.c 25 Nov 2013 14:19:27 -0000
@@ -82,6 +82,7 @@
#include <dev/ic/mpt.h> /* pulls in all headers */
static int mpt_poll(mpt_softc_t *, struct scsipi_xfer *, int);
+static void mpt_restart(mpt_softc_t *, request_t *);
static void mpt_timeout(void *);
static void mpt_done(mpt_softc_t *, uint32_t);
static void mpt_run_xfer(mpt_softc_t *, struct scsipi_xfer *);
@@ -347,6 +348,56 @@
}
static void
+mpt_restart(mpt_softc_t *mpt, request_t *req0)
+{
+ int i, s, nreq;
+ request_t *req;
+ struct scsipi_xfer *xs;
+
+ /* first, reset the IOC, leaving stopped so all requests are idle */
+ if (mpt_soft_reset(mpt) != MPT_OK) {
+ mpt_prt(mpt, "soft reset failed");
+ /* don't try a hard reset since this mangles the PCI
configuration registers */
+ return;
+ }
+
+ /* freeze the channel so scsipi doesn't queue more commands */
+ scsipi_channel_freeze(&mpt->sc_channel, 1);
+
+ /* return all pending requests to scsipi and de-allocate them */
+ s = splbio();
+ nreq = 0;
+ for (i = 0; i < MPT_MAX_REQUESTS(mpt); i++) {
+ req = &mpt->request_pool[i];
+ xs = req->xfer;
+ if (xs != NULL) {
+ if (xs->datalen != 0)
+ bus_dmamap_unload(mpt->sc_dmat, req->dmap);
+ req->xfer = NULL;
+ callout_stop(&xs->xs_callout);
+ if (req != req0) {
+ nreq++;
+ xs->error = XS_REQUEUE;
+ }
+ scsipi_done(xs);
+ /* don't really need to mpt_free_request() since
mpt_init() below will free all requests anyway */
+ mpt_free_request(mpt, req);
+ }
+ }
+ splx(s);
+ if (nreq > 0)
+ mpt_prt(mpt, "re-queued %d requests", nreq);
+
+ /* re-initialize the IOC (which restarts it) */
+ if (mpt_init(mpt, MPT_DB_INIT_HOST) == 0)
+ mpt_prt(mpt, "restart succeeded");
+ /* else error message already printed */
+
+ /* thaw the channel, causing scsipi to re-queue the commands */
+ scsipi_channel_thaw(&mpt->sc_channel, 1);
+}
+
+static void
mpt_timeout(void *arg)
{
request_t *req = arg;
@@ -383,14 +434,12 @@
if (mpt->verbose > 1)
mpt_print_scsi_io_request((MSG_SCSI_IO_REQUEST *)req->req_vbuf);
- /* XXX WHAT IF THE IOC IS STILL USING IT?? */
- req->xfer = NULL;
- mpt_free_request(mpt, req);
+ splx(s);
+ /* restart the IOC since we don't know whether it's still using this
request */
+ /* also, the IOC tends to lock up at least the respective device after
a timeout */
xs->error = XS_TIMEOUT;
- scsipi_done(xs);
-
- splx(s);
+ mpt_restart(mpt, req);
}
static void
Home |
Main Index |
Thread Index |
Old Index