Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/dev/raidframe On an idea from Thor (tls@), do not fail a...



details:   https://anonhg.NetBSD.org/src/rev/e6c3201c1580
branches:  trunk
changeset: 571301:e6c3201c1580
user:      oster <oster%NetBSD.org@localhost>
date:      Tue Nov 16 16:45:51 2004 +0000

description:
On an idea from Thor (tls@), do not fail a component if doing so would
render the RAID set completely dead.  Instead, we retry the IO a
maximum of RF_RETRY_THRESHOLD times (currently '5'), and then just
return an IO error if the IO fails.  This should reduce the damage
caused by having multiple disks appear to fail when the culprit is
really something else (power, controllers, etc.)

diffstat:

 sys/dev/raidframe/rf_desc.h        |   3 ++-
 sys/dev/raidframe/rf_driver.c      |   5 +++--
 sys/dev/raidframe/rf_driver.h      |   6 +++++-
 sys/dev/raidframe/rf_netbsdkintf.c |  11 +++++++----
 sys/dev/raidframe/rf_states.c      |  24 ++++++++++++++++++------
 5 files changed, 35 insertions(+), 14 deletions(-)

diffs (150 lines):

diff -r 54af07ac19e6 -r e6c3201c1580 sys/dev/raidframe/rf_desc.h
--- a/sys/dev/raidframe/rf_desc.h       Tue Nov 16 14:42:19 2004 +0000
+++ b/sys/dev/raidframe/rf_desc.h       Tue Nov 16 16:45:51 2004 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_desc.h,v 1.14 2004/06/02 22:58:28 drochner Exp $    */
+/*     $NetBSD: rf_desc.h,v 1.15 2004/11/16 16:45:51 oster Exp $       */
 /*
  * Copyright (c) 1995 Carnegie-Mellon University.
  * All rights reserved.
@@ -78,6 +78,7 @@
                                 * RAID operation has gotten */
        const RF_AccessState_t *states; /* array of states to be run */
        int     status;         /* pass/fail status of the last operation */
+       int     numRetries;     /* number of times this IO has been attempted */
        RF_DagList_t *dagList;  /* list of dag lists, one list per stripe */
        RF_VoidPointerListElem_t *iobufs; /* iobufs that need to be cleaned 
                                             up at the end of this IO */
diff -r 54af07ac19e6 -r e6c3201c1580 sys/dev/raidframe/rf_driver.c
--- a/sys/dev/raidframe/rf_driver.c     Tue Nov 16 14:42:19 2004 +0000
+++ b/sys/dev/raidframe/rf_driver.c     Tue Nov 16 16:45:51 2004 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_driver.c,v 1.104 2004/06/29 17:09:01 oster Exp $    */
+/*     $NetBSD: rf_driver.c,v 1.105 2004/11/16 16:45:51 oster Exp $    */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
@@ -73,7 +73,7 @@
 
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.104 2004/06/29 17:09:01 oster Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.105 2004/11/16 16:45:51 oster Exp $");
 
 #include "opt_raid_diagnostic.h"
 
@@ -573,6 +573,7 @@
        desc->dagList = NULL;
 
        desc->status = 0;
+       desc->numRetries = 0;
 #if RF_ACC_TRACE > 0
        memset((char *) &desc->tracerec, 0, sizeof(RF_AccTraceEntry_t));
 #endif
diff -r 54af07ac19e6 -r e6c3201c1580 sys/dev/raidframe/rf_driver.h
--- a/sys/dev/raidframe/rf_driver.h     Tue Nov 16 14:42:19 2004 +0000
+++ b/sys/dev/raidframe/rf_driver.h     Tue Nov 16 16:45:51 2004 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_driver.h,v 1.12 2004/06/02 22:58:30 drochner Exp $  */
+/*     $NetBSD: rf_driver.h,v 1.13 2004/11/16 16:45:52 oster Exp $     */
 /*
  * rf_driver.h
  */
@@ -37,6 +37,10 @@
 #include "rf_threadstuff.h"
 #include "rf_netbsd.h"
 
+#ifndef RF_RETRY_THRESHOLD
+#define RF_RETRY_THRESHOLD 5
+#endif
+
 RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex)
 int rf_BootRaidframe(void);
 int rf_UnbootRaidframe(void);
diff -r 54af07ac19e6 -r e6c3201c1580 sys/dev/raidframe/rf_netbsdkintf.c
--- a/sys/dev/raidframe/rf_netbsdkintf.c        Tue Nov 16 14:42:19 2004 +0000
+++ b/sys/dev/raidframe/rf_netbsdkintf.c        Tue Nov 16 16:45:51 2004 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_netbsdkintf.c,v 1.182 2004/10/28 07:07:44 yamt Exp $        */
+/*     $NetBSD: rf_netbsdkintf.c,v 1.183 2004/11/16 16:45:51 oster Exp $       */
 /*-
  * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
  * All rights reserved.
@@ -146,7 +146,7 @@
  ***********************************************************/
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.182 2004/10/28 07:07:44 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.183 2004/11/16 16:45:51 oster Exp $");
 
 #include <sys/param.h>
 #include <sys/errno.h>
@@ -1947,8 +1947,11 @@
        if (bp->b_flags & B_ERROR) {
                /* Mark the disk as dead */
                /* but only mark it once... */
-               if (queue->raidPtr->Disks[queue->col].status ==
-                   rf_ds_optimal) {
+               /* and only if it wouldn't leave this RAID set 
+                  completely broken */
+               if ((queue->raidPtr->Disks[queue->col].status ==
+                   rf_ds_optimal) && (queue->raidPtr->numFailures < 
+                                      queue->raidPtr->Layout.map->faultsTolerated)) {
                        printf("raid%d: IO Error.  Marking %s as failed.\n",
                               queue->raidPtr->raidid,
                               queue->raidPtr->Disks[queue->col].devname);
diff -r 54af07ac19e6 -r e6c3201c1580 sys/dev/raidframe/rf_states.c
--- a/sys/dev/raidframe/rf_states.c     Tue Nov 16 14:42:19 2004 +0000
+++ b/sys/dev/raidframe/rf_states.c     Tue Nov 16 16:45:51 2004 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_states.c,v 1.35 2004/03/23 13:09:18 oster Exp $     */
+/*     $NetBSD: rf_states.c,v 1.36 2004/11/16 16:45:52 oster Exp $     */
 /*
  * Copyright (c) 1995 Carnegie-Mellon University.
  * All rights reserved.
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_states.c,v 1.35 2004/03/23 13:09:18 oster Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_states.c,v 1.36 2004/11/16 16:45:52 oster Exp $");
 
 #include <sys/errno.h>
 
@@ -512,13 +512,18 @@
 
        desc->status = 0;       /* good status */
 
-       if (selectStatus) {
+       if (selectStatus || (desc->numRetries > RF_RETRY_THRESHOLD)) {
                /* failed to create a dag */
                /* this happens when there are too many faults or incomplete
                 * dag libraries */
-               printf("raid%d: failed to create a dag. "
-                      "Too many component failures.\n", 
-                      desc->raidPtr->raidid);
+               if (selectStatus) {
+                       printf("raid%d: failed to create a dag. "
+                              "Too many component failures.\n", 
+                              desc->raidPtr->raidid);
+               } else {
+                       printf("raid%d: IO failed after %d retries.\n",
+                              desc->raidPtr->raidid, RF_RETRY_THRESHOLD);
+               }
 
                desc->status = 1; /* bad status */ 
                /* skip straight to rf_State_Cleanup() */
@@ -624,6 +629,13 @@
                                rf_FreeDAGList(temp);
                        }
                        rf_MarkFailuresInASMList(raidPtr, asmh);
+
+                       /* note the retry so that we'll bail in
+                          rf_State_CreateDAG() once we've retired
+                          the IO RF_RETRY_THRESHOLD times */
+
+                       desc->numRetries++;
+
                        /* back up to rf_State_CreateDAG */
                        desc->state = desc->state - 2;
                        return RF_FALSE;



Home | Main Index | Thread Index | Old Index