Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/dev/raidframe Vastly improve the error handling in the c...



details:   https://anonhg.NetBSD.org/src/rev/6c75a96125db
branches:  trunk
changeset: 573704:6c75a96125db
user:      oster <oster%NetBSD.org@localhost>
date:      Sat Feb 05 23:32:43 2005 +0000

description:
Vastly improve the error handling in the case of a read/write error
that occurs during a reconstruction.  We go from zero error handling
and likely panicing if something goes amiss, to gracefully bailing and
leaving the system in the best, usable state possible.

- introduce rf_DrainReconEventQueue() to allow easy cleaning of the
reconstruction event queue

- change how we cleanup the floating recon buffers in
rf_FreeReconControl().  Detect the end of the list rather
than traversing according to a count.

- keep track of the number of pending reconstruction writes.  In the
event of a read error, use this to wait long enough for the pending
writes to (hopefully) drain.

- more cleanup is still needed on this code, but I didn't want to
start mixing major functional changes with minor cleanups.

XXX: There is a known issue with pool items left outstanding due to
the IO failure, and this can show up in the form of a panic at the
tail end of a shutdown.  This problem is much less severe than before
these changes, and the hope/plan is that this problem will go away
once this code gets overhauled again.

diffstat:

 sys/dev/raidframe/rf_reconstruct.c |  308 +++++++++++++++++++++++++++++++-----
 sys/dev/raidframe/rf_reconstruct.h |    8 +-
 sys/dev/raidframe/rf_reconutil.c   |   14 +-
 sys/dev/raidframe/rf_revent.c      |   29 +++-
 sys/dev/raidframe/rf_revent.h      |    4 +-
 5 files changed, 307 insertions(+), 56 deletions(-)

diffs (truncated from 666 to 300 lines):

diff -r 1db845a5d8f1 -r 6c75a96125db sys/dev/raidframe/rf_reconstruct.c
--- a/sys/dev/raidframe/rf_reconstruct.c        Sat Feb 05 20:19:00 2005 +0000
+++ b/sys/dev/raidframe/rf_reconstruct.c        Sat Feb 05 23:32:43 2005 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_reconstruct.c,v 1.81 2005/01/22 02:24:31 oster Exp $        */
+/*     $NetBSD: rf_reconstruct.c,v 1.82 2005/02/05 23:32:43 oster Exp $        */
 /*
  * Copyright (c) 1995 Carnegie-Mellon University.
  * All rights reserved.
@@ -33,7 +33,7 @@
  ************************************************************/
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.81 2005/01/22 02:24:31 oster Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.82 2005/02/05 23:32:43 oster Exp $");
 
 #include <sys/time.h>
 #include <sys/buf.h>
@@ -94,6 +94,11 @@
 
 #endif /* RF_DEBUG_RECON */
 
+#define RF_RECON_DONE_READS   1
+#define RF_RECON_READ_ERROR   2
+#define RF_RECON_WRITE_ERROR  3
+#define RF_RECON_READ_STOPPED 4
+
 #define RF_MAX_FREE_RECONBUFFER 32
 #define RF_MIN_FREE_RECONBUFFER 16
 
@@ -320,6 +325,16 @@
                rf_update_component_labels(raidPtr, 
                                           RF_NORMAL_COMPONENT_UPDATE);
 
+       } else {
+               /* Reconstruct failed. */
+
+               RF_LOCK_MUTEX(raidPtr->mutex);
+               /* Failed disk goes back to "failed" status */
+               raidPtr->Disks[col].status = rf_ds_failed;
+
+               /* Spare disk goes back to "spare" status. */
+               spareDiskPtr->status = rf_ds_spare;
+               RF_UNLOCK_MUTEX(raidPtr->mutex);
        }
        return (rc);
 }
@@ -496,10 +511,6 @@
        reconDesc->maxReconExecTicks = 0;
        rc = rf_ContinueReconstructFailedDisk(reconDesc);
        
-       RF_LOCK_MUTEX(raidPtr->mutex);
-       raidPtr->reconInProgress--;
-       RF_UNLOCK_MUTEX(raidPtr->mutex);
-       
        if (!rc) {
                RF_LOCK_MUTEX(raidPtr->mutex);
                /* Need to set these here, as at this point it'll be claiming
@@ -536,8 +547,18 @@
 
                rf_update_component_labels(raidPtr, 
                                           RF_NORMAL_COMPONENT_UPDATE);
+       } else {
+               /* Reconstruct-in-place failed.  Disk goes back to
+                  "failed" status, regardless of what it was before.  */
+               RF_LOCK_MUTEX(raidPtr->mutex);
+               raidPtr->Disks[col].status = rf_ds_failed;
+               RF_UNLOCK_MUTEX(raidPtr->mutex);
+       }
 
-       }
+       RF_LOCK_MUTEX(raidPtr->mutex);
+       raidPtr->reconInProgress--;
+       RF_UNLOCK_MUTEX(raidPtr->mutex);
+       
        RF_SIGNAL_COND(raidPtr->waitForReconCond);
        return (rc);
 }
@@ -552,9 +573,12 @@
        RF_ReconMap_t *mapPtr;
        RF_ReconCtrl_t *tmp_reconctrl;
        RF_ReconEvent_t *event;
+       RF_CallbackDesc_t *p;
        struct timeval etime, elpsd;
        unsigned long xor_s, xor_resid_us;
        int     i, ds;
+       int status;
+       int recon_error, write_error;
 
        raidPtr->accumXorTimeUs = 0;
 #if RF_ACC_TRACE > 0
@@ -609,19 +633,65 @@
         * they've completed all work */
 
        mapPtr = raidPtr->reconControl->reconMap;
-       
+       recon_error = 0;
+       write_error = 0;
+
        while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
                
                event = rf_GetNextReconEvent(reconDesc);
                RF_ASSERT(event);
-               
-               if (ProcessReconEvent(raidPtr, event))
+
+               status = ProcessReconEvent(raidPtr, event);
+
+               /* the normal case is that a read completes, and all is well. */
+               if (status == RF_RECON_DONE_READS) {
                        reconDesc->numDisksDone++;
+               } else if ((status == RF_RECON_READ_ERROR) ||
+                          (status == RF_RECON_WRITE_ERROR)) {
+                       /* an error was encountered while reconstructing... 
+                          Pretend we've finished this disk. 
+                       */
+                       recon_error = 1;
+                       raidPtr->reconControl->error = 1;
+
+                       /* bump the numDisksDone count for reads, 
+                          but not for writes */
+                       if (status == RF_RECON_READ_ERROR)
+                               reconDesc->numDisksDone++;
+
+                       /* write errors are special -- when we are
+                          done dealing with the reads that are
+                          finished, we don't want to wait for any
+                          writes */
+                       if (status == RF_RECON_WRITE_ERROR)
+                               write_error = 1;
+
+               } else if (status == RF_RECON_READ_STOPPED) {
+                       /* count this component as being "done" */
+                       reconDesc->numDisksDone++;
+               }
+
+               if (recon_error) {
+
+                       /* make sure any stragglers are woken up so that
+                          their theads will complete, and we can get out
+                          of here with all IO processed */
+
+                       while (raidPtr->reconControl->headSepCBList) {
+                               p = raidPtr->reconControl->headSepCBList;
+                               raidPtr->reconControl->headSepCBList = p->next;
+                               p->next = NULL;
+                               rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
+                               rf_FreeCallbackDesc(p);
+                       }
+               }
+
                raidPtr->reconControl->numRUsTotal = 
                        mapPtr->totalRUs;
                raidPtr->reconControl->numRUsComplete = 
                        mapPtr->totalRUs - 
                        rf_UnitsLeftToReconstruct(mapPtr);
+
 #if RF_DEBUG_RECON
                raidPtr->reconControl->percentComplete = 
                        (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
@@ -637,19 +707,107 @@
        }
        /* at this point all the reads have completed.  We now wait
         * for any pending writes to complete, and then we're done */
-       
-       while (rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
+
+       while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
                
                event = rf_GetNextReconEvent(reconDesc);
                RF_ASSERT(event);
-               
-               (void) ProcessReconEvent(raidPtr, event);       /* ignore return code */
+
+               status = ProcessReconEvent(raidPtr, event);
+               if (status == RF_RECON_WRITE_ERROR) {
+                       recon_error = 1;
+                       raidPtr->reconControl->error = 1; 
+                       /* an error was encountered at the very end... bail */
+               } else {
 #if RF_DEBUG_RECON
-               raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
-               if (rf_prReconSched) {
-                       rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
+                       raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+                       if (rf_prReconSched) {
+                               rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
+                       }
+#endif
                }
+       }
+
+       if (recon_error) {
+               /* we've encountered an error in reconstructing. */
+               printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
+               
+               /* we start by blocking IO to the RAID set. */
+               rf_SuspendNewRequestsAndWait(raidPtr);
+       
+               RF_LOCK_MUTEX(raidPtr->mutex);
+               /* mark set as being degraded, rather than
+                  rf_rs_reconstructing as we were before the problem.
+                  After this is done we can update status of the
+                  component disks without worrying about someone
+                  trying to read from a failed component.
+               */
+               raidPtr->status = rf_rs_degraded;
+               RF_UNLOCK_MUTEX(raidPtr->mutex);
+               
+               /* resume IO */
+               rf_ResumeNewRequests(raidPtr);  
+       
+               /* At this point there are two cases:
+                  1) If we've experienced a read error, then we've
+                  already waited for all the reads we're going to get,
+                  and we just need to wait for the writes.
+
+                  2) If we've experienced a write error, we've also
+                  already waited for all the reads to complete,
+                  but there is little point in waiting for the writes --
+                  when they do complete, they will just be ignored.
+
+                  So we just wait for writes to complete if we didn't have a 
+                  write error.
+               */
+
+               if (!write_error) {
+                       /* wait for writes to complete */
+                       while (raidPtr->reconControl->pending_writes > 0) {
+                               event = rf_GetNextReconEvent(reconDesc);
+                               status = ProcessReconEvent(raidPtr, event);
+
+                               if (status == RF_RECON_WRITE_ERROR) {
+                                       raidPtr->reconControl->error = 1; 
+                                       /* an error was encountered at the very end... bail.
+                                          This will be very bad news for the user, since
+                                          at this point there will have been a read error
+                                          on one component, and a write error on another!
+                                       */
+                                       break;
+                               }
+                       }
+               }
+
+               
+               /* cleanup */
+
+               /* drain the event queue - after waiting for the writes above,
+                  there shouldn't be much (if anything!) left in the queue. */
+
+               rf_DrainReconEventQueue(reconDesc);
+               
+               /* XXX  As much as we'd like to free the recon control structure
+                  and the reconDesc, we have no way of knowing if/when those will
+                  be touched by IO that has yet to occur.  It is rather poor to be
+                  basically causing a 'memory leak' here, but there doesn't seem to be
+                  a cleaner alternative at this time.  Perhaps when the reconstruct code
+                  gets a makeover this problem will go away.
+               */
+#if 0
+               rf_FreeReconControl(raidPtr);
 #endif
+
+#if RF_ACC_TRACE > 0
+               RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
+#endif
+               /* XXX see comment above */
+#if 0
+               FreeReconDesc(reconDesc);
+#endif
+
+               return (1);
        }
 
        /* Success:  mark the dead disk as reconstructed.  We quiesce
@@ -683,7 +841,6 @@
               (int) raidPtr->reconControl->starttime.tv_sec,
               (int) raidPtr->reconControl->starttime.tv_usec,
               (int) etime.tv_sec, (int) etime.tv_usec);
-       
 #if RF_RECON_STATS > 0
        printf("raid%d: Total head-sep stall count was %d\n",
               raidPtr->raidid, (int) reconDesc->hsStallCount);
@@ -695,11 +852,10 @@
        FreeReconDesc(reconDesc);
        
        return (0);
+
 }
 /*****************************************************************************
  * do the right thing upon each reconstruction event.
- * returns nonzero if and only if there is nothing left unread on the 
- * indicated disk
  *****************************************************************************/
 static int 
 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
@@ -708,6 +864,8 @@
        RF_ReconBuffer_t *rbuf;
        RF_SectorCount_t sectorsPerRU;
 



Home | Main Index | Thread Index | Old Index