tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

RAIDframe nested autoconfiguration



I've enhanced RAIDframe autoconfiguration to handle RAID sets having
components contained within other RAID sets (as, for example, in the
"RAID on RAID" section of raidctl(8)).  This, in particular, allows
mounting the root filesystem from a multi-layered RAID.

My changes also handle cases where some of the nodes in the RAID tree
must be brought up in degraded mode (e.g., if one leaf disk goes
missing from a striped mirror or striped parity arrangement), as well
as unbalanced trees of RAIDs (e.g., a RAID 0 built from a RAID 1 and
a plain disk).  It is unclear if the latter case is actually used by
anyone, but I felt I should honor the principle of least surprise where
possible.

Known limitation: only RAIDs that are partitioned with disklabels are
searched for RAID components; wedges are not currently supported for
this, mainly because I haven't looked into how they work internally.

A patch against -current is attached; I've tested it, but it's not
impossible I've overlooked a corner case.  Comments?

-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))
Index: rf_netbsdkintf.c
===================================================================
RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_netbsdkintf.c,v
retrieving revision 1.257
diff -u -p -r1.257 rf_netbsdkintf.c
--- rf_netbsdkintf.c    28 Feb 2009 23:11:11 -0000      1.257
+++ rf_netbsdkintf.c    15 Mar 2009 20:56:53 -0000
@@ -308,15 +308,22 @@ static void raidunlock(struct raid_softc
 static void rf_markalldirty(RF_Raid_t *);
 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
 
+typedef enum rf_enough {
+       RF_ENOUGH_NO,
+       RF_ENOUGH_DEGRADED,
+       RF_ENOUGH_ALL
+} rf_enough_t;
+
 void rf_ReconThread(struct rf_recon_req *);
 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
 void rf_CopybackThread(RF_Raid_t *raidPtr);
 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
 int rf_autoconfig(struct device *self);
-void rf_buildroothack(RF_ConfigSet_t *);
+RF_ConfigSet_t *rf_buildroothack(RF_ConfigSet_t *, RF_AutoConfig_t **, int);
 
 RF_AutoConfig_t *rf_find_raid_components(void);
-RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
+RF_AutoConfig_t *rf_find_raid_components_on(device_t, RF_AutoConfig_t*);
+RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *, RF_ConfigSet_t*);
 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
 static int rf_reasonable_label(RF_ComponentLabel_t *);
 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
@@ -324,7 +331,8 @@ int rf_set_autoconfig(RF_Raid_t *, int);
 int rf_set_rootpartition(RF_Raid_t *, int);
 void rf_release_all_vps(RF_ConfigSet_t *);
 void rf_cleanup_config_set(RF_ConfigSet_t *);
-int rf_have_enough_components(RF_ConfigSet_t *);
+void rf_dispose_cset_list(RF_ConfigSet_t *);
+rf_enough_t rf_have_enough_components(RF_ConfigSet_t *);
 int rf_auto_config_set(RF_ConfigSet_t *, int *);
 static int rf_sync_component_caches(RF_Raid_t *raidPtr);
 
@@ -430,65 +438,126 @@ rf_autoconfig(struct device *self)
        /* XXX This code can only be run once. */
        raidautoconfig = 0;
 
-       /* 1. locate all RAID components on the system */
+       /* 1. locate all RAID components on regular disks */
 #ifdef DEBUG
        printf("Searching for RAID components...\n");
 #endif
        ac_list = rf_find_raid_components();
 
-       /* 2. Sort them into their respective sets. */
-       config_sets = rf_create_auto_sets(ac_list);
+       config_sets = NULL;
+       do {
+               do {
+                       /* 2. Sort them into their respective sets. */
+                       config_sets =
+                           rf_create_auto_sets(ac_list, config_sets);
+                       
+                       /*
+                        * 3. Evaluate each set and configure the valid ones.
+                        * This gets done in rf_buildroothack().
+                        */
+                       ac_list = NULL;
+                       config_sets =
+                           rf_buildroothack(config_sets, &ac_list, 0);
+                       
+                       /* 
+                        * 4. If there were RAID components inside a RAID that
+                        * we just autoconfigured ("RAID on RAID"), loop and
+                        * see if those want to be autoconfigured.
+                        */
+               } while (ac_list != NULL);
+
+               /*
+                * 5. If a RAID was missing components, but not enough to
+                * prevent configuration, then it was not handled above, in
+                * case the missing components turned up inside another RAID.
+                * (This is an odd use case, but there's no reason not to
+                * support it.)  Configure such RAIDs, if any, now.
+                */
+               config_sets = rf_buildroothack(config_sets, &ac_list, 1);
+
+               /* 
+                * 6. However, the configurations performed in the
+                * previous step may have revealed new RAID components
+                * (for example, if one leaf of a striped mirror or
+                * striped parity configuration is missing); loop and
+                * see if this makes any more sets autoconfigurable.
+                */
+       } while(ac_list != NULL);
 
        /*
-        * 3. Evaluate each set andconfigure the valid ones.
-        * This gets done in rf_buildroothack().
+        * 7. There may remain autoconfigured sets that cannot be
+        * brought up at all; dispose of them.
         */
-       rf_buildroothack(config_sets);
+       rf_dispose_cset_list(config_sets);
 
        return 1;
 }
 
-void
-rf_buildroothack(RF_ConfigSet_t *config_sets)
+RF_ConfigSet_t *
+rf_buildroothack(RF_ConfigSet_t *config_sets, RF_AutoConfig_t **newdevs,
+       int degraded)
 {
        RF_ConfigSet_t *cset;
        RF_ConfigSet_t *next_cset;
+       RF_ConfigSet_t *leftovers;
        int retcode;
        int raidID;
        int rootID;
        int col;
        int num_root;
+       int enough;
        char *devname;
 
        rootID = 0;
        num_root = 0;
        cset = config_sets;
+       leftovers = NULL;
        while(cset != NULL ) {
                next_cset = cset->next;
-               if (rf_have_enough_components(cset) &&
-                   cset->ac->clabel->autoconfigure==1) {
-                       retcode = rf_auto_config_set(cset,&raidID);
-                       if (!retcode) {
+               if (cset->ac->clabel->autoconfigure != 1) {
+                       /* we're not autoconfiguring this set...
+                          release the associated resources */
+                       rf_release_all_vps(cset);
+                       goto cleanup;
+               }
+
+               enough = rf_have_enough_components(cset);
+               if (enough == RF_ENOUGH_NO ||
+                   (!degraded && enough == RF_ENOUGH_DEGRADED)) {
+                       /* Not enough components yet.  Save for later... */
 #ifdef DEBUG
-                               printf("raid%d: configured ok\n", raidID);
+                       printf("raid%d: not enough components yet\n", raidID);
 #endif
-                               if (cset->rootable) {
-                                       rootID = raidID;
-                                       num_root++;
-                               }
-                       } else {
-                               /* The autoconfig didn't work :( */
+                       cset->next = leftovers;
+                       leftovers = cset;
+                       cset = next_cset;
+                       continue;
+               }
+
+               /* Enough components, for some value of "enough"; configure. */
+               retcode = rf_auto_config_set(cset,&raidID);
+               if (!retcode) {
 #ifdef DEBUG
-                               printf("Autoconfig failed with code %d for 
raid%d\n", retcode, raidID);
+                       printf("raid%d: configured ok\n", raidID);
 #endif
-                               rf_release_all_vps(cset);
+                       if (cset->rootable) {
+                               rootID = raidID;
+                               num_root++;
                        }
+                       /* Look for components of a nested RAID. */
+                       *newdevs = rf_find_raid_components_on(
+                               device_find_by_driver_unit("raid",
+                                   raidID), *newdevs);
                } else {
-                       /* we're not autoconfiguring this set...
-                          release the associated resources */
+                       /* The autoconfig didn't work :( */
+#ifdef DEBUG
+                       printf("Autoconfig failed with code %d"
+                           " for raid%d\n", retcode, raidID);
+#endif
                        rf_release_all_vps(cset);
                }
                /* cleanup */
+cleanup:
                rf_cleanup_config_set(cset);
                cset = next_cset;
        }
@@ -497,13 +566,17 @@ rf_buildroothack(RF_ConfigSet_t *config_
           then we don't touch booted_device or boothowto... */
 
        if (rootspec != NULL)
-               return;
+               return leftovers;
 
        /* we found something bootable... */
 
        if (num_root == 1) {
                booted_device = raid_softc[rootID].sc_dev;
        } else if (num_root > 1) {
+                /*
+                * (Note: if multiple bootable raids are discovered
+                * on different passes, the results may be unexpected.)
+                */
 
                /* 
                 * Maybe the MD code can help. If it cannot, then
@@ -515,7 +588,7 @@ rf_buildroothack(RF_ConfigSet_t *config_
                if (booted_device == NULL)
                        cpu_rootconf();
                if (booted_device == NULL) 
-                       return;
+                       return leftovers;
 
                num_root = 0;
                for (raidID = 0; raidID < numraid; raidID++) {
@@ -547,9 +620,9 @@ rf_buildroothack(RF_ConfigSet_t *config_
                        boothowto |= RB_ASKNAME;
                }
        }
+       return leftovers;
 }
 
-
 int
 raidsize(dev_t dev)
 {
@@ -2855,13 +2928,7 @@ oomem:
 RF_AutoConfig_t *
 rf_find_raid_components()
 {
-       struct vnode *vp;
-       struct disklabel label;
-       struct device *dv;
-       dev_t dev;
-       int bmajor, bminor, wedge;
-       int error;
-       int i;
+       device_t dv;
        RF_AutoConfig_t *ac_list;
 
 
@@ -2902,95 +2969,109 @@ rf_find_raid_components()
                        continue;
                }
 
-               /* need to find the device_name_to_block_device_major stuff */
-               bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
+               ac_list = rf_find_raid_components_on(dv, ac_list);
+       }
+       return ac_list;
+}
+
+RF_AutoConfig_t* 
+rf_find_raid_components_on(device_t dv, RF_AutoConfig_t* ac_list)
+{
+       struct vnode *vp;
+       struct disklabel label;
+       dev_t dev;
+       int bmajor, bminor, wedge;
+       int error, i;
+
+       /* need to find the device_name_to_block_device_major stuff */
+       bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
 
-               /* get a vnode for the raw partition of this disk */
+       /* get a vnode for the raw partition of this disk */
+       wedge = device_is_a(dv, "dk");
+       bminor = minor(device_unit(dv));
 
-               wedge = device_is_a(dv, "dk");
-               bminor = minor(device_unit(dv));
-               dev = wedge ? makedev(bmajor, bminor) :
-                   MAKEDISKDEV(bmajor, bminor, RAW_PART);
-               if (bdevvp(dev, &vp))
-                       panic("RAID can't alloc vnode");
+       dev = wedge ? makedev(bmajor, bminor) :
+           MAKEDISKDEV(bmajor, bminor, RAW_PART);
+       
+       if (bdevvp(dev, &vp))
+               panic("RAID can't alloc vnode");
 
-               error = VOP_OPEN(vp, FREAD, NOCRED);
+       error = VOP_OPEN(vp, FREAD, NOCRED);
 
+       if (error) {
+               /* "Who cares."  Continue looking
+                  for something that exists*/
+               vput(vp);
+               return ac_list;
+       }
+       
+       if (wedge) {
+               struct dkwedge_info dkw;
+               error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
+                   NOCRED);
                if (error) {
-                       /* "Who cares."  Continue looking
-                          for something that exists*/
+                       printf("RAIDframe: can't get wedge info for "
+                           "dev %s (%d)\n", device_xname(dv), error);
+                       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+                       VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
                        vput(vp);
-                       continue;
+                       return ac_list;
                }
-
-               if (wedge) {
-                       struct dkwedge_info dkw;
-                       error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
-                           NOCRED);
-                       if (error) {
-                               printf("RAIDframe: can't get wedge info for "
-                                   "dev %s (%d)\n", device_xname(dv), error);
-                               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-                               VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
-                               vput(vp);
-                               continue;
-                       }
-
-                       if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
-                               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-                               VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
-                               vput(vp);
-                               continue;
-                       }
-                               
-                       ac_list = rf_get_component(ac_list, dev, vp,
-                           device_xname(dv), dkw.dkw_size);
-                       continue;
+               
+               if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
+                       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+                       VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+                       vput(vp);
+                       return ac_list;
                }
-
-               /* Ok, the disk exists.  Go get the disklabel. */
-               error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
+               
+               ac_list = rf_get_component(ac_list, dev, vp,
+                   device_xname(dv), dkw.dkw_size);
+               return ac_list;
+       }
+       
+       /* Ok, the disk exists.  Go get the disklabel. */
+       error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
+       if (error) {
+               /*
+                * XXX can't happen - open() would
+                * have errored out (or faked up one)
+                */
+               if (error != ENOTTY)
+                       printf("RAIDframe: can't get label for dev "
+                           "%s (%d)\n", device_xname(dv), error);
+       }
+       
+       /* don't need this any more.  We'll allocate it again
+          a little later if we really do... */
+       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+       VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+       vput(vp);
+       
+       if (error)
+               return ac_list;
+       
+       for (i = 0; i < label.d_npartitions; i++) {
+               char cname[sizeof(ac_list->devname)];
+               
+               /* We only support partitions marked as RAID */
+               if (label.d_partitions[i].p_fstype != FS_RAID)
+                       continue;
+               
+               dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
+               if (bdevvp(dev, &vp))
+                       panic("RAID can't alloc vnode");
+               
+               error = VOP_OPEN(vp, FREAD, NOCRED);
                if (error) {
-                       /*
-                        * XXX can't happen - open() would
-                        * have errored out (or faked up one)
-                        */
-                       if (error != ENOTTY)
-                               printf("RAIDframe: can't get label for dev "
-                                   "%s (%d)\n", device_xname(dv), error);
-               }
-
-               /* don't need this any more.  We'll allocate it again
-                  a little later if we really do... */
-               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-               VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
-               vput(vp);
-
-               if (error)
+                       /* Whatever... */
+                       vput(vp);
                        continue;
-
-               for (i = 0; i < label.d_npartitions; i++) {
-                       char cname[sizeof(ac_list->devname)];
-
-                       /* We only support partitions marked as RAID */
-                       if (label.d_partitions[i].p_fstype != FS_RAID)
-                               continue;
-
-                       dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
-                       if (bdevvp(dev, &vp))
-                               panic("RAID can't alloc vnode");
-
-                       error = VOP_OPEN(vp, FREAD, NOCRED);
-                       if (error) {
-                               /* Whatever... */
-                               vput(vp);
-                               continue;
-                       }
-                       snprintf(cname, sizeof(cname), "%s%c",
-                           device_xname(dv), 'a' + i);
-                       ac_list = rf_get_component(ac_list, dev, vp, cname,
-                               label.d_partitions[i].p_size);
                }
+               snprintf(cname, sizeof(cname), "%s%c",
+                   device_xname(dv), 'a' + i);
+               ac_list = rf_get_component(ac_list, dev, vp, cname,
+                   label.d_partitions[i].p_size);
        }
        return ac_list;
 }
@@ -3048,16 +3129,12 @@ rf_print_component_label(RF_ComponentLab
 #endif
 
 RF_ConfigSet_t *
-rf_create_auto_sets(RF_AutoConfig_t *ac_list)
+rf_create_auto_sets(RF_AutoConfig_t *ac_list, RF_ConfigSet_t *config_sets)
 {
        RF_AutoConfig_t *ac;
-       RF_ConfigSet_t *config_sets;
        RF_ConfigSet_t *cset;
        RF_AutoConfig_t *ac_next;
 
-
-       config_sets = NULL;
-
        /* Go through the AutoConfig list, and figure out which components
           belong to what sets.  */
        ac = ac_list;
@@ -3170,7 +3247,7 @@ rf_does_it_fit(RF_ConfigSet_t *cset, RF_
        return(1);
 }
 
-int
+rf_enough_t
 rf_have_enough_components(RF_ConfigSet_t *cset)
 {
        RF_AutoConfig_t *ac;
@@ -3243,16 +3320,15 @@ rf_have_enough_components(RF_ConfigSet_t
                                            component, it's
                                            "Good Night, Charlie" */
                                        if (even_pair_failed == 1) {
-                                               return(0);
+                                               return RF_ENOUGH_NO;
                                        }
                                }
-                       } else {
-                               /* normal accounting */
-                               num_missing++;
                        }
+                       /* normal accounting */
+                       num_missing++;
                }
                if ((parity_type == '1') && (c%2 == 1)) {
-                               /* Just did an even component, and we didn't
+                               /* Just did an odd component, and we didn't
                                   bail.. reset the even_pair_failed flag,
                                   and go on to the next component.... */
                        even_pair_failed = 0;
@@ -3266,11 +3342,11 @@ rf_have_enough_components(RF_ConfigSet_t
            ((clabel->parityConfig == '5') && (num_missing > 1))) {
                /* XXX this needs to be made *much* more general */
                /* Too many failures */
-               return(0);
+               return RF_ENOUGH_NO;
        }
        /* otherwise, all is well, and we've got enough to take a kick
           at autoconfiguring this set */
-       return(1);
+       return (num_missing > 0) ? RF_ENOUGH_DEGRADED : RF_ENOUGH_ALL;
 }
 
 void
@@ -3412,6 +3488,16 @@ rf_cleanup_config_set(RF_ConfigSet_t *cs
        free(cset, M_RAIDFRAME);
 }
 
+void rf_dispose_cset_list(RF_ConfigSet_t *cset)
+{
+       while (cset) {
+               RF_ConfigSet_t *next_cset = cset->next;
+               
+               rf_release_all_vps(cset);
+               rf_cleanup_config_set(cset);
+               cset = next_cset;
+       }
+}
 
 void
 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)


Home | Main Index | Thread Index | Old Index