NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

kern/46895: raidframe autoconfiguration cannot handle layered devices



>Number:         46895
>Category:       kern
>Synopsis:       raidframe autoconfiguration cannot handle layered devices
>Confidential:   no
>Severity:       serious
>Priority:       high
>Responsible:    kern-bug-people
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Mon Sep 03 12:05:00 +0000 2012
>Originator:     Wolfgang Stukenbrock
>Release:        NetBSD 5.1_STABLE - also current
>Organization:
Dr. Nagler & Company GmbH
>Environment:
        
        
System: NetBSD s012 4.0 NetBSD 4.0 (NSW-S012) #12: Tue Jun 19 11:15:19 CEST 
2012 ncadmin@s012:/usr/src/sys/arch/amd64/compile/NSW-S012 amd64
Architecture: x86_64
Machine: amd64
>Description:
        As reported for 4.0 in PR-39784 the raidframe autoconfig code is still 
not able to reconstruct layered raid devices
        on reboot automatically. This is very ugly in raid1/0 combinations to 
archive lagre devices.
        The old patch inside of 39784 was not able to handle GPT partitions 
correctly. The patch below now handles GPT
        partition and is able to create filesystems larger 2TB on layered 
raid-devices on any depth.
        The only restriction is, that all components of a raid-device should be 
on the same depth-level in order to avoid
        possible confusion during autoconfiguration. (See comment in patch for 
further details ...)
>How-To-Repeat:
        Setup a layered raid-device - e.g. a stripe of two mirrors with 
autoconfiguration turned on and reboot the system.
        The two mirrors get configured again, the strip of them will no be 
configured and the mount of the filesystems
        on the stripe will fail.
>Fix:
        The following patch adds autoconfiguration for layered raid-devices.

--- rf_netbsdkintf.c.orig       2012-06-27 13:15:11.000000000 +0200
+++ rf_netbsdkintf.c    2012-08-29 11:52:45.000000000 +0200
@@ -214,6 +214,9 @@
 static void raid_attach(struct device *, struct device *, void *);
 static int raid_detach(struct device *, int);
 
+static RF_AutoConfig_t *rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, 
struct vnode *vp,
+    const char *cname, RF_SectorCount_t size,  uint64_t numsecs, unsigned 
secsize);
+
 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 
     daddr_t, daddr_t);
 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
@@ -319,7 +322,7 @@
 void rf_buildroothack(RF_ConfigSet_t *);
 
 RF_AutoConfig_t *rf_find_raid_components(void);
-RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
+RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *, RF_ConfigSet_t *);
 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
@@ -436,7 +439,7 @@
        ac_list = rf_find_raid_components();
 
        /* 2. Sort them into their respective sets. */
-       config_sets = rf_create_auto_sets(ac_list);
+       config_sets = rf_create_auto_sets(ac_list, NULL);
 
        /*
         * 3. Evaluate each set andconfigure the valid ones.
@@ -447,11 +450,168 @@
        return 1;
 }
 
+static int rf_buildroot_sub(RF_ConfigSet_t *cset, int *r_id, int level)
+{
+       int retcode;    
+
+       if (cset->ac->clabel->autoconfigure != 1) {
+/* force dropping of no-autoconfigure devices from list as soon as possible */
+               rf_release_all_vps(cset);
+               return -2;
+       }
+       if (rf_have_enough_components(cset) >= level) {
+               if ((retcode = rf_auto_config_set(cset, r_id)) == 0) {
+                       aprint_debug("raid%d: configured ok\n", *r_id); 
+                       /* cleanup */
+                       if (cset->rootable)
+                               retcode = 1;
+                       rf_cleanup_config_set(cset);
+                       return retcode;
+               }       
+               printf("Autoconfig failed with code %d for raid%d\n", retcode, 
*r_id);
+               rf_release_all_vps(cset);
+               return -2; 
+       }
+       return -1;
+}                               
+
+static RF_AutoConfig_t *
+rf_buildroot_search(int raidID)
+{
+  int bmajor, bminor, error, i;
+  dev_t bdev;
+  struct vnode *vp;
+  struct disklabel label;
+  RF_AutoConfig_t *ac_list = NULL;
+  uint64_t numsecs;
+  unsigned secsize;
+  struct raid_softc *rs;
+
+  KASSERT(raidID >= 0 && raidID < numraid);
+  rs = &raid_softc[raidID];
+
+  if (rs->sc_dkdev.dk_nwedges != 0)
+    { // we have wedges and may not use the normal partion information ...
+      struct dkwedge_list wl;
+      struct dkwedge_info *wi;
+
+// XXX the interface for wedges is "strange" - sorry, but this is true
+// XXX need to allocate memory, because cannot access the structures itself ...
+// XXX need to use the returned strings to search in the array just read for 
additional data ....
+// XXX - OK, we do this only at boot time, so it is not time critical
+
+      wl.dkwl_bufsize = sizeof(struct dkwedge_info) * rs->sc_dkdev.dk_nwedges;
+      if ((wi = malloc(wl.dkwl_bufsize, M_RAIDFRAME, M_NOWAIT)) != NULL)
+       {
+         wl.dkwl_buf = wi;
+// remark: we are alone durint boot - so no one may changed the wedges in this 
device ...
+         if (dkwedge_list(&rs->sc_dkdev, &wl, NULL) == 0)
+           {
+             for (i = 0; i < wl.dkwl_ncopied; i++)
+               { /* need to find the device_name_to_block_device_major stuff */
+                 device_t dv;
+
+                 if (strcmp("raidframe", wi[i].dkw_ptype))
+                   {
+                     aprint_debug("Autoconfig wedge '%s' on raid device is not 
of type '%s' - ignored\n", wi[i].dkw_wname, wi[i].dkw_ptype);
+                     continue;
+                   }
+                 bmajor = devsw_name2blk(wi[i].dkw_devname, NULL, 0);
+                 if ((dv = dkwedge_find_by_wname(wi[i].dkw_wname)) == NULL) 
continue;
+                 bminor = minor(device_unit(dv));
+                 bdev = makedev(bmajor, bminor);
+                 if (bdevvp(bdev, &vp))
+                   {
+                     printf("RAID-autoconfig: can't alloc vnode for wedge '%s' 
on just configured RAID device raid%d -  ignored\n",
+                       wi[i].dkw_wname, raidID);
+                     continue; 
+                   }
+                 if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+                   {
+                     printf("RAID-autoconfig: failed to open wedge '%s' on 
just configured RAID device raid%d (%d) - ignored\n",
+                       wi[i].dkw_wname, raidID, error);
+                     goto wedge_drop;
+                   }
+                 if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+                   {
+                     printf("RAID-autoconfig: failed to get size information 
for wedge '%s' on just configured RAID device raid%d (%d) - ignored\n",
+                       wi[i].dkw_wname, raidID, error);
+                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+                     VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+                   wedge_drop:
+                     vput(vp);
+                     continue;
+                   }
+                 ac_list = rf_get_component(ac_list, bdev, vp, 
wi[i].dkw_wname, wi[i].dkw_size, numsecs, secsize);
+                }
+           }
+         free(wi, M_RAIDFRAME);
+       }
+      return ac_list;
+    }
+
+// remark: it is to complex  to use our interface routines directly, because 
we want the open() to read in the label ...
+    bmajor = devsw_name2blk("raid", NULL, 0);
+    bdev = MAKEDISKDEV(bmajor, raidID, RAW_PART);
+    if (bdevvp(bdev, &vp))
+      panic("RAID can't alloc vnode in rf_buildroot_search()");
+
+    if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+      {
+       printf("RAID-autoconfig: failed to open just configured RAID device 
raid%d (%d) - ignored\n", raidID, error);
+       vput(vp);
+       return NULL;
+      }
+       /* Ok, the disk exists.  Go get the size and disklabel. */
+    if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+      {
+       printf("RAID-autoconfig: failed to get size information for just 
configured RAID device raid%d (%d) - ignored\n",
+         raidID, error);
+      }
+    else if ((error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED)) != 0)
+      {
+       /*
+        * XXX can't happen - open() would
+        * have errored out (or faked up one)
+        */
+       if (error != ENOTTY)
+         printf("RAIDframe: can't get disk-label for dev raid%d (%d)\n", 
raidID, error);
+      }
+
+/* don't need this any more.  We'll allocate it again a little later if we 
really do... */
+    vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+    VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+    vput(vp);
+    if (error) return NULL;
+
+    for (i = 0; i < label.d_npartitions; i++)
+      {
+       char cname[sizeof(ac_list->devname)];
+
+       /* We only support partitions marked as RAID */
+       if (label.d_partitions[i].p_fstype != FS_RAID) continue;
+
+       bdev = MAKEDISKDEV(bmajor, raidID, i);
+       if (bdevvp(bdev, &vp)) panic("RAID can't alloc vnode");
+
+       if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+         { /* Whatever... */
+           vput(vp);
+           continue;
+         }
+       snprintf(cname, sizeof(cname), "raid%d%c", raidID, 'a' + i);
+       ac_list = rf_get_component(ac_list, bdev, vp, cname, 
label.d_partitions[i].p_size, numsecs, secsize);
+      }
+  return ac_list;
+}
+
 void
 rf_buildroothack(RF_ConfigSet_t *config_sets)
 {
        RF_ConfigSet_t *cset;
        RF_ConfigSet_t *next_cset;
+       RF_ConfigSet_t *prev_cset;
+       RF_AutoConfig_t *ac_list;
        int retcode;
        int raidID;
        int rootID;
@@ -459,34 +619,114 @@
        int num_root;
        char *devname;
 
+
        rootID = 0;
        num_root = 0;
+config_again:          /* first step: configure only "complete" devices */
        cset = config_sets;
+       prev_cset = NULL;
        while(cset != NULL ) {
                next_cset = cset->next;
-               if (rf_have_enough_components(cset) &&
-                   cset->ac->clabel->autoconfigure==1) {
-                       retcode = rf_auto_config_set(cset,&raidID);
-                       if (!retcode) {
-                               aprint_debug("raid%d: configured ok\n", raidID);
-                               if (cset->rootable) {
-                                       rootID = raidID;
-                                       num_root++;
-                               }
-                       } else {
-                               /* The autoconfig didn't work :( */
-                               aprint_debug("Autoconfig failed with code %d 
for raid%d\n", retcode, raidID);
-                               rf_release_all_vps(cset);
+               retcode = rf_buildroot_sub(cset, &raidID, 2); /* only complete 
raid devices ... */
+               if (retcode <= -2) {    /* aproach to configure failed in 
raidframe code - drop it - cannot succeed later ... */
+                       if (prev_cset == NULL) /* this set is dead - remove it 
from the list ... */
+                               config_sets = next_cset;
+                       else
+                               prev_cset->next = next_cset;
+                       rf_cleanup_config_set(cset);
+               } else if (retcode >= 0) {
+                       if (retcode >= 1) {
+                               rootID = raidID;
+                               num_root++;
                        }
-               } else {
-                       /* we're not autoconfiguring this set...
-                          release the associated resources */
-                       rf_release_all_vps(cset);
+                       if (prev_cset == NULL) /* this set has been configured 
and the config data is gone - remove it from the list ... */
+                               config_sets = next_cset;
+                       else
+                               prev_cset->next = next_cset;
+                       if ((ac_list = rf_buildroot_search(raidID)) != NULL) {
+               add_new_components:
+       /*
+        * XXXXXX
+        *
+        * We may have a problem with our new config list ...
+        *
+        * We may not add any component to the remaining sets if we have 
already configured that one.
+        * Otherwise we will add a new set information for the raid device 
already configured by this routine.
+        * This can only happen, if some components are layered raid devices 
and some not - see below too.
+        * 
+        * In worst case, this may lead to a mirror splitted into two degraded 
devices.
+        *
+        * This problem can only occure if there are failed (missing) 
components, because if all components
+        * are available, the first loop will handle everything and there this 
problem is only be relevant for spare devices.
+        * (in some setups hot-spare devices may get dropped from a raid during 
autoconfig if they come from a higher layer-level ...)
+        *
+        * On the other hand, we are talking only about layered raid devices 
here!
+        * And the described problem can only occure with such kind of 
components - components that reside on a raid device.
+        * The only realy usefull setup here (from my oppinion) would be a 
raid0 (stripe) device that is constructed from some
+        * raid 1 devices (and perhaps 4 or 5 - possible but less usefull) in 
order to increase the total size of the raid device.
+        * (remark: concatenation is not avaliable in raidframe ...)
+        * And a raid 0 device has no spare disks at all.
+        *
+        * If it is intended to catch all (strange) setup-cases, we need an 
addtional list of already configured devices and
+        * drop additional components for those devices from the list. Dropping 
should be done in a verbose way, because we
+        * are gooing to change the raid configuration during boot here!
+        * For now this list is not implemented.
+        *
+        * In order to avoid configuration of layered raid devices prior all 
"lower-level" raid devices have been configured, we
+        * will add new devices to the end of the list of all sets in 
rf_create_auto_sets() - see change below.
+        * So we have a stable and reliable autoconfig here, if all components 
of a raid device are at the same layer-level.
+        * And we can build up as much levels we ever want to and autoconfigure 
them during startup.
+        *
+        * W. Stukenbrock
+        */
+                               config_sets = rf_create_auto_sets(ac_list, 
config_sets);
+                               goto config_again;
+                       }
+               } else { /* ignore missing component errors here - retry later 
... */
+                       prev_cset = cset;
+               }
+               cset = next_cset;
+       }
+                       /* second step, try to configure the rest - on failure, 
give them an additional try later */
+                       /* perhaps some additional components are found and we 
will succeed later ... */
+       cset = config_sets;
+       prev_cset = NULL;
+       while(cset != NULL ) {
+               next_cset = cset->next;
+               retcode = rf_buildroot_sub(cset, &raidID, 1); /* all raid 
devices that may have a chance ... */
+               if (retcode <= -2) {    /* aproach to configure failed in 
raidframe code - drop it - cannot succeed later ... */
+                       if (prev_cset == NULL) /* this set is dead - remove it 
from the list ... */
+                               config_sets = next_cset;
+                       else
+                               prev_cset->next = next_cset;
+                       rf_cleanup_config_set(cset);
+               } else if (retcode >= 0) {
+                       if (retcode >= 1) {
+                               rootID = raidID;
+                               num_root++;
+                       }
+                       if (prev_cset == NULL) /* this set has been configured 
and the config data is gone - remove it from the list ... */
+                               config_sets = next_cset;
+                       else
+                               prev_cset->next = next_cset;
+                       if ((ac_list = rf_buildroot_search(raidID)) != NULL) 
goto add_new_components;
+               } else { /* ignore to few component errors here - we retry 
later ... */
+                       prev_cset = cset; /* not complete - just try next one 
in this loop ... */
                }
-               /* cleanup */
-               rf_cleanup_config_set(cset);
                cset = next_cset;
        }
+/* if we still have some config sets now, that cannot be configured at all - 
still missing components
+ * we can just dropt them now, because we wuold have configured them in the 
second loop, because we always start over
+ * if we find additional raid components
+ */
+                       /* third step, try to configure the rest - if still not 
possible, drop it */
+                       /* it makes no sense to search for additonal components 
anymore - has already been done in the second step */
+       while((cset = config_sets) != NULL ) {
+               config_sets = cset->next;
+               printf("Autoconfig failed with due to missing components for 
raid%d\n", raidID);
+               rf_release_all_vps(cset);
+               rf_cleanup_config_set(cset);
+       }
 
        /* if the user has specified what the root device should be
           then we don't touch booted_device or boothowto... */
@@ -542,7 +782,6 @@
        }
 }
 
-
 int
 raidsize(dev_t dev)
 {
@@ -3228,7 +3467,7 @@
 #endif
 
 RF_ConfigSet_t *
-rf_create_auto_sets(RF_AutoConfig_t *ac_list)
+rf_create_auto_sets(RF_AutoConfig_t *ac_list, RF_ConfigSet_t *start_cfg)
 {
        RF_AutoConfig_t *ac;
        RF_ConfigSet_t *config_sets;
@@ -3236,7 +3475,7 @@
        RF_AutoConfig_t *ac_next;
 
 
-       config_sets = NULL;
+       config_sets = start_cfg;
 
        /* Go through the AutoConfig list, and figure out which components
           belong to what sets.  */
@@ -3281,9 +3520,18 @@
                                }
                                cset->ac = ac;
                                ac->next = NULL;
-                               cset->next = config_sets;
                                cset->rootable = 0;
-                               config_sets = cset;
+
+                               if (start_cfg == NULL) { /* "initial setup" 
case - order does not mather ... */
+                                       cset->next = config_sets;
+                                       config_sets = cset;
+                               } else { /* append new sets to end of list - 
needed for layered devices that are added here ... */
+                                       RF_ConfigSet_t *cs;
+
+                                       cset->next = NULL;
+                                       for (cs = config_sets; cs->next != 
NULL; cs = cs->next);
+                                       cs->next = cset;
+                               }
                        }
                }
                ac = ac_next;
@@ -3451,7 +3699,8 @@
        }
        /* otherwise, all is well, and we've got enough to take a kick
           at autoconfiguring this set */
-       return(1);
+        if (num_missing > 0) return (1);
+       return(2);
 }
 
 void

>Unformatted:
        
        


Home | Main Index | Thread Index | Old Index