NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
kern/46895: raidframe autoconfiguration cannot handle layered devices
>Number: 46895
>Category: kern
>Synopsis: raidframe autoconfiguration cannot handle layered devices
>Confidential: no
>Severity: serious
>Priority: high
>Responsible: kern-bug-people
>State: open
>Class: change-request
>Submitter-Id: net
>Arrival-Date: Mon Sep 03 12:05:00 +0000 2012
>Originator: Wolfgang Stukenbrock
>Release: NetBSD 5.1_STABLE - also current
>Organization:
Dr. Nagler & Company GmbH
>Environment:
System: NetBSD s012 4.0 NetBSD 4.0 (NSW-S012) #12: Tue Jun 19 11:15:19 CEST
2012 ncadmin@s012:/usr/src/sys/arch/amd64/compile/NSW-S012 amd64
Architecture: x86_64
Machine: amd64
>Description:
As reported for 4.0 in PR-39784 the raidframe autoconfig code is still
not able to reconstruct layered raid devices
on reboot automatically. This is very ugly in raid1/0 combinations to
archive lagre devices.
The old patch inside of 39784 was not able to handle GPT partitions
correctly. The patch below now handles GPT
partition and is able to create filesystems larger 2TB on layered
raid-devices on any depth.
The only restriction is, that all components of a raid-device should be
on the same depth-level in order to avoid
possible confusion during autoconfiguration. (See comment in patch for
further details ...)
>How-To-Repeat:
Setup a layered raid-device - e.g. a stripe of two mirrors with
autoconfiguration turned on and reboot the system.
The two mirrors get configured again, the strip of them will no be
configured and the mount of the filesystems
on the stripe will fail.
>Fix:
The following patch adds autoconfiguration for layered raid-devices.
--- rf_netbsdkintf.c.orig 2012-06-27 13:15:11.000000000 +0200
+++ rf_netbsdkintf.c 2012-08-29 11:52:45.000000000 +0200
@@ -214,6 +214,9 @@
static void raid_attach(struct device *, struct device *, void *);
static int raid_detach(struct device *, int);
+static RF_AutoConfig_t *rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev,
struct vnode *vp,
+ const char *cname, RF_SectorCount_t size, uint64_t numsecs, unsigned
secsize);
+
static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
daddr_t, daddr_t);
static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
@@ -319,7 +322,7 @@
void rf_buildroothack(RF_ConfigSet_t *);
RF_AutoConfig_t *rf_find_raid_components(void);
-RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
+RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *, RF_ConfigSet_t *);
static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
@@ -436,7 +439,7 @@
ac_list = rf_find_raid_components();
/* 2. Sort them into their respective sets. */
- config_sets = rf_create_auto_sets(ac_list);
+ config_sets = rf_create_auto_sets(ac_list, NULL);
/*
* 3. Evaluate each set andconfigure the valid ones.
@@ -447,11 +450,168 @@
return 1;
}
+static int rf_buildroot_sub(RF_ConfigSet_t *cset, int *r_id, int level)
+{
+ int retcode;
+
+ if (cset->ac->clabel->autoconfigure != 1) {
+/* force dropping of no-autoconfigure devices from list as soon as possible */
+ rf_release_all_vps(cset);
+ return -2;
+ }
+ if (rf_have_enough_components(cset) >= level) {
+ if ((retcode = rf_auto_config_set(cset, r_id)) == 0) {
+ aprint_debug("raid%d: configured ok\n", *r_id);
+ /* cleanup */
+ if (cset->rootable)
+ retcode = 1;
+ rf_cleanup_config_set(cset);
+ return retcode;
+ }
+ printf("Autoconfig failed with code %d for raid%d\n", retcode,
*r_id);
+ rf_release_all_vps(cset);
+ return -2;
+ }
+ return -1;
+}
+
+static RF_AutoConfig_t *
+rf_buildroot_search(int raidID)
+{
+ int bmajor, bminor, error, i;
+ dev_t bdev;
+ struct vnode *vp;
+ struct disklabel label;
+ RF_AutoConfig_t *ac_list = NULL;
+ uint64_t numsecs;
+ unsigned secsize;
+ struct raid_softc *rs;
+
+ KASSERT(raidID >= 0 && raidID < numraid);
+ rs = &raid_softc[raidID];
+
+ if (rs->sc_dkdev.dk_nwedges != 0)
+ { // we have wedges and may not use the normal partion information ...
+ struct dkwedge_list wl;
+ struct dkwedge_info *wi;
+
+// XXX the interface for wedges is "strange" - sorry, but this is true
+// XXX need to allocate memory, because cannot access the structures itself ...
+// XXX need to use the returned strings to search in the array just read for
additional data ....
+// XXX - OK, we do this only at boot time, so it is not time critical
+
+ wl.dkwl_bufsize = sizeof(struct dkwedge_info) * rs->sc_dkdev.dk_nwedges;
+ if ((wi = malloc(wl.dkwl_bufsize, M_RAIDFRAME, M_NOWAIT)) != NULL)
+ {
+ wl.dkwl_buf = wi;
+// remark: we are alone durint boot - so no one may changed the wedges in this
device ...
+ if (dkwedge_list(&rs->sc_dkdev, &wl, NULL) == 0)
+ {
+ for (i = 0; i < wl.dkwl_ncopied; i++)
+ { /* need to find the device_name_to_block_device_major stuff */
+ device_t dv;
+
+ if (strcmp("raidframe", wi[i].dkw_ptype))
+ {
+ aprint_debug("Autoconfig wedge '%s' on raid device is not
of type '%s' - ignored\n", wi[i].dkw_wname, wi[i].dkw_ptype);
+ continue;
+ }
+ bmajor = devsw_name2blk(wi[i].dkw_devname, NULL, 0);
+ if ((dv = dkwedge_find_by_wname(wi[i].dkw_wname)) == NULL)
continue;
+ bminor = minor(device_unit(dv));
+ bdev = makedev(bmajor, bminor);
+ if (bdevvp(bdev, &vp))
+ {
+ printf("RAID-autoconfig: can't alloc vnode for wedge '%s'
on just configured RAID device raid%d - ignored\n",
+ wi[i].dkw_wname, raidID);
+ continue;
+ }
+ if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+ {
+ printf("RAID-autoconfig: failed to open wedge '%s' on
just configured RAID device raid%d (%d) - ignored\n",
+ wi[i].dkw_wname, raidID, error);
+ goto wedge_drop;
+ }
+ if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+ {
+ printf("RAID-autoconfig: failed to get size information
for wedge '%s' on just configured RAID device raid%d (%d) - ignored\n",
+ wi[i].dkw_wname, raidID, error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+ wedge_drop:
+ vput(vp);
+ continue;
+ }
+ ac_list = rf_get_component(ac_list, bdev, vp,
wi[i].dkw_wname, wi[i].dkw_size, numsecs, secsize);
+ }
+ }
+ free(wi, M_RAIDFRAME);
+ }
+ return ac_list;
+ }
+
+// remark: it is to complex to use our interface routines directly, because
we want the open() to read in the label ...
+ bmajor = devsw_name2blk("raid", NULL, 0);
+ bdev = MAKEDISKDEV(bmajor, raidID, RAW_PART);
+ if (bdevvp(bdev, &vp))
+ panic("RAID can't alloc vnode in rf_buildroot_search()");
+
+ if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+ {
+ printf("RAID-autoconfig: failed to open just configured RAID device
raid%d (%d) - ignored\n", raidID, error);
+ vput(vp);
+ return NULL;
+ }
+ /* Ok, the disk exists. Go get the size and disklabel. */
+ if ((error = getdisksize(vp, &numsecs, &secsize)) != 0)
+ {
+ printf("RAID-autoconfig: failed to get size information for just
configured RAID device raid%d (%d) - ignored\n",
+ raidID, error);
+ }
+ else if ((error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED)) != 0)
+ {
+ /*
+ * XXX can't happen - open() would
+ * have errored out (or faked up one)
+ */
+ if (error != ENOTTY)
+ printf("RAIDframe: can't get disk-label for dev raid%d (%d)\n",
raidID, error);
+ }
+
+/* don't need this any more. We'll allocate it again a little later if we
really do... */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
+ vput(vp);
+ if (error) return NULL;
+
+ for (i = 0; i < label.d_npartitions; i++)
+ {
+ char cname[sizeof(ac_list->devname)];
+
+ /* We only support partitions marked as RAID */
+ if (label.d_partitions[i].p_fstype != FS_RAID) continue;
+
+ bdev = MAKEDISKDEV(bmajor, raidID, i);
+ if (bdevvp(bdev, &vp)) panic("RAID can't alloc vnode");
+
+ if ((error = VOP_OPEN(vp, FREAD, NOCRED)) != 0)
+ { /* Whatever... */
+ vput(vp);
+ continue;
+ }
+ snprintf(cname, sizeof(cname), "raid%d%c", raidID, 'a' + i);
+ ac_list = rf_get_component(ac_list, bdev, vp, cname,
label.d_partitions[i].p_size, numsecs, secsize);
+ }
+ return ac_list;
+}
+
void
rf_buildroothack(RF_ConfigSet_t *config_sets)
{
RF_ConfigSet_t *cset;
RF_ConfigSet_t *next_cset;
+ RF_ConfigSet_t *prev_cset;
+ RF_AutoConfig_t *ac_list;
int retcode;
int raidID;
int rootID;
@@ -459,34 +619,114 @@
int num_root;
char *devname;
+
rootID = 0;
num_root = 0;
+config_again: /* first step: configure only "complete" devices */
cset = config_sets;
+ prev_cset = NULL;
while(cset != NULL ) {
next_cset = cset->next;
- if (rf_have_enough_components(cset) &&
- cset->ac->clabel->autoconfigure==1) {
- retcode = rf_auto_config_set(cset,&raidID);
- if (!retcode) {
- aprint_debug("raid%d: configured ok\n", raidID);
- if (cset->rootable) {
- rootID = raidID;
- num_root++;
- }
- } else {
- /* The autoconfig didn't work :( */
- aprint_debug("Autoconfig failed with code %d
for raid%d\n", retcode, raidID);
- rf_release_all_vps(cset);
+ retcode = rf_buildroot_sub(cset, &raidID, 2); /* only complete
raid devices ... */
+ if (retcode <= -2) { /* aproach to configure failed in
raidframe code - drop it - cannot succeed later ... */
+ if (prev_cset == NULL) /* this set is dead - remove it
from the list ... */
+ config_sets = next_cset;
+ else
+ prev_cset->next = next_cset;
+ rf_cleanup_config_set(cset);
+ } else if (retcode >= 0) {
+ if (retcode >= 1) {
+ rootID = raidID;
+ num_root++;
}
- } else {
- /* we're not autoconfiguring this set...
- release the associated resources */
- rf_release_all_vps(cset);
+ if (prev_cset == NULL) /* this set has been configured
and the config data is gone - remove it from the list ... */
+ config_sets = next_cset;
+ else
+ prev_cset->next = next_cset;
+ if ((ac_list = rf_buildroot_search(raidID)) != NULL) {
+ add_new_components:
+ /*
+ * XXXXXX
+ *
+ * We may have a problem with our new config list ...
+ *
+ * We may not add any component to the remaining sets if we have
already configured that one.
+ * Otherwise we will add a new set information for the raid device
already configured by this routine.
+ * This can only happen, if some components are layered raid devices
and some not - see below too.
+ *
+ * In worst case, this may lead to a mirror splitted into two degraded
devices.
+ *
+ * This problem can only occure if there are failed (missing)
components, because if all components
+ * are available, the first loop will handle everything and there this
problem is only be relevant for spare devices.
+ * (in some setups hot-spare devices may get dropped from a raid during
autoconfig if they come from a higher layer-level ...)
+ *
+ * On the other hand, we are talking only about layered raid devices
here!
+ * And the described problem can only occure with such kind of
components - components that reside on a raid device.
+ * The only realy usefull setup here (from my oppinion) would be a
raid0 (stripe) device that is constructed from some
+ * raid 1 devices (and perhaps 4 or 5 - possible but less usefull) in
order to increase the total size of the raid device.
+ * (remark: concatenation is not avaliable in raidframe ...)
+ * And a raid 0 device has no spare disks at all.
+ *
+ * If it is intended to catch all (strange) setup-cases, we need an
addtional list of already configured devices and
+ * drop additional components for those devices from the list. Dropping
should be done in a verbose way, because we
+ * are gooing to change the raid configuration during boot here!
+ * For now this list is not implemented.
+ *
+ * In order to avoid configuration of layered raid devices prior all
"lower-level" raid devices have been configured, we
+ * will add new devices to the end of the list of all sets in
rf_create_auto_sets() - see change below.
+ * So we have a stable and reliable autoconfig here, if all components
of a raid device are at the same layer-level.
+ * And we can build up as much levels we ever want to and autoconfigure
them during startup.
+ *
+ * W. Stukenbrock
+ */
+ config_sets = rf_create_auto_sets(ac_list,
config_sets);
+ goto config_again;
+ }
+ } else { /* ignore missing component errors here - retry later
... */
+ prev_cset = cset;
+ }
+ cset = next_cset;
+ }
+ /* second step, try to configure the rest - on failure,
give them an additional try later */
+ /* perhaps some additional components are found and we
will succeed later ... */
+ cset = config_sets;
+ prev_cset = NULL;
+ while(cset != NULL ) {
+ next_cset = cset->next;
+ retcode = rf_buildroot_sub(cset, &raidID, 1); /* all raid
devices that may have a chance ... */
+ if (retcode <= -2) { /* aproach to configure failed in
raidframe code - drop it - cannot succeed later ... */
+ if (prev_cset == NULL) /* this set is dead - remove it
from the list ... */
+ config_sets = next_cset;
+ else
+ prev_cset->next = next_cset;
+ rf_cleanup_config_set(cset);
+ } else if (retcode >= 0) {
+ if (retcode >= 1) {
+ rootID = raidID;
+ num_root++;
+ }
+ if (prev_cset == NULL) /* this set has been configured
and the config data is gone - remove it from the list ... */
+ config_sets = next_cset;
+ else
+ prev_cset->next = next_cset;
+ if ((ac_list = rf_buildroot_search(raidID)) != NULL)
goto add_new_components;
+ } else { /* ignore to few component errors here - we retry
later ... */
+ prev_cset = cset; /* not complete - just try next one
in this loop ... */
}
- /* cleanup */
- rf_cleanup_config_set(cset);
cset = next_cset;
}
+/* if we still have some config sets now, that cannot be configured at all -
still missing components
+ * we can just dropt them now, because we wuold have configured them in the
second loop, because we always start over
+ * if we find additional raid components
+ */
+ /* third step, try to configure the rest - if still not
possible, drop it */
+ /* it makes no sense to search for additonal components
anymore - has already been done in the second step */
+ while((cset = config_sets) != NULL ) {
+ config_sets = cset->next;
+ printf("Autoconfig failed with due to missing components for
raid%d\n", raidID);
+ rf_release_all_vps(cset);
+ rf_cleanup_config_set(cset);
+ }
/* if the user has specified what the root device should be
then we don't touch booted_device or boothowto... */
@@ -542,7 +782,6 @@
}
}
-
int
raidsize(dev_t dev)
{
@@ -3228,7 +3467,7 @@
#endif
RF_ConfigSet_t *
-rf_create_auto_sets(RF_AutoConfig_t *ac_list)
+rf_create_auto_sets(RF_AutoConfig_t *ac_list, RF_ConfigSet_t *start_cfg)
{
RF_AutoConfig_t *ac;
RF_ConfigSet_t *config_sets;
@@ -3236,7 +3475,7 @@
RF_AutoConfig_t *ac_next;
- config_sets = NULL;
+ config_sets = start_cfg;
/* Go through the AutoConfig list, and figure out which components
belong to what sets. */
@@ -3281,9 +3520,18 @@
}
cset->ac = ac;
ac->next = NULL;
- cset->next = config_sets;
cset->rootable = 0;
- config_sets = cset;
+
+ if (start_cfg == NULL) { /* "initial setup"
case - order does not mather ... */
+ cset->next = config_sets;
+ config_sets = cset;
+ } else { /* append new sets to end of list -
needed for layered devices that are added here ... */
+ RF_ConfigSet_t *cs;
+
+ cset->next = NULL;
+ for (cs = config_sets; cs->next !=
NULL; cs = cs->next);
+ cs->next = cset;
+ }
}
}
ac = ac_next;
@@ -3451,7 +3699,8 @@
}
/* otherwise, all is well, and we've got enough to take a kick
at autoconfiguring this set */
- return(1);
+ if (num_missing > 0) return (1);
+ return(2);
}
void
>Unformatted:
Home |
Main Index |
Thread Index |
Old Index