Subject: Re: RAIDframe: notify me when a drive fails?
To: Greg Oster <oster@cs.usask.ca>
From: Geert Hendrickx <ghen@netbsd.org>
List: netbsd-users
Date: 02/28/2006 19:06:23
--2oS5YaxWCcQjTEyO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
On Tue, Feb 28, 2006 at 06:21:02PM +0100, Geert Hendrickx wrote:
> It doesn't have to be a separate daemon per se. Something I can (easily)
> check for via a cron-job is ok. The grep-for-failed thing works, but it
> would be more elegant if e.g. raidctl -s would return an exit status >0 if
> something is wrong and needs human intervention. Maybe combined with a -q
> flag (for no output), it would get as easy as
>
> raidctl -s -q raid0 || mail -s "RAID problem" ...
>
> in an hourly cronjob.
This simple patch (against 3.0) makes "raidctl -s" return the number of
failed components and/or spares in the array (i.e., normally 0).
Sorry if the style is bad, but you get the idea. :-)
Geert
--2oS5YaxWCcQjTEyO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="raidctl.diff"
Index: raidctl.8
===================================================================
RCS file: /pub/NetBSD-CVS/src/sbin/raidctl/raidctl.8,v
retrieving revision 1.49
diff -u -r1.49 raidctl.8
--- raidctl.8 28 Feb 2005 22:03:05 -0000 1.49
+++ raidctl.8 28 Feb 2006 17:59:51 -0000
@@ -253,6 +253,8 @@
.It Fl s Ar dev
Display the status of the RAIDframe device for each of the components
and spares.
+If any component or spare has failed, an exit code > 0 is returned
+(more specifically, the return code indicates the number of failed disks).
.It Fl S Ar dev
Check the status of parity re-writing, component reconstruction, and
component copyback.
Index: raidctl.c
===================================================================
RCS file: /pub/NetBSD-CVS/src/sbin/raidctl/raidctl.c,v
retrieving revision 1.36.2.1
diff -u -r1.36.2.1 raidctl.c
--- raidctl.c 28 May 2005 13:50:16 -0000 1.36.2.1
+++ raidctl.c 28 Feb 2006 17:59:51 -0000
@@ -72,7 +72,7 @@
void do_ioctl(int, u_long, void *, const char *);
static void rf_configure(int, char*, int);
static const char *device_status(RF_DiskStatus_t);
-static void rf_get_device_status(int);
+int rf_get_device_status(int);
static void rf_output_configuration(int, const char *);
static void get_component_number(int, char *, int *, int *);
static void rf_fail_disk(int, char *, int);
@@ -113,6 +113,7 @@
int fd;
int force;
int openmode;
+ int returncode;
num_options = 0;
action = 0;
@@ -122,6 +123,7 @@
is_clean = 0;
force = 0;
openmode = O_RDWR; /* default to read/write */
+ returncode = 0;
while ((ch = getopt(argc, argv, "a:A:Bc:C:f:F:g:GiI:l:r:R:sSpPuv"))
!= -1)
@@ -312,7 +314,7 @@
if (do_output)
rf_output_configuration(fd, dev_name);
else
- rf_get_device_status(fd);
+ returncode = rf_get_device_status(fd);
break;
case RAIDFRAME_REBUILD_IN_PLACE:
rebuild_in_place(fd, component);
@@ -328,7 +330,7 @@
}
close(fd);
- exit(0);
+ exit(returncode);
}
void
@@ -397,15 +399,17 @@
/* NOTREACHED */
}
-static void
+int
rf_get_device_status(int fd)
{
RF_DeviceConfig_t device_config;
void *cfg_ptr;
int is_clean;
int i;
+ int failed_disks;
cfg_ptr = &device_config;
+ failed_disks = 0;
do_ioctl(fd, RAIDFRAME_GET_INFO, &cfg_ptr, "RAIDFRAME_GET_INFO");
@@ -413,6 +417,9 @@
for(i=0; i < device_config.ndevs; i++) {
printf("%20s: %s\n", device_config.devs[i].devname,
device_status(device_config.devs[i].status));
+ if(device_config.devs[i].status == rf_ds_failed) {
+ failed_disks++;
+ }
}
if (device_config.nspares > 0) {
printf("Spares:\n");
@@ -420,6 +427,9 @@
printf("%20s: %s\n",
device_config.spares[i].devname,
device_status(device_config.spares[i].status));
+ if(device_config.devs[i].status == rf_ds_failed) {
+ failed_disks++;
+ }
}
} else {
printf("No spares.\n");
@@ -458,6 +468,7 @@
printf("Parity status: DIRTY\n");
}
check_status(fd,0);
+ return failed_disks;
}
static void
--2oS5YaxWCcQjTEyO--