netbsd-users: Re: RAIDframe: notify me when a drive fails?

Subject: Re: RAIDframe: notify me when a drive fails?
To: Greg Oster <oster@cs.usask.ca>
From: Geert Hendrickx <ghen@netbsd.org>
List: netbsd-users
Date: 02/28/2006 19:06:23
--2oS5YaxWCcQjTEyO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Tue, Feb 28, 2006 at 06:21:02PM +0100, Geert Hendrickx wrote:
> It doesn't have to be a separate daemon per se.  Something I can (easily)
> check for via a cron-job is ok.  The grep-for-failed thing works, but it
> would be more elegant if e.g. raidctl -s would return an exit status >0 if
> something is wrong and needs human intervention.  Maybe combined with a -q
> flag (for no output), it would get as easy as 
> 
>   raidctl -s -q raid0 || mail -s "RAID problem" ...
> 
> in an hourly cronjob.  

This simple patch (against 3.0) makes "raidctl -s" return the number of
failed components and/or spares in the array (i.e., normally 0).  

Sorry if the style is bad, but you get the idea. :-)  

	Geert

--2oS5YaxWCcQjTEyO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="raidctl.diff"

Index: raidctl.8
===================================================================
RCS file: /pub/NetBSD-CVS/src/sbin/raidctl/raidctl.8,v
retrieving revision 1.49
diff -u -r1.49 raidctl.8
--- raidctl.8	28 Feb 2005 22:03:05 -0000	1.49
+++ raidctl.8	28 Feb 2006 17:59:51 -0000
@@ -253,6 +253,8 @@
 .It Fl s Ar dev
 Display the status of the RAIDframe device for each of the components
 and spares.
+If any component or spare has failed, an exit code > 0 is returned
+(more specifically, the return code indicates the number of failed disks).
 .It Fl S Ar dev
 Check the status of parity re-writing, component reconstruction, and
 component copyback.
Index: raidctl.c
===================================================================
RCS file: /pub/NetBSD-CVS/src/sbin/raidctl/raidctl.c,v
retrieving revision 1.36.2.1
diff -u -r1.36.2.1 raidctl.c
--- raidctl.c	28 May 2005 13:50:16 -0000	1.36.2.1
+++ raidctl.c	28 Feb 2006 17:59:51 -0000
@@ -72,7 +72,7 @@
 void	do_ioctl(int, u_long, void *, const char *);
 static  void rf_configure(int, char*, int);
 static  const char *device_status(RF_DiskStatus_t);
-static  void rf_get_device_status(int);
+int	rf_get_device_status(int);
 static	void rf_output_configuration(int, const char *);
 static  void get_component_number(int, char *, int *, int *);
 static  void rf_fail_disk(int, char *, int);
@@ -113,6 +113,7 @@
 	int fd;
 	int force;
 	int openmode;
+	int returncode;
 
 	num_options = 0;
 	action = 0;
@@ -122,6 +123,7 @@
 	is_clean = 0;
 	force = 0;
 	openmode = O_RDWR;	/* default to read/write */
+	returncode = 0;
 
 	while ((ch = getopt(argc, argv, "a:A:Bc:C:f:F:g:GiI:l:r:R:sSpPuv")) 
 	       != -1)
@@ -312,7 +314,7 @@
 		if (do_output)
 			rf_output_configuration(fd, dev_name);
 		else
-			rf_get_device_status(fd);
+			returncode = rf_get_device_status(fd);
 		break;
 	case RAIDFRAME_REBUILD_IN_PLACE:
 		rebuild_in_place(fd, component);
@@ -328,7 +330,7 @@
 	}
 
 	close(fd);
-	exit(0);
+	exit(returncode);
 }
 
 void
@@ -397,15 +399,17 @@
 	/* NOTREACHED */
 }
 
-static void
+int
 rf_get_device_status(int fd)
 {
 	RF_DeviceConfig_t device_config;
 	void *cfg_ptr;
 	int is_clean;
 	int i;
+	int failed_disks;
 
 	cfg_ptr = &device_config;
+	failed_disks = 0;
 
 	do_ioctl(fd, RAIDFRAME_GET_INFO, &cfg_ptr, "RAIDFRAME_GET_INFO");
 
@@ -413,6 +417,9 @@
 	for(i=0; i < device_config.ndevs; i++) {
 		printf("%20s: %s\n", device_config.devs[i].devname, 
 		       device_status(device_config.devs[i].status));
+		if(device_config.devs[i].status == rf_ds_failed) {
+			failed_disks++;
+		}
 	}
 	if (device_config.nspares > 0) {
 		printf("Spares:\n");
@@ -420,6 +427,9 @@
 			printf("%20s: %s\n",
 			       device_config.spares[i].devname, 
 			       device_status(device_config.spares[i].status));
+			if(device_config.devs[i].status == rf_ds_failed) {
+				failed_disks++;
+			}
 		}
 	} else {
 		printf("No spares.\n");
@@ -458,6 +468,7 @@
 		printf("Parity status: DIRTY\n");
 	}
 	check_status(fd,0);
+	return failed_disks;
 }
 
 static void

--2oS5YaxWCcQjTEyO--