Subject: Re: Alternative approach for interface events
To: None <tech-net@netbsd.org, jdolecek@netbsd.org>
From: Pavel Cahyna <pavel.cahyna@st.cuni.cz>
List: tech-net
Date: 09/21/2004 01:44:22
On Mon, 20 Sep 2004 21:23:54 +0000, Pavel Cahyna wrote:

> (Currently knote(9) can only add events to a kqueue opened from
> userspace, but extending this API to be able to run arbitrary kernel
> functions instead would probably be possible. I have an (untested)
> patch which should do exactly this.)

Here is the patch. It has been compile-tested and all the kqueue
regression test complete successfully. "options KCONT" is needed.

It renames kcont_run to kcont_run_all, adds a kcont_run function which
runs only one kcont (I believe this is a reasonable change to the kcont
API) and adds a kn_kcont field to struct knote. Set this field to any
kcont you want (but always call kcont with env_arg set to this knote) and
it will be run when knote(9) is called on this knote. The kqueue system
has been changed to set kn_kcont to a kcont which queues the knote to a
kqueue.

Index: kern/kern_event.c
===================================================================
RCS file: /home/pavel/cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.19
retrieving revision 1.19.1500.4
diff -u -r1.19 -r1.19.1500.4
--- kern/kern_event.c	14 Feb 2004 11:56:28 -0000	1.19
+++ kern/kern_event.c	20 Sep 2004 22:00:33 -0000	1.19.1500.4
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_event.c,v 1.19 2004/02/14 11:56:28 jdolecek Exp $	*/
+/*	$NetBSD: kern_event.c,v 1.19.1500.4 2004/09/20 22:00:33 pavel Exp $	*/
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * All rights reserved.
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.19 2004/02/14 11:56:28 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.19.1500.4 2004/09/20 22:00:33 pavel Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -53,6 +53,7 @@
 #include <sys/filedesc.h>
 #include <sys/sa.h>
 #include <sys/syscallargs.h>
+#include <sys/kcont.h>
 
 static int	kqueue_scan(struct file *fp, size_t maxevents,
 		    struct kevent *ulistp, const struct timespec *timeout,
@@ -113,13 +114,16 @@
 #define	KNOTE_ACTIVATE(kn)						\
 do {									\
 	kn->kn_status |= KN_ACTIVE;					\
-	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
-		knote_enqueue(kn);					\
+	if ((kn->kn_status & KN_DISABLED) == 0)				\
+		kcont_run(kn->kn_kcont, NULL, 0, KC_IPL_IMMED);		\
 } while(0)
 
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #define	KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
+#define KN_KC_ASSERT(kn)	KASSERT((kn->kn_kcont->kc_fn == kq_func) && \
+				(kn->kn_kcont->kc_env_arg == kn))
+
 extern const struct filterops sig_filtops;
 
 /*
@@ -151,6 +155,15 @@
 static int		user_kfilterc;		/* current offset */
 static int		user_kfiltermaxc;	/* max size so far */
 
+static void
+kq_func(void * obj, void * env, int status)
+{
+	struct knote * kn = env;
+	KN_KC_ASSERT(kn);
+	if ((kn->kn_status & KN_QUEUED) == 0)
+		knote_enqueue(kn);			
+}
+
 /*
  * kqueue_init:
  *
@@ -819,6 +832,8 @@
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
+			kn->kn_kcont = kcont(pool_get(&kc_pool, PR_WAITOK), 
+					     kq_func, kn, KC_IPL_IMMED);
 
 			knote_attach(kn, fdp);
 			if ((error = kfilter->filtops->f_attach(kn)) != 0) {
@@ -836,6 +851,7 @@
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kn->kn_kevent.udata = kev->udata;
+			KN_KC_ASSERT(kn);
 		}
 
 		s = splsched();
@@ -861,9 +877,8 @@
 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
 		s = splsched();
 		kn->kn_status &= ~KN_DISABLED;
-		if ((kn->kn_status & KN_ACTIVE) &&
-		    ((kn->kn_status & KN_QUEUED) == 0))
-			knote_enqueue(kn);
+		if (kn->kn_status & KN_ACTIVE)
+			kcont_run(kn->kn_kcont, NULL, 0, KC_IPL_IMMED);
 		splx(s);
 	}
 
@@ -1193,6 +1208,8 @@
 			if (kq == kn->kn_kq) {
 				kn->kn_fop->f_detach(kn);
 				FILE_UNUSE(kn->kn_fp, p);
+				KN_KC_ASSERT(kn);
+				pool_put(&kc_pool, kn->kn_kcont);
 				pool_put(&knote_pool, kn);
 				*knp = kn0;
 			} else {
@@ -1210,6 +1227,8 @@
 				if (kq == kn->kn_kq) {
 					kn->kn_fop->f_detach(kn);
 					/* XXX non-fd release of kn->kn_ptr */
+					KN_KC_ASSERT(kn);
+					pool_put(&kc_pool, kn->kn_kcont);
 					pool_put(&knote_pool, kn);
 					*knp = kn0;
 				} else {
@@ -1384,6 +1403,8 @@
 		knote_dequeue(kn);
 	if (kn->kn_fop->f_isfd)
 		FILE_UNUSE(kn->kn_fp, p);
+	KN_KC_ASSERT(kn);
+	pool_put(&kc_pool, kn->kn_kcont);
 	pool_put(&knote_pool, kn);
 }
 
Index: kern/kern_kcont.c
===================================================================
RCS file: /home/pavel/cvs/src/sys/kern/kern_kcont.c,v
retrieving revision 1.9
retrieving revision 1.9.1500.3
diff -u -r1.9 -r1.9.1500.3
--- kern/kern_kcont.c	27 Mar 2004 00:42:38 -0000	1.9
+++ kern/kern_kcont.c	20 Sep 2004 21:54:32 -0000	1.9.1500.3
@@ -1,4 +1,4 @@
-/* $NetBSD: kern_kcont.c,v 1.9 2004/03/27 00:42:38 jonathan Exp $ */
+/* $NetBSD: kern_kcont.c,v 1.9.1500.3 2004/09/20 21:54:32 pavel Exp $ */
 
 /*
  * Copyright 2003 Jonathan Stone.
@@ -37,7 +37,7 @@
 /*
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_kcont.c,v 1.9 2004/03/27 00:42:38 jonathan Exp $ ");
+__KERNEL_RCSID(0, "$NetBSD: kern_kcont.c,v 1.9.1500.3 2004/09/20 21:54:32 pavel Exp $ ");
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -86,7 +86,7 @@
 /*
  * Pool allocator structure.
  */
-static struct pool kc_pool;
+struct pool kc_pool;
 
 /*
  * Process-context continuation queue.
@@ -269,6 +269,48 @@
 	kcont_enqueue_atomic(kcq, kc);
 }
 
+/*
+ * Call (or hand off) one continuation function.  If the
+ * caller-provided IPL is the same as the requested IPL, deliver the
+ * callback.  If the caller-provided IPL is higher than the requested
+ * callback IPL, re-enqueue the continuation to a lower-priority
+ * queue.
+ */
+void
+kcont_run(struct kc *kc, void *obj, int status, int curipl)
+{
+
+
+	/* If execution of kc was already deferred, restore context. */
+	if (kc->kc_flags & KC_DEFERRED) {
+		KASSERT(obj == NULL);
+		obj = kc->kc_obj;
+		status = kc->kc_status;
+	}
+
+	/* Check whether to execute now or to defer. */
+	if (kc->kc_ipl == KC_IPL_IMMED || curipl <= kc->kc_ipl) {
+		int saved_flags = kc->kc_flags; /* XXX see below */
+
+		/* Satisfy our raison d'e^tre */
+		(*kc->kc_fn)(obj, kc->kc_env_arg, status);
+
+		/*
+		 * We must not touch (*kc) after calling
+		 * (*kc->kc_fn), unless we were specifically
+		 * asked to free it.  The memory for (*kc) may
+		 * be a sub-field of some other object (for example,
+		 * of kc->kc_env_arg) and (*kc_fn)() may already
+		 * have freed it by the time we get here.  So save
+		 * kc->kc_flags (above) and use that saved copy
+		 * to test for auto-free.
+		 */
+		if (saved_flags & KC_AUTOFREE)
+			pool_put(&kc_pool, kc);
+	} else {
+		kcont_defer(kc, obj, status);
+	}
+}
 
 /*
  * Run through a list of continuations, calling (or handing off)
@@ -279,42 +321,12 @@
  * callback IPL, re-enqueue the continuation to a lower-priority queue.
  */
 void
-kcont_run(kcq_t *kcq, void *obj, int status, int curipl)
+kcont_run_all(kcq_t *kcq, void *obj, int status, int curipl)
 {
 	struct kc *kc;
 
-	while ((kc = kcont_dequeue_atomic(kcq)) != NULL) {
-
-		/* If execution of kc was already deferred, restore context. */
-		if (kc->kc_flags & KC_DEFERRED) {
-			KASSERT(obj == NULL);
-			obj = kc->kc_obj;
-			status = kc->kc_status;
-		}
-
-		/* Check whether to execute now or to defer. */
-		if (kc->kc_ipl == KC_IPL_IMMED || curipl <= kc->kc_ipl) {
-			int saved_flags = kc->kc_flags; /* XXX see below */
-
-			/* Satisfy our raison d'e^tre */
-			(*kc->kc_fn)(obj, kc->kc_env_arg, status);
-
-			/*
-			 * We must not touch (*kc) after calling
-			 * (*kc->kc_fn), unless we were specifically
-			 * asked to free it.  The memory for (*kc) may
-			 * be a sub-field of some other object (for example,
-			 * of kc->kc_env_arg) and (*kc_fn)() may already
-			 * have freed it by the time we get here.  So save
-			 * kc->kc_flags (above) and use that saved copy
-			 * to test for auto-free.
-			 */
-			if (saved_flags & KC_AUTOFREE)
-				pool_put(&kc_pool, kc);
-		} else {
-			kcont_defer(kc, obj, status);
-		}
-	}
+	while ((kc = kcont_dequeue_atomic(kcq)) != NULL) 
+		kcont_run(kc, obj, status, curipl);
 }
 
 #ifdef __HAVE_GENERIC_SOFT_INTERRUPTS
@@ -325,21 +337,21 @@
 kcont_run_softclock(void *arg)
 {
 
-	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTCLOCK);
+	kcont_run_all((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTCLOCK);
 }
 
 static void
 kcont_run_softnet(void *arg)
 {
 
-	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTNET);
+	kcont_run_all((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTNET);
 }
 
 static void
 kcont_run_softserial(void *arg)
 {
 
-	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTSERIAL);
+	kcont_run_all((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTSERIAL);
 }
 #endif /* __HAVE_GENERIC_SOFT_INTERRUPTS */
 
@@ -366,7 +378,7 @@
 		status = ltsleep(&kcq_process_ctxt, PCATCH, "kcont", hz, NULL);
 		if (status != 0 && status != EWOULDBLOCK)
 			break;
-		kcont_run(&kcq_process_ctxt, NULL, 0, KC_IPL_DEFER_PROCESS);
+		kcont_run_all(&kcq_process_ctxt, NULL, 0, KC_IPL_DEFER_PROCESS);
 	}
 	kthread_exit(0);
 }
Index: sys/event.h
===================================================================
RCS file: /home/pavel/cvs/src/sys/sys/event.h,v
retrieving revision 1.12
retrieving revision 1.12.1500.1
diff -u -r1.12 -r1.12.1500.1
--- sys/event.h	22 Feb 2004 17:45:26 -0000	1.12
+++ sys/event.h	20 Sep 2004 13:12:46 -0000	1.12.1500.1
@@ -1,4 +1,4 @@
-/*	$NetBSD: event.h,v 1.12 2004/02/22 17:45:26 jdolecek Exp $	*/
+/*	$NetBSD: event.h,v 1.12.1500.1 2004/09/20 13:12:46 pavel Exp $	*/
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * All rights reserved.
@@ -184,6 +184,7 @@
 	} kn_ptr;
 	const struct filterops	*kn_fop;
 	void 			*kn_hook;
+	struct kc		*kn_kcont;
 
 #define	KN_ACTIVE	0x01			/* event has been triggered */
 #define	KN_QUEUED	0x02			/* event is on queue */
Index: sys/kcont.h
===================================================================
RCS file: /home/pavel/cvs/src/sys/sys/kcont.h,v
retrieving revision 1.3
retrieving revision 1.3.1500.4
diff -u -r1.3 -r1.3.1500.4
--- sys/kcont.h	25 Mar 2004 23:02:58 -0000	1.3
+++ sys/kcont.h	20 Sep 2004 21:54:32 -0000	1.3.1500.4
@@ -1,4 +1,4 @@
-/* $NetBSD: kcont.h,v 1.3 2004/03/25 23:02:58 enami Exp $ */
+/* $NetBSD: kcont.h,v 1.3.1500.4 2004/09/20 21:54:32 pavel Exp $ */
 
 /*
  * Copyright (c) 2003 The NetBSD Foundation, Inc.
@@ -59,6 +59,8 @@
  *    additional state beyond the "slept-upon" object.
  */
 
+extern struct pool kc_pool;
+
 typedef struct kc {
 	SIMPLEQ_ENTRY(kc) kc_next;	/* next kc in object's callback list */
 
@@ -150,6 +152,14 @@
  */
 void	kcont_enqueue(kcq_t * /*kcqueue*/, struct kc * /*kc*/);
 
+/*
+ * Execute (or defer) one continuation, either running the
+ * continuation with the given object and status; or saving the object
+ * and status and deferring the kcont to a software-interrupt priority
+ * kc_queue.
+ */
+void	kcont_run(kc_t * /*kc*/, void * /*obj*/,
+	    int /*status*/, int /*cur_ipl*/);
 
 /*
  * Execute (or defer) all continuations on an object's kcqueue
@@ -158,7 +168,7 @@
  * with the given object and status; or saving the object and status
  * and deferring the kconts to a software-interrupt priority kc_queue.
  */
-void	kcont_run(kcq_t * /*kcq*/, void * /*obj*/,
+void	kcont_run_all(kcq_t * /*kcq*/, void * /*obj*/,
 	    int /*status*/, int /*cur_ipl*/);