Subject: Prototype kernel continuation-passing for NetBSD
To: None <tech-kern@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-kern
Date: 01/28/2004 13:04:00
The code below is a sample implementation of ``continuation-passing''
for the NetBSD kernel. If you're not familiar with languages that have
functions, closures, and continuations as first-class objects, think
of this as a framework for handling ``callback functions'': specifically
creating, enqueing, managinging, and deferring callbacks to some lower
interrupt priority.

One possible use of these `continuations' is to implement operations
like sendfile() or splice() inside the kernel, using socket callback
functions. If you try this (I have, during an internship at Compaq/DEC
WRL) you quickly find yourself needing to defer operation from one
socket's upcall function to a non-upcall context, in order to frob
state on the `peer' socket. Or see the note in spl(9) about not using
splvm(): this code does so, deliberately, in order to provide a
framework whereby other code need never do so.

Long-term, I have an interest in reworking the NFS code (primarily the
server side, but client-side is interesting too) to not depend on
having a process context to do I/O. The I/O rates achievable today
with high-end hardware (and thus in a couple of years with mid-range
hardware) are high enough that a context-switch per NFS I/O is
prohibitive.  Eventually, I'd like to replace or augment the
process-based I/O notification (processes sleeping on buffers) with a
kcont-based notification mechanism. In that world, I/O requests would
pass a kcont down through the I/O system, and each struct buf would
have a list of continuations to call when I/O on the buffer completes.
That's a ways off yet :-).

I have some immediate uses for a kcont-like interface; so any feedback
on this would be appreciated.



--- /dev/null	Wed Jan 21 00:09:46 2004
+++ sys/sys/kcont.h	Sat Dec 13 00:38:58 2003
@@ -0,0 +1,162 @@
+/* $NetBSD: */
+
+/*
+ * Copyright (c) 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jonathan Stone, currently employed by Decru, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2003 Jonathan Stone.
+ * All rights reserved.
+ */
+
+#include <sys/queue.h>
+
+/*
+ * kcont -- Continuation-passing for BSD kernels.
+ * 
+ * This module defines structures that implement a
+ * continuation-passing model in C which can be used as a
+ * ``callback''-like  alternative to using  a process context to
+ * tsleep() on an object address, waiting for some event or status
+ * change on that object.  Since ANSI  CI provides neither
+ * lexically-nested functions nor heap allocation of activation
+ * records, we implement a continuation as a struct holding:
+ * 1. A function pointer, pointing ot the continuation.
+ * 2. A object pointer: the object being "slept" upon. 
+ * 3. An argument pointer: a pointer to private storage containing
+ *    other arguments to the continuation function, including any
+ *    additional state beyond the "slept-upon" object.
+ */
+
+typedef struct kc {
+	SIMPLEQ_ENTRY(kc) kc_next;	/* next kc in object's callback list */
+
+	void (*kc_fn) (void * /*obj*/, void * /*env_arg*/, int /* status*/);
+
+	void * kc_env_arg;	/* caller-supplied continuation argument */
+	void * kc_obj;		/* saved object, used for deferred calls */
+	int kc_status;		/* saved status, used for  deferred calls */
+  	ushort kc_ipl;		/* IPL at which to call kc_func */
+	ushort kc_flags;	/* Flags, private to kcont implemenation */
+} kc_t;
+
+/* kc_flags values. */
+#define KC_AUTOFREE 0x01	/* This struct kcont * was malloc'ed by
+				 * the kcont module; free it immediately 
+				 * after calling the continuation function.
+				 */
+#define KC_DEFERRED 0x02	/* Deferral has already saved object, status */
+
+/* kcont IPL values. */
+#define KC_IPL_DEFER_PROCESS	0x00	/* Hand off continuation fn to a
+					 * full process context (kthread).
+					 */
+
+#define KC_IPL_DEFER_SOFTCLOCK	0x01
+#define KC_IPL_DEFER_SOFTNET	0x02	/* run continatiuon in an
+					 * IPL_SOFTNET software callout. */
+#define KC_IPL_DEFER_SOFTSERIAL	0x03
+
+#define KC_IPL_IMMED	0x10	/*
+				 * Execute continuation fn immediately,
+				 * in the context of the object event,
+				 * with no completion. Continuation
+				 * assumed to be very quick.
+				 */
+
+/*
+ * Head of a list of pending continuations.
+ * For example, for continuation-based buffer I/O,
+ * add a kcq_t to  struct buf), and pass a kc_t *
+ * down through VFS and strategy layers.
+ */
+typedef SIMPLEQ_HEAD(kcqueue, kc)  kcq_t;
+		    
+
+/*
+ * Prepare a struct kcont continuation 
+ */
+kc_t * kcont(kc_t *,
+	    void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+	    void * /*env_arg*/, int /* continue_ipl*/);
+
+kc_t *kcont_malloc(int /*allocmflags*/,
+	    void (* /*fn*/) __P((void * /*obj*/, void */*arg*/, int /*sts*/)),
+	    void * /*env_arg*/, int /* continue_ipl*/);
+
+
+/*
+ * Use a kcont to defer execution to a (presumably) lower interropt
+ * priority.
+ */
+void  kcont_defer(struct kc *kc,
+	    void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+	    void * /*obj*/,
+	    void * /*env_arg*/, int /*status*/, int /* continue_ipl*/);
+
+/*
+ * Malloc a kcont to defer execution to a (presumably) lower interropt
+ * priority.
+ */
+void  kcont_defer_malloc(int, /* mallocflags */
+	    void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+	    void * /*obj*/,
+	    void * /*env_arg*/, int /*status*/, int /*continue_ipl*/);
+
+/*
+ * Enqueue a struct kcont * into the kc_queue* field of some kernel struct.
+ * The struct kcont * argument is typically passed as an argument to
+ * a functio which sequests an asynchronous operation on that kernel object.
+ * When the operation completes, or the  object otherwise decides to
+ * invoke a  wakeup() on itself, all continuations on the object's
+ * kcqueue will be called, either immediately or if a  continuation 
+ * requested it) after deferring to a kc_queue * servied at
+ *  software-interrupt priority.
+ */
+void	kcont_enqueue(kcq_t /*kcqueue*/, struct kc * /*kc*/,
+	   void (*/*func*/) __P((void*, void*, int)),
+	   void * /*env_arg*/, int /*contine_ipl*/);
+
+
+/*
+ * Execute (or defer) all continuations on an object's kcqueue
+ * when  an asynchronous operation completes. Runs through the
+ * struct kc_queue * of kc_ts, either running the continuations
+ * with the given objcet and status; or saving the object and status
+ * and deferring the kconts to a software-interrupt priority kc_queue.
+ */
+void	kcont_run __P((kcq_t * /*kcq*/, void * /*obj*/,
+	    int /*status*/, int /*cur_ipl*/));
+
+
+/* Initialize kcont framework at boot time. */
+void	kcont_init __P((void));
+
--- /dev/null	Wed Jan 21 00:09:46 2004
+++ sys/kern/kern_kcont.c	Sat Dec 13 09:40:04 2003
@@ -0,0 +1,369 @@
+/* $NetBSD:$ */
+
+/*
+ * Copyright (c) 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jonathan Stone, currently employed by Decru, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2003 Jonathan Stone.
+ * All rights reserved.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD:$ ");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+
+#include <machine/intr.h>	/* IPL_*, and schedsoftnet() */
+				/* XXX: schedsofnet() should die. */
+
+#include <sys/kcont.h>
+
+
+MALLOC_DECLARE(M_KCONT);
+MALLOC_DEFINE(M_KCONT, "kcont", "Kernel non-process continuations");
+
+/* Accessors for struct kc_qeueue */
+static __inline struct kc * kc_set(struct kc *,
+	void (*func) __P((void *, void *, int)),
+	void *env_arg, int ipl);
+
+static __inline void kcont_enqueue_atomic(kcq_t *kcq, struct kc *kc);
+static __inline struct kc *kcont_dequeue_atomic(kcq_t *kcq);
+
+
+static void	kcont_defer_kc(struct kc * /*kc*/, void * /*obj*/,
+		    int /*status*/);
+static void	kcont_worker(void * /*arg*/);
+
+
+/*
+ * Software-interrupt priority continuation queues.
+ */
+static kcq_t kcq_softnet;
+static kcq_t kcq_softclock;
+static kcq_t kcq_softserial;
+
+static void *kc_si_softnet;
+static void *kc_si_softclock;
+static void *kc_si_softserial;
+
+/*
+ * Process-context continuatoin queue.
+ */
+static kcq_t kcq_process_ctxt;
+
+
+/*
+ * Insert a fully-formed struct kc * into the kc_queue *
+ * of some kernel object (e.g., a struct buf *).
+ */
+static __inline void
+kcont_enqueue_atomic(kcq_t *kcq, struct kc *kc)
+{
+	int s;
+
+	s =  splvm();
+	SIMPLEQ_INSERT_TAIL(kcq, kc, kc_next);
+	splx(s);
+}
+
+static __inline struct kc *
+kcont_dequeue_atomic(kcq_t *kcq)
+{
+	struct kc *kc;
+	int s;
+
+	s =  splvm();
+	kc = SIMPLEQ_FIRST(kcq);
+	if (kc != NULL) {
+		SIMPLEQ_REMOVE_HEAD(kcq, kc_next);
+		SIMPLEQ_NEXT(kc, kc_next) = NULL;
+	}		
+	splx(s);
+	return kc;
+}
+
+/*
+ * Construct a continuation object from pre-allocated memory.
+ * Used by functions that are about call an asynchronous operation, 
+ * to build a continuation to be called once the operation completes.
+ */
+static __inline struct kc *
+kc_set(struct kc *kc, void (*func) __P((void *, void *, int)),
+	void *env_arg, int ipl)
+{
+	kc->kc_fn = func;
+	kc->kc_env_arg = env_arg;
+	kc->kc_ipl = ipl;
+	kc->kc_flags = 0;
+#ifdef DEBUG
+	kc->kc_obj = NULL;
+	kc->kc_status = -1;
+	SIMPLEQ_NEXT(kc, kc_next) = NULL;
+#endif
+	return kc;
+}
+
+/*
+ * Request a continuation. Caller provides space for the struct kc *. 
+ */
+struct kc *
+kcont(struct kc *kc, void (*func) __P((void*, void*, int)),
+	void *env_arg, int continue_ipl)
+{
+	/* Just save the arguments in the kcont *. */
+	return kc_set(kc, func, env_arg, continue_ipl);
+}
+
+/*
+ * Request a malloc'ed/auto-freed continuation. The kcont framwork
+ * mallocs the struct kc, and initializes it with the caller-supplied args.
+ * Once the asynchronous operation completes and the continuation function
+ * has been called, the kcont framework will free the struct kc *
+ * immediately after the continuation function  returns.
+ */
+struct kc *
+kcont_malloc(int malloc_flags,
+	       void (*func) __P((void*, void*, int)),
+	       void *env_arg, int continue_ipl)
+{
+	struct kc *kc;
+
+	kc = malloc(sizeof (*kc), M_KCONT, malloc_flags);
+	if (kc ==  NULL)
+		return kc;
+	return kc_set(kc, func, env_arg, continue_ipl);
+}
+
+/*
+ * Dispatch a dequeed continuation which requested deferral
+ * into the appropriate lower-priority queue.
+ */
+static void
+kcont_defer_kc(struct kc *kc, void *obj, int status)
+{
+	/* 
+	 * IPL at which to synchronize access to object is
+	 * above the continuer's requested callback IPL,
+	 * (e.g., continuer wants IPL_SOFTNET but the object
+	 * holding this contuation incurred the wakeup()able
+	 * event whilst at IPL_BIO).
+	 * Defer this kc to a lower-priority kcq,pppppp
+	 * to be serviced slightly later at a lower IPL.
+	 */
+
+	/*
+	 * If we already deferred this kcont,don't clobber
+	 * the previously-saved kc_object and kc_status.
+	 */
+	if ((kc->kc_flags & KC_DEFERRED) == 0) {
+		kc->kc_flags |= KC_DEFERRED;
+		kc->kc_obj = obj;
+		kc->kc_status = status;
+	}
+  	switch (kc->kc_ipl) {
+#ifdef __HAVE_GENERIC_SOFT_INTERRUPTS
+	case KC_IPL_DEFER_SOFTCLOCK:
+		kcont_enqueue_atomic(&kcq_softclock, kc);
+		softintr_schedule(kc_si_softclock);
+		break;
+	case KC_IPL_DEFER_SOFTNET:
+		kcont_enqueue_atomic(&kcq_softnet, kc);
+		softintr_schedule(kc_si_softnet);
+  		break;
+	case KC_IPL_DEFER_SOFTSERIAL:
+		kcont_enqueue_atomic(&kcq_softnet, kc);
+		softintr_schedule(kc_si_softserial);
+		break;
+
+#else /*  !__HAVE_GENERIC_SOFT_INTERRUPTS */
+	/* What to do? For now,d. put to process context */
+	case KC_IPL_DEFER_SOFTCLOCK:
+	case KC_IPL_DEFER_SOFTSERIAL:
+	case KC_IPL_DEFER_SOFTNET:
+		/*FALLTHROUGH*/
+#endif /*  __HAVE_GENERIC_SOFT_INTERRUPTS */
+
+	case KC_IPL_DEFER_PROCESS:
+		kcont_enqueue_atomic(&kcq_process_ctxt, kc);
+		break;
+	default:
+		KASSERT(0);
+	}
+}
+
+void
+kcont_defer(struct kc *kc,
+	void (*func) __P((void*, void*, int)),
+	void * obj, void *env_arg, int status, int ipl)
+{
+	kcont_defer_kc(kc_set(kc, func, env_arg, ipl),  obj, status);
+}
+
+void
+kcont_defer_malloc(int mallocflags,
+	void (*func) __P((void*, void*, int)),
+	void * obj, void *env_arg, int status, int ipl)
+{
+	struct kc *kc;
+
+	kc = kcont_malloc(mallocflags, func, env_arg, ipl);
+	if (kc != NULL)
+		kcont_defer_kc(kc, obj, status);
+}
+
+ /*
+  * Run through a list of continuations, calling (or handing off)
+  * continuation functions.
+  * If the caller-provided IPL is the same as the requested  IPL,
+  * deliver the callback. 
+  * If the caller-provided IPL is higher than the requested 
+  * callback IPL, re-enqueue the continuation to a lower-priority queue.
+  */
+void
+kcont_run(kcq_t *kcq, void *obj, int status, int curipl)
+{
+	struct kc *kc;
+	while ((kc = kcont_dequeue_atomic(kcq)) != NULL) {
+
+		/* If exceution of kc was already deferred, restore context. */
+		if (kc->kc_flags & KC_DEFERRED) {
+			KASSERT(obj == NULL);
+			obj = kc->kc_obj;
+			status = kc->kc_status;
+		}
+
+		/* Check whether to execute now or to defer. */
+		if (kc->kc_ipl == KC_IPL_IMMED || curipl <= kc->kc_ipl) {
+			int saved_flags = kc->kc_flags; /* XXX see below */
+
+			/* satisfy our raison d'e^tre */
+			(*kc->kc_fn)(obj, kc->kc_env_arg, status);
+
+			/* XXX XXX:  We must not touch (*kc) after calling
+			 * (*kc->kc_fn),  unless we were specifically
+			 * asked to free it.  The memory for (*kc) may
+			 * be a sub-field of some other object (for example,
+			 * of kc->kc_env_arg) and (*kc_fn)()  may already
+			 * have freed it by the time we get here. So save
+			 * kc->kc_flags  (above) and use that saved copy
+			 * to test for auto-free.
+			 */
+			if (saved_flags & KC_AUTOFREE)
+				free(kc, M_KCONT);
+		} else {
+			kcont_defer_kc(kc, obj, status);
+		}
+	}
+}
+
+/*
+ * Trampolines for processing software-interrupt kcont queues.
+ */
+static void
+kcont_run_softclock(void *arg)
+{
+	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTCLOCK);
+}
+
+static void
+kcont_run_softnet(void *arg)
+{
+	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTNET);
+}
+
+static void
+kcont_run_softserial(void *arg)
+{
+	kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTSERIAL);
+}
+
+/*
+ * Main entrypoint for kcont worker kthreads to execute
+ * a continuation which requested deferral to process context.
+ */
+void
+kcont_worker(void *arg)
+{
+	int status;
+
+	(void)arg;	/* kill GCC warning */
+
+	while (1) {
+		status = ltsleep(&kcq_process_ctxt, PCATCH, "kcont", 0, NULL);
+		if (status != 0)
+		    break;
+		kcont_run(&kcq_process_ctxt, NULL, 0, KC_IPL_DEFER_PROCESS);
+    	}
+	kthread_exit(0);
+}
+
+
+/*
+ * Initialize kcont subsystem.
+ */
+void 
+kcont_init()
+{
+
+#ifdef __HAVE_GENERIC_SOFT_INTERRUPTS
+	/*
+	 * Initialize kc_queue and callout for soft-int deferred
+	 * continuations. (If not avaiable, deferrals  fall back
+	 * to deferring all the way to process context).
+	 */
+	SIMPLEQ_INIT(&kcq_softclock);
+	kc_si_softclock = softintr_establish(IPL_SOFTCLOCK,
+	    kcont_run_softclock, &kcq_softnet);
+
+	SIMPLEQ_INIT(&kcq_softnet);
+	kc_si_softnet = softintr_establish(IPL_SOFTNET,
+	    kcont_run_softnet, &kcq_softnet);
+
+	SIMPLEQ_INIT(&kcq_softserial);
+	kc_si_softserial = softintr_establish(IPL_SOFTSERIAL,
+	    kcont_run_softserial, &kcq_softserial);
+#endif	/* __HAVE_GENERIC_SOFT_INTERRUPTS */
+
+	/*
+	 * Create kc_queue for process-context continuations,
+	 * and a kthread to process the queue.
+	 * (Fine-grained smp locking should have at least one kthread per CPU).
+	 */
+	SIMPLEQ_INIT(&kcq_process_ctxt);
+	kthread_create(kcont_worker, NULL);
+}