Subject: Prototype kernel continuation-passing for NetBSD
To: None <tech-kern@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-kern
Date: 01/28/2004 13:04:00
The code below is a sample implementation of ``continuation-passing''
for the NetBSD kernel. If you're not familiar with languages that have
functions, closures, and continuations as first-class objects, think
of this as a framework for handling ``callback functions'': specifically
creating, enqueing, managinging, and deferring callbacks to some lower
interrupt priority.
One possible use of these `continuations' is to implement operations
like sendfile() or splice() inside the kernel, using socket callback
functions. If you try this (I have, during an internship at Compaq/DEC
WRL) you quickly find yourself needing to defer operation from one
socket's upcall function to a non-upcall context, in order to frob
state on the `peer' socket. Or see the note in spl(9) about not using
splvm(): this code does so, deliberately, in order to provide a
framework whereby other code need never do so.
Long-term, I have an interest in reworking the NFS code (primarily the
server side, but client-side is interesting too) to not depend on
having a process context to do I/O. The I/O rates achievable today
with high-end hardware (and thus in a couple of years with mid-range
hardware) are high enough that a context-switch per NFS I/O is
prohibitive. Eventually, I'd like to replace or augment the
process-based I/O notification (processes sleeping on buffers) with a
kcont-based notification mechanism. In that world, I/O requests would
pass a kcont down through the I/O system, and each struct buf would
have a list of continuations to call when I/O on the buffer completes.
That's a ways off yet :-).
I have some immediate uses for a kcont-like interface; so any feedback
on this would be appreciated.
--- /dev/null Wed Jan 21 00:09:46 2004
+++ sys/sys/kcont.h Sat Dec 13 00:38:58 2003
@@ -0,0 +1,162 @@
+/* $NetBSD: */
+
+/*
+ * Copyright (c) 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jonathan Stone, currently employed by Decru, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2003 Jonathan Stone.
+ * All rights reserved.
+ */
+
+#include <sys/queue.h>
+
+/*
+ * kcont -- Continuation-passing for BSD kernels.
+ *
+ * This module defines structures that implement a
+ * continuation-passing model in C which can be used as a
+ * ``callback''-like alternative to using a process context to
+ * tsleep() on an object address, waiting for some event or status
+ * change on that object. Since ANSI CI provides neither
+ * lexically-nested functions nor heap allocation of activation
+ * records, we implement a continuation as a struct holding:
+ * 1. A function pointer, pointing ot the continuation.
+ * 2. A object pointer: the object being "slept" upon.
+ * 3. An argument pointer: a pointer to private storage containing
+ * other arguments to the continuation function, including any
+ * additional state beyond the "slept-upon" object.
+ */
+
+typedef struct kc {
+ SIMPLEQ_ENTRY(kc) kc_next; /* next kc in object's callback list */
+
+ void (*kc_fn) (void * /*obj*/, void * /*env_arg*/, int /* status*/);
+
+ void * kc_env_arg; /* caller-supplied continuation argument */
+ void * kc_obj; /* saved object, used for deferred calls */
+ int kc_status; /* saved status, used for deferred calls */
+ ushort kc_ipl; /* IPL at which to call kc_func */
+ ushort kc_flags; /* Flags, private to kcont implemenation */
+} kc_t;
+
+/* kc_flags values. */
+#define KC_AUTOFREE 0x01 /* This struct kcont * was malloc'ed by
+ * the kcont module; free it immediately
+ * after calling the continuation function.
+ */
+#define KC_DEFERRED 0x02 /* Deferral has already saved object, status */
+
+/* kcont IPL values. */
+#define KC_IPL_DEFER_PROCESS 0x00 /* Hand off continuation fn to a
+ * full process context (kthread).
+ */
+
+#define KC_IPL_DEFER_SOFTCLOCK 0x01
+#define KC_IPL_DEFER_SOFTNET 0x02 /* run continatiuon in an
+ * IPL_SOFTNET software callout. */
+#define KC_IPL_DEFER_SOFTSERIAL 0x03
+
+#define KC_IPL_IMMED 0x10 /*
+ * Execute continuation fn immediately,
+ * in the context of the object event,
+ * with no completion. Continuation
+ * assumed to be very quick.
+ */
+
+/*
+ * Head of a list of pending continuations.
+ * For example, for continuation-based buffer I/O,
+ * add a kcq_t to struct buf), and pass a kc_t *
+ * down through VFS and strategy layers.
+ */
+typedef SIMPLEQ_HEAD(kcqueue, kc) kcq_t;
+
+
+/*
+ * Prepare a struct kcont continuation
+ */
+kc_t * kcont(kc_t *,
+ void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+ void * /*env_arg*/, int /* continue_ipl*/);
+
+kc_t *kcont_malloc(int /*allocmflags*/,
+ void (* /*fn*/) __P((void * /*obj*/, void */*arg*/, int /*sts*/)),
+ void * /*env_arg*/, int /* continue_ipl*/);
+
+
+/*
+ * Use a kcont to defer execution to a (presumably) lower interropt
+ * priority.
+ */
+void kcont_defer(struct kc *kc,
+ void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+ void * /*obj*/,
+ void * /*env_arg*/, int /*status*/, int /* continue_ipl*/);
+
+/*
+ * Malloc a kcont to defer execution to a (presumably) lower interropt
+ * priority.
+ */
+void kcont_defer_malloc(int, /* mallocflags */
+ void (* /*fn*/) __P((void* /*obj*/, void */*arg*/, int /*sts*/)),
+ void * /*obj*/,
+ void * /*env_arg*/, int /*status*/, int /*continue_ipl*/);
+
+/*
+ * Enqueue a struct kcont * into the kc_queue* field of some kernel struct.
+ * The struct kcont * argument is typically passed as an argument to
+ * a functio which sequests an asynchronous operation on that kernel object.
+ * When the operation completes, or the object otherwise decides to
+ * invoke a wakeup() on itself, all continuations on the object's
+ * kcqueue will be called, either immediately or if a continuation
+ * requested it) after deferring to a kc_queue * servied at
+ * software-interrupt priority.
+ */
+void kcont_enqueue(kcq_t /*kcqueue*/, struct kc * /*kc*/,
+ void (*/*func*/) __P((void*, void*, int)),
+ void * /*env_arg*/, int /*contine_ipl*/);
+
+
+/*
+ * Execute (or defer) all continuations on an object's kcqueue
+ * when an asynchronous operation completes. Runs through the
+ * struct kc_queue * of kc_ts, either running the continuations
+ * with the given objcet and status; or saving the object and status
+ * and deferring the kconts to a software-interrupt priority kc_queue.
+ */
+void kcont_run __P((kcq_t * /*kcq*/, void * /*obj*/,
+ int /*status*/, int /*cur_ipl*/));
+
+
+/* Initialize kcont framework at boot time. */
+void kcont_init __P((void));
+
--- /dev/null Wed Jan 21 00:09:46 2004
+++ sys/kern/kern_kcont.c Sat Dec 13 09:40:04 2003
@@ -0,0 +1,369 @@
+/* $NetBSD:$ */
+
+/*
+ * Copyright (c) 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jonathan Stone, currently employed by Decru, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2003 Jonathan Stone.
+ * All rights reserved.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD:$ ");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+
+#include <machine/intr.h> /* IPL_*, and schedsoftnet() */
+ /* XXX: schedsofnet() should die. */
+
+#include <sys/kcont.h>
+
+
+MALLOC_DECLARE(M_KCONT);
+MALLOC_DEFINE(M_KCONT, "kcont", "Kernel non-process continuations");
+
+/* Accessors for struct kc_qeueue */
+static __inline struct kc * kc_set(struct kc *,
+ void (*func) __P((void *, void *, int)),
+ void *env_arg, int ipl);
+
+static __inline void kcont_enqueue_atomic(kcq_t *kcq, struct kc *kc);
+static __inline struct kc *kcont_dequeue_atomic(kcq_t *kcq);
+
+
+static void kcont_defer_kc(struct kc * /*kc*/, void * /*obj*/,
+ int /*status*/);
+static void kcont_worker(void * /*arg*/);
+
+
+/*
+ * Software-interrupt priority continuation queues.
+ */
+static kcq_t kcq_softnet;
+static kcq_t kcq_softclock;
+static kcq_t kcq_softserial;
+
+static void *kc_si_softnet;
+static void *kc_si_softclock;
+static void *kc_si_softserial;
+
+/*
+ * Process-context continuatoin queue.
+ */
+static kcq_t kcq_process_ctxt;
+
+
+/*
+ * Insert a fully-formed struct kc * into the kc_queue *
+ * of some kernel object (e.g., a struct buf *).
+ */
+static __inline void
+kcont_enqueue_atomic(kcq_t *kcq, struct kc *kc)
+{
+ int s;
+
+ s = splvm();
+ SIMPLEQ_INSERT_TAIL(kcq, kc, kc_next);
+ splx(s);
+}
+
+static __inline struct kc *
+kcont_dequeue_atomic(kcq_t *kcq)
+{
+ struct kc *kc;
+ int s;
+
+ s = splvm();
+ kc = SIMPLEQ_FIRST(kcq);
+ if (kc != NULL) {
+ SIMPLEQ_REMOVE_HEAD(kcq, kc_next);
+ SIMPLEQ_NEXT(kc, kc_next) = NULL;
+ }
+ splx(s);
+ return kc;
+}
+
+/*
+ * Construct a continuation object from pre-allocated memory.
+ * Used by functions that are about call an asynchronous operation,
+ * to build a continuation to be called once the operation completes.
+ */
+static __inline struct kc *
+kc_set(struct kc *kc, void (*func) __P((void *, void *, int)),
+ void *env_arg, int ipl)
+{
+ kc->kc_fn = func;
+ kc->kc_env_arg = env_arg;
+ kc->kc_ipl = ipl;
+ kc->kc_flags = 0;
+#ifdef DEBUG
+ kc->kc_obj = NULL;
+ kc->kc_status = -1;
+ SIMPLEQ_NEXT(kc, kc_next) = NULL;
+#endif
+ return kc;
+}
+
+/*
+ * Request a continuation. Caller provides space for the struct kc *.
+ */
+struct kc *
+kcont(struct kc *kc, void (*func) __P((void*, void*, int)),
+ void *env_arg, int continue_ipl)
+{
+ /* Just save the arguments in the kcont *. */
+ return kc_set(kc, func, env_arg, continue_ipl);
+}
+
+/*
+ * Request a malloc'ed/auto-freed continuation. The kcont framwork
+ * mallocs the struct kc, and initializes it with the caller-supplied args.
+ * Once the asynchronous operation completes and the continuation function
+ * has been called, the kcont framework will free the struct kc *
+ * immediately after the continuation function returns.
+ */
+struct kc *
+kcont_malloc(int malloc_flags,
+ void (*func) __P((void*, void*, int)),
+ void *env_arg, int continue_ipl)
+{
+ struct kc *kc;
+
+ kc = malloc(sizeof (*kc), M_KCONT, malloc_flags);
+ if (kc == NULL)
+ return kc;
+ return kc_set(kc, func, env_arg, continue_ipl);
+}
+
+/*
+ * Dispatch a dequeed continuation which requested deferral
+ * into the appropriate lower-priority queue.
+ */
+static void
+kcont_defer_kc(struct kc *kc, void *obj, int status)
+{
+ /*
+ * IPL at which to synchronize access to object is
+ * above the continuer's requested callback IPL,
+ * (e.g., continuer wants IPL_SOFTNET but the object
+ * holding this contuation incurred the wakeup()able
+ * event whilst at IPL_BIO).
+ * Defer this kc to a lower-priority kcq,pppppp
+ * to be serviced slightly later at a lower IPL.
+ */
+
+ /*
+ * If we already deferred this kcont,don't clobber
+ * the previously-saved kc_object and kc_status.
+ */
+ if ((kc->kc_flags & KC_DEFERRED) == 0) {
+ kc->kc_flags |= KC_DEFERRED;
+ kc->kc_obj = obj;
+ kc->kc_status = status;
+ }
+ switch (kc->kc_ipl) {
+#ifdef __HAVE_GENERIC_SOFT_INTERRUPTS
+ case KC_IPL_DEFER_SOFTCLOCK:
+ kcont_enqueue_atomic(&kcq_softclock, kc);
+ softintr_schedule(kc_si_softclock);
+ break;
+ case KC_IPL_DEFER_SOFTNET:
+ kcont_enqueue_atomic(&kcq_softnet, kc);
+ softintr_schedule(kc_si_softnet);
+ break;
+ case KC_IPL_DEFER_SOFTSERIAL:
+ kcont_enqueue_atomic(&kcq_softnet, kc);
+ softintr_schedule(kc_si_softserial);
+ break;
+
+#else /* !__HAVE_GENERIC_SOFT_INTERRUPTS */
+ /* What to do? For now,d. put to process context */
+ case KC_IPL_DEFER_SOFTCLOCK:
+ case KC_IPL_DEFER_SOFTSERIAL:
+ case KC_IPL_DEFER_SOFTNET:
+ /*FALLTHROUGH*/
+#endif /* __HAVE_GENERIC_SOFT_INTERRUPTS */
+
+ case KC_IPL_DEFER_PROCESS:
+ kcont_enqueue_atomic(&kcq_process_ctxt, kc);
+ break;
+ default:
+ KASSERT(0);
+ }
+}
+
+void
+kcont_defer(struct kc *kc,
+ void (*func) __P((void*, void*, int)),
+ void * obj, void *env_arg, int status, int ipl)
+{
+ kcont_defer_kc(kc_set(kc, func, env_arg, ipl), obj, status);
+}
+
+void
+kcont_defer_malloc(int mallocflags,
+ void (*func) __P((void*, void*, int)),
+ void * obj, void *env_arg, int status, int ipl)
+{
+ struct kc *kc;
+
+ kc = kcont_malloc(mallocflags, func, env_arg, ipl);
+ if (kc != NULL)
+ kcont_defer_kc(kc, obj, status);
+}
+
+ /*
+ * Run through a list of continuations, calling (or handing off)
+ * continuation functions.
+ * If the caller-provided IPL is the same as the requested IPL,
+ * deliver the callback.
+ * If the caller-provided IPL is higher than the requested
+ * callback IPL, re-enqueue the continuation to a lower-priority queue.
+ */
+void
+kcont_run(kcq_t *kcq, void *obj, int status, int curipl)
+{
+ struct kc *kc;
+ while ((kc = kcont_dequeue_atomic(kcq)) != NULL) {
+
+ /* If exceution of kc was already deferred, restore context. */
+ if (kc->kc_flags & KC_DEFERRED) {
+ KASSERT(obj == NULL);
+ obj = kc->kc_obj;
+ status = kc->kc_status;
+ }
+
+ /* Check whether to execute now or to defer. */
+ if (kc->kc_ipl == KC_IPL_IMMED || curipl <= kc->kc_ipl) {
+ int saved_flags = kc->kc_flags; /* XXX see below */
+
+ /* satisfy our raison d'e^tre */
+ (*kc->kc_fn)(obj, kc->kc_env_arg, status);
+
+ /* XXX XXX: We must not touch (*kc) after calling
+ * (*kc->kc_fn), unless we were specifically
+ * asked to free it. The memory for (*kc) may
+ * be a sub-field of some other object (for example,
+ * of kc->kc_env_arg) and (*kc_fn)() may already
+ * have freed it by the time we get here. So save
+ * kc->kc_flags (above) and use that saved copy
+ * to test for auto-free.
+ */
+ if (saved_flags & KC_AUTOFREE)
+ free(kc, M_KCONT);
+ } else {
+ kcont_defer_kc(kc, obj, status);
+ }
+ }
+}
+
+/*
+ * Trampolines for processing software-interrupt kcont queues.
+ */
+static void
+kcont_run_softclock(void *arg)
+{
+ kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTCLOCK);
+}
+
+static void
+kcont_run_softnet(void *arg)
+{
+ kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTNET);
+}
+
+static void
+kcont_run_softserial(void *arg)
+{
+ kcont_run((struct kcqueue *)arg, NULL, 0, KC_IPL_DEFER_SOFTSERIAL);
+}
+
+/*
+ * Main entrypoint for kcont worker kthreads to execute
+ * a continuation which requested deferral to process context.
+ */
+void
+kcont_worker(void *arg)
+{
+ int status;
+
+ (void)arg; /* kill GCC warning */
+
+ while (1) {
+ status = ltsleep(&kcq_process_ctxt, PCATCH, "kcont", 0, NULL);
+ if (status != 0)
+ break;
+ kcont_run(&kcq_process_ctxt, NULL, 0, KC_IPL_DEFER_PROCESS);
+ }
+ kthread_exit(0);
+}
+
+
+/*
+ * Initialize kcont subsystem.
+ */
+void
+kcont_init()
+{
+
+#ifdef __HAVE_GENERIC_SOFT_INTERRUPTS
+ /*
+ * Initialize kc_queue and callout for soft-int deferred
+ * continuations. (If not avaiable, deferrals fall back
+ * to deferring all the way to process context).
+ */
+ SIMPLEQ_INIT(&kcq_softclock);
+ kc_si_softclock = softintr_establish(IPL_SOFTCLOCK,
+ kcont_run_softclock, &kcq_softnet);
+
+ SIMPLEQ_INIT(&kcq_softnet);
+ kc_si_softnet = softintr_establish(IPL_SOFTNET,
+ kcont_run_softnet, &kcq_softnet);
+
+ SIMPLEQ_INIT(&kcq_softserial);
+ kc_si_softserial = softintr_establish(IPL_SOFTSERIAL,
+ kcont_run_softserial, &kcq_softserial);
+#endif /* __HAVE_GENERIC_SOFT_INTERRUPTS */
+
+ /*
+ * Create kc_queue for process-context continuations,
+ * and a kthread to process the queue.
+ * (Fine-grained smp locking should have at least one kthread per CPU).
+ */
+ SIMPLEQ_INIT(&kcq_process_ctxt);
+ kthread_create(kcont_worker, NULL);
+}