Subject: Sample splice(2) via kcont(9), work-in-progress towards sendfile()
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-net
Date: 10/05/2005 09:37:37
Below is a rough first-pass first cut at splice(2), based on the the
API from Kevin Fall and Sally Floyd's work:

    Sally Floyd and Kevin Fall.
    Promoting the use of end-to-end congestion control in the Internet. 
    ACM/IEEE Transactions on Networking, 7(4):458-473, August 1999

The patch uses kcont(9) to stitch together the data from two TCP PCBs.
I've used the patch with a trivial userspace program to forward a
wire-speed gigabit ttcp stream, with the ttcp sender running on
machine A, the ttcp receiver on machine C, and machine B in the middle
using splice(2) to TCP-terminate A's traffic and forward on a separate
connection to machine C;

         ttcp                      ttcp
         sender                    receiver
	   A -------->  B --------> C
                    in-kernel
                    TCP splice

Before any thought of committing, I'd want to rework the API, to split
the overloaded length/flags arguments into two separate arguments: an
int for flags, and a u_long or off_t for the size to copy.  (though
the length was, historically, for UDP/record traffic...)
Oh, and the syscall numbers are for illustration only.


While that's interesting to me, as a prototype testbed for (for
example) 10GbE NICs, my guess is that implementin sendfile() using a
similar kcont scheme would be more interesting to a wider audience.

I'm sure the usual supsects will quickly notice that the patch
supports TCP only.  I'm also interested in reworking the
protocol-level APIs to allow a more protocol/PF-neutral interface for
layering upper-level protocols on top of in-kernel APIs, but below the
*userspace* socket API.

Other comments welcomed.


Index: kern/syscalls.master
===================================================================
RCS file: /cvsroot/src/sys/kern/syscalls.master,v
retrieving revision 1.145
diff -w -u -r1.145 syscalls.master
--- kern/syscalls.master	25 Feb 2005 19:53:56 -0000	1.145
+++ kern/syscalls.master	1 Sep 2005 23:55:20 -0000
@@ -751,3 +751,6 @@
 			    const sigset_t *mask); }
 374	STD		{ int sys_pollts(struct pollfd *fds, u_int nfds, \
 			    const struct timespec *ts, const sigset_t *mask); }
+375	STD		{ int sys_splice(int fd1, int fd2); }
+376     STD		{ int sys_sendfile(int fd, int s, off_t offset, size_t nbytes, \
+                                void *hdtr, off_t *sbytes, int flags); }
--- /dev/null	2005-09-07 17:06:10.000000000 -0700
+++ sys_splice.c	2005-06-06 13:18:03.000000000 -0700
@@ -0,0 +1,378 @@
+
+/*
+ * Copyright 2005 Jonathan Stone.
+ * All rights reserved.
+ */
+#include <sys/syscallargs.h>
+#include <sys/kcont.h>
+
+const char spliceio[] = "spliceio";
+
+int	sys_splice(struct lwp *l, void *v, register_t *retval);
+int	splice1(struct socket *s1, struct socket *s2, struct proc *p);
+int	splice_movedata(struct socket *from, struct socket *to,
+			    struct proc *p);
+void	splice_sowake_tramp(struct socket *, caddr_t, int);
+void	splice_sowake_continuation(void *, void *, int);
+
+typedef struct splice_state {
+
+	/* Address of this struct, for userspace syscall to
+	 * wait upon for completion. But really for debugging.
+	 */
+	struct splice_state * splc_self;
+
+	/* thought: make these an explicit array? */
+	struct socket  *splc_so1, *splc_so2;
+
+	struct proc * splc_proc;
+
+	/* Error (if any) returned to  userspace upon completion. */
+	int splc_error;
+
+	/* Spliced socket state: */
+
+
+	/* kcont(s). again, perhaps make these an explicit array? */
+
+	struct kc so1_kc;
+	struct kc so2_kc;
+
+} splice_state;
+
+
+
+/*
+ * Once data is ready to read, move that data from one socket's
+ * receive queue to the other socket's send queue. If the send
+ * socket has insufficient space, silently give up: we will try again 
+ * later when sowwakeup() is called on the sending socket.
+ */
+int
+splice_movedata(struct socket *from, struct socket *to,
+		    struct proc *p)
+{
+	long space, sspace;
+	struct mbuf *top;
+	struct uio uio;
+	int error, rcvflags;
+	int s;
+
+	/*
+	 * Compute min(send space, available data).
+	 * XXX: needs a rework for sockets with record boundaries
+	 * (SOCK_DGRAM, SOCK_SEQPACKET).
+	 */
+	space = sbspace(&to->so_snd);
+	sspace = from->so_rcv.sb_cc;
+	if (sspace < space)
+		space = sspace;
+	
+	if (space == 0)
+		return 0;
+
+	if (space < 0) {
+		printf("splice: negative space %ld from-q %ld to-q %lu\n",
+		       space, sbspace(&to->so_snd), from->so_rcv.sb_cc);
+		return 0;
+	}
+
+	/* ... read off from socket ... */
+	top = NULL;
+	bzero(&uio, sizeof(uio));
+	uio.uio_resid = space;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_procp = NULL; 	/* XXX: p */
+	uio.uio_iov = NULL;
+	uio.uio_iovcnt = 0;
+	rcvflags = MSG_DONTWAIT;
+	
+	/*
+	 * XXX JRS: try instead
+	 * (*from->so_recv)(from, &nam, &uio, &top, (struct mbuf**)0, &flags)
+	 * as in sys/nfs/nfs_socket.c
+	 */
+	error = soreceive(from,  /* &paddr */ NULL,  &uio,
+		  &top, /*controlp*/ NULL, &rcvflags);
+#if defined(DEBUG)
+	printf("asked for %ld got resid %ld\n", space, (long)uio.uio_resid);
+#endif
+
+	if (error != 0 || top == NULL) {
+		printf("splice: after recv, error = %d, top = %p\n",
+		       error, top);
+		soshutdown(from, SHUT_RDWR);
+		soshutdown(to, SHUT_RDWR);
+		goto done;
+	}
+
+	/* send it ... */
+#if 0
+	error = (*to->so_send)(to, (struct mbuf*)NULL, (struct uio *)NULL,
+			       top, NULL, 0, p);
+#else
+		s = splsoftnet();
+		error = (*to->so_proto->pr_usrreq)(to, PRU_SEND, top, NULL, NULL, p);
+	splx(s);
+#endif
+
+done:
+	return (error);
+}
+
+
+int
+splice1(struct socket *s1, struct socket *s2, struct proc *p)
+{
+
+	int error = 0;;
+
+#define SPLICE_MOVEDATA(a, b, p) \
+do { \
+	if (soreadable((a)) && sowritable(b))	\
+		error = splice_movedata((a), (b), (p));	\
+} while (0)
+
+	SPLICE_MOVEDATA(s1, s2, p);
+	if (error != 0) 
+		return error;
+
+	SPLICE_MOVEDATA(s2, s1, p);
+
+#undef SPLICE_MOVEDATA
+
+	return error;
+
+
+}
+
+
+void
+splice_sowake_tramp(struct socket *so, caddr_t upcallarg, int mflags)
+{
+	/*
+	 * This function is called from protocol-specific code
+	 * at splsoftnet context, where it is unsafe to
+	 * remove or add data to the socket's so_snd or so_rcv queues.
+	 * Schedule a kcont to run at a safer time.
+	 */
+	struct splice_state *splc = (struct splice_state *)upcallarg;
+	struct kc *kc;
+
+	kc = (so == splc->splc_so1) ? &splc->so1_kc : &splc->so2_kc;
+
+	kcont(kc, splice_sowake_continuation, splc, KC_IPL_DEFER_SOFTNET);
+	kcont_defer(kc, so, 0);
+}
+
+
+/*
+ * Called via kcont(9) from some suitable kcont level,
+ * after splice_sowakeup_tramp() has scheduled the kcont.
+ *
+ * When this function is called, we are no longer being called from
+ * protocol-specific code like (for example) tcp_input(); we can
+ * safely call soreceive() to fetch received data from one socket,
+ * then pass the received data to sosend().
+ */
+void
+splice_sowake_continuation(void * obj, void * kc_env, int status)
+{
+	struct socket *so = obj;
+	struct splice_state * knot = kc_env;
+	struct socket *so1, *so2;
+	struct proc *p;
+
+	/* Which direction were we  called in ? */
+	so1 = (so == knot->splc_so1) ? knot->splc_so1 : knot->splc_so2 ;
+	so2 = (so == knot->splc_so1) ? knot->splc_so2 : knot->splc_so1 ;
+	p = knot->splc_proc;
+
+	splice1(so1, so2, p);
+
+	/*
+	 * XXX: check for clean close in both directions.
+	 * when we are done, wakeup the userspace caller sleeping
+	 * on the address of our struct splice_state.
+	 */
+#define CANSENDFROMTO(s1, s2) \
+	( ( ((s1)->so_state & SS_CANTRCVMORE) == 0) &&	\
+	  ( ((s2)->so_state & SS_CANTSENDMORE) == 0)	\
+	)
+
+
+	if (CANSENDFROMTO(so1, so2) ||
+	    CANSENDFROMTO(so2, so1)) {
+	  /* can send more; do not close down yet. */
+	} else {
+/*DBG*/	  printf("splice state %p so %p %p: done\n", knot, so1, so2);
+	  wakeup(knot);
+	}
+}
+
+
+
+
+/* ARGSUSED */
+int
+sys_splice(struct lwp *l, void *v, register_t *retval)
+{
+
+	struct sys_splice_args /**/ {
+	  	syscallarg(int) fd1;
+		syscallarg(int) fd2;
+	} /* */ *uap = v;
+
+
+	int	fd1, fd2;
+	struct filedesc	*fdp;
+	struct file	*fp1, *fp2;
+	struct socket	*so1, *so2;
+	struct proc	*p;
+	int	error;
+	int	s;
+	struct splice_state *knot;
+
+	p = l->l_proc;
+	fdp = p->p_fd;
+	error = 0;
+
+	fd1 = SCARG(uap, fd1);
+	if ((fp1 = fd_getfile(fdp, fd1)) == NULL)
+		return (EBADF);
+	FILE_USE(fp1);
+
+	fd2 = SCARG(uap, fd2);
+	if ((fp2 = fd_getfile(fdp, fd2)) == NULL) {
+		FILE_UNUSE(fp1, p);
+		return (EBADF);
+	}
+	FILE_USE(fp2);
+
+	if (fp1->f_type != DTYPE_SOCKET ||
+	    fp2->f_type != DTYPE_SOCKET) {
+		error = ENOTSOCK;	/* XXX EINVAL? */
+		goto done;
+	}
+
+	so1 = (struct socket*) fp1->f_data;
+	so2 = (struct socket*) fp2->f_data;
+
+	/* Same underlying socket?
+	 * XXX: should handle splice-to-self as as special case, for
+	 * test/measurement purposes? I think all we have to do
+	 * is not acquire the sblock twice (and not release twice).
+	 */
+	if (so1 == so2) {
+		error = EBADF;
+		goto done;
+	}
+
+
+
+	if (so1->so_type != so2->so_type) {
+		error = ESOCKTNOSUPPORT;
+		goto done;
+	}
+
+	/* XXX: implementation is really only SOCK-STREAM for now */
+	if (so1->so_type != SOCK_STREAM) {
+		error = ESOCKTNOSUPPORT;
+		goto done;
+	}
+
+	if ((so1->so_state & SS_ISCONNECTED) == 0 ||
+	    (so2->so_state & SS_ISCONNECTED) == 0) {
+		error =  ENOTCONN;
+		goto done;
+	}
+
+	/*
+	 * Deadlock avoidance:  ensure we obtain locks in a well-ordered
+	 * manner irrespective of the order in which callers supply 
+	 * sockets. 
+	 */
+	if (so2 < so1) {
+		struct socket *sotmp = so2;
+		  so2 = so1;
+		  so1 = sotmp;
+	}
+
+	sblock(&so1->so_snd, M_WAITOK);
+	sblock(&so2->so_snd, M_WAITOK);
+
+	knot = malloc(sizeof(*knot), M_TEMP, M_WAITOK|M_ZERO);
+	if (knot == NULL) {
+		error = ENOSPC;
+		sbunlock(&so1->so_snd);
+		sbunlock(&so2->so_snd);
+		goto done;
+	}
+	
+/*DBG*/	printf("..splice socks %p %p state %p: setting up\n", so1, so2, knot);
+
+
+	/*
+	 * XXX: set up  kconts to shuffle data from one socket
+	 * to another, whenever space permits...
+	 */
+
+	knot = malloc(sizeof(*knot), M_TEMP, M_WAITOK|M_ZERO);
+	if (knot == NULL) {
+		error = ENOSPC;
+		goto done;
+	}
+
+	knot->splc_error = 0;
+	knot->splc_so1 = so1;
+	knot->splc_so2 = so2;
+	knot->splc_proc = p;
+
+	/* wire up socket upcalls */
+	s = splnet();
+	so1->so_upcall = splice_sowake_tramp;
+	so1->so_upcallarg = (caddr_t)knot;
+	so1->so_rcv.sb_flags |= SB_UPCALL;
+	so1->so_snd.sb_flags |= SB_UPCALL;
+
+	so2->so_upcall = splice_sowake_tramp;
+	so2->so_upcallarg = (caddr_t)knot;
+	so2->so_rcv.sb_flags |= SB_UPCALL;
+	so2->so_snd.sb_flags |= SB_UPCALL;
+	splx(s);
+
+	/* wait until both sockets are done */
+	tsleep((caddr_t)knot, PSOCK, spliceio, 0);
+
+/*DBG*/	printf("..splice done\n");
+
+	/* unwire upcalls */
+	s = splnet();
+	so1->so_snd.sb_flags &= ~SB_UPCALL;
+	so1->so_rcv.sb_flags &= ~SB_UPCALL;
+	so1->so_upcall = NULL;
+	so1->so_upcallarg = NULL;
+
+	so2->so_snd.sb_flags &= ~SB_UPCALL;
+	so2->so_rcv.sb_flags &= ~SB_UPCALL;
+	so2->so_upcall = NULL;
+	so2->so_upcallarg = NULL;
+
+#ifdef notyet
+freesplc:
+#endif
+	free(knot, M_TEMP);
+
+done:
+	FILE_UNUSE(fp1, p);
+	FILE_UNUSE(fp2, p);
+
+	return (error);
+}
+
+int
+sys_sendfile(struct lwp *l, void *v, register_t *retval)
+{
+
+	return ENOSYS;
+}
Index: kern/init_sysent.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_sysent.c,v
retrieving revision 1.163
diff -w -u -r1.163 init_sysent.c
--- kern/init_sysent.c	27 Feb 2005 00:02:40 -0000	1.163
+++ kern/init_sysent.c	1 Sep 2005 23:55:20 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
 
 /*
  * System call switch table.
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_nfsserver.h"
@@ -994,10 +994,10 @@
 	    sys_pselect },			/* 373 = pselect */
 	{ 4, s(struct sys_pollts_args), 0,
 	    sys_pollts },			/* 374 = pollts */
-	{ 0, 0, 0,
-	    sys_nosys },			/* 375 = filler */
-	{ 0, 0, 0,
-	    sys_nosys },			/* 376 = filler */
+	{ 2, s(struct sys_splice_args), 0,
+	    sys_splice },			/* 375 = splice */
+	{ 7, s(struct sys_sendfile_args), 0,
+	    sys_sendfile },			/* 376 = sendfile */
 	{ 0, 0, 0,
 	    sys_nosys },			/* 377 = filler */
 	{ 0, 0, 0,
Index: kern/syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/syscalls.c,v
retrieving revision 1.158
diff -w -u -r1.158 syscalls.c
--- kern/syscalls.c	27 Feb 2005 00:02:40 -0000	1.158
+++ kern/syscalls.c	1 Sep 2005 23:55:20 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
 
 /*
  * System call names.
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ktrace.h"
@@ -510,4 +510,6 @@
 	"extattr_list_link",			/* 372 = extattr_list_link */
 	"pselect",			/* 373 = pselect */
 	"pollts",			/* 374 = pollts */
+	"splice",			/* 375 = splice */
+	"sendfile",			/* 376 = sendfile */
 };
Index: kern/uipc_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.90
diff -w -u -r1.90 uipc_syscalls.c
--- kern/uipc_syscalls.c	26 Feb 2005 21:34:55 -0000	1.90
+++ kern/uipc_syscalls.c	1 Sep 2005 23:55:22 -0000
@@ -1116,3 +1116,5 @@
 	*fpp = fp;
 	return (0);
 }
+
+#include "/local/home/jonathan/sys_splice.c"
Index: sys/syscall.h
===================================================================
RCS file: /cvsroot/src/sys/sys/syscall.h,v
retrieving revision 1.156
diff -w -u -r1.156 syscall.h
--- sys/syscall.h	27 Feb 2005 00:03:25 -0000	1.156
+++ sys/syscall.h	1 Sep 2005 23:55:23 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscall.h,v 1.156 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
 
 /*
  * System call numbers.
@@ -1025,5 +1025,11 @@
 /* syscall: "pollts" ret: "int" args: "struct pollfd *" "u_int" "const struct timespec *" "const sigset_t *" */
 #define	SYS_pollts	374
 
-#define	SYS_MAXSYSCALL	375
+/* syscall: "splice" ret: "int" args: "int" "int" */
+#define	SYS_splice	375
+
+/* syscall: "sendfile" ret: "int" args: "int" "int" "off_t" "size_t" "void *" "off_t *" "int" */
+#define	SYS_sendfile	376
+
+#define	SYS_MAXSYSCALL	377
 #define	SYS_NSYSENT	512
Index: sys/syscallargs.h
===================================================================
RCS file: /cvsroot/src/sys/sys/syscallargs.h,v
retrieving revision 1.138
diff -w -u -r1.138 syscallargs.h
--- sys/syscallargs.h	27 Feb 2005 00:03:25 -0000	1.138
+++ sys/syscallargs.h	1 Sep 2005 23:55:23 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscallargs.h,v 1.138 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
 
 /*
  * System call argument lists.
@@ -1588,6 +1588,21 @@
 	syscallarg(const sigset_t *) mask;
 };
 
+struct sys_splice_args {
+	syscallarg(int) fd1;
+	syscallarg(int) fd2;
+};
+
+struct sys_sendfile_args {
+	syscallarg(int) fd;
+	syscallarg(int) s;
+	syscallarg(off_t) offset;
+	syscallarg(size_t) nbytes;
+	syscallarg(void *) hdtr;
+	syscallarg(off_t *) sbytes;
+	syscallarg(int) flags;
+};
+
 /*
  * System call prototypes.
  */
@@ -2251,4 +2266,8 @@
 
 int	sys_pollts(struct lwp *, void *, register_t *);
 
+int	sys_splice(struct lwp *, void *, register_t *);
+
+int	sys_sendfile(struct lwp *, void *, register_t *);
+
 #endif /* _SYS__SYSCALLARGS_H_ */