Subject: Enriched socket-buffer append API for `reliable' datagrams
To: None <tech-kern@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-kern
Date: 05/21/2004 14:25:52
A well-known scalability bug in PF_KEY prompted me to revisit the
in-kernel API for delivering data to socket buffers. I have
already proposed a new socket-append function which appends
an entire preformatted chain of records in one go.

I propose to enrich at least the spappendaddr() interface, to add an
extra `sbpriority' argument with roughly the following meanings:

SB_PRIO_NONE:
	Honour normal sb limits as before.
SB_PRIO_OVERDRAFT:
	Allows a small (2*MLEN) overflow over the normal socket limits.
	This priority is intended for messages notifying the socket
	owner of socket-buffer overflow in earlier normal-priority
	or lower-priority socket messages.
SB_PRIO_ONESHOT_OVERFLOW:
	if the socket has any space at all, insert the entire chain
	possibly exceeding the socket limits. This priority is
	Intended for large requests that should be delivered
	atomically (all of the input data, or none of it),
	with `best-effort' reliability.
SB_PRIO_BESTEFFORT: (really `reliable')
	Ignore the receive-buffer limits entirely.
        This priority is intended for  kernel-generated messages only,
	and is permitted only for sockets bearing a new
	`best-effort reliablitlty' scoket option.
        It is up to generator to avoid total mbuf resource exhaustion.

My preference is that the two last-listed priorities will be honoured
only for sockets where a special `best-effort reliable'' socket flag
has been set. That flag would be settable only by the superuser.

I've sketched an outline of this approach privately to a few people.
The feedback I've gotten so far is that it's a reasonable approach to
the problem of trying to cram potentially-unbounded message streams
into a finite, datagram-ish socket.

The biggest caveat is that, with the existing socket-append API, its
up to the caller to check any socket-level flags: sbappend* doesn't
get to see the socket.  Are we generally happy with that solution, or
would we rather refrob the socket-append API to pass in a struct
socket * arguemnt? (If the priority is SB_PRIO_NORMAL, the struct
socket* argument is guaranteed to be ignored and may be NULL).

Any other comments?




Index: sys/socketvar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/socketvar.h,v
retrieving revision 1.74
diff -u -r1.74 socketvar.h
--- sys/socketvar.h	22 Apr 2004 01:01:42 -0000	1.74
+++ sys/socketvar.h	21 May 2004 18:34:09 -0000
@@ -281,6 +281,8 @@
 void	sbappendstream(struct sockbuf *, struct mbuf *);
 int	sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
 	    struct mbuf *);
+int	sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
+	     struct mbuf *, int);
 int	sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
 void	sbappendrecord(struct sockbuf *, struct mbuf *);
 void	sbcheck(struct sockbuf *);
Index: kern/uipc_socket2.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.62
diff -u -r1.62 uipc_socket2.c
--- kern/uipc_socket2.c	19 Apr 2004 03:44:46 -0000	1.62
+++ kern/uipc_socket2.c	21 May 2004 18:50:48 -0000
@@ -511,15 +511,22 @@
 }
 #endif /* SOCKBUF_DEBUG */
 
-#define	SBLINKRECORD(sb, m0)						\
+/*
+ * Link a chain of records onto a socket buffer
+ */
+#define	SBLINKRECORDCHAIN(sb, m0, mlast)				\
 do {									\
 	if ((sb)->sb_lastrecord != NULL)				\
 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
 	else								\
 		(sb)->sb_mb = (m0);					\
-	(sb)->sb_lastrecord = (m0);					\
+	(sb)->sb_lastrecord = (mlast);					\
 } while (/*CONSTCOND*/0)
 
+
+#define	SBLINKRECORD(sb, m0)						\
+    SBLINKRECORDCHAIN(sb, m0, m0)
+
 /*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
@@ -764,6 +771,110 @@
 	return (1);
 }
 
+/*
+ * Helper for sbappendchainaddr: prepend a struct sockaddr* to
+ * an mbuf chain.
+ */
+static __inline struct mbuf *
+m_prepend_sockaddr(struct mbuf *m0, const struct sockaddr *asa)
+{
+	struct mbuf *m;
+	const int mlen = asa->sa_len;
+
+	/* only the first in each chain need be a pkthdr */
+	MGETHDR(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	MCLAIM(m, sb->sb_mowner);
+	KASSERT(mlen <= MLEN);
+
+	m->m_len = mlen;
+	bcopy((caddr_t)asa, mtod(m, caddr_t), mlen);
+	m->m_next = m0;
+	m->m_pkthdr.len = mlen + m0->m_pkthdr.len;
+
+	return m;
+}
+
+int
+sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
+		  struct mbuf *m0, int sbprio)
+{
+	int space;
+	struct mbuf *m, *n, *n0, *nlast;
+	int error;
+
+	/*
+	 * XXX sbprio reserved for encoding priority of this* request:
+	 *  SB_PRIO_NONE --> honour normal sb limits
+	 *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
+	 *	take whole chain. Intended for large requests
+	 *      that should be delivered atomically (all, or none).
+	 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
+	 *       over normal socket limits, for messages indicating
+	 *       buffer overflow in earlier normal/lower-priority messages
+	 * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
+	 *       Intended for  kernel-generated messages only.
+	 *        Up to generator to avoid total mbuf resource exhaustion.
+	 */
+	(void)sbprio;
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddrchain");
+
+	space = sbspace(sb);
+	
+#ifdef notyet
+	/* 
+	 * we are expected to violate socket-buffer limits: If the
+	 * socket has any rcv space available at all, deliver the
+	 * entire chain. Otherwise, deliver nothing at all.
+	 */
+
+	if (space <= 0)
+		return (0);
+#endif
+
+	n0 = NULL;
+	nlast = NULL;
+	for (m = m0; m; m = m->m_nextpkt) {
+		struct mbuf *np;
+
+		/* Prepend sockaddr to this record (m) of input chain m0 */
+	  	n = m_prepend_sockaddr(m, asa);
+		if (n == NULL) {
+			error = ENOBUFS;
+			goto bad;
+		}
+
+		/* Append record (asa+m) to end of new chain n0 */
+		if (n0 == NULL) {
+			n0 = n;
+		} else {
+			nlast->m_nextpkt = n;
+		}
+		/* Keep track of last record on new chain */
+		nlast = n;
+
+		for (np = n; np; np = np->m_next)
+			sballoc(sb, np);
+	}
+
+	/* Drop the entire chain of (asa+m) records onto the socket */
+	SBLINKRECORDCHAIN(sb, n0, nlast);
+	for (m = nlast; m->m_next; m = m->m_next)
+		;
+	sb->sb_mbtail = m;
+	
+	return (1);
+
+bad:
+	if (n)
+		m_freem(n);
+	return 0;	
+}
+
+
 int
 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
 {