Subject: Fix for KAME PF_KEY DUMP-truncation marshalling bug
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-net
Date: 05/21/2004 12:08:35
The following patch fixes a longstanding bug in the KAME
implementation of PF_KEY: DUMP responses can be dropped (truncated) by
socket buffer limits, but there is no notification of the drop to the
application requesting the DUMP.  The application has no way
whatsoever to find the end of the dump stream. Various *BSD versions
will fail, deadlock, or use unacceptable kludges (which still fail
under load) to get around this bug.

The following patch introduces a new socket-append function in
sys/kern/uipc_socket2.c, which will atomically deliver either all of a
chain of records to a socket so_rcv buffer; or none of the chain.

The patch changes the usual RFC-2367 semantics: responses to DUMP 
requests are now unicast to just the socket requesting the dump.
Moreover, the stream of DUMP messages is deliveered via the new
``all-or-nothing'' implementation.

I have verified that the patch works for dumping 8192 SPD entries.
Its also markedly faster than the sysctl() version: 1.28 seconds on an
old P3 laptop, vs 8.82 for the sysctl()-based version.


But wait, there's more.

Source comments in uipc_socket2.c outline how the API to the new
function can be enriched, allowing callers to indicate the priority of
message. That enriched API could, in principle, be used to *reliably*
deliver an ENOBUFS message to the socket requesting a PF_KEY DUMP.
The enriched API would allow a range of (modified) PF_KEY behaviours,
whilst still guaranteeing to notify applications of DUMP requests.

This enriched API could also allow a better `best-effort' for PF_KEY
ACQUIRE messages. It's a hard call, but a reasonable approach to the
problem of messages which (for correct operation) must be reasonably
reliable; given the poor design choice of mapping a potentially
unbounded, high-reliability message stream onto a datagram socket.

Discussion of proposed enriched sbappend* API should go to tech-kern,
as it has wider scope than the networking stack. I will kick-start
that discussion by repost the enriched API to tech-kern.



Index: sys/socketvar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/socketvar.h,v
retrieving revision 1.74
diff -u -r1.74 socketvar.h
--- sys/socketvar.h	22 Apr 2004 01:01:42 -0000	1.74
+++ sys/socketvar.h	21 May 2004 18:34:09 -0000
@@ -281,6 +281,8 @@
 void	sbappendstream(struct sockbuf *, struct mbuf *);
 int	sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
 	    struct mbuf *);
+int	sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
+	     struct mbuf *, int);
 int	sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
 void	sbappendrecord(struct sockbuf *, struct mbuf *);
 void	sbcheck(struct sockbuf *);
Index: kern/uipc_socket2.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.62
diff -u -r1.62 uipc_socket2.c
--- kern/uipc_socket2.c	19 Apr 2004 03:44:46 -0000	1.62
+++ kern/uipc_socket2.c	21 May 2004 18:50:48 -0000
@@ -511,15 +511,22 @@
 }
 #endif /* SOCKBUF_DEBUG */
 
-#define	SBLINKRECORD(sb, m0)						\
+/*
+ * Link a chain of records onto a socket buffer
+ */
+#define	SBLINKRECORDCHAIN(sb, m0, mlast)				\
 do {									\
 	if ((sb)->sb_lastrecord != NULL)				\
 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
 	else								\
 		(sb)->sb_mb = (m0);					\
-	(sb)->sb_lastrecord = (m0);					\
+	(sb)->sb_lastrecord = (mlast);					\
 } while (/*CONSTCOND*/0)
 
+
+#define	SBLINKRECORD(sb, m0)						\
+    SBLINKRECORDCHAIN(sb, m0, m0)
+
 /*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
@@ -764,6 +771,110 @@
 	return (1);
 }
 
+/*
+ * Helper for sbappendchainaddr: prepend a struct sockaddr* to
+ * an mbuf chain.
+ */
+static __inline struct mbuf *
+m_prepend_sockaddr(struct mbuf *m0, const struct sockaddr *asa)
+{
+	struct mbuf *m;
+	const int mlen = asa->sa_len;
+
+	/* only the first in each chain need be a pkthdr */
+	MGETHDR(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	MCLAIM(m, sb->sb_mowner);
+	KASSERT(mlen <= MLEN);
+
+	m->m_len = mlen;
+	bcopy((caddr_t)asa, mtod(m, caddr_t), mlen);
+	m->m_next = m0;
+	m->m_pkthdr.len = mlen + m0->m_pkthdr.len;
+
+	return m;
+}
+
+int
+sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
+		  struct mbuf *m0, int sbprio)
+{
+	int space;
+	struct mbuf *m, *n, *n0, *nlast;
+	int error;
+
+	/*
+	 * XXX sbprio reserved for encoding priority of this* request:
+	 *  SB_PRIO_NONE --> honour normal sb limits
+	 *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
+	 *	take whole chain. Intended for large requests
+	 *      that should be delivered atomically (all, or none).
+	 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
+	 *       over normal socket limits, for messages indicating
+	 *       buffer overflow in earlier normal/lower-priority messages
+	 * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
+	 *       Intended for  kernel-generated messages only.
+	 *        Up to generator to avoid total mbuf resource exhaustion.
+	 */
+	(void)sbprio;
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddrchain");
+
+	space = sbspace(sb);
+	
+#ifdef notyet
+	/* 
+	 * we are expected to violate socket-buffer limits: If the
+	 * socket has any rcv space available at all, deliver the
+	 * entire chain. Otherwise, deliver nothing at all.
+	 */
+
+	if (space <= 0)
+		return (0);
+#endif
+
+	n0 = NULL;
+	nlast = NULL;
+	for (m = m0; m; m = m->m_nextpkt) {
+		struct mbuf *np;
+
+		/* Prepend sockaddr to this record (m) of input chain m0 */
+	  	n = m_prepend_sockaddr(m, asa);
+		if (n == NULL) {
+			error = ENOBUFS;
+			goto bad;
+		}
+
+		/* Append record (asa+m) to end of new chain n0 */
+		if (n0 == NULL) {
+			n0 = n;
+		} else {
+			nlast->m_nextpkt = n;
+		}
+		/* Keep track of last record on new chain */
+		nlast = n;
+
+		for (np = n; np; np = np->m_next)
+			sballoc(sb, np);
+	}
+
+	/* Drop the entire chain of (asa+m) records onto the socket */
+	SBLINKRECORDCHAIN(sb, n0, nlast);
+	for (m = nlast; m->m_next; m = m->m_next)
+		;
+	sb->sb_mbtail = m;
+	
+	return (1);
+
+bad:
+	if (n)
+		m_freem(n);
+	return 0;	
+}
+
+
 int
 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
 {
Index: netipsec/key.c
===================================================================
RCS file: /cvsroot/src/sys/netipsec/key.c,v
retrieving revision 1.15
diff -u -r1.15 key.c
--- netipsec/key.c	30 Apr 2004 01:08:35 -0000	1.15
+++ netipsec/key.c	21 May 2004 18:51:07 -0000
@@ -385,6 +385,8 @@
 	const struct sadb_msghdr *));
 static int key_spddump __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
+static struct mbuf * key_setspddump __P((int *errorp));
+static struct mbuf * key_setspddump_chain __P((int *errorp, int *lenp));
 static struct mbuf *key_setdumpsp __P((struct secpolicy *,
 	u_int8_t, u_int32_t, u_int32_t));
 static u_int key_getspreqmsglen __P((struct secpolicy *));
@@ -2353,6 +2355,61 @@
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 }
 
+static struct sockaddr key_src = { 2, PF_KEY, };
+
+static struct mbuf *
+key_setspddump_chain(int *errorp, int *lenp)
+{
+	struct secpolicy *sp;
+	int cnt;
+	u_int dir;
+	struct mbuf *m, *n, *prev;
+	int totlen;
+
+	*lenp = 0;
+
+	/* search SPD entry and get buffer size. */
+	cnt = 0;
+	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
+		LIST_FOREACH(sp, &sptree[dir], chain) {
+			cnt++;
+		}
+	}
+
+	if (cnt == 0) {
+		*errorp = ENOENT;
+		return (NULL);
+	}
+
+	m = NULL;
+	prev = m;
+	totlen = 0;
+	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
+		LIST_FOREACH(sp, &sptree[dir], chain) {
+			--cnt;
+			n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt, 0);
+
+			if (!n) {
+				*errorp = ENOBUFS;
+				if (m) m_freem(m);
+				return (NULL);
+			}
+			totlen += n->m_pkthdr.len;
+
+			if (!m) {
+				m = n;
+			} else {
+				prev->m_nextpkt = n;
+			}
+			prev = n;
+		}
+	}
+
+	*lenp = totlen;
+	*errorp = 0;
+	return (m);
+}
+
 /*
  * SADB_SPDDUMP processing
  * receive
@@ -2365,44 +2422,65 @@
  * m will always be freed.
  */
 static int
-key_spddump(so, m, mhp)
+key_spddump(so, m0, mhp)
 	struct socket *so;
-	struct mbuf *m;
+	struct mbuf *m0;
 	const struct sadb_msghdr *mhp;
 {
-	struct secpolicy *sp;
-	int cnt;
-	u_int dir;
-	struct mbuf *n;
+	struct mbuf *m;
+	int error, len;
+	int ok, s;
 
 	/* sanity check */
-	if (so == NULL || m == NULL || mhp == NULL || mhp->msg == NULL)
+	if (so == NULL || m0 == NULL || mhp == NULL || mhp->msg == NULL)
 		panic("key_spddump: NULL pointer is passed.\n");
 
-	/* search SPD entry and get buffer size. */
-	cnt = 0;
-	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
-		LIST_FOREACH(sp, &sptree[dir], chain) {
-			cnt++;
-		}
+
+	/*
+	 * If the requestor has insufficient socket-buffer space
+	 * for the entire chain, nobody gets any response to the DUMP.
+	 * XXX For now, only the requestor ever gets anything.
+	 * Moreover, if the requestor has any space at all, they receive
+	 * the entire chain, otherwise the request is refused with  ENOBUFS.
+	 */
+	if (sbspace(&so->so_rcv) <= 0) {
+		return key_senderror(so, m0, ENOBUFS);
 	}
 
-	if (cnt == 0)
-		return key_senderror(so, m, ENOENT);
+	s = splsoftnet();
+	m = key_setspddump_chain(&error, &len);
+	splx(s);
 
-	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
-		LIST_FOREACH(sp, &sptree[dir], chain) {
-			--cnt;
-			n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
-			    mhp->msg->sadb_msg_pid);
+	if (m == NULL) {
+		return key_senderror(so, m0, ENOENT);
+	}
+	pfkeystat.in_total++;
+	pfkeystat.in_bytes += len;
 
-			if (n)
-				key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
-		}
+	/*
+	 * PF_KEY DUMP responses are no longer broadcast to all PF_KEY sockets.
+	 * The requestor receives either the entire chain, or an
+	 * error message with ENOBUFS.
+	 */
+
+	/*
+	 * sbappendchainwith record takes the chain of entries, one
+	 * packet-record per SPD entry, prepends the key_src sockaddr
+	 * to each packet-record, links the sockaddr mbufs into a new
+	 * list of records, then   appends the entire resulting
+	 * list to the requesting socket.
+	 */
+	ok = sbappendaddrchain(&so->so_rcv, (struct sockaddr *)&key_src,
+	        m, /*SB_PRIO_ONESHOT_OVERFLOW*/ 0);
+
+	if (!ok) {
+		pfkeystat.in_nomem++;
+		m_freem(m);
+		return key_senderror(so, m0, ENOBUFS);
 	}
+	/*XXX*/sorwakeup(so);
 
-	m_freem(m);
-	return 0;
+	return error;
 }
 
 static struct mbuf *