Subject: Wart in tcp_output(), IPv6-related?
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 04/26/2002 16:22:47
--96YOpH+ONegL0A3E
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

When the KAME IPv6 code came in, we grew a wart on tcp_output(),
(the snippet of code is in tcp_build_datapkt() in -current, so that it can
be profiled easily).  Specifcally, when the outgoing TCP segment is being
built, the code checks to see if the packet is larger than a single mbuf
and smaller than a cluster, and if so, allocates a cluster for the packet.

This means that, for the vast majority of cases, we are now going to
pay for a data copy, since a TCP segment will always fit into a cluster
for a 1500 byte MTU.

By removing this chunk of code (patch with instrumentation attached),
then we fall into the case that does m_copy(), which will simply do a
reference to M_EXT mbufs rather than copying the data.  If an application
is doing large writes, the socket buffer is going to have M_EXT mbufs in
it, therefore we go from "almost always copy" to "almost never copy",
which is precisely what we want.  This results in 1MB/s better throughput
with ttcp, and will get us pretty damn close to zero-copy once I write a
page-loaning sosend for stream sockets.

Can someone (itojun?) explain to me why the MCLGET() was added to
tcp_output() in the first place?

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--96YOpH+ONegL0A3E
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=tcp-patch

Index: tcp_output.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_output.c,v
retrieving revision 1.78
diff -u -r1.78 tcp_output.c
--- tcp_output.c	2002/03/01 22:54:09	1.78
+++ tcp_output.c	2002/04/26 23:18:04
@@ -1,3 +1,4 @@
+#define TCP_OUTPUT_COUNTERS
 /*	$NetBSD: tcp_output.c,v 1.78 2002/03/01 22:54:09 thorpej Exp $	*/
 
 /*
@@ -200,6 +201,20 @@
 int	tcp_cwm = 1;
 int	tcp_cwm_burstsize = 4;
 
+#ifdef TCP_OUTPUT_COUNTERS
+#include <sys/device.h>
+
+extern struct evcnt tcp_output_copysmall;
+extern struct evcnt tcp_output_copybig;
+extern struct evcnt tcp_output_refbig;
+
+#define	TCP_OUTPUT_COUNTER_INCR(ev)	(ev)->ev_count++
+#else
+
+#define	TCP_OUTPUT_COUNTER_INCR(ev)	/* nothing */
+
+#endif /* TCP_OUTPUT_COUNTERS */
+
 static
 #ifndef GPROF
 __inline
@@ -384,15 +399,6 @@
 	m->m_data -= hdrlen;
 #else
 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
-	if (m != NULL &&
-	    (max_linkhdr + hdrlen > MHLEN ||
-	     max_linkhdr + hdrlen + len <= MCLBYTES)) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_freem(m);
-			m = NULL;
-		}
-	}
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
@@ -401,12 +407,19 @@
 		m_copydata(so->so_snd.sb_mb, off, (int) len,
 		    mtod(m, caddr_t) + hdrlen);
 		m->m_len += len;
+		TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
 	} else {
 		m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
 		if (m->m_next == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
+#ifdef TCP_OUTPUT_COUNTERS
+		if (m->m_next->m_flags & M_EXT)
+			TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
+		else
+			TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
+#endif /* TCP_OUTPUT_COUNTERS */
 	}
 #endif
 
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_subr.c,v
retrieving revision 1.124
diff -u -r1.124 tcp_subr.c
--- tcp_subr.c	2002/03/15 09:25:41	1.124
+++ tcp_subr.c	2002/04/26 23:18:06
@@ -1,3 +1,4 @@
+#define TCP_OUTPUT_COUNTERS
 /*	$NetBSD: tcp_subr.c,v 1.124 2002/03/15 09:25:41 itojun Exp $	*/
 
 /*
@@ -227,6 +228,17 @@
     NULL, "tcp", "swcsum");
 #endif /* TCP_CSUM_COUNTERS */
 
+#ifdef TCP_OUTPUT_COUNTERS
+#include <sys/device.h>
+
+struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+    NULL, "tcp", "output copy small");
+struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+    NULL, "tcp", "output copy big");
+struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+    NULL, "tcp", "output reference big");
+#endif /* TCP_OUTPUT_COUNTERS */
+
 /*
  * Tcp initialization
  */
@@ -271,6 +283,12 @@
 	evcnt_attach_static(&tcp_hwcsum_data);
 	evcnt_attach_static(&tcp_swcsum);
 #endif /* TCP_CSUM_COUNTERS */
+
+#ifdef TCP_OUTPUT_COUNTERS
+	evcnt_attach_static(&tcp_output_copysmall);
+	evcnt_attach_static(&tcp_output_copybig);
+	evcnt_attach_static(&tcp_output_refbig);
+#endif /* TCP_OUTPUT_COUNTERS */
 }
 
 /*

--96YOpH+ONegL0A3E--