Subject: Wart in tcp_output(), IPv6-related?
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 04/26/2002 16:22:47
--96YOpH+ONegL0A3E
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
When the KAME IPv6 code came in, we grew a wart on tcp_output(),
(the snippet of code is in tcp_build_datapkt() in -current, so that it can
be profiled easily). Specifcally, when the outgoing TCP segment is being
built, the code checks to see if the packet is larger than a single mbuf
and smaller than a cluster, and if so, allocates a cluster for the packet.
This means that, for the vast majority of cases, we are now going to
pay for a data copy, since a TCP segment will always fit into a cluster
for a 1500 byte MTU.
By removing this chunk of code (patch with instrumentation attached),
then we fall into the case that does m_copy(), which will simply do a
reference to M_EXT mbufs rather than copying the data. If an application
is doing large writes, the socket buffer is going to have M_EXT mbufs in
it, therefore we go from "almost always copy" to "almost never copy",
which is precisely what we want. This results in 1MB/s better throughput
with ttcp, and will get us pretty damn close to zero-copy once I write a
page-loaning sosend for stream sockets.
Can someone (itojun?) explain to me why the MCLGET() was added to
tcp_output() in the first place?
--
-- Jason R. Thorpe <thorpej@wasabisystems.com>
--96YOpH+ONegL0A3E
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=tcp-patch
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_output.c,v
retrieving revision 1.78
diff -u -r1.78 tcp_output.c
--- tcp_output.c 2002/03/01 22:54:09 1.78
+++ tcp_output.c 2002/04/26 23:18:04
@@ -1,3 +1,4 @@
+#define TCP_OUTPUT_COUNTERS
/* $NetBSD: tcp_output.c,v 1.78 2002/03/01 22:54:09 thorpej Exp $ */
/*
@@ -200,6 +201,20 @@
int tcp_cwm = 1;
int tcp_cwm_burstsize = 4;
+#ifdef TCP_OUTPUT_COUNTERS
+#include <sys/device.h>
+
+extern struct evcnt tcp_output_copysmall;
+extern struct evcnt tcp_output_copybig;
+extern struct evcnt tcp_output_refbig;
+
+#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
+#else
+
+#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
+
+#endif /* TCP_OUTPUT_COUNTERS */
+
static
#ifndef GPROF
__inline
@@ -384,15 +399,6 @@
m->m_data -= hdrlen;
#else
MGETHDR(m, M_DONTWAIT, MT_HEADER);
- if (m != NULL &&
- (max_linkhdr + hdrlen > MHLEN ||
- max_linkhdr + hdrlen + len <= MCLBYTES)) {
- MCLGET(m, M_DONTWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_freem(m);
- m = NULL;
- }
- }
if (m == NULL)
return (ENOBUFS);
m->m_data += max_linkhdr;
@@ -401,12 +407,19 @@
m_copydata(so->so_snd.sb_mb, off, (int) len,
mtod(m, caddr_t) + hdrlen);
m->m_len += len;
+ TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
} else {
m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
if (m->m_next == NULL) {
m_freem(m);
return (ENOBUFS);
}
+#ifdef TCP_OUTPUT_COUNTERS
+ if (m->m_next->m_flags & M_EXT)
+ TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
+ else
+ TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
+#endif /* TCP_OUTPUT_COUNTERS */
}
#endif
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_subr.c,v
retrieving revision 1.124
diff -u -r1.124 tcp_subr.c
--- tcp_subr.c 2002/03/15 09:25:41 1.124
+++ tcp_subr.c 2002/04/26 23:18:06
@@ -1,3 +1,4 @@
+#define TCP_OUTPUT_COUNTERS
/* $NetBSD: tcp_subr.c,v 1.124 2002/03/15 09:25:41 itojun Exp $ */
/*
@@ -227,6 +228,17 @@
NULL, "tcp", "swcsum");
#endif /* TCP_CSUM_COUNTERS */
+#ifdef TCP_OUTPUT_COUNTERS
+#include <sys/device.h>
+
+struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+ NULL, "tcp", "output copy small");
+struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+ NULL, "tcp", "output copy big");
+struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
+ NULL, "tcp", "output reference big");
+#endif /* TCP_OUTPUT_COUNTERS */
+
/*
* Tcp initialization
*/
@@ -271,6 +283,12 @@
evcnt_attach_static(&tcp_hwcsum_data);
evcnt_attach_static(&tcp_swcsum);
#endif /* TCP_CSUM_COUNTERS */
+
+#ifdef TCP_OUTPUT_COUNTERS
+ evcnt_attach_static(&tcp_output_copysmall);
+ evcnt_attach_static(&tcp_output_copybig);
+ evcnt_attach_static(&tcp_output_refbig);
+#endif /* TCP_OUTPUT_COUNTERS */
}
/*
--96YOpH+ONegL0A3E--