Subject: Patch for timiting TCP MSS (i.e. for new PPPoE)
To: None <current-users@netbsd.org>
From: Rick Byers <rb-netbsd@BigScaryChildren.net>
List: tech-net
Date: 12/02/2001 14:37:11
Hi,
In order to work around buggy networks suffering from the PMTU blackhole
problem (see RFC 2923), I've written up a quick patch which adds a sysctl
to limit the advertised TCP MSS (I this this is preferable to lowering
the interface MTU).  Ideally, this could be configured per interface or
per route, or even auto-detected on a host-by-host basis - but all of
those options require much more work.

This should be most usefull for machines behind a NetBSD box using the new
PPPoE implementation (which doesn't have any built-in support for TCP MSS
clamping like most other PPPoE software does).

The best way to work around this problem is debateable.  Ideally
we could just convince people to fix their networks, but thats not so
easy (I've spend over a month trying to convince the Bank of Montreal -
www.bmo.com - that they have a problem that needs to be fixed).  Anyway,
this is one possible work around, and I post it here in the hopes that
others will find it usefull.  If this is comitted, it will close
kern/10943.

Rick

Patch against NetBSD-1.5Y
Index: sys/netinet/tcp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_var.h,v
retrieving revision 1.87
diff -c -r1.87 tcp_var.h
*** tcp_var.h	2001/09/11 21:03:21	1.87
--- tcp_var.h	2001/12/02 19:06:00
***************
*** 559,565 ****
  #endif
  #define	TCPCTL_RSTPPSLIMIT	24	/* RST pps limit */
  #define	TCPCTL_DELACK_TICKS	25	/* # ticks to delay ACK */
! #define	TCPCTL_MAXID		26

  #define	TCPCTL_NAMES { \
  	{ 0, 0 }, \
--- 559,566 ----
  #endif
  #define	TCPCTL_RSTPPSLIMIT	24	/* RST pps limit */
  #define	TCPCTL_DELACK_TICKS	25	/* # ticks to delay ACK */
! #define TCPCTL_MSS_MAXADV	26	/* Maximum advertised MSS */
! #define	TCPCTL_MAXID		27

  #define	TCPCTL_NAMES { \
  	{ 0, 0 }, \
***************
*** 588,593 ****
--- 589,595 ----
  	{ 0, 0 }, \
  	{ "rstppslimit", CTLTYPE_INT }, \
  	{ "delack_ticks", CTLTYPE_INT }, \
+ 	{ "mss_maxadv", CTLTYPE_INT }, \
  }

  #ifdef _KERNEL
***************
*** 602,608 ****
  extern	int tcp_do_win_scale;	/* RFC1323 window scaling enabled/disabled? */
  extern	int tcp_do_timestamps;	/* RFC1323 timestamps enabled/disabled? */
  extern	int tcp_do_newreno;	/* Use the New Reno algorithms */
! extern	int tcp_mssdflt;	/* default seg size */
  extern	int tcp_init_win;	/* initial window */
  extern	int tcp_mss_ifmtu;	/* take MSS from interface, not in_maxmtu */
  extern	int tcp_compat_42;	/* work around ancient broken TCP peers */
--- 604,611 ----
  extern	int tcp_do_win_scale;	/* RFC1323 window scaling enabled/disabled? */
  extern	int tcp_do_timestamps;	/* RFC1323 timestamps enabled/disabled? */
  extern	int tcp_do_newreno;	/* Use the New Reno algorithms */
! extern	int tcp_mssdflt;	/* default maximum seg size */
! extern	int tcp_mss_maxadv;	/* maximum advertised mss */
  extern	int tcp_init_win;	/* initial window */
  extern	int tcp_mss_ifmtu;	/* take MSS from interface, not in_maxmtu */
  extern	int tcp_compat_42;	/* work around ancient broken TCP peers */
***************
*** 646,651 ****
--- 649,655 ----
  	{ 0 },					\
  	{ 1, 0, &tcp_rst_ppslim },		\
  	{ 1, 0, &tcp_delack_ticks },		\
+ 	{ 1, 0, &tcp_mss_maxadv },		\
  }

  int	 tcp_attach __P((struct socket *));
Index: sys/netinet/tcp_subr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_subr.c,v
retrieving revision 1.122
diff -c -r1.122 tcp_subr.c
*** tcp_subr.c	2001/11/13 00:32:41	1.122
--- tcp_subr.c	2001/12/02 19:06:04
***************
*** 165,170 ****
--- 165,171 ----

  /* patchable/settable parameters for tcp */
  int 	tcp_mssdflt = TCP_MSS;
+ int	tcp_mss_maxadv = 0;
  int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
  int	tcp_do_rfc1323 = 1;	/* window scaling / timestamps (obsolete) */
  #if NRND > 0
***************
*** 1573,1579 ****
--- 1574,1588 ----
  	if (mss > hdrsiz)
  		mss -= hdrsiz;

+ 	/* If a maximum MSS is set, don't advertise above it.
+ 	 */
+ 	if( tcp_mss_maxadv > 0 )
+ 	    mss = min(tcp_mss_maxadv, mss);
+
+ 	/* Advertise atleast the default MSS (is there a reason for this?)
+ 	 */
  	mss = max(tcp_mssdflt, mss);
+
  	return (mss);
  }

Index: sbin/sysctl/sysctl.8
===================================================================
RCS file: /cvsroot/basesrc/sbin/sysctl/sysctl.8,v
retrieving revision 1.70
diff -c -r1.70 sysctl.8
*** sysctl.8	2001/10/30 07:28:22	1.70
--- sysctl.8	2001/12/02 19:06:05
***************
*** 252,257 ****
--- 252,258 ----
  .It net.inet.tcp.keepintvl	integer	yes
  .It net.inet.tcp.log_refused	integer	yes
  .It net.inet.tcp.mss_ifmtu	integer	yes
+ .It net.inet.tcp.mss_maxadv	integer	yes
  .It net.inet.tcp.mssdflt	integer	yes
  .It net.inet.tcp.recvspace	integer	yes
  .It net.inet.tcp.rfc1323	integer	yes
Index: lib/libc/gen/sysctl.3
===================================================================
RCS file: /cvsroot/basesrc/lib/libc/gen/sysctl.3,v
retrieving revision 1.80
diff -c -r1.80 sysctl.3
*** sysctl.3	2001/10/30 06:43:21	1.80
--- sysctl.3	2001/12/02 19:06:08
***************
*** 705,710 ****
--- 705,711 ----
  .It tcp	newreno	integer	yes
  .It tcp	log_refused	integer	yes
  .It tcp	rstppslimit	integer	yes
+ .It tcp	mss_maxadv	integer	yes
  .It udp	checksum	integer	yes
  .It udp	sendspace	integer	yes
  .It udp	recvspace	integer	yes
***************
*** 796,801 ****
--- 797,808 ----
  and to use when the peer does not advertize a maximum segment size to
  us during connection setup.  Do not change this value unless you really
  know what you are doing.
+ .It Li tcp.mss_maxadv
+ Returns the maximum TCP MSS that will be advertized to peers, or 0 if there
+ is no limit.  Usually there is no need to change this setting, but it can
+ be usefull for working around buggy networks which suffer from the  PMTU
+ blackhole problem.  This setting does not affect the segment size for
+ outgoing packets.
  .It Li tcp.syn_cache_limit
  Returns the maximum number of entries allowed in the TCP compressed state
  engine.