Subject: EMSGSIZE and mtu's
To: None <tech-net@NetBSD.ORG>
From: Matt Thomas <matt@3am-software.com>
List: tech-net
Date: 10/01/1997 15:39:44
When using raw sockets with IP, one might need to set the DF bit in
the IP header to prevent fragmentation.  However, there's no current
way to return what the MTU that should have been used.  This small
diff attempts to fix that.  It defines a new flag bit to ip_output
(IP_RETURNMTU) which, if set, indicates that there exists an extra
"int *" argument to ip_output in which the MTU would be returned if
ip_output returns EMSGSIZE.

I also added an extra getsockopt, IP_ERRORMTU, which (for raw sockets
only for now) returns the errored mtu to the user.  

	cc = sendto(...)
	if (cc < 0)
	    if (errno == EMSGSIZE)
		int mtu;
		int mtulen = sizeof(mtu);
		getsockopt(fd, IPPROTO_IP, IP_ERRORMTU, &mtu, &mtulen);
		...
	    }
	}

Note that the below diffs use if_mtu for now but when Path MTU discovery
is really added, the path mtu should be returned instead.

I'd like to add this.  Anyone have any comments?  [I'd also like to add
a MSG_ATOMIC flag which could/would control whether packets may/may-not
be fragmented.  An M_ATOMIC mbuf header flag would also be added.]

Cheers,

Index: in_pcb.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/in_pcb.c,v
retrieving revision 1.38
diff -u -r1.38 in_pcb.c
--- in_pcb.c	1997/09/22 21:39:40	1.38
+++ in_pcb.c	1997/10/01 19:18:40
@@ -100,6 +100,7 @@
 	bzero((caddr_t)inp, sizeof(*inp));
 	inp->inp_table = table;
 	inp->inp_socket = so;
+	inp->inp_errormtu = -1;
 	so->so_pcb = inp;
 	s = splnet();
 	CIRCLEQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
Index: in_pcb.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/in_pcb.h,v
retrieving revision 1.21
diff -u -r1.21 in_pcb.h
--- in_pcb.h	1997/09/22 21:39:42	1.21
+++ in_pcb.h	1997/10/01 19:18:41
@@ -58,6 +58,7 @@
 	struct	  ip inp_ip;		/* header prototype; should have more */
 	struct	  mbuf *inp_options;	/* IP options */
 	struct	  ip_moptions *inp_moptions; /* IP multicast options */
+	int	  inp_errormtu;		/* MTU of last xmit status = EMSGSIZE */
 };
 #define	inp_faddr	inp_ip.ip_dst
 #define	inp_laddr	inp_ip.ip_src
Index: ip_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_output.c,v
retrieving revision 1.39
diff -u -r1.39 ip_output.c
--- ip_output.c	1997/04/15 00:41:53	1.39
+++ ip_output.c	1997/10/01 19:18:43
@@ -91,6 +91,7 @@
 	struct mbuf *opt;
 	struct route *ro;
 	int flags;
+	int *mtu_p;
 	struct ip_moptions *imo;
 	va_list ap;
 #ifdef PFIL_HOOKS
@@ -104,6 +105,10 @@
 	ro = va_arg(ap, struct route *);
 	flags = va_arg(ap, int);
 	imo = va_arg(ap, struct ip_moptions *);
+	if (flags & IP_RETURNMTU)
+		mtu_p = va_arg(ap, int *);
+	else
+		mtu_p = NULL;
 	va_end(ap);
 
 #ifdef	DIAGNOSTIC
@@ -329,6 +334,8 @@
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (ip->ip_off & IP_DF) {
+		if (flags & IP_RETURNMTU)
+			*mtu_p = ifp->if_mtu;
 		error = EMSGSIZE;
 		ipstat.ips_cantfrag++;
 		goto bad;
@@ -618,6 +625,7 @@
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVIF:
+		case IP_ERRORMTU:
 			*mp = m = m_get(M_WAIT, MT_SOOPTS);
 			m->m_len = sizeof(int);
 			switch (optname) {
@@ -628,6 +636,10 @@
 
 			case IP_TTL:
 				optval = inp->inp_ip.ip_ttl;
+				break;
+
+			case IP_ERRORMTU:
+				optval = inp->inp_errormtu;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
Index: in.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/in.h,v
retrieving revision 1.25
diff -u -r1.25 in.h
--- in.h	1997/08/27 17:05:34	1.25
+++ in.h	1997/10/01 19:18:43
@@ -187,6 +187,7 @@
 #define	IP_ADD_MEMBERSHIP	12   /* ip_mreq; add an IP group membership */
 #define	IP_DROP_MEMBERSHIP	13   /* ip_mreq; drop an IP group membership */
 #define	IP_RECVIF		20   /* bool; receive reception if w/dgram */
+#define	IP_ERRORMTU		21   /* int; get MTU of last xmit = EMSGSIZE */
 
 /*
  * Defaults and limits for options
Index: ip_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_var.h,v
retrieving revision 1.20
diff -u -r1.20 ip_var.h
--- ip_var.h	1997/06/24 02:26:05	1.20
+++ ip_var.h	1997/10/01 19:18:44
@@ -146,6 +146,7 @@
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
+#define	IP_RETURNMTU		0x4		/* pass back mtu on EMSGSIZE */
 #define	IP_ROUTETOIF		SO_DONTROUTE	/* bypass routing tables */
 #define	IP_ALLOWBROADCAST	SO_BROADCAST	/* can send broadcast packets */
 
Index: raw_ip.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/raw_ip.c,v
retrieving revision 1.36
diff -u -r1.36 raw_ip.c
--- raw_ip.c	1997/01/11 05:21:13	1.36
+++ raw_ip.c	1997/10/01 19:18:44
@@ -181,7 +181,8 @@
 	va_end(ap);
 
 	flags =
-	    (inp->inp_socket->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+	    (inp->inp_socket->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST
+	    | IP_RETURNMTU;
 
 	/*
 	 * If the user handed us a complete IP packet, use it.
@@ -215,7 +216,7 @@
 		flags |= IP_RAWOUTPUT;
 		ipstat.ips_rawout++;
 	}
-	return (ip_output(m, opts, &inp->inp_route, flags, inp->inp_moptions));
+	return (ip_output(m, opts, &inp->inp_route, flags, inp->inp_moptions, &inp->inp_errormtu));
 }
 
 /*


-- 
Matt Thomas               Internet:   matt@3am-software.com
3am Software Foundry      WWW URL:    http://www.3am-software.com/bio/matt.html
Nashua, NH                Disclaimer: I disavow all knowledge of this message