Subject: Splitting ip{,6}_output
To: None <tech-net@netbsd.org>
From: DEGROOTE Arnaud <degroote@NetBSD.org>
List: tech-net
Date: 03/02/2007 22:46:04
--+QahgC5+KEYLbs62
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

In order to better integrate the fast_ipsec with our ipv{4,6} processing,
we need to make some changes to the current way to deal in the output
processing. Currently, we do something like that

ip6_output calls ipsec6_process_packet which process the ipsec
transformation on an asynchronous way. When it has finished,
ipsec_process_done is called and the packet is reinjected in ip6_output
with dummy arguments.

There is two problems here :
    - we lose the current argument of ip6_output ( all the options in
	  particulary )
    - we process some things that we already have processed on the first
	 pass

The situation is quite the same on the v4 side, maybe worse because when we
call ipsec4_process_packet, we have already process most of the ip_output
function.

The solution I have previously proposed for the v6 side, during the SoC is
to split the stack in two ip6_output and ip6_output2, and the
ipsec_process_done will callback in the ip6 output code with ip6_output2.

I propose to do the same thing for the netinet stack. I don't expect
any performance decrease. 
		     
You can find the both patchs attached.  
		      
Can people comment it ? I'm interested too if you have any better ideas.
If nobody has strong objections, I will commit it next week.

Take cares

--
Arnaud Degroote

--+QahgC5+KEYLbs62
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="ip6_output.c.diff"

Index: ip6_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet6/ip6_output.c,v
retrieving revision 1.116
diff -u -r1.116 ip6_output.c
--- ip6_output.c	21 Feb 2007 23:00:08 -0000	1.116
+++ ip6_output.c	2 Mar 2007 22:37:49 -0000
@@ -141,6 +141,10 @@
     const struct in6_addr *, u_long *, int *);
 static int copypktopts __P((struct ip6_pktopts *, struct ip6_pktopts *, int));
 
+static int ip6_output2 (struct mbuf *, struct ip6_pktopts *, 
+    struct route_in6 *, int, struct ip6_moptions*, struct socket *, 
+    struct ifnet **, void *, struct ip6_exthdrs *, struct in6_addr *);
+
 #ifdef RFC2292
 static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *,
 	struct socket *));
@@ -173,23 +177,12 @@
     struct ifnet **ifpp		/* XXX: just for statistics */
 )
 {
-	struct ip6_hdr *ip6, *mhip6;
-	struct ifnet *ifp, *origifp;
+	struct ip6_hdr *ip6;
 	struct mbuf *m = m0;
-	int hlen, tlen, len, off;
-	bool tso;
-	struct route_in6 ip6route;
-	struct rtentry *rt = NULL;
-	struct sockaddr_in6 *dst, src_sa, dst_sa;
 	int error = 0;
-	struct in6_ifaddr *ia = NULL;
-	u_long mtu;
-	int alwaysfrag, dontfrag;
-	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
+	u_int32_t optlen = 0, plen = 0;
 	struct ip6_exthdrs exthdrs;
-	struct in6_addr finaldst, src0, dst0;
-	u_int32_t zone;
-	struct route_in6 *ro_pmtu = NULL;
+	struct in6_addr finaldst; 
 	int hdrsplit = 0;
 	int needipsec = 0;
 #ifdef IPSEC
@@ -312,7 +305,6 @@
 	if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len;
-	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 	/* NOTE: we don't add AH/ESP length here. do that later. */
 	if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len;
 
@@ -564,18 +556,6 @@
 
 	ip6stat.ip6s_localout++;
 
-	/*
-	 * Route packet.
-	 */
-	/* initialize cached route */
-	if (ro == NULL) {
-		memset(&ip6route, 0, sizeof(ip6route));
-		ro = &ip6route;
-	}
-	ro_pmtu = ro;
-	if (opt && opt->ip6po_rthdr)
-		ro = &opt->ip6po_route;
-	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 
  	/*
 	 * if specified, try to fill in the traffic class field.
@@ -603,8 +583,105 @@
 			ip6->ip6_hlim = ip6_defmcasthlim;
 	}
 
+#ifdef FAST_IPSEC
+	if (needipsec) {
+		s = splsoftnet();
+		error = ipsec6_process_packet(m,sp->req);
+
+		/*
+		 * Preserve KAME behaviour: ENOENT can be returned
+		 * when an SA acquire is in progress.  Don't propagate
+		 * this to user-level; it confuses applications.
+		 * XXX this will go away when the SADB is redone.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		splx(s);
+		goto done;
+    }
+#endif /* FAST_IPSEC */    
+
+#ifdef IPSEC
+	if (needipsectun) 
+		error = ip6_output2(m, opt, ro, flags, im6o, so,
+							ifpp, sp, &exthdrs, &finaldst);
+	else
+#endif
+		error = ip6_output2(m, opt, ro, flags, im6o, so, 
+							ifpp, NULL, &exthdrs, &finaldst);
+
+done:
+#ifdef IPSEC
+	if (sp != NULL)
+		key_freesp(sp);
+#endif /* IPSEC */
+#ifdef FAST_IPSEC
+	if (sp != NULL)
+		KEY_FREESP(&sp);
+#endif /* FAST_IPSEC */
+
+	return (error);
+
+freehdrs:
+	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
+	m_freem(exthdrs.ip6e_dest1);
+	m_freem(exthdrs.ip6e_rthdr);
+	m_freem(exthdrs.ip6e_dest2);
+	/* FALLTHROUGH */
+bad:
+	m_freem(m);
+	goto done;
+}
+
+static int
+ip6_output2(
+	struct mbuf *m0,
+	struct ip6_pktopts *opt,
+	struct route_in6 *ro,
+	int flags,
+	struct ip6_moptions *im6o,
+	struct socket *so,
+	struct ifnet **ifpp,		/* XXX: just for statistics */
+	void * spp,
+	struct ip6_exthdrs * exthdrs,
+	struct in6_addr *  finaldst)
+{
+	struct ip6_hdr *ip6, *mhip6;
+	struct ifnet *ifp, *origifp;
+	struct mbuf *m = m0;
+	int hlen, tlen, len, off;
+	bool tso;
+	struct route_in6 ip6route;
+	struct rtentry *rt = NULL;
+	struct sockaddr_in6 *dst, src_sa, dst_sa;
+	int error = 0;
+	struct in6_ifaddr *ia = NULL;
+	u_long mtu;
+	int alwaysfrag, dontfrag;
+	u_int32_t unfragpartlen = 0;
+	struct in6_addr  src0, dst0;
+	u_int32_t zone;
+	struct route_in6 *ro_pmtu = NULL;
+#ifdef IPSEC
+	struct secpolicy * sp = spp;
+#endif
+	
+
+	/*
+	 * Route packet.
+	 */
+	/* initialize cached route */
+	if (ro == NULL) {
+		memset(&ip6route, 0, sizeof(ip6route));
+		ro = &ip6route;
+	}
+	ro_pmtu = ro;
+	if (opt && opt->ip6po_rthdr)
+		ro = &opt->ip6po_route;
+	dst = (struct sockaddr_in6 *)&ro->ro_dst;
+
 #ifdef IPSEC
-	if (needipsec && needipsectun) {
+	if (sp) {
 		struct ipsec_output_state state;
 
 		/*
@@ -615,8 +692,8 @@
 		 *
 		 * IPv6 [ESP|AH] IPv6 [extension headers] payload
 		 */
-		bzero(&exthdrs, sizeof(exthdrs));
-		exthdrs.ip6e_ip6 = m;
+		bzero(exthdrs, sizeof(*exthdrs));
+		exthdrs->ip6e_ip6 = m;
 
 		bzero(&state, sizeof(state));
 		state.m = m;
@@ -650,28 +727,9 @@
 			goto bad;
 		}
 
-		exthdrs.ip6e_ip6 = m;
+		exthdrs->ip6e_ip6 = m;
 	}
 #endif /* IPSEC */
-#ifdef FAST_IPSEC
-	if (needipsec) {
-		s = splsoftnet();
-		error = ipsec6_process_packet(m,sp->req);
-
-		/*
-		 * Preserve KAME behaviour: ENOENT can be returned
-		 * when an SA acquire is in progress.  Don't propagate
-		 * this to user-level; it confuses applications.
-		 * XXX this will go away when the SADB is redone.
-		 */
-		if (error == ENOENT)
-			error = 0;
-		splx(s);
-		goto done;
-    }
-#endif /* FAST_IPSEC */    
-
-
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
@@ -842,11 +900,11 @@
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
-	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
+	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, finaldst, &mtu,
 	    &alwaysfrag)) != 0)
 		goto bad;
 #ifdef IPSEC
-	if (needipsectun)
+	if (sp)
 		mtu = IPV6_MMTU;
 #endif
 
@@ -886,8 +944,8 @@
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
-	if (exthdrs.ip6e_hbh) {
-		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
+	if (exthdrs->ip6e_hbh) {
+		struct ip6_hbh *hbh = mtod(exthdrs->ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy1; /* XXX unused */
 		u_int32_t dummy2; /* XXX unused */
 
@@ -1038,6 +1096,11 @@
 		u_int32_t mtu32;
 #endif
 
+		unfragpartlen = sizeof(struct ip6_hdr);
+		if (exthdrs->ip6e_hbh) unfragpartlen += exthdrs->ip6e_hbh->m_len;
+		if (exthdrs->ip6e_dest1) unfragpartlen += exthdrs->ip6e_dest1->m_len;
+		if (exthdrs->ip6e_rthdr) unfragpartlen += exthdrs->ip6e_rthdr->m_len;
+
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
@@ -1079,15 +1142,15 @@
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
-		if (exthdrs.ip6e_rthdr) {
-			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
-			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
-		} else if (exthdrs.ip6e_dest1) {
-			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
-			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
-		} else if (exthdrs.ip6e_hbh) {
-			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
-			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
+		if (exthdrs->ip6e_rthdr) {
+			nextproto = *mtod(exthdrs->ip6e_rthdr, u_char *);
+			*mtod(exthdrs->ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
+		} else if (exthdrs->ip6e_dest1) {
+			nextproto = *mtod(exthdrs->ip6e_dest1, u_char *);
+			*mtod(exthdrs->ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
+		} else if (exthdrs->ip6e_hbh) {
+			nextproto = *mtod(exthdrs->ip6e_hbh, u_char *);
+			*mtod(exthdrs->ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
@@ -1192,34 +1255,17 @@
 	if (error == 0)
 		ip6stat.ip6s_fragmented++;
 
+bad:
+	m_freem(m);
 done:
+
 	/* XXX Second if is invariant? */
 	if (ro == &ip6route)
 		rtcache_free((struct route *)ro);
 	else if (ro_pmtu == &ip6route)
 		rtcache_free((struct route *)ro_pmtu);
 
-#ifdef IPSEC
-	if (sp != NULL)
-		key_freesp(sp);
-#endif /* IPSEC */
-#ifdef FAST_IPSEC
-	if (sp != NULL)
-		KEY_FREESP(&sp);
-#endif /* FAST_IPSEC */
-
-
-	return (error);
-
-freehdrs:
-	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
-	m_freem(exthdrs.ip6e_dest1);
-	m_freem(exthdrs.ip6e_rthdr);
-	m_freem(exthdrs.ip6e_dest2);
-	/* FALLTHROUGH */
-bad:
-	m_freem(m);
-	goto done;
+	return error;
 }
 
 static int

--+QahgC5+KEYLbs62
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="ip_output.c.diff"

Index: ip_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_output.c,v
retrieving revision 1.178
diff -u -r1.178 ip_output.c
--- ip_output.c	17 Feb 2007 22:34:11 -0000	1.178
+++ ip_output.c	28 Feb 2007 22:13:02 -0000
@@ -153,6 +153,20 @@
 #include <netinet/udp.h>
 #endif
 
+struct ip_output_args {
+	struct mbuf * opt;
+	struct ip_moptions * imo;
+	struct socket * so;
+	int flags;
+	int * mtu_p;
+#ifdef IPSEC_NAT_T
+	int natt_frag;
+#endif
+};
+
+static int ip_output2(struct mbuf *, struct ifnet *, struct sockaddr_in *,
+					  struct route *, u_long, struct ip_output_args *);
+
 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
 static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *);
@@ -190,12 +204,13 @@
 	struct ifaddr *xifa;
 	struct mbuf *opt;
 	struct route *ro;
-	int flags, sw_csum;
+	int flags;
 	int *mtu_p;
 	u_long mtu;
 	struct ip_moptions *imo;
 	struct socket *so;
 	va_list ap;
+	struct ip_output_args args;
 #ifdef IPSEC_NAT_T
 	int natt_frag = 0;
 #endif
@@ -209,7 +224,6 @@
 	struct tdb_ident *tdbi;
 	int s;
 #endif
-	u_int16_t ip_len;
 
 	len = 0;
 	va_start(ap, m0);
@@ -281,10 +295,9 @@
 		}
 		ip->ip_hl = hlen >> 2;
 		ipstat.ips_localout++;
-	} else {
-		hlen = ip->ip_hl << 2;
 	}
-	/*
+
+ 	/*
 	 * Route packet.
 	 */
 	memset(&iproute, 0, sizeof(iproute));
@@ -507,9 +520,6 @@
 	    (ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
 		ip->ip_off |= htons(IP_DF);
 
-	/* Remember the current ip_len */
-	ip_len = ntohs(ip->ip_len);
-
 #ifdef IPSEC
 	/* get SP for this packet */
 	if (so == NULL)
@@ -639,11 +649,6 @@
 		goto bad;
 	}
 
-	/* be sure to update variables that are affected by ipsec4_output() */
-	ip = mtod(m, struct ip *);
-	hlen = ip->ip_hl << 2;
-	ip_len = ntohs(ip->ip_len);
-
 	if (ro->ro_rt == NULL) {
 		if ((flags & IP_ROUTETOIF) == 0) {
 			printf("ip_output: "
@@ -806,6 +811,49 @@
 	}
 spd_done:
 #endif /* FAST_IPSEC */
+	
+	args.opt = opt;
+	args.imo = imo;
+	args.so = so;
+	args.flags = flags;
+	args.mtu_p = mtu_p;
+#ifdef IPSEC_NAT_T
+	args.natt_frag = natt_frag;
+#endif
+
+	error = ip_output2(m,ifp,dst,ro,mtu,&args);
+
+done:
+	rtcache_free(&iproute);
+
+#ifdef IPSEC
+	if (sp != NULL) {
+		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
+			printf("DP ip_output call free SP:%p\n", sp));
+		key_freesp(sp);
+	}
+#endif /* IPSEC */
+#ifdef FAST_IPSEC
+	if (sp != NULL)
+		KEY_FREESP(&sp);
+#endif /* FAST_IPSEC */
+
+	return (error);
+bad:
+	m_freem(m);
+	goto done;
+}
+
+static int
+ip_output2(struct mbuf * m, struct ifnet * ifp, struct sockaddr_in * dst,
+		   struct route * ro, u_long mtu, struct ip_output_args * args)
+{
+	struct ip * ip;
+	struct mbuf * m0;
+	int hlen;
+	int error;
+	int sw_csum;
+	uint16_t ip_len;
 
 #ifdef PFIL_HOOKS
 	/*
@@ -815,11 +863,11 @@
 		goto done;
 	if (m == NULL)
 		goto done;
+#endif /* PFIL_HOOKS */
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 	ip_len = ntohs(ip->ip_len);
-#endif /* PFIL_HOOKS */
 
 	m->m_pkthdr.csum_data |= hlen << 16;
 
@@ -911,8 +959,8 @@
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (ntohs(ip->ip_off) & IP_DF) {
-		if (flags & IP_RETURNMTU)
-			*mtu_p = mtu;
+		if (args->flags & IP_RETURNMTU)
+			*(args->mtu_p) = mtu;
 		error = EMSGSIZE;
 		ipstat.ips_cantfrag++;
 		goto bad;
@@ -945,9 +993,9 @@
 			 * fragmented, re-inject it in ip_output so that IPsec
 			 * processing can occur.
 			 */
-			if (natt_frag) {
-				error = ip_output(m, opt,
-				    ro, flags, imo, so, mtu_p);
+			if (args->natt_frag) {
+				error = ip_output(m, args->opt,
+				    ro, args->flags, args->imo, args->so, args->mtu_p);
 			} else
 #endif /* IPSEC_NAT_T */
 			{
@@ -962,25 +1010,12 @@
 
 	if (error == 0)
 		ipstat.ips_fragmented++;
-done:
-	rtcache_free(&iproute);
-
-#ifdef IPSEC
-	if (sp != NULL) {
-		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
-			printf("DP ip_output call free SP:%p\n", sp));
-		key_freesp(sp);
-	}
-#endif /* IPSEC */
-#ifdef FAST_IPSEC
-	if (sp != NULL)
-		KEY_FREESP(&sp);
-#endif /* FAST_IPSEC */
 
-	return (error);
 bad:
-	m_freem(m);
-	goto done;
+	m_free(m);
+
+done:
+	return error;
 }
 
 int

--+QahgC5+KEYLbs62--