Subject: Re: Patch to make upper-layer network protocols align as needed
To: Jonathan Stone <jonathan@DSG.Stanford.EDU>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 06/30/2002 15:22:46
--azLHFNyN32YCQGCU
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Mon, May 13, 2002 at 12:13:23PM -0700, Jonathan Stone wrote:

 > The idea is excellent. The name, however, er, sucks :).
 > It doesn't fit the style of the existing mbuf code at all.
 > (to me  sounds rather   Linux-like).
 > One alternative that (imho) fits a lot better is m_copyup().

Ok, I have renamed it to m_copyup().

I am committing the following diff.  The new code path will not be
used unless individual device drivers stop doing the alignment fixup
themselves, so the change is effectively a noop until those device
drivers are changed.

This change was good for an additional 30Mb/s or so on a DP83820 Gig-E
interface in a 533MHz Alpha.

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--azLHFNyN32YCQGCU
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=copyup-patch

Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvsroot/syssrc/sys/kern/uipc_mbuf.c,v
retrieving revision 1.59
diff -c -r1.59 uipc_mbuf.c
*** kern/uipc_mbuf.c	2002/03/09 01:46:33	1.59
--- kern/uipc_mbuf.c	2002/06/30 22:13:28
***************
*** 729,734 ****
--- 729,784 ----
  }
  
  /*
+  * Like m_pullup(), except a new mbuf is always allocated, and we allow
+  * the amount of empty space before the data in the new mbuf to be specified
+  * (in the event that the caller expects to prepend later).
+  */
+ int MSFail;
+ 
+ struct mbuf *
+ m_copyup(struct mbuf *n, int len, int dstoff)
+ {
+ 	struct mbuf *m;
+ 	int count, space;
+ 
+ 	if (len > (MHLEN - dstoff))
+ 		goto bad;
+ 	MGET(m, M_DONTWAIT, n->m_type);
+ 	if (m == NULL)
+ 		goto bad;
+ 	m->m_len = 0;
+ 	if (n->m_flags & M_PKTHDR) {
+ 		M_COPY_PKTHDR(m, n);
+ 		n->m_flags &= ~M_PKTHDR;
+ 	}
+ 	m->m_data += dstoff;
+ 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+ 	do {
+ 		count = min(min(max(len, max_protohdr), space), n->m_len);
+ 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
+ 		    (unsigned)count);
+ 		len -= count;
+ 		m->m_len += count;
+ 		n->m_len -= count;
+ 		space -= count;
+ 		if (n->m_len)
+ 			n->m_data += count;
+ 		else
+ 			n = m_free(n);
+ 	} while (len > 0 && n);
+ 	if (len > 0) {
+ 		(void) m_free(m);
+ 		goto bad;
+ 	}
+ 	m->m_next = n;
+ 	return (m);
+  bad:
+ 	m_freem(n);
+ 	MSFail++;
+ 	return (NULL);
+ }
+ 
+ /*
   * Partition an mbuf chain in two pieces, returning the tail --
   * all but the first len0 bytes.  In case of failure, it returns NULL and
   * attempts to restore the chain to its original state.
Index: netinet/icmp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/icmp_var.h,v
retrieving revision 1.20
diff -c -r1.20 icmp_var.h
*** netinet/icmp_var.h	2002/06/09 16:33:37	1.20
--- netinet/icmp_var.h	2002/06/30 22:13:28
***************
*** 83,88 ****
--- 83,94 ----
  
  #ifdef _KERNEL
  struct	icmpstat icmpstat;
+ 
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	ICMP_HDR_ALIGNED_P(ic)	1
+ #else
+ #define	ICMP_HDR_ALIGNED_P(ic)	((((vaddr_t) (ic)) & 3) == 0)
  #endif
+ #endif /* _KERNEL_ */
  
  #endif /* _NETINET_ICMP_VAR_H_ */
Index: netinet/igmp.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/igmp.c,v
retrieving revision 1.29
diff -c -r1.29 igmp.c
*** netinet/igmp.c	2002/06/09 16:33:37	1.29
--- netinet/igmp.c	2002/06/30 22:13:29
***************
*** 180,185 ****
--- 180,186 ----
  	m->m_data += iphlen;
  	m->m_len -= iphlen;
  	igmp = mtod(m, struct igmp *);
+ 	/* No need to assert alignment here. */
  	if (in_cksum(m, ip->ip_len - iphlen)) {
  		++igmpstat.igps_rcv_badsum;
  		m_freem(m);
Index: netinet/igmp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/igmp_var.h,v
retrieving revision 1.13
diff -c -r1.13 igmp_var.h
*** netinet/igmp_var.h	2002/05/29 01:33:45	1.13
--- netinet/igmp_var.h	2002/06/30 22:13:29
***************
*** 75,80 ****
--- 75,86 ----
   */
  #define	IGMP_RANDOM_DELAY(X)	(arc4random() % (X) + 1)
  
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	IGMP_HDR_ALIGNED_P(ig)	1
+ #else
+ #define	IGMP_HDR_ALIGNED_P(ig)	((((vaddr_t) (ig)) & 3) == 0)
+ #endif
+ 
  void	igmp_init __P((void));
  void	igmp_input __P((struct mbuf *, ...));
  void	igmp_joingroup __P((struct in_multi *));
Index: netinet/ip_flow.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/ip_flow.c,v
retrieving revision 1.24
diff -c -r1.24 ip_flow.c
*** netinet/ip_flow.c	2002/06/09 16:33:40	1.24
--- netinet/ip_flow.c	2002/06/30 22:13:30
***************
*** 143,149 ****
  ipflow_fastforward(
  	struct mbuf *m)
  {
! 	struct ip *ip;
  	struct ipflow *ipf;
  	struct rtentry *rt;
  	struct sockaddr *dst;
--- 143,149 ----
  ipflow_fastforward(
  	struct mbuf *m)
  {
! 	struct ip *ip, ip_store;
  	struct ipflow *ipf;
  	struct rtentry *rt;
  	struct sockaddr *dst;
***************
*** 166,172 ****
  	/*
  	 * IP header with no option and valid version and length
  	 */
! 	ip = mtod(m, struct ip *);
  	iplen = ntohs(ip->ip_len);
  	if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) ||
  	    iplen < sizeof(struct ip) || iplen > m->m_pkthdr.len)
--- 166,177 ----
  	/*
  	 * IP header with no option and valid version and length
  	 */
! 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)))
! 		ip = mtod(m, struct ip *);
! 	else {
! 		memcpy(&ip_store, mtod(m, caddr_t), sizeof(ip_store));
! 		ip = &ip_store;
! 	}
  	iplen = ntohs(ip->ip_len);
  	if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) ||
  	    iplen < sizeof(struct ip) || iplen > m->m_pkthdr.len)
***************
*** 232,237 ****
--- 237,248 ----
  		ip->ip_sum -= ~htons(IPTTLDEC << 8);
  	else
  		ip->ip_sum += htons(IPTTLDEC << 8);
+ 
+ 	/*
+ 	 * Done modifying the header; copy it back, if necessary.
+ 	 */
+ 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0)
+ 		memcpy(mtod(m, caddr_t), &ip_store, sizeof(ip_store));
  
  	/*
  	 * Trim the packet in case it's too long..
Index: netinet/ip_icmp.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/ip_icmp.c,v
retrieving revision 1.68
diff -c -r1.68 ip_icmp.c
*** netinet/ip_icmp.c	2002/06/13 16:25:54	1.68
--- netinet/ip_icmp.c	2002/06/30 22:13:30
***************
*** 419,424 ****
--- 419,425 ----
  	m->m_len -= hlen;
  	m->m_data += hlen;
  	icp = mtod(m, struct icmp *);
+ 	/* Don't need to assert alignment, here. */
  	if (in_cksum(m, icmplen)) {
  		icmpstat.icps_checksum++;
  		goto freeit;
Index: netinet/ip_input.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/ip_input.c,v
retrieving revision 1.153
diff -c -r1.153 ip_input.c
*** netinet/ip_input.c	2002/06/13 16:25:54	1.153
--- netinet/ip_input.c	2002/06/30 22:13:32
***************
*** 420,429 ****
  	if (TAILQ_FIRST(&in_ifaddr) == 0)
  		goto bad;
  	ipstat.ips_total++;
! 	if (m->m_len < sizeof (struct ip) &&
! 	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
! 		ipstat.ips_toosmall++;
! 		return;
  	}
  	ip = mtod(m, struct ip *);
  	if (ip->ip_v != IPVERSION) {
--- 420,443 ----
  	if (TAILQ_FIRST(&in_ifaddr) == 0)
  		goto bad;
  	ipstat.ips_total++;
! 	/*
! 	 * If the IP header is not aligned, slurp it up into a new
! 	 * mbuf with space for link headers, in the event we forward
! 	 * it.  Otherwise, if it is aligned, make sure the entire
! 	 * base IP header is in the first mbuf of the chain.
! 	 */
! 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
! 		if ((m = m_copyup(m, sizeof(struct ip),
! 				  (max_linkhdr + 3) & ~3)) == NULL) {
! 			/* XXXJRT new stat, please */
! 			ipstat.ips_toosmall++;
! 			return;
! 		}
! 	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
! 		if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
! 			ipstat.ips_toosmall++;
! 			return;
! 		}
  	}
  	ip = mtod(m, struct ip *);
  	if (ip->ip_v != IPVERSION) {
Index: netinet/ip_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/ip_var.h,v
retrieving revision 1.47
diff -c -r1.47 ip_var.h
*** netinet/ip_var.h	2002/05/07 02:59:38	1.47
--- netinet/ip_var.h	2002/06/30 22:13:32
***************
*** 183,188 ****
--- 183,194 ----
  #define	IP_ALLOWBROADCAST	SO_BROADCAST	/* can send broadcast packets */
  #define	IP_MTUDISC		0x0400		/* Path MTU Discovery; set DF */
  
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	IP_HDR_ALIGNED_P(ip)	1
+ #else
+ #define	IP_HDR_ALIGNED_P(ip)	((((vaddr_t) (ip)) & 3) == 0)
+ #endif
+ 
  extern struct ipstat ipstat;		/* ip statistics */
  extern LIST_HEAD(ipqhead, ipq) ipq;	/* ip reass. queue */
  extern u_int16_t ip_id;			/* ip packet ctr, for ids */
Index: netinet/tcp_input.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_input.c,v
retrieving revision 1.145
diff -c -r1.145 tcp_input.c
*** netinet/tcp_input.c	2002/06/29 04:13:21	1.145
--- netinet/tcp_input.c	2002/06/30 22:13:36
***************
*** 921,926 ****
--- 921,928 ----
  		return;
  	}
  
+ 	KASSERT(TCP_HDR_ALIGNED_P(th));
+ 
  	/*
  	 * Check that TCP offset makes sense,
  	 * pull out TCP options and adjust length.		XXX
***************
*** 971,976 ****
--- 973,979 ----
  		 * (as they're before toff) and we don't need to update those.
  		 */
  #endif
+ 		KASSERT(TCP_HDR_ALIGNED_P(th));
  		optlen = off - sizeof (struct tcphdr);
  		optp = ((caddr_t)th) + sizeof(struct tcphdr);
  		/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_var.h,v
retrieving revision 1.92
diff -c -r1.92 tcp_var.h
*** netinet/tcp_var.h	2002/06/09 16:33:45	1.92
--- netinet/tcp_var.h	2002/06/30 22:13:40
***************
*** 679,684 ****
--- 679,690 ----
  	{ 1, 0, &tcp_delack_ticks },		\
  }
  
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	TCP_HDR_ALIGNED_P(th)	1
+ #else
+ #define	TCP_HDR_ALIGNED_P(th)	((((vaddr_t) (th)) & 3) == 0)
+ #endif
+ 
  int	 tcp_attach __P((struct socket *));
  void	 tcp_canceltimers __P((struct tcpcb *));
  struct tcpcb *
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/udp_usrreq.c,v
retrieving revision 1.94
diff -c -r1.94 udp_usrreq.c
*** netinet/udp_usrreq.c	2002/06/09 16:33:45	1.94
--- netinet/udp_usrreq.c	2002/06/30 22:13:42
***************
*** 262,267 ****
--- 262,268 ----
  		return;
  	}
  #endif
+ 	KASSERT(UDP_HDR_ALIGNED_P(uh));
  
  	/* destination port of 0 is illegal, based on RFC768. */
  	if (uh->uh_dport == 0)
***************
*** 418,423 ****
--- 419,425 ----
  		return IPPROTO_DONE;
  	}
  #endif
+ 	KASSERT(UDP_HDR_ALIGNED_P(uh));
  	ulen = ntohs((u_short)uh->uh_ulen);
  	/*
  	 * RFC2675 section 4: jumbograms will have 0 in the UDP header field,
Index: netinet/udp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/udp_var.h,v
retrieving revision 1.18
diff -c -r1.18 udp_var.h
*** netinet/udp_var.h	2002/05/12 20:33:51	1.18
--- netinet/udp_var.h	2002/06/30 22:13:42
***************
*** 88,93 ****
--- 88,99 ----
  extern	struct	inpcbtable udbtable;
  extern	struct	udpstat udpstat;
  
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	UDP_HDR_ALIGNED_P(uh)	1
+ #else
+ #define	UDP_HDR_ALIGNED_P(uh)	((((vaddr_t) (uh)) & 3) == 0)
+ #endif
+ 
  void	 *udp_ctlinput __P((int, struct sockaddr *, void *));
  void	 udp_init __P((void));
  void	 udp_input __P((struct mbuf *, ...));
Index: netinet6/icmp6.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet6/icmp6.c,v
retrieving revision 1.82
diff -c -r1.82 icmp6.c
*** netinet6/icmp6.c	2002/06/09 14:43:11	1.82
--- netinet6/icmp6.c	2002/06/30 22:13:45
***************
*** 469,474 ****
--- 469,475 ----
  		return IPPROTO_DONE;
  	}
  #endif
+ 	KASSERT(IP6_HDR_ALIGNED_P(icmp6));
  	code = icmp6->icmp6_code;
  
  	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet6/ip6_input.c,v
retrieving revision 1.56
diff -c -r1.56 ip6_input.c
*** netinet6/ip6_input.c	2002/06/09 14:43:12	1.56
--- netinet6/ip6_input.c	2002/06/30 22:13:47
***************
*** 269,277 ****
  	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /*nothing*/);
  #endif
  
! 	if (m->m_len < sizeof(struct ip6_hdr)) {
! 		struct ifnet *inifp;
! 		inifp = m->m_pkthdr.rcvif;
  		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
  			ip6stat.ip6s_toosmall++;
  			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
--- 269,291 ----
  	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /*nothing*/);
  #endif
  
! 	/*
! 	 * If the IPv6 header is not aligned, slurp it up into a new
! 	 * mbuf with space for link headers, in the event we forward
! 	 * it.  OTherwise, if it is aligned, make sure the entire base
! 	 * IPv6 header is in the first mbuf of the chain.
! 	 */
! 	if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
! 		struct ifnet *inifp = m->m_pkthdr.rcvif;
! 		if ((m = m_copyup(m, sizeof(struct ip6_hdr),
! 				  (max_linkhdr + 3) & ~3)) == NULL) {
! 			/* XXXJRT new stat, please */
! 			ip6stat.ip6s_toosmall++;
! 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
! 			return;
! 		}
! 	} else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) {
! 		struct ifnet *inifp = m->m_pkthdr.rcvif;
  		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
  			ip6stat.ip6s_toosmall++;
  			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
***************
*** 625,630 ****
--- 639,645 ----
  			return;
  		}
  #endif
+ 		KASSERT(IP6_HDR_ALIGNED_P(hbh));
  		nxt = hbh->ip6h_nxt;
  
  		/*
***************
*** 789,794 ****
--- 804,810 ----
  		return -1;
  	}
  #endif
+ 	KASSERT(IP6_HDR_ALIGNED_P(hbh));
  	off += hbhlen;
  	hbhlen -= sizeof(struct ip6_hbh);
  	opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh);
***************
*** 1150,1155 ****
--- 1166,1172 ----
  				return;
  			}
  #endif
+ 			KASSERT(IP6_HDR_ALIGNED_P(ip6e));
  
  			switch (nxt) {
  		        case IPPROTO_DSTOPTS:
Index: netinet6/ip6_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet6/ip6_var.h,v
retrieving revision 1.21
diff -c -r1.21 ip6_var.h
*** netinet6/ip6_var.h	2002/06/08 21:22:33	1.21
--- netinet6/ip6_var.h	2002/06/30 22:13:47
***************
*** 206,211 ****
--- 206,217 ----
  #define	IPV6_FORWARDING		0x02	/* most of IPv6 header exists */
  #define	IPV6_MINMTU		0x04	/* use minimum MTU (IPV6_USE_MIN_MTU) */
  
+ #ifdef __NO_STRICT_ALIGNMENT
+ #define	IP6_HDR_ALIGNED_P(ip)	1
+ #else
+ #define	IP6_HDR_ALIGNED_P(ip)	((((vaddr_t) (ip)) & 3) == 0)
+ #endif
+ 
  extern struct	ip6stat ip6stat;	/* statistics */
  extern u_int32_t ip6_id;		/* fragment identifier */
  extern int	ip6_defhlim;		/* default hop limit */
Index: sys/mbuf.h
===================================================================
RCS file: /cvsroot/syssrc/sys/sys/mbuf.h,v
retrieving revision 1.66
diff -c -r1.66 mbuf.h
*** sys/mbuf.h	2002/06/22 05:37:01	1.66
--- sys/mbuf.h	2002/06/30 22:13:49
***************
*** 651,656 ****
--- 651,657 ----
  struct	mbuf *m_prepend __P((struct mbuf *,int,int));
  struct	mbuf *m_pulldown __P((struct mbuf *, int, int, int *));
  struct	mbuf *m_pullup __P((struct mbuf *, int));
+ struct	mbuf *m_copyup __P((struct mbuf *, int, int));
  struct	mbuf *m_split __P((struct mbuf *,int,int));
  void	m_adj __P((struct mbuf *, int));
  void	m_cat __P((struct mbuf *,struct mbuf *));

--azLHFNyN32YCQGCU--