Subject: Re: bpf/pcap performance
To: Guy Harris <guy@alum.mit.edu>
From: Darren Reed <darrenr@reed.wattle.id.au>
List: tech-net
Date: 04/10/2004 21:26:43
In some email I received from Guy Harris, sie wrote:
> On Sat, Apr 10, 2004 at 07:30:15AM +1000, Darren Reed wrote:
> > A merged change of the above plus a copy of FreeBSD's changes from 1.86,
> > adapted for NetBSD are below.  I've not tested them yet beyond compiling
> > them up and making sure the kernel links cleanly :)
> 
> BTW, there's some odd code in "bpfread()":

Thanks, change applied.

> Also, one of the changes imported from FreeBSD:
> 
> > ***************
> > *** 1177,1182 ****
> > --- 1223,1233 ----
> >   	for (m0 = m; m0 != 0; m0 = m0->m_next)
> >   		pktlen += m0->m_len;
> >   
> > + 	if (pktlen == m->m_len) {
> > + 		bpf_tap(arg, mtod(m, u_char *), pktlen);
> > + 		return;
> > + 	}
> > + 
> >   	for (d = bp->bif_dlist; d != 0; d = d->bd_next) {
> >   		++d->bd_rcount;
> >   		slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0);
> 
> might introduce a bug - see FreeBSD PR kern/56441:
> 
> 	In file 'sys/net/bpf.c' have a error introduced in CVS revision
> 	1.95.  This error is critical for the programs with used flag
> 	BIOCGSEESENT.  Locally generated packet may be copied in user
> 	space if flag BIOCGSEESENT set to one.  Function 'bpf_tap' must
> 	be used only for incoming packets.  But function 'bpf_mtap' uses
> 	'bpf_tap'.  It is fast.  But it's wrong.

Hmm.  NetBSD doesn't yet support BIOCGSEESENT, but that's a SMOP.
Unfortunately the ioctl numbers for bpf won't match, so binary
compatibility will need tweaks in freebsd_compat.

Having looked at that change, I think the changes I've included
below encapsulate what the perceived speedup was in FreeBSD (and
so I should commit this over there at some point too) without
recreating the BIOCGSEESENT interaction problem.

Darren

Index: bpf.c
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.c,v
retrieving revision 1.89
diff -c -r1.89 bpf.c
*** bpf.c	22 Jan 2004 00:32:41 -0000	1.89
--- bpf.c	10 Apr 2004 11:21:14 -0000
***************
*** 114,119 ****
--- 114,120 ----
  static void	bpf_attachd __P((struct bpf_d *, struct bpf_if *));
  static void	bpf_detachd __P((struct bpf_d *));
  static int	bpf_setif __P((struct bpf_d *, struct ifreq *));
+ static void	bpf_timed_out __P((void *));
  static __inline void
  		bpf_wakeup __P((struct bpf_d *));
  static void	catchpacket __P((struct bpf_d *, u_char *, u_int, u_int,
***************
*** 380,385 ****
--- 381,388 ----
  	/* Mark "free" and do most initialization. */
  	memset((char *)d, 0, sizeof(*d));
  	d->bd_bufsize = bpf_bufsize;
+ 	d->bd_seesent = 1;
+ 	callout_init(&d->bd_callout);
  
  	return (0);
  }
***************
*** 400,405 ****
--- 403,411 ----
  	int s;
  
  	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	d->bd_state = BPF_IDLE;
  	if (d->bd_bif)
  		bpf_detachd(d);
  	splx(s);
***************
*** 429,434 ****
--- 435,441 ----
  	int ioflag;
  {
  	struct bpf_d *d = &bpf_dtab[minor(dev)];
+ 	int timed_out;
  	int error;
  	int s;
  
***************
*** 440,456 ****
  		return (EINVAL);
  
  	s = splnet();
  	/*
  	 * If the hold buffer is empty, then do a timed sleep, which
  	 * ends when the timeout expires or when enough packets
  	 * have arrived to fill the store buffer.
  	 */
  	while (d->bd_hbuf == 0) {
! 		if (d->bd_immediate) {
! 			if (d->bd_slen == 0) {
! 				splx(s);
! 				return (EWOULDBLOCK);
! 			}
  			/*
  			 * A packet(s) either arrived since the previous
  			 * read or arrived while we were asleep.
--- 447,463 ----
  		return (EINVAL);
  
  	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	timed_out = (d->bd_state == BPF_TIMED_OUT);
+ 	d->bd_state = BPF_IDLE;
  	/*
  	 * If the hold buffer is empty, then do a timed sleep, which
  	 * ends when the timeout expires or when enough packets
  	 * have arrived to fill the store buffer.
  	 */
  	while (d->bd_hbuf == 0) {
! 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
  			/*
  			 * A packet(s) either arrived since the previous
  			 * read or arrived while we were asleep.
***************
*** 463,473 ****
  			error = tsleep((caddr_t)d, PRINET|PCATCH, "bpf",
  					  d->bd_rtout);
  		else {
! 			if (d->bd_rtout == -1) {
! 				/* User requested non-blocking I/O */
! 				error = EWOULDBLOCK;
! 			} else
! 				error = 0;
  		}
  		if (error == EINTR || error == ERESTART) {
  			splx(s);
--- 470,477 ----
  			error = tsleep((caddr_t)d, PRINET|PCATCH, "bpf",
  					  d->bd_rtout);
  		else {
! 			/* User requested non-blocking I/O */
! 			error = EWOULDBLOCK;
  		}
  		if (error == EINTR || error == ERESTART) {
  			splx(s);
***************
*** 535,540 ****
--- 539,562 ----
  	d->bd_sel.sel_pid = 0;
  }
  
+ 
+ static void
+ bpf_timed_out(arg)
+ 	void *arg;
+ {
+ 	struct bpf_d *d = (struct bpf_d *)arg;
+ 	int s;
+ 
+ 	s = splnet();
+ 	if (d->bd_state == BPF_WAITING) {
+ 		d->bd_state = BPF_TIMED_OUT;
+ 		if (d->bd_slen != 0)
+ 			bpf_wakeup(d);
+ 	}
+ 	splx(s);
+ }
+ 
+ 
  int
  bpfwrite(dev, uio, ioflag)
  	dev_t dev;
***************
*** 631,636 ****
--- 653,664 ----
  	struct bpf_insn **p;
  #endif
  
+ 	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	d->bd_state = BPF_IDLE;
+ 	splx(s);
+ 
  	switch (cmd) {
  
  	default:
***************
*** 853,858 ****
--- 881,900 ----
  		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
  		break;
  
+ 	/*
+ 	 * Get "see sent packets" flag
+ 	 */
+ 	case BIOCGSEESENT:
+ 		*(u_int *)addr = d->bd_seesent;
+ 		break;
+ 
+ 	/*
+ 	 * Set "see sent" packets flag
+ 	 */
+ 	case BIOCSSEESENT:
+ 		d->bd_seesent = *(u_int *)addr;
+ 		break;
+ 
  	case FIONBIO:		/* Non-blocking I/O */
  		if (*(int *)addr)
  			d->bd_rtout = -1;
***************
*** 1040,1049 ****
--- 1082,1107 ----
  		/*
  		 * An imitation of the FIONREAD ioctl code.
  		 */
+ #if 0
  		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
  			revents |= events & (POLLIN | POLLRDNORM);
  		else
  			selrecord(p, &d->bd_sel);
+ #else
+ 		if (d->bd_hlen != 0 ||
+ 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
+ 		     d->bd_slen != 0))
+ 			revents |= events & (POLLIN | POLLRDNORM);
+ 		else {
+ 			selrecord(p, &d->bd_sel);
+ 			/* Start the read timeout if necessary */
+ 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
+ 				callout_reset(&d->bd_callout, d->bd_rtout,
+ 					      bpf_timed_out, d);
+ 				d->bd_state = BPF_WAITING;
+ 			}
+ 		}
+ #endif
  	}
  
  	splx(s);
***************
*** 1168,1187 ****
  	caddr_t arg;
  	struct mbuf *m;
  {
  	struct bpf_if *bp = (struct bpf_if *)arg;
  	struct bpf_d *d;
! 	u_int pktlen, slen;
  	struct mbuf *m0;
  
  	pktlen = 0;
  	for (m0 = m; m0 != 0; m0 = m0->m_next)
  		pktlen += m0->m_len;
  
  	for (d = bp->bif_dlist; d != 0; d = d->bd_next) {
  		++d->bd_rcount;
! 		slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0);
  		if (slen != 0)
! 			catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcpy);
  	}
  }
  
--- 1226,1259 ----
  	caddr_t arg;
  	struct mbuf *m;
  {
+ 	void *(*cpfn) __P((void *, const void *, size_t));
  	struct bpf_if *bp = (struct bpf_if *)arg;
  	struct bpf_d *d;
! 	u_int pktlen, slen, buflen;
  	struct mbuf *m0;
+ 	void *marg;
  
  	pktlen = 0;
  	for (m0 = m; m0 != 0; m0 = m0->m_next)
  		pktlen += m0->m_len;
  
+ 	if (pktlen == m->m_len) {
+ 		cpfn = memcpy;
+ 		marg = mtod(m, void *);
+ 		buflen = pktlen;
+ 	} else {
+ 		cpfn = bpf_mcpy;
+ 		marg = m;
+ 		buflen = 0;
+ 	}
+ 
  	for (d = bp->bif_dlist; d != 0; d = d->bd_next) {
+ 		if (!d->bd_seesent && (m->m_pkthdr.rcvif == NULL))
+ 			continue;
  		++d->bd_rcount;
! 		slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
  		if (slen != 0)
! 			catchpacket(d, marg, pktlen, slen, cpfn);
  	}
  }
  
***************
*** 1234,1240 ****
  		ROTATE_BUFFERS(d);
  		bpf_wakeup(d);
  		curlen = 0;
! 	}
  
  	/*
  	 * Append the bpf header.
--- 1306,1318 ----
  		ROTATE_BUFFERS(d);
  		bpf_wakeup(d);
  		curlen = 0;
! 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
! 		/*
! 		 * Immediate mode is set, or the read timeout has
! 		 * already expired during a select call.  A packet
! 		 * arrived, so the reader should be woken up.
! 		 */
! 		bpf_wakeup(d);
  
  	/*
  	 * Append the bpf header.
***************
*** 1248,1261 ****
  	 */
  	(*cpfn)((u_char *)hp + hdrlen, pkt, (hp->bh_caplen = totlen - hdrlen));
  	d->bd_slen = curlen + totlen;
- 
- 	if (d->bd_immediate) {
- 		/*
- 		 * Immediate mode is set.  A packet arrived so any
- 		 * reads should be woken up.
- 		 */
- 		bpf_wakeup(d);
- 	}
  }
  
  /*
--- 1326,1331 ----
Index: bpf.h
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.h,v
retrieving revision 1.33
diff -c -r1.33 bpf.h
*** bpf.h	22 Jan 2004 00:32:41 -0000	1.33
--- bpf.h	10 Apr 2004 11:21:15 -0000
***************
*** 102,126 ****
   * header files.  If your using gcc, we assume that you
   * have run fixincludes so the latter set should work.
   */
! #define	BIOCGBLEN	 _IOR('B',102, u_int)
! #define	BIOCSBLEN	_IOWR('B',102, u_int)
! #define	BIOCSETF	 _IOW('B',103, struct bpf_program)
! #define	BIOCFLUSH	  _IO('B',104)
! #define BIOCPROMISC	  _IO('B',105)
! #define	BIOCGDLT	 _IOR('B',106, u_int)
! #define BIOCGETIF	 _IOR('B',107, struct ifreq)
! #define BIOCSETIF	 _IOW('B',108, struct ifreq)
! #define BIOCSRTIMEOUT	 _IOW('B',109, struct timeval)
! #define BIOCGRTIMEOUT	 _IOR('B',110, struct timeval)
! #define BIOCGSTATS	 _IOR('B',111, struct bpf_stat)
! #define BIOCIMMEDIATE	 _IOW('B',112, u_int)
! #define BIOCVERSION	 _IOR('B',113, struct bpf_version)
! #define BIOCSTCPF	 _IOW('B',114, struct bpf_program)
! #define BIOCSUDPF	 _IOW('B',115, struct bpf_program)
! #define	BIOCGHDRCMPLT	 _IOR('B',116, u_int)
! #define	BIOCSHDRCMPLT	 _IOW('B',117, u_int)
! #define	BIOCSDLT	 _IOW('B',118, u_int)
! #define	BIOCGDLTLIST	_IOWR('B',119, struct bpf_dltlist)
  
  /*
   * Structure prepended to each packet.
--- 102,128 ----
   * header files.  If your using gcc, we assume that you
   * have run fixincludes so the latter set should work.
   */
! #define BIOCGBLEN	_IOR('B',102, u_int)
! #define BIOCSBLEN	_IOWR('B',102, u_int)
! #define BIOCSETF	_IOW('B',103, struct bpf_program)
! #define BIOCFLUSH	_IO('B',104)
! #define BIOCPROMISC	_IO('B',105)
! #define BIOCGDLT	_IOR('B',106, u_int)
! #define BIOCGETIF	_IOR('B',107, struct ifreq)
! #define BIOCSETIF	_IOW('B',108, struct ifreq)
! #define BIOCSRTIMEOUT	_IOW('B',109, struct timeval)
! #define BIOCGRTIMEOUT	_IOR('B',110, struct timeval)
! #define BIOCGSTATS	_IOR('B',111, struct bpf_stat)
! #define BIOCIMMEDIATE	_IOW('B',112, u_int)
! #define BIOCVERSION	_IOR('B',113, struct bpf_version)
! #define BIOCSTCPF	_IOW('B',114, struct bpf_program)
! #define BIOCSUDPF	_IOW('B',115, struct bpf_program)
! #define BIOCGHDRCMPLT	_IOR('B',116, u_int)
! #define BIOCSHDRCMPLT	_IOW('B',117, u_int)
! #define BIOCSDLT	_IOW('B',118, u_int)
! #define BIOCGDLTLIST	_IOWR('B',119, struct bpf_dltlist)
! #define BIOCGSEESENT	_IOR('B',120, u_int)
! #define BIOCSSEESENT	_IOW('B',121, u_int)
  
  /*
   * Structure prepended to each packet.
Index: bpfdesc.h
===================================================================
RCS file: /cvsroot/src/sys/net/bpfdesc.h,v
retrieving revision 1.16
diff -c -r1.16 bpfdesc.h
*** bpfdesc.h	7 Aug 2003 16:32:48 -0000	1.16
--- bpfdesc.h	10 Apr 2004 11:21:15 -0000
***************
*** 41,46 ****
--- 41,47 ----
  #ifndef _NET_BPFDESC_H_
  #define _NET_BPFDESC_H_
  
+ #include <sys/callout.h>
  #include <sys/select.h>
  
  /*
***************
*** 75,80 ****
--- 76,82 ----
  	u_char		bd_state;	/* idle, waiting, or timed out */
  	u_char		bd_immediate;	/* true to return on packet arrival */
  	int		bd_hdrcmplt;	/* false to fill in src lladdr */
+ 	int		bd_seesent;	/* true if bpf should see sent packets */
  	int		bd_async;	/* non-zero if packet reception should generate signal */
  	pid_t		bd_pgid;	/* process or group id for signal */
  #if BSD < 199103
***************
*** 85,92 ****
--- 87,101 ----
  	u_char		bd_pad;		/* explicit alignment */
  	struct selinfo	bd_sel;		/* bsd select info */
  #endif
+ 	struct callout	bd_callout;	/* for BPF timeouts with select */
  };
  
+ 
+ /* Values for bd_state */
+ #define BPF_IDLE	0		/* no select in progress */
+ #define BPF_WAITING	1		/* waiting for read timeout in select */
+ #define BPF_TIMED_OUT	2		/* read timeout has expired in select */
+ 
  /*
   * Descriptor associated with each attached hardware interface.
   */