Subject: Re: bpf/pcap performance
To: Guy Harris <guy@alum.mit.edu>
From: Darren Reed <darrenr@reed.wattle.id.au>
List: tech-net
Date: 04/10/2004 07:30:15
In some email I received from Guy Harris, sie wrote:
> > * the application is threaded, one thread uses select over all the
> >   NICs so it knows when to read data from BPF, the other writes to
> >   disk.
> 
> The original BPF implementation didn't correctly support "select()" on 
> BPF devices if you had a timeout on the device - "select()" wouldn't 
> consider the BPF device readable until the hold buffer was non-empty, 
> but the store buffer wasn't rotated into the hold buffer until it 
> filled up, so "select()" would wait until the store buffer filled.
> 
> FreeBSD fixed that somewhere in the 4.x timeframe, and I *think* 
> OpenBSD also has it fixed; NetBSD still doesn't have it fixed, as far 
> as I know.

Ok, I went looking.  I think the bug you are talking about here relates
to bpfread() ?  NetBSD has:

        while (d->bd_hbuf == 0) {
                if (d->bd_immediate) {
                        if (d->bd_slen == 0) {
                                splx(s);
                                return (EWOULDBLOCK);
                        }

FreeBSD:
        while (d->bd_hbuf == 0) {
                if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
OpenBSD:
        while (d->bd_hbuf == 0) {
                if (d->bd_immediate && d->bd_slen != 0) {

FreeBSD also has a bunch of other changes with the use of callouts,
that according to the commit comment, relate to threads:
http://www.freebsd.org/cgi/cvsweb.cgi/src/sys/net/bpf.c
- search for rev 1.86.

A merged change of the above plus a copy of FreeBSD's changes from 1.86,
adapted for NetBSD are below.  I've not tested them yet beyond compiling
them up and making sure the kernel links cleanly :)

Some feedback from other NetBSD types about whether or not this is a good
patch to apply would be nice.  If so, I'll commit it.

Cheers,
Darren

Index: bpf.c
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.c,v
retrieving revision 1.89
diff -c -r1.89 bpf.c
*** bpf.c	22 Jan 2004 00:32:41 -0000	1.89
--- bpf.c	9 Apr 2004 20:17:08 -0000
***************
*** 39,45 ****
   */
  
  #include <sys/cdefs.h>
! __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.89 2004/01/22 00:32:41 jonathan Exp $");
  
  #include "bpfilter.h"
  
--- 39,45 ----
   */
  
  #include <sys/cdefs.h>
! __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.90 2004/03/24 15:34:54 atatat Exp $");
  
  #include "bpfilter.h"
  
***************
*** 114,119 ****
--- 114,120 ----
  static void	bpf_attachd __P((struct bpf_d *, struct bpf_if *));
  static void	bpf_detachd __P((struct bpf_d *));
  static int	bpf_setif __P((struct bpf_d *, struct ifreq *));
+ static void	bpf_timed_out __P((void *));
  static __inline void
  		bpf_wakeup __P((struct bpf_d *));
  static void	catchpacket __P((struct bpf_d *, u_char *, u_int, u_int,
***************
*** 380,385 ****
--- 381,387 ----
  	/* Mark "free" and do most initialization. */
  	memset((char *)d, 0, sizeof(*d));
  	d->bd_bufsize = bpf_bufsize;
+ 	callout_init(&d->bd_callout);
  
  	return (0);
  }
***************
*** 400,405 ****
--- 402,410 ----
  	int s;
  
  	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	d->bd_state = BPF_IDLE;
  	if (d->bd_bif)
  		bpf_detachd(d);
  	splx(s);
***************
*** 429,434 ****
--- 434,440 ----
  	int ioflag;
  {
  	struct bpf_d *d = &bpf_dtab[minor(dev)];
+ 	int timed_out;
  	int error;
  	int s;
  
***************
*** 440,456 ****
  		return (EINVAL);
  
  	s = splnet();
  	/*
  	 * If the hold buffer is empty, then do a timed sleep, which
  	 * ends when the timeout expires or when enough packets
  	 * have arrived to fill the store buffer.
  	 */
  	while (d->bd_hbuf == 0) {
! 		if (d->bd_immediate) {
! 			if (d->bd_slen == 0) {
! 				splx(s);
! 				return (EWOULDBLOCK);
! 			}
  			/*
  			 * A packet(s) either arrived since the previous
  			 * read or arrived while we were asleep.
--- 446,462 ----
  		return (EINVAL);
  
  	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	timed_out = (d->bd_state == BPF_TIMED_OUT);
+ 	d->bd_state = BPF_IDLE;
  	/*
  	 * If the hold buffer is empty, then do a timed sleep, which
  	 * ends when the timeout expires or when enough packets
  	 * have arrived to fill the store buffer.
  	 */
  	while (d->bd_hbuf == 0) {
! 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
  			/*
  			 * A packet(s) either arrived since the previous
  			 * read or arrived while we were asleep.
***************
*** 535,540 ****
--- 541,564 ----
  	d->bd_sel.sel_pid = 0;
  }
  
+ 
+ static void
+ bpf_timed_out(arg)
+ 	void *arg;
+ {
+ 	struct bpf_d *d = (struct bpf_d *)arg;
+ 	int s;
+ 
+ 	s = splnet();
+ 	if (d->bd_state == BPF_WAITING) {
+ 		d->bd_state = BPF_TIMED_OUT;
+ 		if (d->bd_slen != 0)
+ 			bpf_wakeup(d);
+ 	}
+ 	splx(s);
+ }
+ 
+ 
  int
  bpfwrite(dev, uio, ioflag)
  	dev_t dev;
***************
*** 631,636 ****
--- 655,666 ----
  	struct bpf_insn **p;
  #endif
  
+ 	s = splnet();
+ 	if (d->bd_state == BPF_WAITING)
+ 		callout_stop(&d->bd_callout);
+ 	d->bd_state = BPF_IDLE;
+ 	splx(s);
+ 
  	switch (cmd) {
  
  	default:
***************
*** 1040,1049 ****
--- 1070,1095 ----
  		/*
  		 * An imitation of the FIONREAD ioctl code.
  		 */
+ #if 0
  		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
  			revents |= events & (POLLIN | POLLRDNORM);
  		else
  			selrecord(p, &d->bd_sel);
+ #else
+ 		if (d->bd_hlen != 0 ||
+ 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
+ 		     d->bd_slen != 0))
+ 			revents |= events & (POLLIN | POLLRDNORM);
+ 		else {
+ 			selrecord(p, &d->bd_sel);
+ 			/* Start the read timeout if necessary */
+ 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
+ 				callout_reset(&d->bd_callout, d->bd_rtout,
+ 					      bpf_timed_out, d);
+ 				d->bd_state = BPF_WAITING;
+ 			}
+ 		}
+ #endif
  	}
  
  	splx(s);
***************
*** 1177,1182 ****
--- 1223,1233 ----
  	for (m0 = m; m0 != 0; m0 = m0->m_next)
  		pktlen += m0->m_len;
  
+ 	if (pktlen == m->m_len) {
+ 		bpf_tap(arg, mtod(m, u_char *), pktlen);
+ 		return;
+ 	}
+ 
  	for (d = bp->bif_dlist; d != 0; d = d->bd_next) {
  		++d->bd_rcount;
  		slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0);
***************
*** 1234,1240 ****
  		ROTATE_BUFFERS(d);
  		bpf_wakeup(d);
  		curlen = 0;
! 	}
  
  	/*
  	 * Append the bpf header.
--- 1285,1297 ----
  		ROTATE_BUFFERS(d);
  		bpf_wakeup(d);
  		curlen = 0;
! 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
! 		/*
! 		 * Immediate mode is set, or the read timeout has
! 		 * already expired during a select call.  A packet
! 		 * arrived, so the reader should be woken up.
! 		 */
! 		bpf_wakeup(d);
  
  	/*
  	 * Append the bpf header.
***************
*** 1248,1261 ****
  	 */
  	(*cpfn)((u_char *)hp + hdrlen, pkt, (hp->bh_caplen = totlen - hdrlen));
  	d->bd_slen = curlen + totlen;
- 
- 	if (d->bd_immediate) {
- 		/*
- 		 * Immediate mode is set.  A packet arrived so any
- 		 * reads should be woken up.
- 		 */
- 		bpf_wakeup(d);
- 	}
  }
  
  /*
--- 1305,1310 ----
Index: bpfdesc.h
===================================================================
RCS file: /cvsroot/src/sys/net/bpfdesc.h,v
retrieving revision 1.16
diff -c -r1.16 bpfdesc.h
*** bpfdesc.h	7 Aug 2003 16:32:48 -0000	1.16
--- bpfdesc.h	9 Apr 2004 20:17:08 -0000
***************
*** 41,46 ****
--- 41,47 ----
  #ifndef _NET_BPFDESC_H_
  #define _NET_BPFDESC_H_
  
+ #include <sys/callout.h>
  #include <sys/select.h>
  
  /*
***************
*** 85,92 ****
--- 86,100 ----
  	u_char		bd_pad;		/* explicit alignment */
  	struct selinfo	bd_sel;		/* bsd select info */
  #endif
+ 	struct callout	bd_callout;	/* for BPF timeouts with select */
  };
  
+ 
+ /* Values for bd_state */
+ #define BPF_IDLE	0		/* no select in progress */
+ #define BPF_WAITING	1		/* waiting for read timeout in select */
+ #define BPF_TIMED_OUT	2		/* read timeout has expired in select */
+ 
  /*
   * Descriptor associated with each attached hardware interface.
   */