Re: Improving the data supplied by BPF

To: NetBSD tech-net mailing list <tech-net%netbsd.org@localhost>
Subject: Re: Improving the data supplied by BPF
From: Darren Reed <darrenr%netbsd.org@localhost>
Date: Tue, 30 Dec 2008 11:04:33 +1100

Whilst letting the bits and bytes of this change digest over the
last few days, one aspect of the current BPF design gave me cause
to pause: the existing BPF header record for /dev/bpf is 18 bytes
long, meaning that when combined with ethernet, IP packets start
on a 32bit aligned boundary. This is significant for platforms
such as SPARC where thus access of IP addresses is via correctly
aligned addresses.

With this in mind, I added bf_exthdrlen. As per the comments in
the attached diff, it gets bumped up by an amount to ensure that
the header following the link layer header is on a 32bit aligned
address. For things such as gre/tun/loopback, which are using
bpf_mtap_af(), there would be no padding (link layer record is a
32bit int) but for ethernet, 2 bytes of padding is added.

Additionally, I've made some changes to force the buffer size
set via BIOCSBLEN to always be a multiple of 64bits.  This change
takes precedence over bpf_maxbufsize and BPF_MINBUFSIZE, so if
either of those would force the set length to be a number that
is not a multiple of 8, that is now lost.

The question I find myself asking is, are the changes to start
an IP (layer3) header on a 32bit aligned address warranted?
Or is that needless microbenchmarking?

Darren

Index: bpf.c
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.c,v
retrieving revision 1.141
diff -u -r1.141 bpf.c
--- bpf.c       15 Jun 2008 16:37:21 -0000      1.141
+++ bpf.c       29 Dec 2008 23:53:06 -0000
@@ -127,7 +127,7 @@
 static int     bpf_allocbufs(struct bpf_d *);
 static void    bpf_deliver(struct bpf_if *,
                            void *(*cpfn)(void *, const void *, size_t),
-                           void *, u_int, u_int, struct ifnet *);
+                           void *, u_int, u_int, struct ifnet *, u_int);
 static void    bpf_freed(struct bpf_d *);
 static void    bpf_ifname(struct ifnet *, struct ifreq *);
 static void    *bpf_mcpy(void *, const void *, size_t);
@@ -140,7 +140,7 @@
 static inline void
                bpf_wakeup(struct bpf_d *);
 static void    catchpacket(struct bpf_d *, u_char *, u_int, u_int,
-    void *(*)(void *, const void *, size_t), struct timeval *);
+    void *(*)(void *, const void *, size_t), struct timeval *, u_int);
 static void    reset_d(struct bpf_d *);
 static int     bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int     bpf_setdlt(struct bpf_d *, u_int);
@@ -399,6 +399,8 @@
 
        d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK|M_ZERO);
        d->bd_bufsize = bpf_bufsize;
+       if (d->bd_bufsize & 7)
+               d->bd_bufsize += 8 - (d->bd_bufsize & 7);
        d->bd_seesent = 1;
        d->bd_pid = l->l_proc->p_pid;
        callout_init(&d->bd_callout, 0);
@@ -756,6 +758,11 @@
                                *(u_int *)addr = size = bpf_maxbufsize;
                        else if (size < BPF_MINBUFSIZE)
                                *(u_int *)addr = size = BPF_MINBUFSIZE;
+                       /*
+                        * Force the size to be a multiple of 64 bits.
+                        */
+                       if ((size & 7) != 0)
+                               size += 8 - (size & 7);
                        d->bd_bufsize = size;
                }
                break;
@@ -928,6 +935,14 @@
                *(u_int *)addr = d->bd_seesent;
                break;
 
+       case BIOCGEXTEND:
+               *(u_int *)addr = d->bd_extend;
+               break;
+
+       case BIOCSEXTEND:
+               d->bd_extend = *(u_int *)addr;
+               break;
+
        /*
         * Set "see sent" packets flag
         */
@@ -1204,7 +1219,7 @@
  * buffer.
  */
 void
-bpf_tap(void *arg, u_char *pkt, u_int pktlen)
+bpf_tap(void *arg, u_char *pkt, u_int pktlen, u_int flags)
 {
        struct bpf_if *bp;
        struct bpf_d *d;
@@ -1227,7 +1242,8 @@
                                microtime(&tv);
                                gottime = 1;
                        }
-               catchpacket(d, pkt, pktlen, slen, (void *)memcpy, &tv);
+                       catchpacket(d, pkt, pktlen, slen, (void *)memcpy,
+                           &tv, flags);
                }
        }
 }
@@ -1268,7 +1284,8 @@
  */
 static inline void
 bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t),
-           void *marg, u_int pktlen, u_int buflen, struct ifnet *rcvif)
+           void *marg, u_int pktlen, u_int buflen, struct ifnet *rcvif,
+           u_int flags)
 {
        u_int slen;
        struct bpf_d *d;
@@ -1286,7 +1303,7 @@
                                microtime(&tv);
                                gottime = 1;
                        }
-                       catchpacket(d, marg, pktlen, slen, cpfn, &tv);
+                       catchpacket(d, marg, pktlen, slen, cpfn, &tv, flags);
                }
        }
 }
@@ -1296,7 +1313,7 @@
  * a buffer, and the tail is in an mbuf chain.
  */
 void
-bpf_mtap2(void *arg, void *data, u_int dlen, struct mbuf *m)
+bpf_mtap2(void *arg, void *data, u_int dlen, struct mbuf *m, u_int flags)
 {
        struct bpf_if *bp = arg;
        u_int pktlen;
@@ -1314,14 +1331,14 @@
        mb.m_data = data;
        mb.m_len = dlen;
 
-       bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, m->m_pkthdr.rcvif);
+       bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, m->m_pkthdr.rcvif, flags);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  */
 void
-bpf_mtap(void *arg, struct mbuf *m)
+bpf_mtap(void *arg, struct mbuf *m, u_int flags)
 {
        void *(*cpfn)(void *, const void *, size_t);
        struct bpf_if *bp = arg;
@@ -1341,7 +1358,7 @@
                buflen = 0;
        }
 
-       bpf_deliver(bp, cpfn, marg, pktlen, buflen, m->m_pkthdr.rcvif);
+       bpf_deliver(bp, cpfn, marg, pktlen, buflen, m->m_pkthdr.rcvif, flags);
 }
 
 /*
@@ -1352,7 +1369,7 @@
  * try to free it or keep a pointer a to it).
  */
 void
-bpf_mtap_af(void *arg, uint32_t af, struct mbuf *m)
+bpf_mtap_af(void *arg, uint32_t af, struct mbuf *m, u_int flags)
 {
        struct mbuf m0;
 
@@ -1361,11 +1378,11 @@
        m0.m_len = 4;
        m0.m_data = (char *)&af;
 
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, flags);
 }
 
 void
-bpf_mtap_et(void *arg, uint16_t et, struct mbuf *m)
+bpf_mtap_et(void *arg, uint16_t et, struct mbuf *m, u_int flags)
 {
        struct mbuf m0;
 
@@ -1379,7 +1396,7 @@
        ((uint32_t *)m0.m_data)[2] = 0;
        ((uint16_t *)m0.m_data)[6] = et;
 
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, flags);
 }
 
 #if NSL > 0 || NSTRIP > 0
@@ -1404,7 +1421,7 @@
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
        s = splnet();
-       bpf_mtap(arg, *m);
+       bpf_mtap(arg, *m, 0);
        splx(s);
 
        m_adj(*m, SLIP_HDRLEN);
@@ -1433,7 +1450,7 @@
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
        s = splnet();
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, EBPF_OUT);
        splx(s);
        m_freem(m);
 }
@@ -1449,12 +1466,20 @@
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
-    void *(*cpfn)(void *, const void *, size_t), struct timeval *tv)
+    void *(*cpfn)(void *, const void *, size_t), struct timeval *tv,
+    u_int flags)
 {
+       int totlen, curlen, caplen;
        struct bpf_hdr *hp;
-       int totlen, curlen;
-       int hdrlen = d->bd_bif->bif_hdrlen;
        int do_wakeup = 0;
+       ebpf_rec_t *ehp;
+       int hdrlen;
+
+       if (d->bd_extend) {
+               hdrlen = d->bd_bif->bif_exthdrlen;
+       } else {
+               hdrlen = d->bd_bif->bif_hdrlen;
+       }
 
        ++d->bd_ccount;
        ++bpf_gstats.bs_capt;
@@ -1467,6 +1492,17 @@
        totlen = hdrlen + min(snaplen, pktlen);
        if (totlen > d->bd_bufsize)
                totlen = d->bd_bufsize;
+       caplen = totlen - hdrlen;
+       if (d->bd_extend && (totlen & 0x7)) {
+               /*
+                * Even though totlen is being rounded up, if it is less
+                * than bd_bufsize, rounding up to a 64bit number cannot
+                * cause it to become greater than bd_bufsize due to the
+                * rounding up of bd_bufsize to a multiple of 64bits in
+                * other parts of this file.
+                */
+               totlen += 8 - (totlen & 7);
+       }
 
        /*
         * Round up the end of the previous packet to the next longword.
@@ -1502,14 +1538,30 @@
        /*
         * Append the bpf header.
         */
-       hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
-       hp->bh_tstamp = *tv;
-       hp->bh_datalen = pktlen;
-       hp->bh_hdrlen = hdrlen;
+       hp = (struct bpf_hdr *)((u_char *)d->bd_sbuf + curlen);
+       if (d->bd_extend) {
+               ehp = (ebpf_rec_t *)hp;
+               ehp->ebr_rlen = totlen;
+               ehp->ebr_seqno = d->bd_ccount;
+               ehp->ebr_pktoff = hdrlen;
+               ehp->ebr_flags = flags;
+               ehp->ebr_secs = tv->tv_sec;
+               ehp->ebr_nsecs = tv->tv_usec * 1000;
+               ehp->ebr_wlen = pktlen;
+               ehp->ebr_clen = caplen;
+               ehp->ebr_type = d->bd_bif->bif_dlt;
+               ehp->ebr_subtype = 0;
+       } else {
+               hp->bh_tstamp = *tv;
+               hp->bh_datalen = pktlen;
+               hp->bh_hdrlen = hdrlen;
+               hp->bh_caplen = caplen;
+       }
+
        /*
         * Copy the packet data into the store buffer and update its length.
         */
-       (*cpfn)((u_char *)hp + hdrlen, pkt, (hp->bh_caplen = totlen - hdrlen));
+       (*cpfn)((u_char *)hp + hdrlen, pkt, caplen);
        d->bd_slen = curlen + totlen;
 
        /*
@@ -1604,6 +1656,18 @@
         * performance reasons and to alleviate alignment restrictions).
         */
        bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
+       /*
+        * The common case is for ethernet, which has a header length of
+        * 14 bytes. If we copy the ethernet packet into a buffer where it
+        * starts on a 32bit boundary then the IP header will start on a
+        * 16bit boundary and accessing address information on platforms
+        * such as SPARC will result in a bus alignment trap (core dump.)
+        * To mitigate this, we pad the size of the header out to a
+        * multiple of 32bits.
+        */
+       bp->bif_exthdrlen = sizeof(ebpf_rec_t);
+       if (hdrlen & 3)
+               bp->bif_exthdrlen += 4 - (hdrlen & 3);
 
 #if 0
        printf("bpf: %s attached\n", ifp->if_xname);
@@ -1669,6 +1733,19 @@
         * performance reasons and to alleviate alignment restrictions).
         */
        bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
+
+       /*
+        * The common case is for ethernet, which has a header length of
+        * 14 bytes. If we copy the ethernet packet into a buffer where it
+        * starts on a 32bit boundary then the IP header will start on a
+        * 16bit boundary and accessing address information on platforms
+        * such as SPARC will result in a bus alignment trap (core dump.)
+        * To mitigate this, we pad the size of the header out to a
+        * multiple of 32bits.
+        */
+       bp->bif_exthdrlen = sizeof(ebpf_rec_t);
+       if (hdrlen & 3)
+               bp->bif_exthdrlen += 4 - (hdrlen & 3);
 }
 
 /*
Index: bpf.h
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.h,v
retrieving revision 1.48
diff -u -r1.48 bpf.h
--- bpf.h       10 Dec 2005 23:21:38 -0000      1.48
+++ bpf.h       29 Dec 2008 23:53:07 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: bpf.h,v 1.48 2005/12/10 23:21:38 elad Exp $    */
+/*     $NetBSD: bpf.h,v 1.39 2004/05/29 14:18:33 darrenr Exp $ */
 
 /*
  * Copyright (c) 1990, 1991, 1993
@@ -134,6 +134,8 @@
 #define BIOCGDLTLIST   _IOWR('B',119, struct bpf_dltlist)
 #define BIOCGSEESENT    _IOR('B',120, u_int)
 #define BIOCSSEESENT    _IOW('B',121, u_int)
+#define BIOCGEXTEND     _IOR('B',122, u_int)
+#define BIOCSEXTEND     _IOW('B',123, u_int)
 
 /*
  * Structure prepended to each packet.
@@ -145,6 +147,7 @@
        uint16_t        bh_hdrlen;      /* length of bpf header (this struct
                                           plus alignment padding) */
 };
+
 /*
  * Because the structure above is not a multiple of 4 bytes, some compilers
  * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work.
@@ -163,6 +166,34 @@
 #endif
 #endif
 
+/*
+ * Enhanced BPF packet record structure
+ *
+ * All lengths are in 32bit values because with the introduction of IPv6
+ * Jumbograms, 16bits is no longer big enough to represent the size of one
+ * entire packet.
+ *
+ * pktoff = offset from ebr_secs to the start of the packet data (may not be
+ *          the same as sizeof(ebr_rec_t))
+ */
+typedef struct ebpf_rec_s {
+       uint32_t        ebr_rlen;       /* Complete record length */
+       uint32_t        ebr_seqno;      /* sequence number in capture */
+       uint32_t        ebr_pktoff;     /* Offset to start of packet */
+       uint32_t        ebr_flags;
+       uint64_t        ebr_secs;       /* No more Y2k38 problem */
+       uint32_t        ebr_nsecs;      /* Note, 1ns = 1000us */
+       uint32_t        ebr_wlen;       /* Wire length of packet */
+       uint32_t        ebr_clen;       /* Captured length of packet */
+       uint16_t        ebr_type;       /* DLT_* type */
+       uint16_t        ebr_subtype;
+} ebpf_rec_t;
+
+/*
+ * flags defined for ebr_flags:
+ */
+#define        EBPF_OUT                0x00000001      /* Transmitted packet */
+
 /* Pull in data-link level type codes. */
 #include <net/dlt.h>
 
@@ -248,11 +279,11 @@
 
 #ifdef _KERNEL
 int     bpf_validate(struct bpf_insn *, int);
-void    bpf_tap(void *, u_char *, u_int);
-void    bpf_mtap(void *, struct mbuf *);
-void    bpf_mtap2(void *, void *, u_int, struct mbuf *);
-void    bpf_mtap_af(void *, uint32_t, struct mbuf *);
-void    bpf_mtap_et(void *, uint16_t, struct mbuf *);
+void    bpf_tap(void *, u_char *, u_int, u_int);
+void    bpf_mtap(void *, struct mbuf *, u_int);
+void    bpf_mtap2(void *, void *, u_int, struct mbuf *, u_int);
+void    bpf_mtap_af(void *, uint32_t, struct mbuf *, u_int);
+void    bpf_mtap_et(void *, uint16_t, struct mbuf *, u_int);
 void    bpf_mtap_sl_in(void *, u_char *, struct mbuf **);
 void    bpf_mtap_sl_out(void *, u_char *, struct mbuf *);
 void    bpfattach(struct ifnet *, u_int, u_int);

References:
- Improving the data supplied by BPF
  - From: Darren Reed

Prev by Date: Re: Improving the data supplied by BPF
Next by Date: Re: Changing the default subnet route.
Previous by Thread: Re: Improving the data supplied by BPF
Next by Thread: NetBSD TCP Congestion Control Configuration
Indexes:

Home | Main Index | Thread Index | Old Index