tech-net archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Improving the data supplied by BPF



Whilst letting the bits and bytes of this change digest over the
last few days, one aspect of the current BPF design gave me cause
to pause: the existing BPF header record for /dev/bpf is 18 bytes
long, meaning that when combined with ethernet, IP packets start
on a 32bit aligned boundary. This is significant for platforms
such as SPARC where thus access of IP addresses is via correctly
aligned addresses.

With this in mind, I added bf_exthdrlen. As per the comments in
the attached diff, it gets bumped up by an amount to ensure that
the header following the link layer header is on a 32bit aligned
address. For things such as gre/tun/loopback, which are using
bpf_mtap_af(), there would be no padding (link layer record is a
32bit int) but for ethernet, 2 bytes of padding is added.

Additionally, I've made some changes to force the buffer size
set via BIOCSBLEN to always be a multiple of 64bits.  This change
takes precedence over bpf_maxbufsize and BPF_MINBUFSIZE, so if
either of those would force the set length to be a number that
is not a multiple of 8, that is now lost.

The question I find myself asking is, are the changes to start
an IP (layer3) header on a 32bit aligned address warranted?
Or is that needless microbenchmarking?

Darren

Index: bpf.c
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.c,v
retrieving revision 1.141
diff -u -r1.141 bpf.c
--- bpf.c       15 Jun 2008 16:37:21 -0000      1.141
+++ bpf.c       29 Dec 2008 23:53:06 -0000
@@ -127,7 +127,7 @@
 static int     bpf_allocbufs(struct bpf_d *);
 static void    bpf_deliver(struct bpf_if *,
                            void *(*cpfn)(void *, const void *, size_t),
-                           void *, u_int, u_int, struct ifnet *);
+                           void *, u_int, u_int, struct ifnet *, u_int);
 static void    bpf_freed(struct bpf_d *);
 static void    bpf_ifname(struct ifnet *, struct ifreq *);
 static void    *bpf_mcpy(void *, const void *, size_t);
@@ -140,7 +140,7 @@
 static inline void
                bpf_wakeup(struct bpf_d *);
 static void    catchpacket(struct bpf_d *, u_char *, u_int, u_int,
-    void *(*)(void *, const void *, size_t), struct timeval *);
+    void *(*)(void *, const void *, size_t), struct timeval *, u_int);
 static void    reset_d(struct bpf_d *);
 static int     bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int     bpf_setdlt(struct bpf_d *, u_int);
@@ -399,6 +399,8 @@
 
        d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK|M_ZERO);
        d->bd_bufsize = bpf_bufsize;
+       if (d->bd_bufsize & 7)
+               d->bd_bufsize += 8 - (d->bd_bufsize & 7);
        d->bd_seesent = 1;
        d->bd_pid = l->l_proc->p_pid;
        callout_init(&d->bd_callout, 0);
@@ -756,6 +758,11 @@
                                *(u_int *)addr = size = bpf_maxbufsize;
                        else if (size < BPF_MINBUFSIZE)
                                *(u_int *)addr = size = BPF_MINBUFSIZE;
+                       /*
+                        * Force the size to be a multiple of 64 bits.
+                        */
+                       if ((size & 7) != 0)
+                               size += 8 - (size & 7);
                        d->bd_bufsize = size;
                }
                break;
@@ -928,6 +935,14 @@
                *(u_int *)addr = d->bd_seesent;
                break;
 
+       case BIOCGEXTEND:
+               *(u_int *)addr = d->bd_extend;
+               break;
+
+       case BIOCSEXTEND:
+               d->bd_extend = *(u_int *)addr;
+               break;
+
        /*
         * Set "see sent" packets flag
         */
@@ -1204,7 +1219,7 @@
  * buffer.
  */
 void
-bpf_tap(void *arg, u_char *pkt, u_int pktlen)
+bpf_tap(void *arg, u_char *pkt, u_int pktlen, u_int flags)
 {
        struct bpf_if *bp;
        struct bpf_d *d;
@@ -1227,7 +1242,8 @@
                                microtime(&tv);
                                gottime = 1;
                        }
-               catchpacket(d, pkt, pktlen, slen, (void *)memcpy, &tv);
+                       catchpacket(d, pkt, pktlen, slen, (void *)memcpy,
+                           &tv, flags);
                }
        }
 }
@@ -1268,7 +1284,8 @@
  */
 static inline void
 bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t),
-           void *marg, u_int pktlen, u_int buflen, struct ifnet *rcvif)
+           void *marg, u_int pktlen, u_int buflen, struct ifnet *rcvif,
+           u_int flags)
 {
        u_int slen;
        struct bpf_d *d;
@@ -1286,7 +1303,7 @@
                                microtime(&tv);
                                gottime = 1;
                        }
-                       catchpacket(d, marg, pktlen, slen, cpfn, &tv);
+                       catchpacket(d, marg, pktlen, slen, cpfn, &tv, flags);
                }
        }
 }
@@ -1296,7 +1313,7 @@
  * a buffer, and the tail is in an mbuf chain.
  */
 void
-bpf_mtap2(void *arg, void *data, u_int dlen, struct mbuf *m)
+bpf_mtap2(void *arg, void *data, u_int dlen, struct mbuf *m, u_int flags)
 {
        struct bpf_if *bp = arg;
        u_int pktlen;
@@ -1314,14 +1331,14 @@
        mb.m_data = data;
        mb.m_len = dlen;
 
-       bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, m->m_pkthdr.rcvif);
+       bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, m->m_pkthdr.rcvif, flags);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  */
 void
-bpf_mtap(void *arg, struct mbuf *m)
+bpf_mtap(void *arg, struct mbuf *m, u_int flags)
 {
        void *(*cpfn)(void *, const void *, size_t);
        struct bpf_if *bp = arg;
@@ -1341,7 +1358,7 @@
                buflen = 0;
        }
 
-       bpf_deliver(bp, cpfn, marg, pktlen, buflen, m->m_pkthdr.rcvif);
+       bpf_deliver(bp, cpfn, marg, pktlen, buflen, m->m_pkthdr.rcvif, flags);
 }
 
 /*
@@ -1352,7 +1369,7 @@
  * try to free it or keep a pointer a to it).
  */
 void
-bpf_mtap_af(void *arg, uint32_t af, struct mbuf *m)
+bpf_mtap_af(void *arg, uint32_t af, struct mbuf *m, u_int flags)
 {
        struct mbuf m0;
 
@@ -1361,11 +1378,11 @@
        m0.m_len = 4;
        m0.m_data = (char *)&af;
 
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, flags);
 }
 
 void
-bpf_mtap_et(void *arg, uint16_t et, struct mbuf *m)
+bpf_mtap_et(void *arg, uint16_t et, struct mbuf *m, u_int flags)
 {
        struct mbuf m0;
 
@@ -1379,7 +1396,7 @@
        ((uint32_t *)m0.m_data)[2] = 0;
        ((uint16_t *)m0.m_data)[6] = et;
 
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, flags);
 }
 
 #if NSL > 0 || NSTRIP > 0
@@ -1404,7 +1421,7 @@
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
        s = splnet();
-       bpf_mtap(arg, *m);
+       bpf_mtap(arg, *m, 0);
        splx(s);
 
        m_adj(*m, SLIP_HDRLEN);
@@ -1433,7 +1450,7 @@
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
        s = splnet();
-       bpf_mtap(arg, &m0);
+       bpf_mtap(arg, &m0, EBPF_OUT);
        splx(s);
        m_freem(m);
 }
@@ -1449,12 +1466,20 @@
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
-    void *(*cpfn)(void *, const void *, size_t), struct timeval *tv)
+    void *(*cpfn)(void *, const void *, size_t), struct timeval *tv,
+    u_int flags)
 {
+       int totlen, curlen, caplen;
        struct bpf_hdr *hp;
-       int totlen, curlen;
-       int hdrlen = d->bd_bif->bif_hdrlen;
        int do_wakeup = 0;
+       ebpf_rec_t *ehp;
+       int hdrlen;
+
+       if (d->bd_extend) {
+               hdrlen = d->bd_bif->bif_exthdrlen;
+       } else {
+               hdrlen = d->bd_bif->bif_hdrlen;
+       }
 
        ++d->bd_ccount;
        ++bpf_gstats.bs_capt;
@@ -1467,6 +1492,17 @@
        totlen = hdrlen + min(snaplen, pktlen);
        if (totlen > d->bd_bufsize)
                totlen = d->bd_bufsize;
+       caplen = totlen - hdrlen;
+       if (d->bd_extend && (totlen & 0x7)) {
+               /*
+                * Even though totlen is being rounded up, if it is less
+                * than bd_bufsize, rounding up to a 64bit number cannot
+                * cause it to become greater than bd_bufsize due to the
+                * rounding up of bd_bufsize to a multiple of 64bits in
+                * other parts of this file.
+                */
+               totlen += 8 - (totlen & 7);
+       }
 
        /*
         * Round up the end of the previous packet to the next longword.
@@ -1502,14 +1538,30 @@
        /*
         * Append the bpf header.
         */
-       hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
-       hp->bh_tstamp = *tv;
-       hp->bh_datalen = pktlen;
-       hp->bh_hdrlen = hdrlen;
+       hp = (struct bpf_hdr *)((u_char *)d->bd_sbuf + curlen);
+       if (d->bd_extend) {
+               ehp = (ebpf_rec_t *)hp;
+               ehp->ebr_rlen = totlen;
+               ehp->ebr_seqno = d->bd_ccount;
+               ehp->ebr_pktoff = hdrlen;
+               ehp->ebr_flags = flags;
+               ehp->ebr_secs = tv->tv_sec;
+               ehp->ebr_nsecs = tv->tv_usec * 1000;
+               ehp->ebr_wlen = pktlen;
+               ehp->ebr_clen = caplen;
+               ehp->ebr_type = d->bd_bif->bif_dlt;
+               ehp->ebr_subtype = 0;
+       } else {
+               hp->bh_tstamp = *tv;
+               hp->bh_datalen = pktlen;
+               hp->bh_hdrlen = hdrlen;
+               hp->bh_caplen = caplen;
+       }
+
        /*
         * Copy the packet data into the store buffer and update its length.
         */
-       (*cpfn)((u_char *)hp + hdrlen, pkt, (hp->bh_caplen = totlen - hdrlen));
+       (*cpfn)((u_char *)hp + hdrlen, pkt, caplen);
        d->bd_slen = curlen + totlen;
 
        /*
@@ -1604,6 +1656,18 @@
         * performance reasons and to alleviate alignment restrictions).
         */
        bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
+       /*
+        * The common case is for ethernet, which has a header length of
+        * 14 bytes. If we copy the ethernet packet into a buffer where it
+        * starts on a 32bit boundary then the IP header will start on a
+        * 16bit boundary and accessing address information on platforms
+        * such as SPARC will result in a bus alignment trap (core dump.)
+        * To mitigate this, we pad the size of the header out to a
+        * multiple of 32bits.
+        */
+       bp->bif_exthdrlen = sizeof(ebpf_rec_t);
+       if (hdrlen & 3)
+               bp->bif_exthdrlen += 4 - (hdrlen & 3);
 
 #if 0
        printf("bpf: %s attached\n", ifp->if_xname);
@@ -1669,6 +1733,19 @@
         * performance reasons and to alleviate alignment restrictions).
         */
        bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
+
+       /*
+        * The common case is for ethernet, which has a header length of
+        * 14 bytes. If we copy the ethernet packet into a buffer where it
+        * starts on a 32bit boundary then the IP header will start on a
+        * 16bit boundary and accessing address information on platforms
+        * such as SPARC will result in a bus alignment trap (core dump.)
+        * To mitigate this, we pad the size of the header out to a
+        * multiple of 32bits.
+        */
+       bp->bif_exthdrlen = sizeof(ebpf_rec_t);
+       if (hdrlen & 3)
+               bp->bif_exthdrlen += 4 - (hdrlen & 3);
 }
 
 /*
Index: bpf.h
===================================================================
RCS file: /cvsroot/src/sys/net/bpf.h,v
retrieving revision 1.48
diff -u -r1.48 bpf.h
--- bpf.h       10 Dec 2005 23:21:38 -0000      1.48
+++ bpf.h       29 Dec 2008 23:53:07 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: bpf.h,v 1.48 2005/12/10 23:21:38 elad Exp $    */
+/*     $NetBSD: bpf.h,v 1.39 2004/05/29 14:18:33 darrenr Exp $ */
 
 /*
  * Copyright (c) 1990, 1991, 1993
@@ -134,6 +134,8 @@
 #define BIOCGDLTLIST   _IOWR('B',119, struct bpf_dltlist)
 #define BIOCGSEESENT    _IOR('B',120, u_int)
 #define BIOCSSEESENT    _IOW('B',121, u_int)
+#define BIOCGEXTEND     _IOR('B',122, u_int)
+#define BIOCSEXTEND     _IOW('B',123, u_int)
 
 /*
  * Structure prepended to each packet.
@@ -145,6 +147,7 @@
        uint16_t        bh_hdrlen;      /* length of bpf header (this struct
                                           plus alignment padding) */
 };
+
 /*
  * Because the structure above is not a multiple of 4 bytes, some compilers
  * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work.
@@ -163,6 +166,34 @@
 #endif
 #endif
 
+/*
+ * Enhanced BPF packet record structure
+ *
+ * All lengths are in 32bit values because with the introduction of IPv6
+ * Jumbograms, 16bits is no longer big enough to represent the size of one
+ * entire packet.
+ *
+ * pktoff = offset from ebr_secs to the start of the packet data (may not be
+ *          the same as sizeof(ebr_rec_t))
+ */
+typedef struct ebpf_rec_s {
+       uint32_t        ebr_rlen;       /* Complete record length */
+       uint32_t        ebr_seqno;      /* sequence number in capture */
+       uint32_t        ebr_pktoff;     /* Offset to start of packet */
+       uint32_t        ebr_flags;
+       uint64_t        ebr_secs;       /* No more Y2k38 problem */
+       uint32_t        ebr_nsecs;      /* Note, 1ns = 1000us */
+       uint32_t        ebr_wlen;       /* Wire length of packet */
+       uint32_t        ebr_clen;       /* Captured length of packet */
+       uint16_t        ebr_type;       /* DLT_* type */
+       uint16_t        ebr_subtype;
+} ebpf_rec_t;
+
+/*
+ * flags defined for ebr_flags:
+ */
+#define        EBPF_OUT                0x00000001      /* Transmitted packet */
+
 /* Pull in data-link level type codes. */
 #include <net/dlt.h>
 
@@ -248,11 +279,11 @@
 
 #ifdef _KERNEL
 int     bpf_validate(struct bpf_insn *, int);
-void    bpf_tap(void *, u_char *, u_int);
-void    bpf_mtap(void *, struct mbuf *);
-void    bpf_mtap2(void *, void *, u_int, struct mbuf *);
-void    bpf_mtap_af(void *, uint32_t, struct mbuf *);
-void    bpf_mtap_et(void *, uint16_t, struct mbuf *);
+void    bpf_tap(void *, u_char *, u_int, u_int);
+void    bpf_mtap(void *, struct mbuf *, u_int);
+void    bpf_mtap2(void *, void *, u_int, struct mbuf *, u_int);
+void    bpf_mtap_af(void *, uint32_t, struct mbuf *, u_int);
+void    bpf_mtap_et(void *, uint16_t, struct mbuf *, u_int);
 void    bpf_mtap_sl_in(void *, u_char *, struct mbuf **);
 void    bpf_mtap_sl_out(void *, u_char *, struct mbuf *);
 void    bpfattach(struct ifnet *, u_int, u_int);


Home | Main Index | Thread Index | Old Index