My colleague Beverly Schwartz has found a problem in if_bnx.c where bnx_start takes packets off ifp->if_snd without a mutex. bnx_start can be called from multiple places (the output routine, the tx complete routine, and the allocate-more-tx-buffers routine). We have in fact seen calls to bnx_start while it is running, resulting in the packet being queued twice and a double free. With pure netbsd-6, we can lock up a machine (i386) with multiple bnx interfaces by running multiple transfers in both directions on multiple interfaces at once. With the following patch, it is solid. (I am pretty sure we are running with only 1 core.) I'm sending this as early as I can, because whatever change we converge on will need to be pulled up to netbsd-6 and I'd like some more eyes on it. There are a splnet calls around ifq_enqueue. So perhaps the bug is instead that bnx_start is somehow called not at splnet. But bnx_intr should be running at splnet, which should serialize access. Author: Bev Schwartz <bschwart@repo> Date: Mon Mar 5 15:02:04 2012 -0500 Fix to race condition in bnx_start The bnx driver has a defect which causes the same mbuf to be queued in the bnx driver twice. bnx_start is re-entrant, and the ifp->if_snd queue was not protected with a mutex, thus the same head could be grabbed and passed to the bnx code twice. Added mutex to protect ifp->if_snd. In addition, the variable used_tx_bd was not protected by a mutex, but should be. Use of tx_pkt_mtx was expanded to include protection for tx_used_pkts. diff --git a/netbsd/src/sys/dev/pci/if_bnxvar.h b/netbsd/src/sys/dev/pci/if_bnxvar.h index 685b7ce..1397a8d 100644 --- a/netbsd/src/sys/dev/pci/if_bnxvar.h +++ b/netbsd/src/sys/dev/pci/if_bnxvar.h @@ -258,11 +258,28 @@ struct bnx_softc bus_dma_segment_t tx_mbuf_seg; int tx_mbuf_rseg; + /* + * ifq_pkt_mtx is for protecting the ifp->if_snd queue in bnx_start. + * bnx_start can be called from interface queue handling in the + * kernel, or from the bnx interrupt handler in bnx_intr. This + * mutex prevents the same packet being queued in the tx_used_pkts + * twice. Note: The tx_pkt_mtx will be used when ifq_pkt_mtx is + * held. + */ + kmutex_t ifq_pkt_mtx; + /* S/W maintained mbuf TX chain structure. */ + /* + * Note: tx_pkt_mutex protect the tx_free_pkts and tx_used_pkt + * queues as well as the counters tx_pkt_count and used_tx_bd. + * The ifq_pkt_mtx may already be taken when we attempt to get + * tx_pkt_mtx. Therefore, ifq_pkt_mtx must not be taken when + * tx_pkt_mtx is held. + */ kmutex_t tx_pkt_mtx; - u_int tx_pkt_count; - struct bnx_pkt_list tx_free_pkts; - struct bnx_pkt_list tx_used_pkts; + u_int tx_pkt_count; /* take_tx_pkt_mutex */ + struct bnx_pkt_list tx_free_pkts; /* take tx_pkt_mutex */ + struct bnx_pkt_list tx_used_pkts; /* take tx_pkt_mutex */ /* S/W maintained mbuf RX chain structure. */ bus_dmamap_t rx_mbuf_map[TOTAL_RX_BD]; @@ -271,7 +288,7 @@ struct bnx_softc /* Track the number of rx_bd and tx_bd's in use. */ u_int16_t free_rx_bd; u_int16_t max_rx_bd; - u_int16_t used_tx_bd; + u_int16_t used_tx_bd; /* take tx_pkt_mutex */ u_int16_t max_tx_bd; /* Provides access to hardware statistics through sysctl. */ diff --git a/netbsd/src/sys/dev/pci/if_bnx.c b/netbsd/src/sys/dev/pci/if_bnx.c index 7b7f315..1b88d10 100644 --- a/netbsd/src/sys/dev/pci/if_bnx.c +++ b/netbsd/src/sys/dev/pci/if_bnx.c @@ -2416,6 +2416,7 @@ bnx_dma_alloc(struct bnx_softc *sc) TAILQ_INIT(&sc->tx_used_pkts); sc->tx_pkt_count = 0; mutex_init(&sc->tx_pkt_mtx, MUTEX_DEFAULT, IPL_NET); + mutex_init(&sc->ifq_pkt_mtx, MUTEX_DEFAULT, IPL_NET); /* * Allocate DMA memory for the Rx buffer descriptor chain, @@ -4056,7 +4057,9 @@ bnx_free_tx_chain(struct bnx_softc *sc) BNX_TX_CHAIN_PAGE_SZ, BUS_DMASYNC_PREWRITE); } + mutex_enter(&sc->tx_pkt_mtx); sc->used_tx_bd = 0; + mutex_exit(&sc->tx_pkt_mtx); /* Check if we lost any mbufs in the process. */ DBRUNIF((sc->tx_mbuf_alloc), @@ -4676,9 +4679,9 @@ bnx_tx_intr(struct bnx_softc *sc) mutex_enter(&sc->tx_pkt_mtx); TAILQ_INSERT_TAIL(&sc->tx_free_pkts, pkt, pkt_entry); } - mutex_exit(&sc->tx_pkt_mtx); sc->used_tx_bd--; + mutex_exit(&sc->tx_pkt_mtx); DBPRINT(sc, BNX_INFO_SEND, "%s(%d) used_tx_bd %d\n", __FILE__, __LINE__, sc->used_tx_bd); @@ -4702,6 +4705,7 @@ bnx_tx_intr(struct bnx_softc *sc) ifp->if_timer = 0; /* Clear the tx hardware queue full flag. */ + mutex_enter(&sc->tx_pkt_mtx); if (sc->used_tx_bd < sc->max_tx_bd) { DBRUNIF((ifp->if_flags & IFF_OACTIVE), aprint_debug_dev(sc->bnx_dev, @@ -4709,6 +4713,7 @@ bnx_tx_intr(struct bnx_softc *sc) sc->used_tx_bd, sc->max_tx_bd)); ifp->if_flags &= ~IFF_OACTIVE; } + mutex_exit(&sc->tx_pkt_mtx); sc->tx_cons = sw_tx_cons; } @@ -4912,8 +4917,13 @@ bnx_tx_encap(struct bnx_softc *sc, struct mbuf *m) bus_dmamap_sync(sc->bnx_dmatag, map, 0, map->dm_mapsize, BUS_DMASYNC_PREWRITE); /* Make sure there's room in the chain */ - if (map->dm_nsegs > (sc->max_tx_bd - sc->used_tx_bd)) + mutex_enter(&sc->tx_pkt_mtx); + if (map->dm_nsegs > (sc->max_tx_bd - sc->used_tx_bd)) { + mutex_exit(&sc->tx_pkt_mtx); goto nospace; + } else { + mutex_exit(&sc->tx_pkt_mtx); + } /* prod points to an empty tx_bd at this point. */ prod_bseq = sc->tx_prod_bseq; @@ -4962,9 +4972,9 @@ bnx_tx_encap(struct bnx_softc *sc, struct mbuf *m) mutex_enter(&sc->tx_pkt_mtx); TAILQ_INSERT_TAIL(&sc->tx_used_pkts, pkt, pkt_entry); - mutex_exit(&sc->tx_pkt_mtx); sc->used_tx_bd += map->dm_nsegs; + mutex_exit(&sc->tx_pkt_mtx); DBPRINT(sc, BNX_INFO_SEND, "%s(%d) used_tx_bd %d\n", __FILE__, __LINE__, sc->used_tx_bd); @@ -5006,7 +5016,7 @@ bnx_start(struct ifnet *ifp) struct bnx_softc *sc = ifp->if_softc; struct mbuf *m_head = NULL; int count = 0; - u_int16_t tx_prod, tx_chain_prod; + u_int16_t tx_prod, tx_chain_prod, used_tx_bd; /* If there's no link or the transmit queue is empty then just exit. */ if ((ifp->if_flags & (IFF_OACTIVE|IFF_RUNNING)) != IFF_RUNNING) { @@ -5028,11 +5038,22 @@ bnx_start(struct ifnet *ifp) /* * Keep adding entries while there is space in the ring. */ - while (sc->used_tx_bd < sc->max_tx_bd) { + mutex_enter(&sc->tx_pkt_mtx); + used_tx_bd = sc->used_tx_bd; + mutex_exit(&sc->tx_pkt_mtx); + while (used_tx_bd < sc->max_tx_bd) { + + /* + * Take ifq_pkt_mtx to prevent the same + * mbuf being enqueued in bnx twice. + */ + mutex_enter(&sc->ifq_pkt_mtx); /* Check for any frames to send. */ IFQ_POLL(&ifp->if_snd, m_head); - if (m_head == NULL) + if (m_head == NULL) { + mutex_exit(&sc->ifq_pkt_mtx); break; + } /* * Pack the data into the transmit ring. If we @@ -5043,15 +5064,21 @@ bnx_start(struct ifnet *ifp) ifp->if_flags |= IFF_OACTIVE; DBPRINT(sc, BNX_INFO_SEND, "TX chain is closed for " "business! Total tx_bd used = %d\n", - sc->used_tx_bd); + used_tx_bd); + mutex_exit(&sc->ifq_pkt_mtx); break; } IFQ_DEQUEUE(&ifp->if_snd, m_head); + mutex_exit(&sc->ifq_pkt_mtx); count++; /* Send a copy of the frame to any BPF listeners. */ bpf_mtap(ifp, m_head); + + mutex_enter(&sc->tx_pkt_mtx); + used_tx_bd = sc->used_tx_bd; + mutex_exit(&sc->tx_pkt_mtx); } if (count == 0) {
Attachment:
pgpwW4evAJi9A.pgp
Description: PGP signature