Subject: Slight change to mbuf external storage infrastructure
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 04/29/2002 10:28:18
--PpAOPzA3dXsRhoo+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

I'm doing some work on zero-copy TCP, and have bumped into a problem
with how the external storage facility in our mbuf system works.

Basically, the current mbuf external storage implementation assumes that
it is safe to free external storage from any context where it is safe to
free an mbuf.  Now, this is currently true for all current uses of external
storage (cluster mbufs, malloc()'d data attached to mbuf, and the two uses
of external storage in in-tree network interface drivers).

However, this is not the case for external storage which is created using
uvm_loan(); freeing that storage in interrupt context would horribly break
the UVM page/object locking protocols.

What I need to do with uvm_loan()'d mbuf external storage is to defer the
releasing of that storage to a valid thread context; either the next time
an operation is performed on the socket, or when the pagedaemon runs to
free up resources.

In order to do that, I need some data structure to do bookkeeping on the
external storage.  The mbuf itself is a pretty convenient mechanism to do
this, and using it saves me from having to perform an allocation in order
to free something.

Unfortunately, the mbuf is always freed by MFREE(), such that I can't use
the mbuf to perform that bookkeeping task.

What I would like to do is change the protocol for how the mbuf is freed in
MFREE().  Basically, I want it to work like this:

	* If mbuf has no external storage, free mbuf.

	* Otherwise, determine how that storage was allocated:

		* If external storage referenced by another mbuf,
		  decrement reference count, free mbuf.

		* If external storage is a cluster, free cluster
		  and mbuf.

		* If external storage is plain malloc()'d storage,
		  free storage and mbuf.

		* Otherwise, we the external storage might require
		  extra bookkeeping to release.  Call (*ext_free)(),
		  and let it free the mbuf when it is safe to do so.

This requires adding an "mbuf *" argument to the (*ext_free)() function.  If
the new mbuf * argment is non-NULL, then (*ext_free)() is responsible for
freeing the mbuf when it is safe to do so.  The mbuf * argument will be NULL
if (*ext_free)() is invoked via MEXTREMOVE(); we will make the assumption that
that macro will be used only when safe to actually free the external storage
(it is meant for use by the subsystem that attached the external storage in
the first place; see how it is used in the NFS code).

The following patch implements this change to (*ext_free)().  Any objections?

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--PpAOPzA3dXsRhoo+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=mbuf-patch

Index: sys/mbuf.h
===================================================================
RCS file: /cvsroot/syssrc/sys/sys/mbuf.h,v
retrieving revision 1.64
diff -u -r1.64 mbuf.h
--- sys/mbuf.h	2002/03/09 01:46:32	1.64
+++ sys/mbuf.h	2002/04/29 17:05:10
@@ -150,7 +150,7 @@
 struct m_ext {
 	caddr_t	ext_buf;		/* start of buffer */
 	void	(*ext_free)		/* free routine if not the usual */
-		__P((caddr_t, u_int, void *));
+		__P((struct mbuf *, caddr_t, u_int, void *));
 	void	*ext_arg;		/* argument for ext_free */
 	u_int	ext_size;		/* size of buffer, for ext_free */
 	int	ext_type;		/* malloc type */
@@ -392,24 +392,32 @@
 	MCLINITREFERENCE(m);						\
 } while (/* CONSTCOND */ 0)
 
-#define	_MEXTREMOVE(m)							\
+#define	MEXTREMOVE(m)							\
 do {									\
+	int _ms_ = splvm(); /* MBUFLOCK */				\
 	if (MCLISREFERENCED(m)) {					\
 		_MCLDEREFERENCE(m);					\
+		splx(_ms_);						\
 	} else if ((m)->m_flags & M_CLUSTER) {				\
 		pool_cache_put(&mclpool_cache, (m)->m_ext.ext_buf);	\
+		splx(_ms_);						\
 	} else if ((m)->m_ext.ext_free) {				\
-		(*((m)->m_ext.ext_free))((m)->m_ext.ext_buf,		\
+		/*							\
+		 * NOTE: We assume that MEXTREMOVE() is called from	\
+		 * code where it is safe to invoke the free routine	\
+		 * without the mbuf to perform bookkeeping.		\
+		 */							\
+		splx(_ms_);						\
+		(*((m)->m_ext.ext_free))(NULL, (m)->m_ext.ext_buf,	\
 		    (m)->m_ext.ext_size, (m)->m_ext.ext_arg);		\
 	} else {							\
-		free((m)->m_ext.ext_buf,(m)->m_ext.ext_type);		\
+		splx(_ms_);						\
+		free((m)->m_ext.ext_buf, (m)->m_ext.ext_type);		\
 	}								\
 	(m)->m_flags &= ~(M_CLUSTER|M_EXT);				\
 	(m)->m_ext.ext_size = 0;	/* why ??? */			\
 } while (/* CONSTCOND */ 0)
 
-#define	MEXTREMOVE(m)	MBUFLOCK(_MEXTREMOVE((m));)
-
 /*
  * Reset the data pointer on an mbuf.
  */
@@ -435,15 +443,36 @@
 #define	MFREE(m, n)							\
 	MBUFLOCK(							\
 		mbstat.m_mtypes[(m)->m_type]--;				\
-		if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) { \
+		if (((m)->m_flags & M_PKTHDR) != 0 &&			\
+		    (m)->m_pkthdr.aux != NULL) {			\
 			m_freem((m)->m_pkthdr.aux);			\
 			(m)->m_pkthdr.aux = NULL;			\
 		}							\
-		if ((m)->m_flags & M_EXT) {				\
-			_MEXTREMOVE((m));				\
-		}							\
 		(n) = (m)->m_next;					\
-		pool_cache_put(&mbpool_cache, (m));			\
+		if ((m)->m_flags & M_EXT) {				\
+			if (MCLISREFERENCED(m)) {			\
+				_MCLDEREFERENCE(m);			\
+				pool_cache_put(&mbpool_cache, (m));	\
+			} else if ((m)->m_flags & M_CLUSTER) {		\
+				pool_cache_put(&mclpool_cache,		\
+				    (m)->m_ext.ext_buf);		\
+				pool_cache_put(&mbpool_cache, (m));	\
+			} else if ((m)->m_ext.ext_free) {		\
+				/*					\
+				 * (*ext_free)() is responsible for	\
+				 * freeing the mbuf when it is safe.	\
+				 */					\
+				(*((m)->m_ext.ext_free))((m),		\
+				    (m)->m_ext.ext_buf,			\
+				    (m)->m_ext.ext_size,		\
+				    (m)->m_ext.ext_arg);		\
+			} else {					\
+				free((m)->m_ext.ext_buf,		\
+				    (m)->m_ext.ext_type);		\
+				pool_cache_put(&mbpool_cache, (m));	\
+			}						\
+		} else							\
+			pool_cache_put(&mbpool_cache, (m));		\
 	)
 
 /*
Index: dev/pci/if_ti.c
===================================================================
RCS file: /cvsroot/syssrc/sys/dev/pci/if_ti.c,v
retrieving revision 1.46
diff -u -r1.46 if_ti.c
--- dev/pci/if_ti.c	2002/04/28 01:00:26	1.46
+++ dev/pci/if_ti.c	2002/04/29 17:05:13
@@ -195,7 +195,7 @@
 static void ti_handle_events	__P((struct ti_softc *));
 static int ti_alloc_jumbo_mem	__P((struct ti_softc *));
 static void *ti_jalloc		__P((struct ti_softc *));
-static void ti_jfree		__P((caddr_t, u_int, void *));
+static void ti_jfree		__P((struct mbuf *, caddr_t, u_int, void *));
 static int ti_newbuf_std	__P((struct ti_softc *, int, struct mbuf *, bus_dmamap_t));
 static int ti_newbuf_mini	__P((struct ti_softc *, int, struct mbuf *, bus_dmamap_t));
 static int ti_newbuf_jumbo	__P((struct ti_softc *, int, struct mbuf *));
@@ -682,13 +682,14 @@
 /*
  * Release a jumbo buffer.
  */
-static void ti_jfree(buf, size, arg)
+static void ti_jfree(m, buf, size, arg)
+	struct mbuf		*m;
 	caddr_t			buf;
 	u_int			size;
 	void *arg;
 {
 	struct ti_softc		*sc;
-	int		        i;
+	int		        i, s;
 	struct ti_jpool_entry   *entry;
 
 	/* Extract the softc struct pointer. */
@@ -704,6 +705,8 @@
 
 	if ((i < 0) || (i >= TI_JSLOTS))
 		panic("ti_jfree: asked to free buffer that we don't manage!");
+
+	s = splvm();
 	entry = SIMPLEQ_FIRST(&sc->ti_jinuse_listhead);
 	if (entry == NULL)
 		panic("ti_jfree: buffer not in use!");
@@ -713,7 +716,9 @@
 	SIMPLEQ_INSERT_HEAD(&sc->ti_jfree_listhead, 
 	     entry, jpool_entries);
 
-	return;
+	if (__predict_true(m != NULL))
+		pool_cache_put(&mbpool_cache, m);
+	splx(s);
 }
 
 
Index: arch/alpha/a12/if_ade.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/alpha/a12/if_ade.c,v
retrieving revision 1.12
diff -u -r1.12 if_ade.c
--- arch/alpha/a12/if_ade.c	2001/07/12 23:35:42	1.12
+++ arch/alpha/a12/if_ade.c	2002/04/29 17:05:21
@@ -234,8 +234,16 @@
  * course, they won't be needing de(4) drivers.
  */
 static void
-donothing(caddr_t m, u_int p, void *q)
+donothing(struct mbuf *m, caddr_t buf, u_int size, void *arg)
 {
+	int s;
+
+	if (__predict_true(m != NULL)) {
+		s = splvm();
+		pool_cache_put(&mbpool_cache, m);
+		splx(s);
+	}
+
 }
 static void a12r2pb(void *vsrc, void *vdst, int len) {
 	long	bounce[9];

--PpAOPzA3dXsRhoo+--