Subject: mbuf external storage sharing
To: None <tech-net@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-net
Date: 10/02/2004 19:32:10
--NextPart-20041002192117-2591600
Content-Type: Text/Plain; charset=us-ascii

hi,

the attached diffs are to change the way to share mbuf external storage.
with the current linked list method, it's difficult to be mp-safe
without having a global lock, because flags etc are not really shared.

comments?

YAMAMOTO Takashi

--NextPart-20041002192117-2591600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="extref.diff"

Index: sys/mbuf.h
===================================================================
--- sys/mbuf.h	(revision 900)
+++ sys/mbuf.h	(working copy)
@@ -177,34 +177,27 @@ struct	pkthdr {
 						 * is not yet 1s-complemented.
 						 */
 
+#if defined(_KERNEL)
 /*
  * Max # of pages we can attach to m_ext.  This is carefully chosen
  * to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page.
  */
-#ifdef MIN_PAGE_SIZE
 #define	M_EXT_MAXPAGES		((65536 / MIN_PAGE_SIZE) + 1)
-#endif
 
 /* description of external storage mapped into mbuf, valid if M_EXT set */
-struct _m_ext {
+struct mexthdr {
+	struct simplelock ext_lock;
+	int	ext_refcnt;
+	int	ext_flags;		/* M_EXT_ flags */
 	caddr_t	ext_buf;		/* start of buffer */
 	void	(*ext_free)		/* free routine if not the usual */
 		(struct mbuf *, caddr_t, size_t, void *);
 	void	*ext_arg;		/* argument for ext_free */
 	size_t	ext_size;		/* size of buffer, for ext_free */
-	struct malloc_type *ext_type;	/* malloc type */
-	struct mbuf *ext_nextref;
-	struct mbuf *ext_prevref;
 	union {
 		paddr_t extun_paddr;	/* physical address (M_EXT_CLUSTER) */
 					/* pages (M_EXT_PAGES) */
-	/*
-	 * XXX This is gross, but it doesn't really matter; this is
-	 * XXX overlaid on top of the mbuf data area.
-	 */
-#ifdef M_EXT_MAXPAGES
 		struct vm_page *extun_pgs[M_EXT_MAXPAGES];
-#endif
 	} ext_un;
 #define	ext_paddr	ext_un.extun_paddr
 #define	ext_pgs		ext_un.extun_pgs
@@ -216,6 +209,22 @@ struct _m_ext {
 #endif
 };
 
+/*
+ * XXX a compatibility hack to make m->m_ext.ext_something work.
+ * it's recommended to use MEXT(m)->ext_something instead.
+ */
+struct mexthdr_compatwrapper {
+	struct mexthdr ext_hdr;
+};
+#define	m_ext	M_dat.MH.MH_dat.MH_ext.u.ext_hdr_compat->ext_hdr
+
+struct _m_ext {
+	union {
+		struct mexthdr *ext_hdr;
+		struct mexthdr_compatwrapper *ext_hdr_compat;
+	} u;
+};
+
 #define	M_PADDR_INVALID		POOL_PADDR_INVALID
 
 /*
@@ -245,7 +254,7 @@ struct _m_ext {
 #define	m_nextpkt	m_hdr.mh_nextpkt
 #define	m_paddr		m_hdr.mh_paddr
 #define	m_pkthdr	M_dat.MH.MH_pkthdr
-#define	m_ext		M_dat.MH.MH_dat.MH_ext
+#define	m_exthdr	M_dat.MH.MH_dat.MH_ext.u.ext_hdr
 #define	m_pktdat	M_dat.MH.MH_dat.MH_databuf
 #define	m_dat		M_dat.M_databuf
 
@@ -288,11 +297,12 @@ MBUF_DEFINE(mbuf, MHLEN, MLEN);
 #define	M_LINK2		0x4000	/* link layer specific flag */
 
 /* additional flags for M_EXT mbufs */
-#define	M_EXT_FLAGS	0xff000000
-#define	M_EXT_CLUSTER	0x01000000	/* ext is a cluster */
-#define	M_EXT_PAGES	0x02000000	/* ext_pgs is valid */
-#define	M_EXT_ROMAP	0x04000000	/* ext mapping is r-o at MMU */
-#define	M_EXT_RW	0x08000000	/* ext storage is writable */
+#define	M_EXT_FLAGS		0xff000000
+#define	M_EXT_CLUSTER		0x01000000	/* ext is a cluster */
+#define	M_EXT_PAGES		0x02000000	/* ext_pgs is valid */
+#define	M_EXT_ROMAP		0x04000000	/* ext mapping is r-o at MMU */
+#define	M_EXT_RW		0x08000000	/* ext storage is writable */
+#define	M_EXT_HDREMBEDDED	0x10000000	/* mexthdr is embedded */
 
 /* for source-level compatibility */
 #define	M_CLUSTER	M_EXT_CLUSTER
@@ -300,9 +310,6 @@ MBUF_DEFINE(mbuf, MHLEN, MLEN);
 /* flags copied when copying m_pkthdr */
 #define	M_COPYFLAGS	(M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD|M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP|M_AUTHIPDGM)
 
-/* flag copied when shallow-copying external storage */
-#define	M_EXTCOPYFLAGS	(M_EXT|M_EXT_FLAGS)
-
 /* mbuf types */
 #define	MT_FREE		0	/* should be on free list */
 #define	MT_DATA		1	/* dynamic (data) allocation */
@@ -362,10 +369,11 @@ do {									\
 #define	_MOWNERCLAIM(m, mowner)	do {					\
 	(m)->m_owner = (mowner);					\
 	(mowner)->mo_claims++;						\
-	if ((m)->m_flags & M_EXT)					\
+	if ((m)->m_flags & M_EXT) {					\
 		(mowner)->mo_ext_claims++;				\
-	if ((m)->m_flags & M_CLUSTER)					\
-		(mowner)->mo_cluster_claims++;				\
+		if (MEXT(m)->ext_flags & M_EXT_CLUSTER)			\
+			(mowner)->mo_cluster_claims++;			\
+	}								\
 } while (/* CONSTCOND */ 0)
 
 #define	MCLAIM(m, mowner) 						\
@@ -440,7 +448,6 @@ do {									\
 	}								\
 } while (/* CONSTCOND */ 0)
 
-#if defined(_KERNEL)
 #define	_M_
 /*
  * Macros for tracking external storage associated with an mbuf.
@@ -450,14 +457,14 @@ do {									\
 #ifdef DEBUG
 #define MCLREFDEBUGN(m, file, line)					\
 do {									\
-	(m)->m_ext.ext_nfile = (file);					\
-	(m)->m_ext.ext_nline = (line);					\
+	MEXT(m)->ext_nfile = (file);					\
+	MEXT(m)->ext_nline = (line);					\
 } while (/* CONSTCOND */ 0)
 
 #define MCLREFDEBUGO(m, file, line)					\
 do {									\
-	(m)->m_ext.ext_ofile = (file);					\
-	(m)->m_ext.ext_oline = (line);					\
+	MEXT(m)->ext_ofile = (file);					\
+	MEXT(m)->ext_oline = (line);					\
 } while (/* CONSTCOND */ 0)
 #else
 #define MCLREFDEBUGN(m, file, line)
@@ -465,35 +472,41 @@ do {									\
 #endif
 
 #define	MCLBUFREF(p)
-#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
+#define	MCLISREFERENCED(m)	(MEXT(m)->ext_refcnt != 1)
 #define	_MCLDEREFERENCE(m)						\
 do {									\
-	(m)->m_ext.ext_nextref->m_ext.ext_prevref =			\
-		(m)->m_ext.ext_prevref;					\
-	(m)->m_ext.ext_prevref->m_ext.ext_nextref =			\
-		(m)->m_ext.ext_nextref;					\
+	KASSERT((m)->m_flags & M_EXT);					\
+	KASSERT(MEXT(m)->ext_refcnt > 1);				\
+	MEXT(m)->ext_refcnt--;						\
 } while (/* CONSTCOND */ 0)
 
 #define	_MCLADDREFERENCE(o, n)						\
 do {									\
-	(n)->m_flags |= ((o)->m_flags & M_EXTCOPYFLAGS);		\
-	(n)->m_ext.ext_nextref = (o)->m_ext.ext_nextref;		\
-	(n)->m_ext.ext_prevref = (o);					\
-	(o)->m_ext.ext_nextref = (n);					\
-	(n)->m_ext.ext_nextref->m_ext.ext_prevref = (n);		\
+	KASSERT((o)->m_flags & M_EXT);					\
+	KASSERT(((n)->m_flags & M_EXT) == 0);				\
+	KASSERT(MEXT(o)->ext_refcnt >= 1);				\
+	(n)->m_flags |= M_EXT;						\
+	MEXT(o)->ext_refcnt++;						\
+	(n)->m_exthdr = (o)->m_exthdr;					\
 	_MOWNERREF((n), (n)->m_flags);					\
 	MCLREFDEBUGN((n), __FILE__, __LINE__);				\
 } while (/* CONSTCOND */ 0)
 
 #define	MCLINITREFERENCE(m)						\
 do {									\
-	(m)->m_ext.ext_prevref = (m);					\
-	(m)->m_ext.ext_nextref = (m);					\
+	MEXT(m)->ext_refcnt = 1;					\
+	MEXT_LOCK_INIT(MEXT(m));					\
 	MCLREFDEBUGO((m), __FILE__, __LINE__);				\
 	MCLREFDEBUGN((m), NULL, 0);					\
 } while (/* CONSTCOND */ 0)
 
-#define	MCLADDREFERENCE(o, n)	MBUFLOCK(_MCLADDREFERENCE((o), (n));)
+#define	MCLADDREFERENCE(o, n)						\
+	MBUFLOCK(							\
+		MEXT_LOCK(MEXT(o));					\
+		_MCLADDREFERENCE((o), (n));				\
+		MEXT_UNLOCK(MEXT(o));					\
+	)
+
 
 /*
  * Macros for mbuf external storage.
@@ -507,27 +520,19 @@ do {									\
  * MEXTADD adds pre-allocated external storage to
  * a normal mbuf; the flag M_EXT is set upon success.
  */
-#define	_MCLGET(m, pool_cache, size, how)				\
-do {									\
-	MBUFLOCK(							\
-		(m)->m_ext.ext_buf =					\
-		    pool_cache_get_paddr((pool_cache),			\
-		        (how) == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : 0,	\
-			&(m)->m_ext.ext_paddr);				\
-		if ((m)->m_ext.ext_buf != NULL)				\
-			_MOWNERREF((m), M_EXT|M_CLUSTER);		\
-	);								\
-	if ((m)->m_ext.ext_buf != NULL) {				\
-		(m)->m_data = (m)->m_ext.ext_buf;			\
-		(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) |	\
-				M_EXT|M_CLUSTER|M_EXT_RW;		\
-		(m)->m_ext.ext_size = (size);				\
-		(m)->m_ext.ext_free = NULL;				\
-		(m)->m_ext.ext_arg = (pool_cache);			\
-		/* ext_paddr initialized above */			\
-		MCLINITREFERENCE(m);					\
-	}								\
-} while (/* CONSTCOND */ 0)
+
+#define	MEXT(m)		((m)->m_exthdr)
+#define	MEXT_LOCK(e)	simple_lock(&(e)->ext_lock)
+#define	MEXT_UNLOCK(e)	simple_unlock(&(e)->ext_lock)
+#define	MEXT_LOCK_INIT(e) simple_lock_init(&(e)->ext_lock)
+#define	MEXTHDR_GET(how)	_mexthdr_get(how)
+#define	MEXTHDR_PUT(ext)	\
+	MBUFLOCK(pool_cache_put(&mexthdrpool_cache, (ext));)
+#define	_MEXTHDR_GET(m, how)	(m)->m_exthdr = MEXTHDR_GET(how)
+#define	_MEXTHDR_PUT(m)		MEXTHDR_PUT((m)->m_exthdr)
+
+#define	_MCLGET(m, pool_cache, size, how) \
+	_m_clget((m), (pool_cache), (size), (how))
 
 /*
  * The standard mbuf cluster pool.
@@ -536,41 +541,58 @@ do {									\
 
 #define	MEXTMALLOC(m, size, how)					\
 do {									\
-	(m)->m_ext.ext_buf =						\
-	    (caddr_t)malloc((size), mbtypes[(m)->m_type], (how));	\
-	if ((m)->m_ext.ext_buf != NULL) {				\
-		(m)->m_data = (m)->m_ext.ext_buf;			\
-		(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) |	\
-				M_EXT|M_EXT_RW;				\
-		(m)->m_ext.ext_size = (size);				\
-		(m)->m_ext.ext_free = NULL;				\
-		(m)->m_ext.ext_arg = NULL;				\
-		(m)->m_ext.ext_type = mbtypes[(m)->m_type];		\
-		MCLINITREFERENCE(m);					\
-		MOWNERREF((m), M_EXT);					\
+	size_t realsize = ALIGN(size) + sizeof(struct mexthdr);		\
+	void *p = malloc((realsize), mbtypes[(m)->m_type], (how));	\
+	if (p != NULL) {						\
+		(m)->m_exthdr = (void *)((char *)p + ALIGN(size));	\
+		_MEXTADD((m), p, (size), M_EXT_RW | M_EXT_HDREMBEDDED,	\
+		    _mext_free_malloc, mbtypes[(m)->m_type]);		\
 	}								\
 } while (/* CONSTCOND */ 0)
 
-#define	MEXTADD(m, buf, size, type, free, arg)				\
+#define	MEXTADD(m, buf, size, extflags, free, arg, how)			\
+do {									\
+	KASSERT(((m)->m_flags & M_EXT) == 0);				\
+	_MEXTHDR_GET(m, how);						\
+	_MEXTADD((m), (buf), (size), (extflags), (free), (arg));	\
+} while (/* CONSTCOND */ 0)
+
+/* MEXTADD for malloc(9)'ed buffer */
+#define	MEXTADD_MALLOC(m, buf, size, extflags, type, how)		\
+	MEXTADD((m), (buf), (size), (extflags), _mext_free_malloc,	\
+	    (type), (how))
+
+/*
+ * MEXTADD2: MEXTADD with preallocated mexthdr
+ */
+#define	MEXTADD2(m, buf, size, extflags, free, arg, exthdr)		\
+do {									\
+	KASSERT(((m)->m_flags & M_EXT) == 0);				\
+	(m)->m_exthdr = (exthdr);					\
+	_MEXTADD((m), (buf), (size), (extflags), (free), (arg));	\
+} while (/* CONSTCOND */ 0)
+
+#define	_MEXTADD(m, buf, size, extflags, free, arg)			\
 do {									\
-	(m)->m_data = (m)->m_ext.ext_buf = (caddr_t)(buf);		\
-	(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT;	\
-	(m)->m_ext.ext_size = (size);					\
-	(m)->m_ext.ext_free = (free);					\
-	(m)->m_ext.ext_arg = (arg);					\
-	(m)->m_ext.ext_type = (type);					\
-	MCLINITREFERENCE(m);						\
-	MOWNERREF((m), M_EXT);						\
+	if (MEXT(m)) {							\
+		MCLINITREFERENCE(m);					\
+		(m)->m_data = MEXT(m)->ext_buf = (caddr_t)(buf);	\
+		(m)->m_flags |= M_EXT;					\
+		MEXT(m)->ext_flags = (extflags);			\
+		MEXT(m)->ext_size = (size);				\
+		MEXT(m)->ext_free = (free);				\
+		MEXT(m)->ext_arg = (arg);				\
+		MOWNERREF((m), M_EXT);					\
+	}								\
 } while (/* CONSTCOND */ 0)
 
 #define	MEXTREMOVE(m)							\
 do {									\
 	int _ms_ = splvm(); /* MBUFLOCK */				\
 	_MOWNERREVOKE((m), 0, (m)->m_flags);				\
-	m_ext_free(m, FALSE);						\
+	_m_ext_free(m, FALSE);						\
 	splx(_ms_);							\
-	(m)->m_flags &= ~M_EXTCOPYFLAGS;				\
-	(m)->m_ext.ext_size = 0;	/* why ??? */			\
+	(m)->m_flags &= ~M_EXT;						\
 } while (/* CONSTCOND */ 0)
 
 /*
@@ -579,7 +601,7 @@ do {									\
 #define	MRESETDATA(m)							\
 do {									\
 	if ((m)->m_flags & M_EXT)					\
-		(m)->m_data = (m)->m_ext.ext_buf;			\
+		(m)->m_data = MEXT(m)->ext_buf;				\
 	else if ((m)->m_flags & M_PKTHDR)				\
 		(m)->m_data = (m)->m_pktdat;				\
 	else								\
@@ -592,6 +614,8 @@ do {									\
  * Place the successor, if any, in n.
  */
 #define	MFREE(m, n)							\
+do {									\
+	KASSERT(((m)->m_flags & M_EXT_FLAGS) == 0);			\
 	MBUFLOCK(							\
 		mbstat.m_mtypes[(m)->m_type]--;				\
 		if ((m)->m_flags & M_PKTHDR)				\
@@ -599,11 +623,12 @@ do {									\
 		(n) = (m)->m_next;					\
 		_MOWNERREVOKE((m), 1, m->m_flags);			\
 		if ((m)->m_flags & M_EXT) {				\
-			m_ext_free(m, TRUE);				\
+			_m_ext_free(m, TRUE);				\
 		} else {						\
 			pool_cache_put(&mbpool_cache, (m));		\
 		}							\
-	)
+	);								\
+} while (/* CONSTCOND */ 0)
 
 /*
  * Copy mbuf pkthdr from `from' to `to'.
@@ -643,21 +668,21 @@ do {									\
  */
 #define	M_READONLY(m)							\
 	(((m)->m_flags & M_EXT) != 0 &&					\
-	  (((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW ||	\
+	  ((MEXT(m)->ext_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW ||	\
 	  MCLISREFERENCED(m)))
 
 /*
  * Determine if an mbuf's data area is read-only at the MMU.
  */
 #define	M_ROMAP(m)							\
-	(((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))
+	(((m)->m_flags & M_EXT) && (MEXT(m)->ext_flags & M_EXT_ROMAP))
 
 /*
  * Compute the amount of space available
  * before the current start of data in an mbuf.
  */
 #define	_M_LEADINGSPACE(m)						\
-	((m)->m_flags & M_EXT ? (m)->m_data - (m)->m_ext.ext_buf :	\
+	((m)->m_flags & M_EXT ? (m)->m_data - MEXT(m)->ext_buf :	\
 	 (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat :	\
 	 (m)->m_data - (m)->m_dat)
 
@@ -669,7 +694,7 @@ do {									\
  * after the end of data in an mbuf.
  */
 #define	_M_TRAILINGSPACE(m)						\
-	((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \
+	((m)->m_flags & M_EXT ? MEXT(m)->ext_buf + MEXT(m)->ext_size -	\
 	 ((m)->m_data + (m)->m_len) :					\
 	 &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))
 
@@ -781,8 +806,10 @@ extern const int msize;			/* mbuf base s
 extern const int mclbytes;		/* mbuf cluster size */
 extern struct pool mbpool;
 extern struct pool mclpool;
+extern struct pool mexthdrpool;
 extern struct pool_cache mbpool_cache;
 extern struct pool_cache mclpool_cache;
+extern struct pool_cache mexthdrpool_cache;
 #ifdef MBUFTRACE
 LIST_HEAD(mownerhead, mowner);
 extern struct mownerhead mowners;
@@ -825,10 +852,16 @@ void	m_copydata(struct mbuf *, int, int,
 void	m_freem(struct mbuf *);
 void	m_reclaim(void *, int);
 void	mbinit(void);
+void	_mext_free_malloc(struct mbuf *, caddr_t, size_t, void *);
 
 /* Inline routines. */
 static __inline u_int m_length(struct mbuf *) __unused;
-static __inline void m_ext_free(struct mbuf *, boolean_t) __unused;
+static __inline void m_free_extdone(struct mbuf *) __unused;
+
+static __inline void _m_ext_free(struct mbuf *, boolean_t) __unused;
+static __inline struct mexthdr *_mexthdr_get(int) __unused;
+static __inline void _m_clget(struct mbuf *, struct pool_cache *, size_t, int)
+    __unused;
 
 /* Packet tag routines */
 struct	m_tag *m_tag_get(int, int, int);
@@ -887,31 +920,110 @@ m_length(struct mbuf *m)
 }
 
 /*
- * m_ext_free: release a reference to the mbuf external storage. 
+ * _m_ext_free: release a reference to the mbuf external storage. 
  *
  * => if 'dofree', free the mbuf m itsself as well.
  * => called at splvm.
  */
 static __inline void
-m_ext_free(struct mbuf *m, boolean_t dofree)
+_m_ext_free(struct mbuf *m, boolean_t dofree)
 {
+	struct mexthdr *ext;
 
+	KASSERT(m->m_flags & M_EXT);
+	ext = MEXT(m);
+	KASSERT(ext->ext_refcnt >= 1);
+
+	MEXT_LOCK(ext);
 	if (MCLISREFERENCED(m)) {
 		_MCLDEREFERENCE(m);
-	} else if (m->m_flags & M_CLUSTER) {
-		pool_cache_put_paddr(m->m_ext.ext_arg,
-		    m->m_ext.ext_buf, m->m_ext.ext_paddr);
-	} else if (m->m_ext.ext_free) {
-		(*m->m_ext.ext_free)(dofree ? m : NULL, m->m_ext.ext_buf,
-		    m->m_ext.ext_size, m->m_ext.ext_arg);
-		dofree = FALSE;
+		MEXT_UNLOCK(ext);
 	} else {
-		free(m->m_ext.ext_buf, m->m_ext.ext_type);
+		int extflags;
+		boolean_t puthdr;
+		MEXT_UNLOCK(ext);
+
+		/* dropping the last reference */
+		extflags = ext->ext_flags;
+		puthdr = (extflags & M_EXT_HDREMBEDDED) == 0;
+		if (extflags & M_EXT_CLUSTER) {
+			pool_cache_put_paddr(ext->ext_arg,
+			    ext->ext_buf, ext->ext_paddr);
+		} else {
+			(*ext->ext_free)(dofree ? m : NULL,
+			    ext->ext_buf, ext->ext_size, ext->ext_arg);
+			if (dofree)
+				dofree = puthdr = FALSE;
+		}
+		if (puthdr)
+			MEXTHDR_PUT(ext);
 	}
 	if (dofree)
 		pool_cache_put(&mbpool_cache, m);
 }
 
+static __inline struct mexthdr *
+_mexthdr_get(int how)
+{
+	struct mexthdr *ext;
+	int s;
+
+	s = splvm(); /* MBUFLOCK */
+	ext = pool_cache_get(&mexthdrpool_cache,
+	    (how) == M_WAIT ? PR_WAITOK : PR_NOWAIT);
+	splx(s);
+
+	return ext;
+}
+
+/*
+ * XXX it's better to embed mexthdr to cluster data.
+ */
+void
+_m_clget(struct mbuf *m, struct pool_cache *pc, size_t size, int how)
+{
+	int s;
+	void *buf;
+	paddr_t pa;
+	const int prhow =
+	    (how == M_WAIT) ? (PR_WAITOK|PR_LIMITFAIL) : PR_NOWAIT;
+
+	KASSERT((m->m_flags & M_EXT) == 0);
+
+	s = splvm(); /* BUFLOCK */
+	buf = pool_cache_get_paddr(pc, prhow, &pa);
+	splx(s);
+
+	if (buf == NULL)
+		return;
+
+	MEXTADD(m, buf, size, M_EXT_CLUSTER|M_EXT_RW, NULL, pc, how);
+	if ((m->m_flags & M_EXT) == 0) {
+		s = splvm(); /* BUFLOCK */
+		pool_cache_put_paddr(pc, buf, pa);
+		splx(s);
+		return;
+	}
+	MEXT(m)->ext_paddr = pa;
+}
+
+/*
+ * m_free_extdone: free mbuf and its associated mexthdr.
+ *
+ * => intended to be used by ext_free callback routines.
+ * => called at splvm.
+ */
+static __inline void
+m_free_extdone(struct mbuf *m)
+{
+	struct mexthdr *ext;
+
+	KASSERT((m)->m_flags & M_EXT);
+	ext = MEXT(m);
+	if ((ext->ext_flags & M_EXT_HDREMBEDDED) == 0)
+		MEXTHDR_PUT(ext);
+	pool_cache_put(&mbpool_cache, (m));
+}
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */
Index: sys/socketvar.h
===================================================================
--- sys/socketvar.h	(revision 878)
+++ sys/socketvar.h	(working copy)
@@ -206,7 +206,7 @@ do {									\
 	(sb)->sb_cc += (m)->m_len;					\
 	(sb)->sb_mbcnt += MSIZE;					\
 	if ((m)->m_flags & M_EXT)					\
-		(sb)->sb_mbcnt += (m)->m_ext.ext_size;			\
+		(sb)->sb_mbcnt += MEXT(m)->ext_size;			\
 } while (/* CONSTCOND */ 0)
 
 /* adjust counters in sb reflecting freeing of m */
@@ -215,7 +215,7 @@ do {									\
 	(sb)->sb_cc -= (m)->m_len;					\
 	(sb)->sb_mbcnt -= MSIZE;					\
 	if ((m)->m_flags & M_EXT)					\
-		(sb)->sb_mbcnt -= (m)->m_ext.ext_size;			\
+		(sb)->sb_mbcnt -= MEXT(m)->ext_size;			\
 } while (/* CONSTCOND */ 0)
 
 /*
Index: kern/uipc_usrreq.c
===================================================================
--- kern/uipc_usrreq.c	(revision 878)
+++ kern/uipc_usrreq.c	(working copy)
@@ -1017,9 +1017,9 @@ unp_internalize(struct mbuf *control, st
 	if (newcm) {
 		if (control->m_flags & M_EXT)
 			MEXTREMOVE(control);
-		MEXTADD(control, newcm,
+		MEXTADD_MALLOC(control, newcm,
 		    CMSG_SPACE(nfds * sizeof(struct file *)),
-		    M_MBUF, NULL, NULL);
+		    M_EXT_RW, M_MBUF, M_WAIT);
 		cm = newcm;
 	}
 
Index: kern/uipc_socket.c
===================================================================
--- kern/uipc_socket.c	(revision 878)
+++ kern/uipc_socket.c	(working copy)
@@ -332,19 +332,20 @@ sodopendfreel(struct socket *so)
 			break;
 		so_pendfree = NULL;
 		simple_unlock(&so_pendfree_slock);
-		/* XXX splx */
 
 		for (; m != NULL; m = next) {
+			struct mexthdr *ext;
+			KASSERT(m->m_flags & M_EXT);
+			KASSERT((MEXT(m)->ext_flags & M_EXT_CLUSTER) == 0);
 			next = m->m_next;
 
-			rv += m->m_ext.ext_size;
-			sodoloanfree((m->m_flags & M_EXT_PAGES) ?
-			    m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
-			    m->m_ext.ext_size);
-			pool_cache_put(&mbpool_cache, m);
+			ext = MEXT(m);
+			rv += ext->ext_size;
+			sodoloanfree((ext->ext_flags & M_EXT_PAGES) ?
+			    ext->ext_pgs : NULL, ext->ext_buf, ext->ext_size);
+			m_free_extdone(m);
 		}
 
-		/* XXX splvm */
 		simple_lock(&so_pendfree_slock);
 	}
 
@@ -366,6 +367,9 @@ soloanfree(struct mbuf *m, caddr_t buf, 
 		return;
 	}
 
+	KASSERT(m->m_flags & M_EXT);
+	KASSERT((MEXT(m)->ext_flags & M_EXT_CLUSTER) == 0);
+
 	/*
 	 * postpone freeing mbuf.
 	 *
@@ -391,6 +395,7 @@ sosend_loan(struct socket *so, struct ui
 	vsize_t len;
 	vaddr_t lva, va;
 	int npgs, i, error;
+	struct mexthdr *ext;
 
 	if (uio->uio_segflg != UIO_USERSPACE)
 		return (0);
@@ -413,22 +418,26 @@ sosend_loan(struct socket *so, struct ui
 	if (lva == 0)
 		return 0;
 
+	ext = MEXTHDR_GET(M_DONTWAIT);
+	if (ext == NULL)
+		return 0;
 	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
-	    m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
+	    ext->ext_pgs, UVM_LOAN_TOPAGE);
 	if (error) {
 		sokvafree(lva, len);
+		MEXTHDR_PUT(ext);
 		return (0);
 	}
 
 	for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
-		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
+		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(ext->ext_pgs[i]),
 		    VM_PROT_READ);
 	pmap_update(pmap_kernel());
 
 	lva += (vaddr_t) iov->iov_base & PAGE_MASK;
 
-	MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
-	m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
+	MEXTADD2(m, (caddr_t) lva, space, M_EXT_PAGES | M_EXT_ROMAP,
+	    soloanfree, so, ext);
 
 	uio->uio_resid -= space;
 	/* uio_offset not updated, not set/used for write(2) */
Index: kern/uipc_mbuf.c
===================================================================
--- kern/uipc_mbuf.c	(revision 900)
+++ kern/uipc_mbuf.c	(working copy)
@@ -94,9 +94,11 @@ __KERNEL_RCSID(0, "uipc_mbuf.c,v 1.84 20
 
 struct	pool mbpool;		/* mbuf pool */
 struct	pool mclpool;		/* mbuf cluster pool */
+struct	pool mexthdrpool;	/* mbuf extref pool */
 
 struct pool_cache mbpool_cache;
 struct pool_cache mclpool_cache;
+struct pool_cache mexthdrpool_cache;
 
 struct mbstat mbstat;
 int	max_linkhdr;
@@ -155,12 +157,16 @@ mbinit(void)
 
 	pool_init(&mbpool, msize, 0, 0, 0, "mbpl", NULL);
 	pool_init(&mclpool, mclbytes, 0, 0, 0, "mclpl", &mclpool_allocator);
+	pool_init(&mexthdrpool, sizeof(struct mexthdr), 0, 0, 0, "mexthdr",
+	    NULL);
 
 	pool_set_drain_hook(&mbpool, m_reclaim, NULL);
 	pool_set_drain_hook(&mclpool, m_reclaim, NULL);
+	pool_set_drain_hook(&mexthdrpool, m_reclaim, NULL);
 
 	pool_cache_init(&mbpool_cache, &mbpool, mb_ctor, NULL, NULL);
 	pool_cache_init(&mclpool_cache, &mclpool, NULL, NULL, NULL);
+	pool_cache_init(&mexthdrpool_cache, &mexthdrpool, NULL, NULL, NULL);
 
 	/*
 	 * Set the hard limit on the mclpool to the number of
@@ -177,6 +183,7 @@ mbinit(void)
 	 */
 	pool_setlowat(&mbpool, mblowat);
 	pool_setlowat(&mclpool, mcllowat);
+	pool_setlowat(&mexthdrpool, mcllowat); /* XXX */
 
 #ifdef MBUFTRACE
 	{
@@ -567,7 +574,6 @@ m_copym0(struct mbuf *m, int off0, int l
 		if (m->m_flags & M_EXT) {
 			if (!deep) {
 				n->m_data = m->m_data + off;
-				n->m_ext = m->m_ext;
 				MCLADDREFERENCE(m, n);
 			} else {
 				/*
@@ -626,7 +632,6 @@ m_copypacket(struct mbuf *m, int how)
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
-		n->m_ext = m->m_ext;
 		MCLADDREFERENCE(m, n);
 	} else {
 		memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
@@ -645,7 +650,6 @@ m_copypacket(struct mbuf *m, int how)
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
-			n->m_ext = m->m_ext;
 			MCLADDREFERENCE(m, n);
 		} else {
 			memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
@@ -972,7 +976,6 @@ m_split0(struct mbuf *m0, int len0, int 
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
-		n->m_ext = m->m_ext;
 		MCLADDREFERENCE(m, n);
 		n->m_data = m->m_data + len;
 	} else {
@@ -1368,3 +1371,15 @@ m_getptr(struct mbuf *m, int loc, int *o
 
 	return (NULL);
 }
+
+/*
+ * ext_free callback routine for MEXTMALLOC and MEXTADD_MALLOC.
+ */
+void
+_mext_free_malloc(struct mbuf *m, caddr_t buf, size_t size, void *type)
+{
+
+	if (m)
+		m_free_extdone(m);
+	free(buf, type);
+}

--NextPart-20041002192117-2591600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="x86.diff"

Index: arch/x86/x86/bus_dma.c
===================================================================
--- arch/x86/x86/bus_dma.c	(revision 786)
+++ arch/x86/x86/bus_dma.c	(working copy)
@@ -401,31 +401,34 @@ _bus_dmamap_load_mbuf(bus_dma_tag_t t, b
 		const struct vm_page * const *pgs;
 		paddr_t paddr;
 		int size;
+		struct mexthdr *ext;
 
 		if (m->m_len == 0)
 			continue;
-		switch (m->m_flags & (M_EXT|M_EXT_CLUSTER|M_EXT_PAGES)) {
-		case M_EXT|M_EXT_CLUSTER:
+		if ((m->m_flags & M_EXT) == 0) {
+			paddr = m->m_paddr + M_BUFOFFSET(m) +
+			    (m->m_data - M_BUFADDR(m));
+			size = m->m_len;
+			error = _bus_dmamap_load_paddr(t, map, paddr, size);
+		} else if ((ext = MEXT(m))->ext_flags & M_EXT_CLUSTER) {
 			/* XXX KDASSERT */
-			KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID);
-			paddr = m->m_ext.ext_paddr +
-			    (m->m_data - m->m_ext.ext_buf);
+			KASSERT(ext->ext_paddr != M_PADDR_INVALID);
+			paddr = ext->ext_paddr +
+			    (m->m_data - ext->ext_buf);
 			size = m->m_len;
 			error = _bus_dmamap_load_paddr(t, map, paddr, size);
-			break;
-
-		case M_EXT|M_EXT_PAGES:
-			KASSERT(m->m_ext.ext_buf <= m->m_data);
+		} else if (ext->ext_flags & M_EXT_PAGES) {
+			KASSERT(ext->ext_buf <= m->m_data);
 			KASSERT(m->m_data <=
-			    m->m_ext.ext_buf + m->m_ext.ext_size);
+			    ext->ext_buf + ext->ext_size);
 
 			offset = (vaddr_t)m->m_data -
-			    trunc_page((vaddr_t)m->m_ext.ext_buf);
+			    trunc_page((vaddr_t)ext->ext_buf);
 			remainbytes = m->m_len;
 
 			/* skip uninteresting pages */
 			pgs = (const struct vm_page * const *)
-			    m->m_ext.ext_pgs + (offset >> PAGE_SHIFT);
+			    ext->ext_pgs + (offset >> PAGE_SHIFT);
 
 			offset &= PAGE_MASK; /* offset in the first page */
 
@@ -446,16 +449,7 @@ _bus_dmamap_load_mbuf(bus_dma_tag_t t, b
 				offset = 0;
 				remainbytes -= size;
 			}
-			break;
-
-		case 0:
-			paddr = m->m_paddr + M_BUFOFFSET(m) +
-			    (m->m_data - M_BUFADDR(m));
-			size = m->m_len;
-			error = _bus_dmamap_load_paddr(t, map, paddr, size);
-			break;
-
-		default:
+		} else {
 			error = _bus_dmamap_load_buffer(t, map, m->m_data,
 			    m->m_len, NULL, flags);
 		}

--NextPart-20041002192117-2591600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="dev.diff"

Index: dev/pci/if_ti.c
===================================================================
--- dev/pci/if_ti.c	(revision 904)
+++ dev/pci/if_ti.c	(working copy)
@@ -721,7 +721,7 @@ static void ti_jfree(m, buf, size, arg)
 	SIMPLEQ_INSERT_HEAD(&sc->ti_jfree_listhead, entry, jpool_entries);
 
 	if (__predict_true(m != NULL))
-		pool_cache_put(&mbpool_cache, m);
+		m_free_extdone(m);
 	splx(s);
 }
 
@@ -904,9 +904,13 @@ static int ti_newbuf_jumbo(sc, i, m)
 		}
 
 		/* Attach the buffer to the mbuf. */
-		MEXTADD(m_new, buf, ETHER_MAX_LEN_JUMBO,
-		    M_DEVBUF, ti_jfree, sc);
-		m_new->m_flags |= M_EXT_RW;
+		MEXTADD(m_new, (void *)buf, ETHER_MAX_LEN_JUMBO,
+		    M_EXT_RW, ti_jfree, sc, M_DONTWAIT);
+		if ((m_new->m_flags & M_EXT) == 0) {
+			m_freem(m_new);
+			ti_jfree(NULL, buf, ETHER_MAX_LEN_JUMBO, sc);
+			return(ENOBUFS);
+		}
 		m_new->m_len = m_new->m_pkthdr.len = ETHER_MAX_LEN_JUMBO;
 	} else {
 		m_new = m;
Index: dev/pci/if_dge.c
===================================================================
--- dev/pci/if_dge.c	(revision 900)
+++ dev/pci/if_dge.c	(working copy)
@@ -607,7 +607,7 @@ dge_freebuf(struct mbuf *m, caddr_t buf,
 	SLIST_INSERT_HEAD(&sc->sc_buglist, entry, rb_entry);
 
 	if (__predict_true(m != NULL))
-		pool_cache_put(&mbpool_cache, m);
+		m_free_extdone(m);
 	splx(s);
 }
 #endif
@@ -2128,8 +2128,12 @@ dge_add_rxbuf(struct dge_softc *sc, int 
 		return ENOBUFS;
 
 	m->m_len = m->m_pkthdr.len = DGE_BUFFER_SIZE;
-	MEXTADD(m, buf, DGE_BUFFER_SIZE, M_DEVBUF, dge_freebuf, sc);
-	m->m_flags |= M_EXT_RW;
+	MEXTADD(m, buf, DGE_BUFFER_SIZE, M_EXT_RW, dge_freebuf, sc, M_DONTWAIT);
+	if ((m->m_flags & M_EXT) == 0) {
+		m_freem(m);
+		dge_freebuf(NULL, buf, DGE_BUFFER_SIZE, sc);
+		return ENOBUFS;
+	}
 
 	if (rxs->rxs_mbuf != NULL)
 		bus_dmamap_unload(sc->sc_dmat, rxs->rxs_dmamap);
Index: dev/pci/if_bge.c
===================================================================
--- dev/pci/if_bge.c	(revision 904)
+++ dev/pci/if_bge.c	(working copy)
@@ -849,7 +849,7 @@ bge_jfree(m, buf, size, arg)
 	SLIST_INSERT_HEAD(&sc->bge_jfree_listhead, entry, jpool_entries);
 
 	if (__predict_true(m != NULL))
-  		pool_cache_put(&mbpool_cache, m);
+		m_free_extdone(m);
 	splx(s);
 }
 
@@ -953,9 +953,13 @@ bge_newbuf_jumbo(sc, i, m)
 
 		/* Attach the buffer to the mbuf. */
 		m_new->m_len = m_new->m_pkthdr.len = BGE_JUMBO_FRAMELEN;
-		MEXTADD(m_new, buf, BGE_JUMBO_FRAMELEN, M_DEVBUF,
-		    bge_jfree, sc);
-		m_new->m_flags |= M_EXT_RW;
+		MEXTADD(m_new, buf, BGE_JUMBO_FRAMELEN, M_EXT_RW,
+		    bge_jfree, sc, M_DONTWAIT);
+		if ((m_new->m_flags & M_EXT) == 0) {
+			m_freem(m_new);
+			bge_jfree(NULL, buf, BGE_JUMBO_FRAMELEN, sc);
+			return(ENOBUFS);
+		}
 	} else {
 		m_new = m;
 		m_new->m_data = m_new->m_ext.ext_buf;

--NextPart-20041002192117-2591600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="nfs.diff"

Index: nfs/nfs_vnops.c
===================================================================
--- nfs/nfs_vnops.c	(revision 906)
+++ nfs/nfs_vnops.c	(working copy)
@@ -1340,7 +1340,7 @@ nfs_writerpc_extfree(struct mbuf *m, cad
 
 	KASSERT(m != NULL);
 	KASSERT(ctx != NULL);
-	pool_cache_put(&mbpool_cache, m);
+	m_free_extdone(m);
 	simple_lock(&ctx->nwc_slock);
 	if (--ctx->nwc_mbufcount == 0) {
 		wakeup(ctx);
@@ -1440,12 +1440,13 @@ retry:
 			 */
 			struct mbuf *m;
 			struct iovec *iovp = uiop->uio_iov;
+			struct mexthdr *ext;
 
 			m = m_get(M_WAIT, MT_DATA);
+			ext = MEXTHDR_GET(M_WAIT);
 			MCLAIM(m, &nfs_mowner);
-			MEXTADD(m, iovp->iov_base, len, M_MBUF,
-			    nfs_writerpc_extfree, &ctx);
-			m->m_flags |= M_EXT_ROMAP;
+			MEXTADD2(m, iovp->iov_base, len, M_EXT_ROMAP,
+			    nfs_writerpc_extfree, &ctx, ext);
 			m->m_len = len;
 			mb->m_next = m;
 			/*
Index: nfs/nfsm_subs.h
===================================================================
--- nfs/nfsm_subs.h	(revision 905)
+++ nfs/nfsm_subs.h	(working copy)
@@ -51,7 +51,7 @@
 
 #define	M_HASCL(m)	((m)->m_flags & M_EXT)
 #define	NFSMADV(m, s)	(m)->m_data += (s)
-#define	NFSMSIZ(m)	((M_HASCL(m)) ? (m)->m_ext.ext_size : \
+#define	NFSMSIZ(m)	((M_HASCL(m)) ? MEXT(m)->ext_size : \
 				(((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN))
 
 /*
Index: nfs/nfs_subs.c
===================================================================
--- nfs/nfs_subs.c	(revision 900)
+++ nfs/nfs_subs.c	(working copy)
@@ -1020,7 +1020,6 @@ nfsm_disct(mdp, dposp, siz, left, cp2)
 			 * mbuf look empty.
 			 */
 			m2 = m_get(M_WAIT, MT_DATA);
-			m2->m_ext = m1->m_ext;
 			m2->m_data = src;
 			m2->m_len = left;
 			MCLADDREFERENCE(m1, m2);
Index: nfs/nfs_serv.c
===================================================================
--- nfs/nfs_serv.c	(revision 900)
+++ nfs/nfs_serv.c	(working copy)
@@ -666,6 +666,7 @@ nfsrv_read(nfsd, slp, procp, mrq)
 			voff_t pgoff = trunc_page(off);
 			int npages;
 			vaddr_t lva;
+			struct mexthdr *ext;
 
 			npages = (round_page(off + cnt) - pgoff) >> PAGE_SHIFT;
 			KASSERT(npages <= M_EXT_MAXPAGES); /* XXX */
@@ -680,7 +681,8 @@ nfsrv_read(nfsd, slp, procp, mrq)
 			/* allocate mbuf */
 			m = m_get(M_WAIT, MT_DATA);
 			MCLAIM(m, &nfs_mowner);
-			pgpp = m->m_ext.ext_pgs;
+			ext = MEXTHDR_GET(M_WAIT);
+			pgpp = ext->ext_pgs;
 
 			/* loan pages */
 			error = uvm_loanuobjpages(&vp->v_uobj, pgoff, npages,
@@ -688,13 +690,14 @@ nfsrv_read(nfsd, slp, procp, mrq)
 			if (error) {
 				sokvafree(lva, npages << PAGE_SHIFT);
 				m_free(m);
+				MEXTHDR_PUT(ext);
 				goto read_error;
 			}
 
 			/* associate kva to mbuf */
-			MEXTADD(m, (void *)(lva + ((vaddr_t)off & PAGE_MASK)),
-			    cnt, M_MBUF, soloanfree, slp->ns_so);
-			m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
+			MEXTADD2(m, (void *)(lva + ((vaddr_t)off & PAGE_MASK)),
+			    cnt, M_EXT_PAGES | M_EXT_ROMAP,
+			    soloanfree, slp->ns_so, ext);
 			m->m_len = cnt;
 
 			/* map pages */

--NextPart-20041002192117-2591600--