Subject: percpu storage allocator
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 11/12/2007 20:41:35
--NextPart-20071112203611-0619600
Content-Type: Text/Plain; charset=us-ascii

hi,

the attached files contain an implementation of percpu storage.
(and some users of it.)
the patch is against vmlocking branch.

any comments?

YAMAMOTO Takashi

--NextPart-20071112203611-0619600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="percpu.h"

/*	$NetBSD$	*/

/*-
 * Copyright (c)2007 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef _SYS_PERCPU_H_
#define	_SYS_PERCPU_H_

#include <sys/types.h>

struct cpu_info;
typedef struct percpu percpu_t;

typedef struct percpu_cpu {
	size_t pcc_size;
	void *pcc_data;
} percpu_cpu_t;

void percpu_init(void);
void percpu_init_cpu(struct cpu_info *);
percpu_t *percpu_alloc(size_t);
void percpu_free(percpu_t *, size_t);
void *percpu_getptr(percpu_t *);
void percpu_traverse_enter(void);
void percpu_traverse_exit(void);
void *percpu_getptr_remote(percpu_t *, struct cpu_info *);

#endif /* _SYS_PERCPU_H_ */

--NextPart-20071112203611-0619600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="subr_percpu.c"

/*	$NetBSD$	*/

/*-
 * Copyright (c)2007 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD$");

#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/rwlock.h>
#include <sys/vmem.h>
#include <sys/xcall.h>

static krwlock_t percpu_swap_lock;
static kmutex_t percpu_allocation_lock;
static vmem_t *percpu_offset_arena;
static unsigned int percpu_nextoff;

#define	PERCPU_QUANTUM_SIZE	(ALIGNBYTES + 1)

static percpu_cpu_t *
cpu_percpu(struct cpu_info *ci)
{

	return &ci->ci_data.cpu_percpu;
}

static unsigned int
percpu_offset(percpu_t *pc)
{

	return (uintptr_t)pc;
}

/*
 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
 */

static void
percpu_cpu_swap(void *p1, void *p2)
{
	struct cpu_info * const ci = p1;
	percpu_cpu_t * const newpcc = p2;
	percpu_cpu_t * const pcc = cpu_percpu(ci);

	rw_enter(&percpu_swap_lock, RW_WRITER);
	/* unless anyone has beaten us... */
	if (newpcc->pcc_size > pcc->pcc_size) {
		percpu_cpu_t tmp;
		int s;

		s = splhigh();
		/* copy data to new storage */
		memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
		/* swap */
		tmp = *pcc;
		*pcc = *newpcc;
		splx(s);
		*newpcc = tmp;
	}
	rw_exit(&percpu_swap_lock);
}

/*
 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
 */

static void
percpu_cpu_enlarge(size_t size)
{
	CPU_INFO_ITERATOR cii;
	struct cpu_info *ci;

	for (CPU_INFO_FOREACH(cii, ci)) {
		percpu_cpu_t pcc;

		pcc.pcc_data = kmem_zalloc(size, KM_SLEEP);
		pcc.pcc_size = size;
		if (!mp_online) {
			percpu_cpu_swap(ci, &pcc);
		} else {
			uint64_t where;

			where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
			xc_wait(where);
		}
		KASSERT(pcc.pcc_size < size);
		if (pcc.pcc_data != NULL)
			kmem_free(pcc.pcc_data, pcc.pcc_size);
	}
}

/*
 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
 */

static vmem_addr_t
percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
    vm_flag_t vmflags)
{
	unsigned int offset;
	unsigned int nextoff;

	KASSERT(dummy == NULL);

	if ((vmflags & VM_NOSLEEP) != 0)
		return VMEM_ADDR_NULL;

	mutex_enter(&percpu_allocation_lock);
	offset = percpu_nextoff;
	percpu_nextoff = nextoff = percpu_nextoff + size;
	mutex_exit(&percpu_allocation_lock);

	percpu_cpu_enlarge(nextoff);

	*resultsize = size;
	return (vmem_addr_t)offset;
}

/*
 * percpu_init: subsystem initialization
 */

void
percpu_init(void)
{

	rw_init(&percpu_swap_lock);
	mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE);

	percpu_offset_arena = vmem_create("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
	    percpu_backend_alloc, NULL, NULL, PERCPU_QUANTUM_SIZE * 32,
	    VM_SLEEP, IPL_NONE);
}

void
percpu_init_cpu(struct cpu_info *ci)
{
	percpu_cpu_t * const pcc = cpu_percpu(ci);
	size_t size = percpu_nextoff;

	pcc->pcc_size = size;
	if (size) {
		pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
	}
}

/*
 * percpu_alloc: allocate percpu data
 */

percpu_t *
percpu_alloc(size_t size)
{
	unsigned int offset;

	offset = vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT);
	return (percpu_t *)(uintptr_t)offset;
}

/*
 * percpu_alloc: free percpu data
 */

void
percpu_free(percpu_t *pc, size_t size)
{

	vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
}

/*
 * percpu_getptr:
 *
 * => called with preemption disabled
 */

void *
percpu_getptr(percpu_t *pc)
{

	return percpu_getptr_remote(pc, curcpu());
}

/*
 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
 * helpers to access remote cpu's percpu data.
 *
 * => called in thread context.
 * => typical usage would be:
 *
 *	sum = 0;
 *	percpu_traverse_enter();
 *	for (CPU_INFO_FOREACH(cii, ci)) {
 *		unsigned int *p = percpu_getptr_remote(pc, ci);
 *		sum += *p;
 *	}
 *	percpu_traverse_exit();
 */

void
percpu_traverse_enter(void)
{

	rw_enter(&percpu_swap_lock, RW_READER);
}

void
percpu_traverse_exit(void)
{

	rw_exit(&percpu_swap_lock);
}

void *
percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
{

	return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
}

--NextPart-20071112203611-0619600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="a.diff"

Index: sys/cpu_data.h
===================================================================
RCS file: /cvsroot/src/sys/sys/cpu_data.h,v
retrieving revision 1.7.6.10
diff -u -p -r1.7.6.10 cpu_data.h
--- sys/cpu_data.h	1 Nov 2007 21:58:24 -0000	1.7.6.10
+++ sys/cpu_data.h	12 Nov 2007 11:34:24 -0000
@@ -46,6 +46,7 @@ struct lwp;
 
 #include <sys/sched.h>	/* for schedstate_percpu */
 #include <sys/condvar.h>
+#include <sys/percpu.h>
 
 /*
  * MI per-cpu data
@@ -90,6 +91,7 @@ struct cpu_data {
 	kmutex_t	cpu_uarea_lock;		/* uarea alloc lock */
 	u_int		cpu_uarea_cnt;		/* count of free uareas */
 	vaddr_t		cpu_uarea_list;		/* free uareas */
+	percpu_cpu_t	cpu_percpu;		/* per-cpu data */
 };
 
 /* compat definitions */
Index: sys/mbuf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/mbuf.h,v
retrieving revision 1.135.2.1
diff -u -p -r1.135.2.1 mbuf.h
--- sys/mbuf.h	1 Sep 2007 12:56:51 -0000	1.135.2.1
+++ sys/mbuf.h	12 Nov 2007 11:34:24 -0000
@@ -80,6 +80,7 @@
 #endif
 #include <sys/pool.h>
 #include <sys/queue.h>
+#include <sys/percpu.h>
 
 /* For offsetof() */
 #if defined(_KERNEL) || defined(_STANDALONE)
@@ -110,15 +111,33 @@ struct mowner {
 	char mo_name[16];		/* owner name (fxp0) */
 	char mo_descr[16];		/* owner description (input) */
 	LIST_ENTRY(mowner) mo_link;	/* */
-	u_long mo_claims;		/* # of small mbuf claimed */
-	u_long mo_releases;		/* # of small mbuf released */
-	u_long mo_cluster_claims;	/* # of M_CLUSTER mbuf claimed */
-	u_long mo_cluster_releases;	/* # of M_CLUSTER mbuf released */
-	u_long mo_ext_claims;		/* # of M_EXT mbuf claimed */
-	u_long mo_ext_releases;		/* # of M_EXT mbuf released */
+	percpu_t *mo_counters;
 };
 
-#define MOWNER_INIT(x, y) { x, y, { NULL, NULL }, 0, 0, 0, 0, 0, 0 }
+#define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y }
+
+enum mowner_counter_index {
+	MOWNER_COUNTER_CLAIMS,		/* # of small mbuf claimed */
+	MOWNER_COUNTER_RELEASES,	/* # of small mbuf released */
+	MOWNER_COUNTER_CLUSTER_CLAIMS,	/* # of M_CLUSTER mbuf claimed */
+	MOWNER_COUNTER_CLUSTER_RELEASES,/* # of M_CLUSTER mbuf released */
+	MOWNER_COUNTER_EXT_CLAIMS,	/* # of M_EXT mbuf claimed */
+	MOWNER_COUNTER_EXT_RELEASES,	/* # of M_EXT mbuf released */
+
+	MOWNER_COUNTER_NCOUNTERS,
+};
+
+struct mowner_counter {
+	u_long mc_counter[MOWNER_COUNTER_NCOUNTERS];
+};
+
+/* userland-exported version */
+struct mowner_user {
+	char mo_name[16];		/* owner name (fxp0) */
+	char mo_descr[16];		/* owner description (input) */
+	LIST_ENTRY(mowner) mo_link;	/* */
+	u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */
+};
 
 /*
  * Macros for type conversion
@@ -374,65 +393,28 @@ do {									\
 
 #ifdef MBUFTRACE
 /*
- * mbuf allocation tracing macros
- *
+ * mbuf allocation tracing
  */
-#define _MOWNERINIT(m, type)						\
-	((m)->m_owner = &unknown_mowners[(type)], (m)->m_owner->mo_claims++)
-
-#define	_MOWNERREF(m, flags)	do {					\
-	if ((flags) & M_EXT)						\
-		(m)->m_owner->mo_ext_claims++;				\
-	if ((flags) & M_CLUSTER)					\
-		(m)->m_owner->mo_cluster_claims++;			\
-} while (/* CONSTCOND */ 0)
-
-#define	MOWNERREF(m, flags)	MBUFLOCK( _MOWNERREF((m), (flags)); );
-
-#define	_MOWNERREVOKE(m, all, flags)	do {				\
-	if ((flags) & M_EXT)						\
-		(m)->m_owner->mo_ext_releases++;			\
-	if ((flags) & M_CLUSTER)					\
-		(m)->m_owner->mo_cluster_releases++;			\
-	if (all) {							\
-		(m)->m_owner->mo_releases++;				\
-		(m)->m_owner = &revoked_mowner;				\
-	}								\
-} while (/* CONSTCOND */ 0)
-
-#define	_MOWNERCLAIM(m, mowner)	do {					\
-	(m)->m_owner = (mowner);					\
-	(mowner)->mo_claims++;						\
-	if ((m)->m_flags & M_EXT)					\
-		(mowner)->mo_ext_claims++;				\
-	if ((m)->m_flags & M_CLUSTER)					\
-		(mowner)->mo_cluster_claims++;				\
-} while (/* CONSTCOND */ 0)
-
-#define	MCLAIM(m, mowner) 						\
-	MBUFLOCK(							\
-		if ((m)->m_owner != (mowner) && (mowner) != NULL) {	\
-			_MOWNERREVOKE((m), 1, (m)->m_flags);		\
-			_MOWNERCLAIM((m), (mowner));			\
-		}							\
-	)
-
-#define	MOWNER_ATTACH(mo)	LIST_INSERT_HEAD(&mowners, (mo), mo_link)
-#define	MOWNER_DETACH(mo)	LIST_REMOVE((mo), mo_link)
-#define MBUFTRACE_ASSERT(cond)	KASSERT(cond)
+void mowner_init(struct mbuf *, int);
+void mowner_ref(struct mbuf *, int);
+void m_claim(struct mbuf *, struct mowner *);
+void mowner_revoke(struct mbuf *, bool, int);
+void mowner_attach(struct mowner *);
+void mowner_detach(struct mowner *);
+void m_claimm(struct mbuf *, struct mowner *);
 #else
-#define _MOWNERINIT(m, type)		do { } while (/* CONSTCOND */ 0)
-#define	_MOWNERREF(m, flags)		do { } while (/* CONSTCOND */ 0)
-#define	MOWNERREF(m, flags)		do { } while (/* CONSTCOND */ 0)
-#define	_MOWNERREVOKE(m, all, flags)	do { } while (/* CONSTCOND */ 0)
-#define	_MOWNERCLAIM(m, mowner)		do { } while (/* CONSTCOND */ 0)
-#define	MCLAIM(m, mowner) 		do { } while (/* CONSTCOND */ 0)
-#define	MOWNER_ATTACH(mo)		do { } while (/* CONSTCOND */ 0)
-#define	MOWNER_DETACH(mo)		do { } while (/* CONSTCOND */ 0)
+#define mowner_init(m, type)		do { } while (/* CONSTCOND */ 0)
+#define	mowner_ref(m, flags)		do { } while (/* CONSTCOND */ 0)
+#define	mowner_revoke(m, all, flags)	do { } while (/* CONSTCOND */ 0)
+#define	m_claim(m, mowner) 		do { } while (/* CONSTCOND */ 0)
+#define	mowner_attach(mo)		do { } while (/* CONSTCOND */ 0)
+#define	mowner_detach(mo)		do { } while (/* CONSTCOND */ 0)
 #define	m_claimm(m, mo)			do { } while (/* CONSTCOND */ 0)
-#define MBUFTRACE_ASSERT(cond)		do { } while (/* CONSTCOND */ 0)
 #endif
 
+#define	MCLAIM(m, mo)		m_claim((m), (mo))
+#define	MOWNER_ATTACH(mo)	mowner_attach(mo)
+#define	MOWNER_DETACH(mo)	mowner_detach(mo)
 
 /*
  * mbuf allocation/deallocation macros:
@@ -447,39 +429,8 @@ do {									\
  * If 'how' is M_WAIT, these macros (and the corresponding functions)
  * are guaranteed to return successfully.
  */
-#define	MGET(m, how, type)						\
-MBUFLOCK(								\
-	(m) = pool_cache_get(mb_cache,					\
-		(how) == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : 0);		\
-	if (m) {							\
-		mbstat.m_mtypes[type]++;				\
-		_MOWNERINIT((m), (type));				\
-		(m)->m_type = (type);					\
-		(m)->m_next = (struct mbuf *)NULL;			\
-		(m)->m_nextpkt = (struct mbuf *)NULL;			\
-		(m)->m_data = (m)->m_dat;				\
-		(m)->m_flags = 0;					\
-	}								\
-)
-
-#define	MGETHDR(m, how, type)						\
-MBUFLOCK(								\
-	(m) = pool_cache_get(mb_cache,					\
-	    (how) == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : 0);		\
-	if (m) {							\
-		mbstat.m_mtypes[type]++;				\
-		_MOWNERINIT((m), (type));				\
-		(m)->m_type = (type);					\
-		(m)->m_next = (struct mbuf *)NULL;			\
-		(m)->m_nextpkt = (struct mbuf *)NULL;			\
-		(m)->m_data = (m)->m_pktdat;				\
-		(m)->m_flags = M_PKTHDR;				\
-		(m)->m_pkthdr.rcvif = NULL;				\
-		(m)->m_pkthdr.csum_flags = 0;				\
-		(m)->m_pkthdr.csum_data = 0;				\
-		SLIST_INIT(&(m)->m_pkthdr.tags);			\
-	}								\
-)
+#define	MGET(m, how, type)	m = m_get((how), (type))
+#define	MGETHDR(m, how, type)	m = m_gethdr((how), (type))
 
 #if defined(_KERNEL)
 #define	_M_
@@ -522,7 +473,7 @@ do {									\
 	(n)->m_ext.ext_prevref = (o);					\
 	(o)->m_ext.ext_nextref = (n);					\
 	(n)->m_ext.ext_nextref->m_ext.ext_prevref = (n);		\
-	_MOWNERREF((n), (n)->m_flags);					\
+	mowner_ref((n), (n)->m_flags);					\
 	MCLREFDEBUGN((n), __FILE__, __LINE__);				\
 } while (/* CONSTCOND */ 0)
 
@@ -550,15 +501,12 @@ do {									\
  */
 #define	_MCLGET(m, pool_cache, size, how)				\
 do {									\
-	MBUFLOCK(							\
-		(m)->m_ext.ext_buf =					\
-		    pool_cache_get_paddr((pool_cache),			\
-		        (how) == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : 0,	\
-			&(m)->m_ext.ext_paddr);				\
-		if ((m)->m_ext.ext_buf != NULL)				\
-			_MOWNERREF((m), M_EXT|M_CLUSTER);		\
-	);								\
+	(m)->m_ext.ext_buf =						\
+	    pool_cache_get_paddr((pool_cache),				\
+		(how) == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : 0,		\
+		&(m)->m_ext.ext_paddr);					\
 	if ((m)->m_ext.ext_buf != NULL) {				\
+		mowner_ref((m), M_EXT|M_CLUSTER);			\
 		(m)->m_data = (m)->m_ext.ext_buf;			\
 		(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) |	\
 				M_EXT|M_CLUSTER|M_EXT_RW;		\
@@ -588,7 +536,7 @@ do {									\
 		(m)->m_ext.ext_arg = NULL;				\
 		(m)->m_ext.ext_type = mbtypes[(m)->m_type];		\
 		MCLINITREFERENCE(m);					\
-		MOWNERREF((m), M_EXT);					\
+		mowner_ref((m), M_EXT);					\
 	}								\
 } while (/* CONSTCOND */ 0)
 
@@ -601,13 +549,13 @@ do {									\
 	(m)->m_ext.ext_arg = (arg);					\
 	(m)->m_ext.ext_type = (type);					\
 	MCLINITREFERENCE(m);						\
-	MOWNERREF((m), M_EXT);						\
+	mowner_ref((m), M_EXT);						\
 } while (/* CONSTCOND */ 0)
 
 #define	MEXTREMOVE(m)							\
 do {									\
+	mowner_revoke((m), 0, (m)->m_flags);				\
 	int _ms_ = splvm(); /* MBUFLOCK */				\
-	_MOWNERREVOKE((m), 0, (m)->m_flags);				\
 	m_ext_free(m, FALSE);						\
 	splx(_ms_);							\
 	(m)->m_flags &= ~M_EXTCOPYFLAGS;				\
@@ -633,12 +581,12 @@ do {									\
  * Place the successor, if any, in n.
  */
 #define	MFREE(m, n)							\
+	mowner_revoke((m), 1, (m)->m_flags);				\
+	mbstat_type_add((m)->m_type, -1);				\
 	MBUFLOCK(							\
-		mbstat.m_mtypes[(m)->m_type]--;				\
 		if ((m)->m_flags & M_PKTHDR)				\
 			m_tag_delete_chain((m), NULL);			\
 		(n) = (m)->m_next;					\
-		_MOWNERREVOKE((m), 1, m->m_flags);			\
 		if ((m)->m_flags & M_EXT) {				\
 			m_ext_free(m, TRUE);				\
 		} else {						\
@@ -759,7 +707,8 @@ do {									\
 /* change mbuf to new type */
 #define MCHTYPE(m, t)							\
 do {									\
-	MBUFLOCK(mbstat.m_mtypes[(m)->m_type]--; mbstat.m_mtypes[t]++;); \
+	mbstat_type_add((m)->m_type, -1);				\
+	mbstat_type_add(t, 1);						\
 	(m)->m_type = t;						\
 } while (/* CONSTCOND */ 0)
 
@@ -847,6 +796,10 @@ struct mbstat {
 	u_short	m_mtypes[256];	/* type specific mbuf allocations */
 };
 
+struct mbstat_cpu {
+	u_int	m_mtypes[256];	/* type specific mbuf allocations */
+};
+
 /*
  * Mbuf sysctl variables.
  */
@@ -913,9 +866,6 @@ void	m_adj(struct mbuf *, int);
 int	m_apply(struct mbuf *, int, int,
 		int (*)(void *, void *, unsigned int), void *);
 void	m_cat(struct mbuf *,struct mbuf *);
-#ifdef MBUFTRACE
-void	m_claimm(struct mbuf *, struct mowner *);
-#endif
 void	m_clget(struct mbuf *, int);
 int	m_mballoc(int, int);
 void	m_copyback(struct mbuf *, int, int, const void *);
@@ -932,6 +882,9 @@ void	m_move_pkthdr(struct mbuf *to, stru
 static __inline u_int m_length(struct mbuf *) __unused;
 static __inline void m_ext_free(struct mbuf *, bool) __unused;
 
+/* Statistics */
+void mbstat_type_add(int, int);
+
 /* Packet tag routines */
 struct	m_tag *m_tag_get(int, int, int);
 void	m_tag_free(struct m_tag *);
Index: sys/systm.h
===================================================================
RCS file: /cvsroot/src/sys/sys/systm.h,v
retrieving revision 1.196.2.5
diff -u -p -r1.196.2.5 systm.h
--- sys/systm.h	23 Oct 2007 20:17:27 -0000	1.196.2.5
+++ sys/systm.h	12 Nov 2007 11:34:24 -0000
@@ -93,6 +93,7 @@ extern const char *rootspec;	/* how root
 
 extern int ncpu;		/* number of CPUs configured */
 extern int ncpuonline;		/* number of CPUs online */
+extern bool mp_online;		/* secondary processors are started */
 
 extern const char hexdigits[];	/* "0123456789abcdef" in subr_prf.c */
 extern const char HEXDIGITS[];	/* "0123456789ABCDEF" in subr_prf.c */
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.299.2.28
diff -u -p -r1.299.2.28 init_main.c
--- kern/init_main.c	5 Nov 2007 17:08:31 -0000	1.299.2.28
+++ kern/init_main.c	12 Nov 2007 11:34:24 -0000
@@ -117,6 +117,7 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,
 #include <sys/exec.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
+#include <sys/sysctl.h>
 #include <sys/reboot.h>
 #include <sys/user.h>
 #include <sys/sysctl.h>
@@ -294,6 +295,8 @@ main(void)
 
 	kmem_init();
 
+	percpu_init();
+
 	/* Initialize the extent manager. */
 	extent_init();
 
@@ -317,12 +320,6 @@ main(void)
 	/* Initialize the buffer cache */
 	bufinit();
 
-	/*
-	 * Initialize mbuf's.  Do this now because we might attempt to
-	 * allocate mbufs or mbuf clusters during autoconfiguration.
-	 */
-	mbinit();
-
 	/* Initialize sockets. */
 	soinit();
 
@@ -360,6 +357,12 @@ main(void)
 	error = mi_cpu_attach(curcpu());
 	KASSERT(error == 0);
 
+	/*
+	 * Initialize mbuf's.  Do this now because we might attempt to
+	 * allocate mbufs or mbuf clusters during autoconfiguration.
+	 */
+	mbinit();
+
 	/* Initialize the sysctl subsystem. */
 	sysctl_init();
 
@@ -662,13 +665,14 @@ main(void)
 
 	/* Create the aiodone daemon kernel thread. */
 	if (workqueue_create(&uvm.aiodone_queue, "aiodoned",
-	    uvm_aiodone_worker, NULL, PRI_VM, IPL_NONE, WQ_MPSAFE))
+	    uvm_aiodone_worker, NULL, PRI_VM, IPL_NONE, WQ_MPSAFE | WQ_PERCPU))
 		panic("fork aiodoned");
 
 	vmem_rehash_start();
 
-#if defined(MULTIPROCESSOR)
 	/* Boot the secondary processors. */
+	mp_online = true;
+#if defined(MULTIPROCESSOR)
 	cpu_boot_secondary_processors();
 #endif
 
Index: kern/kern_cpu.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_cpu.c,v
retrieving revision 1.2.2.9
diff -u -p -r1.2.2.9 kern_cpu.c
--- kern/kern_cpu.c	1 Nov 2007 21:58:16 -0000	1.2.2.9
+++ kern/kern_cpu.c	12 Nov 2007 11:34:24 -0000
@@ -98,6 +98,7 @@ const struct cdevsw cpuctl_cdevsw = {
 kmutex_t cpu_lock;
 int	ncpu;
 int	ncpuonline;
+bool	mp_online;
 
 int
 mi_cpu_attach(struct cpu_info *ci)
@@ -122,6 +123,7 @@ mi_cpu_attach(struct cpu_info *ci)
 	else
 		ci->ci_data.cpu_onproc = ci->ci_data.cpu_idlelwp;
 
+	percpu_init_cpu(ci);
 	softint_init(ci);
 	xc_init_cpu(ci);
 	pool_cache_cpu_init(ci);
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.120.2.3
diff -u -p -r1.120.2.3 uipc_mbuf.c
--- kern/uipc_mbuf.c	1 Nov 2007 21:05:21 -0000	1.120.2.3
+++ kern/uipc_mbuf.c	12 Nov 2007 11:34:24 -0000
@@ -84,6 +84,7 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_mbuf.c,
 #include <sys/syslog.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
+#include <sys/percpu.h>
 #include <sys/pool.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
@@ -126,6 +127,8 @@ static const char mclpool_warnmsg[] =
 
 MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
+static percpu_t *mbstat_percpu;
+
 #ifdef MBUFTRACE
 struct mownerhead mowners = LIST_HEAD_INITIALIZER(mowners);
 struct mowner unknown_mowners[] = {
@@ -171,6 +174,8 @@ mbinit(void)
 	 */
 	pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60);
 
+	mbstat_percpu = percpu_alloc(sizeof(struct mbstat_cpu));
+
 	/*
 	 * Set a low water mark for both mbufs and clusters.  This should
 	 * help ensure that they can be allocated in a memory starvation
@@ -250,6 +255,31 @@ sysctl_kern_mbuf(SYSCTLFN_ARGS)
 }
 
 #ifdef MBUFTRACE
+static void
+mowner_convert_to_user(struct mowner *mo, struct mowner_user *mo_user)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	memset(mo_user, 0, sizeof(*mo_user));
+	KASSERT(sizeof(mo_user->mo_name) == sizeof(mo->mo_name));
+	KASSERT(sizeof(mo_user->mo_descr) == sizeof(mo->mo_descr));
+	memcpy(mo_user->mo_name, mo->mo_name, sizeof(mo->mo_name));
+	memcpy(mo_user->mo_descr, mo->mo_descr, sizeof(mo->mo_descr));
+
+	percpu_traverse_enter();
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		struct mowner_counter *mc;
+		int i;
+
+		mc = percpu_getptr_remote(mo->mo_counters, ci);
+		for (i = 0; i < MOWNER_COUNTER_NCOUNTERS; i++) {
+			mo_user->mo_counter[i] += mc->mc_counter[i];
+		}
+	}
+	percpu_traverse_exit();
+}
+
 static int
 sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS)
 {
@@ -263,16 +293,21 @@ sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS)
 		return (EPERM);
 
 	LIST_FOREACH(mo, &mowners, mo_link) {
+		struct mowner_user mo_user;
+
+		mowner_convert_to_user(mo, &mo_user);
+
 		if (oldp != NULL) {
-			if (*oldlenp - len < sizeof(*mo)) {
+			if (*oldlenp - len < sizeof(mo_user)) {
 				error = ENOMEM;
 				break;
 			}
-			error = copyout(mo, (char *)oldp + len, sizeof(*mo));
+			error = copyout(&mo_user, (char *)oldp + len,
+			    sizeof(mo_user));
 			if (error)
 				break;
 		}
-		len += sizeof(*mo);
+		len += sizeof(mo_user);
 	}
 
 	if (error == 0)
@@ -282,6 +317,40 @@ sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS)
 }
 #endif /* MBUFTRACE */
 
+static void
+mbstat_convert_to_user(struct mbstat *mbs)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	memset(mbs, 0, sizeof(*mbs));
+	mbs->m_drain = mbstat.m_drain;
+	percpu_traverse_enter();
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		struct mbstat_cpu *mbsc;
+		int i;
+
+		mbsc = percpu_getptr_remote(mbstat_percpu, ci);
+		for (i = 0; i < __arraycount(mbs->m_mtypes); i++) {
+			mbs->m_mtypes[i] += mbsc->m_mtypes[i];
+		}
+	}
+	percpu_traverse_exit();
+}
+
+static int
+sysctl_kern_mbuf_stats(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	struct mbstat mbs;
+
+	mbstat_convert_to_user(&mbs);
+	node = *rnode;
+	node.sysctl_data = &mbs;
+	node.sysctl_size = sizeof(mbs);
+	return sysctl_lookup(SYSCTLFN_CALL(&node));
+}
+
 SYSCTL_SETUP(sysctl_kern_mbuf_setup, "sysctl kern.mbuf subtree setup")
 {
 
@@ -331,7 +400,7 @@ SYSCTL_SETUP(sysctl_kern_mbuf_setup, "sy
 		       CTLFLAG_PERMANENT,
 		       CTLTYPE_STRUCT, "stats",
 		       SYSCTL_DESCR("mbuf allocation statistics"),
-		       NULL, 0, &mbstat, sizeof(mbstat),
+		       sysctl_kern_mbuf_stats, 0, NULL, 0,
 		       CTL_KERN, KERN_MBUF, MBUF_STATS, CTL_EOL);
 #ifdef MBUFTRACE
 	sysctl_createv(clog, 0, NULL, NULL,
@@ -407,8 +476,20 @@ m_get(int nowait, int type)
 {
 	struct mbuf *m;
 
-	MGET(m, nowait, type);
-	return (m);
+	m = pool_cache_get(mb_cache,
+	    nowait == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : 0);
+	if (m == NULL)
+		return NULL;
+
+	mbstat_type_add(type, 1);
+	mowner_init(m, type);
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_nextpkt = NULL;
+	m->m_data = m->m_dat;
+	m->m_flags = 0;
+
+	return m;
 }
 
 struct mbuf *
@@ -416,8 +497,18 @@ m_gethdr(int nowait, int type)
 {
 	struct mbuf *m;
 
-	MGETHDR(m, nowait, type);
-	return (m);
+	m = m_get(nowait, type);
+	if (m == NULL)
+		return NULL;
+
+	m->m_data = m->m_pktdat;
+	m->m_flags = M_PKTHDR;
+	m->m_pkthdr.rcvif = NULL;
+	m->m_pkthdr.csum_flags = 0;
+	m->m_pkthdr.csum_data = 0;
+	SLIST_INIT(&m->m_pkthdr.tags);
+
+	return m;
 }
 
 struct mbuf *
@@ -1484,3 +1575,127 @@ nextchain:
 	}
 }
 #endif /* defined(DDB) */
+
+void
+mbstat_type_add(int type, int diff)
+{
+	struct mbstat_cpu *mb;
+	int s;
+
+	s = splvm();
+	mb = percpu_getptr(mbstat_percpu);
+	mb->m_mtypes[type] += diff;
+	splx(s);
+}
+
+#if defined(MBUFTRACE)
+void
+mowner_attach(struct mowner *mo)
+{
+
+	KASSERT(mo->mo_counters == NULL);
+	mo->mo_counters = percpu_alloc(sizeof(struct mowner_counter));
+
+	/* XXX lock */
+	LIST_INSERT_HEAD(&mowners, mo, mo_link);
+}
+
+void
+mowner_detach(struct mowner *mo)
+{
+
+	KASSERT(mo->mo_counters != NULL);
+
+	/* XXX lock */
+	LIST_REMOVE(mo, mo_link);
+
+	percpu_free(mo->mo_counters, sizeof(struct mowner_counter));
+	mo->mo_counters = NULL;
+}
+
+static struct mowner_counter *
+mowner_counter(struct mowner *mo)
+{
+
+	return percpu_getptr(mo->mo_counters);
+}
+
+void
+mowner_init(struct mbuf *m, int type)
+{
+	struct mowner_counter *mc;
+	struct mowner *mo;
+	int s;
+
+	m->m_owner = mo = &unknown_mowners[type];
+	s = splvm();
+	mc = mowner_counter(mo);
+	mc->mc_counter[MOWNER_COUNTER_CLAIMS]++;
+	splx(s);
+}
+
+void
+mowner_ref(struct mbuf *m, int flags)
+{
+	struct mowner *mo = m->m_owner;
+	struct mowner_counter *mc;
+	int s;
+
+	s = splvm();
+	mc = mowner_counter(mo);
+	if ((flags & M_EXT) != 0)
+		mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++;
+	if ((flags & M_CLUSTER) != 0)
+		mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++;
+	splx(s);
+}
+
+void
+mowner_revoke(struct mbuf *m, bool all, int flags)
+{
+	struct mowner *mo = m->m_owner;
+	struct mowner_counter *mc;
+	int s;
+
+	s = splvm();
+	mc = mowner_counter(mo);
+	if ((flags & M_EXT) != 0)
+		mc->mc_counter[MOWNER_COUNTER_EXT_RELEASES]++;
+	if ((flags & M_CLUSTER) != 0)
+		mc->mc_counter[MOWNER_COUNTER_CLUSTER_RELEASES]++;
+	if (all)
+		mc->mc_counter[MOWNER_COUNTER_RELEASES]++;
+	splx(s);
+	if (all)
+		m->m_owner = &revoked_mowner;
+}
+
+static void
+mowner_claim(struct mbuf *m, struct mowner *mo)
+{
+	struct mowner_counter *mc;
+	int flags = m->m_flags;
+	int s;
+
+	s = splvm();
+	mc = mowner_counter(mo);
+	mc->mc_counter[MOWNER_COUNTER_CLAIMS]++;
+	if ((flags & M_EXT) != 0)
+		mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++;
+	if ((flags & M_CLUSTER) != 0)
+		mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++;
+	splx(s);
+	m->m_owner = mo;
+}
+
+void
+m_claim(struct mbuf *m, struct mowner *mo)
+{
+
+	if (m->m_owner == mo || mo == NULL)
+		return;
+
+	mowner_revoke(m, true, m->m_flags);
+	mowner_claim(m, mo);
+}
+#endif /* defined(MBUFTRACE) */
Index: conf/files
===================================================================
RCS file: /cvsroot/src/sys/conf/files,v
retrieving revision 1.834.2.11
diff -u -p -r1.834.2.11 files
--- conf/files	23 Oct 2007 20:17:04 -0000	1.834.2.11
+++ conf/files	12 Nov 2007 11:34:24 -0000
@@ -1378,6 +1378,7 @@ file	kern/subr_hash.c
 file	kern/subr_kmem.c
 file	kern/subr_lockdebug.c
 file	kern/subr_log.c
+file	kern/subr_percpu.c
 file	kern/subr_pool.c
 file	kern/subr_prf.c
 file	kern/subr_prf2.c

--NextPart-20071112203611-0619600--