tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

passive references



To make the network stack scale well to multiple cores, the packet-
processing path needs to share resources such as routes, tunnels,
pcbs, &c., between cores without incurring much interprocessor
synchronization.

It would be nice to use pserialize(9) for this, but many of these
resources are held by code paths during packet processing that may
sleep, which is not allowed in a pserialize read section.  The two
obvious ways to resolve this are:

- Change all of these paths so that they don't sleep and can be run
  inside a pserialize read section.  This is a major engineering
  effort, because the network stack is such a complex interdependent
  beast.

- Add a reference count to each route, tunnel, pcb, &c.  This would
  work to make the network stack *safe* to run on multiple cores, but
  it incurs interprocessor synchronization for each use and hence
  fails to make the network stack *scalable* to multiple cores.

Prompted by discussion with rmind@ and dyoung@, I threw together a
sketch for an abstraction rmind called `passive references' which can
be held across sleeps on a single CPU -- e.g., in a softint LWP or
CPU-bound kthread -- but which incur no interprocessor synchronization
to acquire and release.  This would serve as an intermediary between
the two options so that we can incrementally adapt the network stack.

The idea is that acquiring a reference puts an entry on a CPU-local
list, which can be done inside a pserialize read section.  Releasing
the reference removes the entry.  When an object is about to be
destroyed -- e.g., you are unconfiguring a tunnel -- then you mark it
as unusable so that nobody can acquire new references, and wait until
there are no references on any CPU's list.

The attached file contains a summary of the design, an example of use,
and a sketch of an implementation, with input and proof-reading from
riz@.

Thoughts?


A variant of this approach which dyoung@ has used in the past is to
count the number of references, instead of putting them on a list, on
each CPU.  I first wrote a sketch with a count instead of a list,
thinking mainly of using this just for ip_encap tunnels, of which
there are likely relatively few, and not for routes or pcbs.

However, if there are many more objects than references -- as I expect
to be with most kinds of packet flow that the packet-processing path
will handle one or two of at a time --, it would waste a lot of space
to have one count on each CPU for each object, yet the list of all
references on each CPU (to any object) would be relatively short.
/*
 * Passive references
 *
 *	Passive references are references to objects that guarantee the
 *	object will not be destroyed until the reference is released.
 *
 *	Passive references require no interprocessor synchronization to
 *	acquire or release.  However, destroying the target of passive
 *	references requires expensive interprocessor synchronization --
 *	xcalls to determine on which CPUs the object is still in use.
 *
 *	Passive references may be held only on a single CPU and by a
 *	single LWP.  They require the caller to allocate a little stack
 *	space, a struct psref object.  Sleeping while a passive
 *	reference is held is allowed, provided that the owner's LWP is
 *	bound to a CPU -- e.g., the owner is a softint or a bound
 *	kthread.  However, sleeping should be kept to a short duration,
 *	e.g. sleeping on an adaptive lock.
 *
 *	Passive references serve as an intermediate stage between
 *	reference counting and passive serialization (pserialize(9)):
 *
 *	- If you need references to transfer from CPU to CPU or LWP to
 *	  LWP, or if you need long-term references, you must use
 *	  reference counting, e.g. with atomic operations or locks,
 *	  which incurs interprocessor synchronization for every use --
 *	  cheaper than an xcall, but not scalable.
 *
 *	- If all users *guarantee* that they will not sleep, then it is
 *	  not necessary to use passive references: you may as well just
 *	  use the even cheaper pserialize(9), because you have
 *	  satisfied the requirements of a pserialize read section.
 */

#if EXAMPLE
struct frotz {
	struct psref_target	frotz_psref;
	LIST_ENTRY(frotz)	frotz_entry;
	...
};

static struct {
	kmutex_t		lock;
	pserialize_t		psz;
	LIST_HEAD(frotz)	head;
	struct psref_class	*class;
} frobbotzim __cacheline_aligned;

static int
frobbotzim_init(void)
{

	mutex_init(&frobbotzim.lock, MUTEX_DEFAULT, IPL_NONE);
	frobbotzim.psz = pserialize_create();
	if (frobbotzim.psz == NULL)
		goto fail0;
	LIST_INIT(&frobbotzim.head);
	frobbotzim.class = psref_class_create("frotz", IPL_SOFTNET);
	if (frobbotzim.class == NULL)
		goto fail1;

	return 0;

fail2: __unused
	psref_class_destroy(frobbotzim.class);
fail1:	pserialize_destroy(frobbotzim.psz);
fail0:	mutex_destroy(&frobbotzim.lock);
	return ENOMEM;
}

static void
frobbotzim_exit(void)
{

	KASSERT(LIST_EMPTY(&frobbotzim.head));

	psref_class_destroy(frobbotzim.class);
	pserialize_destroy(frobbotzim.psz);
	mutex_destroy(&frobbotzim.lock);
}

static struct frotz *
frotz_create(...)
{
	struct frotz *frotz;

	frotz = kmem_alloc(sizeof(*frotz), KM_SLEEP);
	if (frotz == NULL)
		return NULL;

	psref_target_init(&frotz->frotz_psref, frobbotzim.class);
	...initialize fields...;

	mutex_enter(&frobbotzim.lock);
	LIST_INSERT_HEAD(&frobbotzim.head, frotz, frotz_entry);
	mutex_exit(&frobbotzim.lock);

	return frotz;
}

static void
frotz_destroy(struct frotz *frotz)
{

	psref_target_drain(&frotz->frotz_psref, frobbotzim.class);
	mutex_enter(&frobbotzim.lock);
	LIST_REMOVE(frotz, frotz_entry);
	pserialize_perform(frobbotzim.psz);
	mutex_exit(&frobbotzim.lock);

	...destroy fields...;

	kmem_free(frotz, sizeof(*frotz));
}

static struct frotz *
frotz_lookup(uint64_t key, struct psref *psref)
{
	struct frotz *frotz;
	int s;

	s = pserialize_read_enter();
	LIST_FOREACH(frotz, &frobbotzim.head, frotz_entry) {
		membar_datadep_consumer();
		if (!match(frotz, key))
			continue;
		if (psref_acquire(psref, &frotz->frotz_psref, frobbotzim.class)
		    != 0)
			continue;
		break;
	}
	pserialize_read_exit(s);

	return frotz;
}

static void
frotz_input(struct mbuf *m, ...)
{
	struct frotz *frotz;
	struct psref psref;

	...parse m...;
	frotz = frotz_lookup(key, &psref);
	if (frotz == NULL) {
		/* Drop packet.  */
		m_freem(m);
		return;
	}

	(*frotz->frotz_input)(m, ...);
	psref_release(&psref, &frotz->frotz_psref, frobbotzim.class);
}
#endif

#define	PSREF_DEBUG	0

/*
 * struct psref_target
 *
 *	Bookkeeping for an object to which users can acquire passive
 *	references.  This is compact so that it can easily be embedded
 *	into many multitudes of objects, e.g. IP packet flows.
 */
struct psref_target {
	bool			prt_draining;
#if PSREF_DEBUG
	struct psref_class	*prt_class;
#endif
};

/*
 * struct psref
 *
 *	Bookkeeping for a single passive reference.  There should only
 *	be a few of these per CPU in the system at once, no matter how
 *	many targets are stored, so these are a bit larger than struct
 *	psref_target.
 */
struct psref {
	LIST_ENTRY(psref)	psref_entry;
	struct psref_target	*psref_target;
#if PSREF_DEBUG
	struct lwp		*psref_lwp;
	struct cpu_info		*psref_cpu;
#endif
};

/*
 * struct psref_class
 *
 *	Private global state for a class of passive reference targets.
 *	Opaque to callers.
 */
struct psref_class {
	kmutex_t		prc_lock;
	kcondvar_t		prc_cv;
	struct percpu		*prc_percpu; /* struct psref_cpu */
	ipl_cookie_t		prc_iplcookie;
};

/*
 * struct psref_cpu
 *
 *	Private per-CPU state for a class of passive reference targets.
 *	Not exposed by the API.
 */
struct psref_cpu {
	LIST_HEAD(psref)		pcpu_head;
};

/*
 * psref_class_create(name, ipl)
 *
 *	Create a new passive reference class, with the given wchan name
 *	and ipl.
 */
struct psref_class *
psref_class_create(const char *name, int ipl)
{
	struct psref_class *class;

	class = kmem_alloc(sizeof(*class), KM_SLEEP);
	if (class == NULL)
		goto fail0;

	class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu));
	if (class->prc_percpu == NULL)
		goto fail1;

	mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl);
	cv_init(&class->prc_cv, name);
	class->prc_iplcookie = makeiplcookie(ipl);

fail1:	kmem_free(class, sizeof(*class));
fail0:	return NULL;
}

/*
 * psref_class_destroy(class)
 *
 *	Destroy a passive reference class and free memory associated
 *	with it.  All targets in this class must have been drained and
 *	destroyed already.
 */
void
psref_class_destroy(struct psref_class *class)
{

	cv_destroy(&class->prc_cv);
	mutex_destroy(&class->prc_lock);
	percpu_free(class->prc_percpu, sizeof(struct psref_cpu));
	kmem_free(class, sizeof(*class));
}

/*
 * psref_target_init(target, class)
 *
 *	Initialize a passive reference target in the specified class.
 *	The caller is responsible for issuing a membar_producer before
 *	exposing a pointer to the target to other CPUs.
 */
void
psref_target_init(struct psref_target *target, struct psref_class *class)
{

	target->prt_draining = false;
#if PSREF_DEBUG
	target->prt_class = class;
#endif
}

/*
 * psref_target_destroy(target, class)
 *
 *	Destroy a passive reference target.  It must have previously
 *	been drained.
 */
void
psref_target_destroy(struct psref_target *target, struct psref_class *class)
{

	KASSERT(target->prt_draining);
#if PSREF_DEBUG
	KASSERT(target->prt_class == class);
	target->prt_class = NULL;
#endif
}

/*
 * psref_acquire(psref, target, class)
 *
 *	Try to acquire a passive reference to the specified target,
 *	which must be in the specified class.  On success, returns
 *	zero; on failure, returns a nonzero error code.  If the target
 *	is draining, returns ENOENT.
 *
 *	The caller must guarantee that it will not switch CPUs before
 *	releasing the passive reference, either by disabling
 *	kpreemption and avoiding sleeps, or by being in a softint or in
 *	an LWP bound to a CPU.
 */
int
psref_acquire(struct psref *psref, struct psref_target *target,
    struct psref_class *class)
{
	struct psref_cpu *pcpu;
	int s, error;

	KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
		ISSET(curlwp->l_pflag, LP_BOUND)),
	    "passive references are CPU-local,"
	    " but preemption is enabled and the caller is not"
	    " in a softint or CPU-bound LWP");

#if PSREF_DEBUG
	KASSERT(target->prt_class == class);
#endif

	/* Block interrupts and acquire the current CPU's reference list.  */
	s = splraiseipl(class->prc_iplcookie);
	pcpu = percpu_getref(class->prc_percpu);

	/* Is this target going away?  */
	if (__predict_false(target->prt_draining)) {
		/* Yes: fail.  */
		error = ENOENT;
	} else {
		/* No: record our reference.  */
		LIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry);
		psref->psref_target = target;
#if PSREF_DEBUG
		psref->psref_lwp = curlwp;
		psref->psref_cpu = curcpu();
#endif
		error = 0;
	}

	/* Release the CPU list and restore interrupts.  */
	percpu_putref(class->prc_percpu);
	splx(s);

	return error;
}

/*
 * psref_release(psref, target, class)
 *
 *	Release a passive reference to the specified target, which must
 *	be in the specified class.
 *
 *	The caller must not have switched CPUs or LWPs since acquiring
 *	the passive reference.
 */
void
psref_release(struct psref *psref, struct psref_target *target,
    struct psref_class *class)
{
	int s;

	KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
		ISSET(curlwp->l_pflag, LP_BOUND)),
	    "passive references are CPU-local,"
	    " but preemption is enabled and the caller is not"
	    " in a softint or CPU-bound LWP");

	KASSERT(psref->psref_target == target);
#if PSREF_DEBUG
	KASSERT(target->prt_class == class);
	KASSERTMSG((psref->psref_lwp == curlwp),
	    "passive reference transferred from lwp %p to lwp %p",
	    psref->psref_lwp, curlwp);
	KASSERT((psref->psref_cpu == curcpu()),
	    "passive reference transferred from CPU %u to CPU %u",
	    cpu_index(psref->psref_cpu), cpu_index(curcpu()));
#endif

	/*
	 * Block interrupts and remove the psref from the current CPU's
	 * list.  No need to percpu_getref or get the head of the list,
	 * and the caller guarantees that we are bound to a CPU anyway
	 * (as does blocking interrupts).
	 */
	s = splraiseipl(class->prc_iplcookie);
	LIST_REMOVE(psref, psref_entry);
	splx(s);

	/* If someone is waiting for users to drain, notify 'em.  */
	if (__predict_false(target->prt_draining))
		cv_broadcast(&class->prc_cv);
}

struct psreffed {
	struct psref_class	*class;
	struct psref_target	*target;
	bool			ret;
};

static void
psreffed_p_xc(void *cookie0, void *cookie1 __unused)
{
	struct psreffed *P = cookie0;
	struct psref_class *class = P->class;
	struct psref_target *target = P->target;
	struct psref_cpu *pcpu;
	struct psref *psref;
	int s;

	/* Block interrupts and acquire the current CPU's reference list.  */
	s = splraiseipl(class->prc_iplcookie);
	pcpu = percpu_getref(class->prc_percpu);

	/*
	 * Check the CPU's reference list for any references to this
	 * target.  This loop shouldn't take very long because any
	 * single CPU should hold only a small number of references at
	 * any given time unless there is a bug.
	 */
	LIST_FOREACH(psref, pcpu->pcpu_head, psref_entry) {
		if (psref->psref_target == target) {
			/*
			 * No need to lock anything here: every write
			 * transitions from false to true, so as long
			 * as any write goes through we're good.  No
			 * need for a memory barrier because this is
			 * read only after xc_wait, which has already
			 * issued any necessary memory barriers.
			 */
			P->ret = true;
			break;
		}
	}

	/* Release the CPU list and restore interrupts.  */
	percpu_putref(class->prc_percpu);
	splx(s);
}

static bool
psreffed_p(struct psref_target *target, struct psref_class *class)
{
	struct psreffed P = {
		.class = class,
		.target = target,
		.ret = false,
	};

	xc_wait(xc_broadcast(0, &psreffed_p_xc, &P, NULL));

	return P.ret;
}

/*
 * psref_target_drain(target, class)
 *
 *	Prevent new references to target and wait for existing ones to
 *	drain.  May sleep.
 */
void
psref_target_drain(struct psref_target *target, struct psref_class *class)
{

#if PSREF_DEBUG
	KASSERT(target->prt_class == class);
#endif

	KASSERT(!target->prt_draining);
	target->prt_draining = true;

	/* Wait until there are no more references on any CPU.  */
	while (psreffed_p(target)) {
		/*
		 * This enter/wait/exit business looks wrong, but it is
		 * both necessary, because psreffed_p performs a
		 * low-priority xcall and hence cannot run while a
		 * mutex is locked, and OK, because the wait is timed
		 * -- explicit wakeups are only an optimization.
		 */
		mutex_enter(&class->prc_lock);
		(void)cv_timedwait(&class->prc_cv, &class->prc_lock, hz);
		mutex_exit(&class->prc_lock);
	}
}


Home | Main Index | Thread Index | Old Index