Subject: some ideas how to make sysv_shm scale better
To: None <tech-kern@netbsd.org>
From: Matthias Drochner <M.Drochner@fz-juelich.de>
List: tech-kern
Date: 09/04/2003 19:33:47
This is a multipart MIME message.

--==_Exmh_5456351885340
Content-Type: text/plain; charset=us-ascii


Some X application like to allocate many shm segments - eg for pixmaps.
Our default limit (128) is quickly hit. A linux box I just looked at
has a limit of 4096.
This is adjustable at kernel build time, of course. This is ugly and
should be changed. I don't see a good reason to allocate a fixed size
table early with allocsys().
A more serious problem imo is that the per-process table of attached
shm segments is a fixed size table too (vmspace->vm_shm). If we bump
the max number of segments allowed per process (shminfo.shmseg), every
process will allocate that much memory if it only attaches to one shm
segment. That, and the fact that the large useless table is copied on
fork() even if an exec() follows immediately are dealt with by the
appended patch. It implements a linked list per process, and a simple
reference counting scheme.
"Works for me" as far as I tried.

Comments? (I know, there should be locking for SMP... but it is
at least not worse than before in this respect)

best regards
Matthias



--==_Exmh_5456351885340
Content-Type: text/plain ; name="shm.txt"; charset=us-ascii
Content-Description: shm.txt
Content-Disposition: attachment; filename="shm.txt"

--- sysv_shm.c.~1.68.~	Mon Feb 24 12:04:15 2003
+++ sysv_shm.c	Thu Sep  4 19:07:57 2003
@@ -82,6 +82,8 @@
 #include <sys/mount.h>		/* XXX for <sys/syscallargs.h> */
 #include <sys/sa.h>
 #include <sys/syscallargs.h>
+#include <sys/queue.h>
+#include <sys/pool.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -113,18 +115,29 @@
 	struct uvm_object *shm_object;
 };
 
-struct shmmap_state {
+struct shmmap_entry {
+	SLIST_ENTRY(shmmap_entry) next;
 	vaddr_t va;
 	int shmid;
 };
 
+struct pool shmmap_entry_pool;
+
+struct shmmap_state {
+	unsigned int nitems;
+	unsigned int nrefs;
+	SLIST_HEAD(, shmmap_entry) entries;
+};
+
 static int shm_find_segment_by_key __P((key_t));
 static void shm_deallocate_segment __P((struct shmid_ds *));
-static void shm_delete_mapping __P((struct vmspace *, struct shmmap_state *));
+static void shm_delete_mapping __P((struct vmspace *, struct shmmap_state *,
+				    struct shmmap_entry *));
 static int shmget_existing __P((struct proc *, struct sys_shmget_args *,
 				int, int, register_t *));
 static int shmget_allocate_segment __P((struct proc *, struct sys_shmget_args *,
 					int, register_t *));
+static struct shmmap_state *shmmap_getprivate __P((struct proc *));
 
 static int
 shm_find_segment_by_key(key)
@@ -178,19 +191,22 @@
 }
 
 static void
-shm_delete_mapping(vm, shmmap_s)
+shm_delete_mapping(vm, shmmap_s, shmmap_se)
 	struct vmspace *vm;
 	struct shmmap_state *shmmap_s;
+	struct shmmap_entry *shmmap_se;
 {
 	struct shmid_ds *shmseg;
 	int segnum;
 	size_t size;
 	
-	segnum = IPCID_TO_IX(shmmap_s->shmid);
+	segnum = IPCID_TO_IX(shmmap_se->shmid);
 	shmseg = &shmsegs[segnum];
 	size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
-	uvm_deallocate(&vm->vm_map, shmmap_s->va, size);
-	shmmap_s->shmid = -1;
+	uvm_deallocate(&vm->vm_map, shmmap_se->va, size);
+	SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next);
+	shmmap_s->nitems--;
+	pool_put(&shmmap_entry_pool, shmmap_se);
 	shmseg->shm_dtime = time.tv_sec;
 	if ((--shmseg->shm_nattch <= 0) &&
 	    (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
@@ -199,6 +215,36 @@
 	}
 }
 
+static struct shmmap_state *
+shmmap_getprivate(struct proc *p)
+{
+	struct shmmap_state *oshmmap_s, *shmmap_s;
+	struct shmmap_entry *oshmmap_se, *shmmap_se;
+
+	oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	if (oshmmap_s && oshmmap_s->nrefs == 1)
+		return (oshmmap_s);
+
+	shmmap_s = malloc(sizeof(struct shmmap_state), M_SHM, M_WAITOK);
+	memset(shmmap_s, 0, sizeof(struct shmmap_state));
+	shmmap_s->nrefs = 1;
+	SLIST_INIT(&shmmap_s->entries);
+	p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+
+	if (!oshmmap_s)
+		return (shmmap_s);
+
+	SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) {
+		shmmap_se = pool_get(&shmmap_entry_pool, PR_WAITOK);
+		shmmap_se->va = oshmmap_se->va;
+		shmmap_se->shmid = oshmmap_se->shmid;
+		SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
+	}
+	shmmap_s->nitems = oshmmap_s->nitems;
+	oshmmap_s->nrefs--;
+	return (shmmap_s);
+}
+
 int
 sys_shmdt(l, v, retval)
 	struct lwp *l;
@@ -210,20 +256,22 @@
 	} */ *uap = v;
 	struct proc *p = l->l_proc;
 	struct shmmap_state *shmmap_s;
-	int i;
+	struct shmmap_entry *shmmap_se;
 
 	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL)
 		return EINVAL;
 
-	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
-		if (shmmap_s->shmid != -1 &&
-		    shmmap_s->va == (vaddr_t)SCARG(uap, shmaddr))
-			break;
-	if (i == shminfo.shmseg)
-		return EINVAL;
-	shm_delete_mapping(p->p_vmspace, shmmap_s);
-	return 0;
+	SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
+		if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr)) {
+			shmmap_s = shmmap_getprivate(p);
+			shm_delete_mapping(p->p_vmspace, shmmap_s, shmmap_se);
+			return 0;
+		}
+	}
+
+	/* not found */
+	return EINVAL;
 }
 
 int
@@ -258,23 +306,16 @@
 	vaddr_t *attachp;
 	int findremoved;
 {
-	int error, i, flags;
+	int error, flags;
 	struct ucred *cred = p->p_ucred;
 	struct shmid_ds *shmseg;
-	struct shmmap_state *shmmap_s = NULL;
+	struct shmmap_state *shmmap_s;
 	struct shm_handle *shm_handle;
 	vaddr_t attach_va;
 	vm_prot_t prot;
 	vsize_t size;
+	struct shmmap_entry *shmmap_se;
 
-	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
-	if (shmmap_s == NULL) {
-		size = shminfo.shmseg * sizeof(struct shmmap_state);
-		shmmap_s = malloc(size, M_SHM, M_WAITOK);
-		for (i = 0; i < shminfo.shmseg; i++)
-			shmmap_s[i].shmid = -1;
-		p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
-	}
 	shmseg = shm_find_segment_by_shmid(shmid, findremoved);
 	if (shmseg == NULL)
 		return EINVAL;
@@ -282,13 +323,11 @@
 		    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error)
 		return error;
-	for (i = 0; i < shminfo.shmseg; i++) {
-		if (shmmap_s->shmid == -1)
-			break;
-		shmmap_s++;
-	}
-	if (i >= shminfo.shmseg)
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg)
 		return EMFILE;
+
 	size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
 	prot = VM_PROT_READ;
 	if ((shmflg & SHM_RDONLY) == 0)
@@ -315,8 +354,12 @@
 	if (error) {
 		return error;
 	}
-	shmmap_s->va = attach_va;
-	shmmap_s->shmid = shmid;
+	shmmap_se = pool_get(&shmmap_entry_pool, PR_WAITOK);
+	shmmap_se->va = attach_va;
+	shmmap_se->shmid = shmid;
+	shmmap_s = shmmap_getprivate(p);
+	SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
+	shmmap_s->nitems++;
 	shmseg->shm_lpid = p->p_pid;
 	shmseg->shm_atime = time.tv_sec;
 	shmseg->shm_nattch++;
@@ -554,21 +597,18 @@
 	struct vmspace *vm1, *vm2;
 {
 	struct shmmap_state *shmmap_s;
-	size_t size;
-	int i;
+	struct shmmap_entry *shmmap_se;
+
+	vm2->vm_shm = vm1->vm_shm;
 
-	if (vm1->vm_shm == NULL) {
-		vm2->vm_shm = NULL;
+	if (vm1->vm_shm == NULL)
 		return;
-	}
 
-	size = shminfo.shmseg * sizeof(struct shmmap_state);
-	shmmap_s = malloc(size, M_SHM, M_WAITOK);
-	memcpy(shmmap_s, vm1->vm_shm, size);
-	vm2->vm_shm = (caddr_t)shmmap_s;
-	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
-		if (shmmap_s->shmid != -1)
-			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+	shmmap_s = (struct shmmap_state *)vm1->vm_shm;
+
+	SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
+		shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++;
+	shmmap_s->nrefs++;
 }
 
 void
@@ -576,16 +616,26 @@
 	struct vmspace *vm;
 {
 	struct shmmap_state *shmmap_s;
-	int i;
+	struct shmmap_entry *shmmap_se;
 
 	shmmap_s = (struct shmmap_state *)vm->vm_shm;
 	if (shmmap_s == NULL)
 		return;
-	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
-		if (shmmap_s->shmid != -1)
-			shm_delete_mapping(vm, shmmap_s);
-	free(vm->vm_shm, M_SHM);
+
 	vm->vm_shm = NULL;
+
+	if (--shmmap_s->nrefs > 0) {
+		SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
+			shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--;
+		return;
+	}
+
+	while (!SLIST_EMPTY(&shmmap_s->entries)) {
+		shmmap_se = SLIST_FIRST(&shmmap_s->entries);
+		shm_delete_mapping(vm, shmmap_s, shmmap_se);
+	}
+	KASSERT(shmmap_s->nitems == 0);
+	free(shmmap_s, M_SHM);
 }
 
 void
@@ -602,4 +652,7 @@
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
+
+	pool_init(&shmmap_entry_pool, sizeof(struct shmmap_state), 0, 0, 0,
+		  "shmmp", 0);
 }

--==_Exmh_5456351885340--