Subject: make vnodes freeable
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 07/02/2003 23:36:58
hi,

following patch is to make vnodes freeable.
i.e. you can decrease kern.maxvnodes.

if no one objects, i'll check it in a few days later.

YAMAMOTO Takashi


Index: kern/kern_sysctl.c
===================================================================
--- kern/kern_sysctl.c	(revision 182)
+++ kern/kern_sysctl.c	(working copy)
@@ -375,8 +375,11 @@ kern_sysctl(int *name, u_int namelen, vo
 		error = sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes);
 		if (newp && !error) {
 			if (old_vnodes > desiredvnodes) {
-				desiredvnodes = old_vnodes;
-				return (EINVAL);
+				error = vfs_drainvnodes(desiredvnodes, p);
+				if (error) {
+					desiredvnodes = old_vnodes;
+					return error;
+				}
 			}
 			vfs_reinit();
 			nchreinit();
Index: kern/vfs_cache.c
===================================================================
--- kern/vfs_cache.c	(revision 154)
+++ kern/vfs_cache.c	(working copy)
@@ -97,6 +97,45 @@ int doingcache = 1;			/* 1 => enable the
 /* A single lock to protect cache insertion, removal and lookup */
 static struct simplelock namecache_slock = SIMPLELOCK_INITIALIZER;
 
+static void cache_remove(struct namecache *);
+static void cache_free(struct namecache *);
+
+static void
+cache_remove(struct namecache *ncp)
+{
+
+	LOCK_ASSERT(simple_lock_held(&namecache_slock));
+
+	ncp->nc_dvp = NULL;
+	ncp->nc_vp = NULL;
+
+	TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
+	if (ncp->nc_hash.le_prev != NULL) {
+		LIST_REMOVE(ncp, nc_hash);
+		ncp->nc_hash.le_prev = NULL;
+	}
+	if (ncp->nc_vhash.le_prev != NULL) {
+		LIST_REMOVE(ncp, nc_vhash);
+		ncp->nc_vhash.le_prev = NULL;
+	}
+	if (ncp->nc_vlist.le_prev != NULL) {
+		LIST_REMOVE(ncp, nc_vlist);
+		ncp->nc_vlist.le_prev = NULL;
+	}
+	if (ncp->nc_dvlist.le_prev != NULL) {
+		LIST_REMOVE(ncp, nc_dvlist);
+		ncp->nc_dvlist.le_prev = NULL;
+	}
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+
+	pool_put(&namecache_pool, ncp);
+	numcache--; /* XXX MP */
+}
+
 /*
  * Look for a the name in the cache. We don't do this
  * if the segment name is long, simply so the cache can avoid
@@ -370,15 +409,7 @@ cache_enter(struct vnode *dvp, struct vn
 		memset(ncp, 0, sizeof(*ncp));
 		simple_lock(&namecache_slock);
 	} else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) {
-		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-		if (ncp->nc_hash.le_prev != NULL) {
-			LIST_REMOVE(ncp, nc_hash);
-			ncp->nc_hash.le_prev = NULL;
-		}
-		if (ncp->nc_vhash.le_prev != NULL) {
-			LIST_REMOVE(ncp, nc_vhash);
-			ncp->nc_vhash.le_prev = NULL;
-		}
+		cache_remove(ncp);
 	} else {
 		simple_unlock(&namecache_slock);
 		return;
@@ -396,6 +427,9 @@ cache_enter(struct vnode *dvp, struct vn
 	}
 	/* Fill in cache info. */
 	ncp->nc_dvp = dvp;
+	LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist);
+	if (vp)
+		LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist);
 	ncp->nc_dvpid = dvp->v_id;
 	ncp->nc_nlen = cnp->cn_namelen;
 	memcpy(ncp->nc_name, cnp->cn_nameptr, (unsigned)ncp->nc_nlen);
@@ -494,11 +528,21 @@ nchreinit(void)
 void
 cache_purge(struct vnode *vp)
 {
-	struct namecache *ncp;
+	struct namecache *ncp, *ncnext;
 	struct nchashhead *ncpp;
 	static u_long nextvnodeid;
 
 	simple_lock(&namecache_slock);
+	for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL; ncp = ncnext) {
+		ncnext = LIST_NEXT(ncp, nc_vlist);
+		cache_remove(ncp);
+		cache_free(ncp);
+	}
+	for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL; ncp = ncnext) {
+		ncnext = LIST_NEXT(ncp, nc_dvlist);
+		cache_remove(ncp);
+		cache_free(ncp);
+	}
 	vp->v_id = ++nextvnodeid;
 	if (nextvnodeid != 0)
 		goto out;
@@ -529,18 +573,8 @@ cache_purgevfs(struct mount *mp)
 			continue;
 		}
 		/* Free the resources we had. */
-		ncp->nc_vp = NULL;
-		ncp->nc_dvp = NULL;
-		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-		if (ncp->nc_hash.le_prev != NULL) {
-			LIST_REMOVE(ncp, nc_hash);
-			ncp->nc_hash.le_prev = NULL;
-		}
-		if (ncp->nc_vhash.le_prev != NULL) {
-			LIST_REMOVE(ncp, nc_vhash);
-			ncp->nc_vhash.le_prev = NULL;
-		}
-		TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
+		cache_remove(ncp);
+		cache_free(ncp);
 	}
 	simple_unlock(&namecache_slock);
 }
Index: kern/vfs_subr.c
===================================================================
--- kern/vfs_subr.c	(revision 154)
+++ kern/vfs_subr.c	(working copy)
@@ -191,6 +191,7 @@ static int vfs_hang_addrlist __P((struct
 				  struct export_args *));
 static int vfs_free_netcred __P((struct radix_node *, void *));
 static void vfs_free_addrlist __P((struct netexport *));
+static struct vnode *getcleanvnode __P((struct proc *));
 
 #ifdef DEBUG
 void printlockedvnodes __P((void));
@@ -212,6 +213,80 @@ vntblinit()
 	vn_initialize_syncerd();
 }
 
+int
+vfs_drainvnodes(long target, struct proc *p)
+{
+
+	simple_lock(&vnode_free_list_slock);
+	while (numvnodes > target) {
+		struct vnode *vp;
+
+		vp = getcleanvnode(p);
+		if (vp == NULL)
+			return EBUSY; /* give up */
+		pool_put(&vnode_pool, vp);
+		simple_lock(&vnode_free_list_slock);
+		numvnodes--;
+	}
+	simple_unlock(&vnode_free_list_slock);
+
+	return 0;
+}
+
+/*
+ * grab a vnode from freelist and clean it.
+ */
+struct vnode *
+getcleanvnode(p)
+	struct proc *p;
+{
+	struct vnode *vp;
+	struct freelst *listhd;
+
+	LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
+	if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL)
+		vp = TAILQ_FIRST(listhd = &vnode_hold_list);
+	for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) {
+		if (simple_lock_try(&vp->v_interlock)) {
+			if ((vp->v_flag & VLAYER) == 0) {
+				break;
+			}
+			if (VOP_ISLOCKED(vp) == 0)
+				break;
+			else
+				simple_unlock(&vp->v_interlock);
+		}
+	}
+
+	if (vp == NULLVP) {
+		simple_unlock(&vnode_free_list_slock);
+		return NULLVP;
+	}
+
+	if (vp->v_usecount)
+		panic("free vnode isn't, vp %p", vp);
+	TAILQ_REMOVE(listhd, vp, v_freelist);
+	/* see comment on why 0xdeadb is set at end of vgone (below) */
+	vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
+	simple_unlock(&vnode_free_list_slock);
+	vp->v_lease = NULL;
+
+	if (vp->v_type != VBAD)
+		vgonel(vp, p);
+	else
+		simple_unlock(&vp->v_interlock);
+#ifdef DIAGNOSTIC
+	if (vp->v_data || vp->v_uobj.uo_npages ||
+	    TAILQ_FIRST(&vp->v_uobj.memq))
+		panic("cleaned vnode isn't, vp %p", vp);
+	if (vp->v_numoutput)
+		panic("clean vnode has pending I/O's, vp %p", vp);
+#endif
+	KASSERT((vp->v_flag & VONWORKLST) == 0);
+
+	return vp;
+}
+
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
@@ -434,7 +509,6 @@ getnewvnode(tag, mp, vops, vpp)
 	extern struct uvm_pagerops uvm_vnodeops;
 	struct uvm_object *uobj;
 	struct proc *p = curproc;	/* XXX */
-	struct freelst *listhd;
 	static int toggle;
 	struct vnode *vp;
 	int error = 0, tryalloc;
@@ -494,26 +568,13 @@ getnewvnode(tag, mp, vops, vpp)
 		TAILQ_INIT(&uobj->memq);
 		numvnodes++;
 	} else {
-		if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL)
-			vp = TAILQ_FIRST(listhd = &vnode_hold_list);
-		for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) {
-			if (simple_lock_try(&vp->v_interlock)) {
-				if ((vp->v_flag & VLAYER) == 0) {
-					break;
-				}
-				if (VOP_ISLOCKED(vp) == 0)
-					break;
-				else
-					simple_unlock(&vp->v_interlock);
-			}
-		}
+		vp = getcleanvnode(p);
 		/*
 		 * Unless this is a bad time of the month, at most
 		 * the first NCPUS items on the free list are
 		 * locked, so this is close enough to being empty.
 		 */
 		if (vp == NULLVP) {
-			simple_unlock(&vnode_free_list_slock);
 			if (mp && error != EDEADLK)
 				vfs_unbusy(mp);
 			if (tryalloc) {
@@ -526,26 +587,6 @@ getnewvnode(tag, mp, vops, vpp)
 			*vpp = 0;
 			return (ENFILE);
 		}
-		if (vp->v_usecount)
-			panic("free vnode isn't, vp %p", vp);
-		TAILQ_REMOVE(listhd, vp, v_freelist);
-		/* see comment on why 0xdeadb is set at end of vgone (below) */
-		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
-		simple_unlock(&vnode_free_list_slock);
-		vp->v_lease = NULL;
-
-		if (vp->v_type != VBAD)
-			vgonel(vp, p);
-		else
-			simple_unlock(&vp->v_interlock);
-#ifdef DIAGNOSTIC
-		if (vp->v_data || vp->v_uobj.uo_npages ||
-		    TAILQ_FIRST(&vp->v_uobj.memq))
-			panic("cleaned vnode isn't, vp %p", vp);
-		if (vp->v_numoutput)
-			panic("clean vnode has pending I/O's, vp %p", vp);
-#endif
-		KASSERT((vp->v_flag & VONWORKLST) == 0);
 		vp->v_flag = 0;
 		vp->v_socket = NULL;
 #ifdef VERIFIED_EXEC
@@ -1759,7 +1800,7 @@ vgonel(vp, p)
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the back
 	 * pointer and the reference count of zero is because
-	 * it will be removed from the free list by getnewvnode,
+	 * it will be removed from the free list by getcleanvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
@@ -1769,18 +1810,26 @@ vgonel(vp, p)
 	 * that we do not try to move it here.
 	 */
 
+	vp->v_type = VBAD;
 	if (vp->v_usecount == 0) {
+		boolean_t dofree;
+
 		simple_lock(&vnode_free_list_slock);
 		if (vp->v_holdcnt > 0)
 			panic("vgonel: not clean, vp %p", vp);
-		if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
-		    TAILQ_FIRST(&vnode_free_list) != vp) {
+		/*
+		 * if it isn't on the freelist, we're called by getcleanvnode.
+		 * otherwise, we'll free it.
+		 */
+		dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
+		if (dofree) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
-			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+			numvnodes--;
 		}
 		simple_unlock(&vnode_free_list_slock);
+		if (dofree)
+			pool_put(&vnode_pool, vp);
 	}
-	vp->v_type = VBAD;
 }
 
 /*
Index: sys/namei.h
===================================================================
--- sys/namei.h	(revision 1)
+++ sys/namei.h	(working copy)
@@ -166,8 +166,10 @@ struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	TAILQ_ENTRY(namecache) nc_lru;	/* LRU chain */
 	LIST_ENTRY(namecache) nc_vhash;	/* directory hash chain */
+	LIST_ENTRY(namecache) nc_dvlist;
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	u_long	nc_dvpid;		/* capability number of nc_dvp */
+	LIST_ENTRY(namecache) nc_vlist;
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_long	nc_vpid;		/* capability number of nc_vp */
 	char	nc_nlen;		/* length of name */
Index: sys/vnode.h
===================================================================
--- sys/vnode.h	(revision 29)
+++ sys/vnode.h	(working copy)
@@ -48,6 +48,8 @@
 #include <uvm/uvm_object.h>	/* XXX */
 #include <uvm/uvm_extern.h>	/* XXX */
 
+struct namecache;
+
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
@@ -100,6 +102,8 @@ struct vnode {
 	struct buflists	v_cleanblkhd;		/* clean blocklist head */
 	struct buflists	v_dirtyblkhd;		/* dirty blocklist head */
 	LIST_ENTRY(vnode) v_synclist;		/* vnodes with dirty buffers */
+	LIST_HEAD(, namecache) v_dnclist;	/* namecaches for children */
+	LIST_HEAD(, namecache) v_nclist;	/* namecaches for our parent */
 	union {
 		struct mount	*vu_mountedhere;/* ptr to mounted vfs (VDIR) */
 		struct socket	*vu_socket;	/* unix ipc (VSOCK) */
@@ -588,6 +592,7 @@ int	getvnode(struct filedesc *fdp, int f
 
 /* see vfssubr(9) */
 void	vfs_getnewfsid(struct mount *);
+int	vfs_drainvnodes(long target, struct proc *);
 #ifdef DDB
 void	vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...));
 #endif /* DDB */