Subject: make vnodes freeable
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 07/02/2003 23:36:58
hi,
following patch is to make vnodes freeable.
i.e. you can decrease kern.maxvnodes.
if no one objects, i'll check it in a few days later.
YAMAMOTO Takashi
Index: kern/kern_sysctl.c
===================================================================
--- kern/kern_sysctl.c (revision 182)
+++ kern/kern_sysctl.c (working copy)
@@ -375,8 +375,11 @@ kern_sysctl(int *name, u_int namelen, vo
error = sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes);
if (newp && !error) {
if (old_vnodes > desiredvnodes) {
- desiredvnodes = old_vnodes;
- return (EINVAL);
+ error = vfs_drainvnodes(desiredvnodes, p);
+ if (error) {
+ desiredvnodes = old_vnodes;
+ return error;
+ }
}
vfs_reinit();
nchreinit();
Index: kern/vfs_cache.c
===================================================================
--- kern/vfs_cache.c (revision 154)
+++ kern/vfs_cache.c (working copy)
@@ -97,6 +97,45 @@ int doingcache = 1; /* 1 => enable the
/* A single lock to protect cache insertion, removal and lookup */
static struct simplelock namecache_slock = SIMPLELOCK_INITIALIZER;
+static void cache_remove(struct namecache *);
+static void cache_free(struct namecache *);
+
+static void
+cache_remove(struct namecache *ncp)
+{
+
+ LOCK_ASSERT(simple_lock_held(&namecache_slock));
+
+ ncp->nc_dvp = NULL;
+ ncp->nc_vp = NULL;
+
+ TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
+ if (ncp->nc_hash.le_prev != NULL) {
+ LIST_REMOVE(ncp, nc_hash);
+ ncp->nc_hash.le_prev = NULL;
+ }
+ if (ncp->nc_vhash.le_prev != NULL) {
+ LIST_REMOVE(ncp, nc_vhash);
+ ncp->nc_vhash.le_prev = NULL;
+ }
+ if (ncp->nc_vlist.le_prev != NULL) {
+ LIST_REMOVE(ncp, nc_vlist);
+ ncp->nc_vlist.le_prev = NULL;
+ }
+ if (ncp->nc_dvlist.le_prev != NULL) {
+ LIST_REMOVE(ncp, nc_dvlist);
+ ncp->nc_dvlist.le_prev = NULL;
+ }
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+
+ pool_put(&namecache_pool, ncp);
+ numcache--; /* XXX MP */
+}
+
/*
* Look for a the name in the cache. We don't do this
* if the segment name is long, simply so the cache can avoid
@@ -370,15 +409,7 @@ cache_enter(struct vnode *dvp, struct vn
memset(ncp, 0, sizeof(*ncp));
simple_lock(&namecache_slock);
} else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) {
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- if (ncp->nc_hash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_hash);
- ncp->nc_hash.le_prev = NULL;
- }
- if (ncp->nc_vhash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_vhash);
- ncp->nc_vhash.le_prev = NULL;
- }
+ cache_remove(ncp);
} else {
simple_unlock(&namecache_slock);
return;
@@ -396,6 +427,9 @@ cache_enter(struct vnode *dvp, struct vn
}
/* Fill in cache info. */
ncp->nc_dvp = dvp;
+ LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist);
+ if (vp)
+ LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist);
ncp->nc_dvpid = dvp->v_id;
ncp->nc_nlen = cnp->cn_namelen;
memcpy(ncp->nc_name, cnp->cn_nameptr, (unsigned)ncp->nc_nlen);
@@ -494,11 +528,21 @@ nchreinit(void)
void
cache_purge(struct vnode *vp)
{
- struct namecache *ncp;
+ struct namecache *ncp, *ncnext;
struct nchashhead *ncpp;
static u_long nextvnodeid;
simple_lock(&namecache_slock);
+ for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL; ncp = ncnext) {
+ ncnext = LIST_NEXT(ncp, nc_vlist);
+ cache_remove(ncp);
+ cache_free(ncp);
+ }
+ for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL; ncp = ncnext) {
+ ncnext = LIST_NEXT(ncp, nc_dvlist);
+ cache_remove(ncp);
+ cache_free(ncp);
+ }
vp->v_id = ++nextvnodeid;
if (nextvnodeid != 0)
goto out;
@@ -529,18 +573,8 @@ cache_purgevfs(struct mount *mp)
continue;
}
/* Free the resources we had. */
- ncp->nc_vp = NULL;
- ncp->nc_dvp = NULL;
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- if (ncp->nc_hash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_hash);
- ncp->nc_hash.le_prev = NULL;
- }
- if (ncp->nc_vhash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_vhash);
- ncp->nc_vhash.le_prev = NULL;
- }
- TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
+ cache_remove(ncp);
+ cache_free(ncp);
}
simple_unlock(&namecache_slock);
}
Index: kern/vfs_subr.c
===================================================================
--- kern/vfs_subr.c (revision 154)
+++ kern/vfs_subr.c (working copy)
@@ -191,6 +191,7 @@ static int vfs_hang_addrlist __P((struct
struct export_args *));
static int vfs_free_netcred __P((struct radix_node *, void *));
static void vfs_free_addrlist __P((struct netexport *));
+static struct vnode *getcleanvnode __P((struct proc *));
#ifdef DEBUG
void printlockedvnodes __P((void));
@@ -212,6 +213,80 @@ vntblinit()
vn_initialize_syncerd();
}
+int
+vfs_drainvnodes(long target, struct proc *p)
+{
+
+ simple_lock(&vnode_free_list_slock);
+ while (numvnodes > target) {
+ struct vnode *vp;
+
+ vp = getcleanvnode(p);
+ if (vp == NULL)
+ return EBUSY; /* give up */
+ pool_put(&vnode_pool, vp);
+ simple_lock(&vnode_free_list_slock);
+ numvnodes--;
+ }
+ simple_unlock(&vnode_free_list_slock);
+
+ return 0;
+}
+
+/*
+ * grab a vnode from freelist and clean it.
+ */
+struct vnode *
+getcleanvnode(p)
+ struct proc *p;
+{
+ struct vnode *vp;
+ struct freelst *listhd;
+
+ LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
+ if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL)
+ vp = TAILQ_FIRST(listhd = &vnode_hold_list);
+ for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) {
+ if (simple_lock_try(&vp->v_interlock)) {
+ if ((vp->v_flag & VLAYER) == 0) {
+ break;
+ }
+ if (VOP_ISLOCKED(vp) == 0)
+ break;
+ else
+ simple_unlock(&vp->v_interlock);
+ }
+ }
+
+ if (vp == NULLVP) {
+ simple_unlock(&vnode_free_list_slock);
+ return NULLVP;
+ }
+
+ if (vp->v_usecount)
+ panic("free vnode isn't, vp %p", vp);
+ TAILQ_REMOVE(listhd, vp, v_freelist);
+ /* see comment on why 0xdeadb is set at end of vgone (below) */
+ vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_lease = NULL;
+
+ if (vp->v_type != VBAD)
+ vgonel(vp, p);
+ else
+ simple_unlock(&vp->v_interlock);
+#ifdef DIAGNOSTIC
+ if (vp->v_data || vp->v_uobj.uo_npages ||
+ TAILQ_FIRST(&vp->v_uobj.memq))
+ panic("cleaned vnode isn't, vp %p", vp);
+ if (vp->v_numoutput)
+ panic("clean vnode has pending I/O's, vp %p", vp);
+#endif
+ KASSERT((vp->v_flag & VONWORKLST) == 0);
+
+ return vp;
+}
+
/*
* Mark a mount point as busy. Used to synchronize access and to delay
* unmounting. Interlock is not released on failure.
@@ -434,7 +509,6 @@ getnewvnode(tag, mp, vops, vpp)
extern struct uvm_pagerops uvm_vnodeops;
struct uvm_object *uobj;
struct proc *p = curproc; /* XXX */
- struct freelst *listhd;
static int toggle;
struct vnode *vp;
int error = 0, tryalloc;
@@ -494,26 +568,13 @@ getnewvnode(tag, mp, vops, vpp)
TAILQ_INIT(&uobj->memq);
numvnodes++;
} else {
- if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL)
- vp = TAILQ_FIRST(listhd = &vnode_hold_list);
- for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) {
- if (simple_lock_try(&vp->v_interlock)) {
- if ((vp->v_flag & VLAYER) == 0) {
- break;
- }
- if (VOP_ISLOCKED(vp) == 0)
- break;
- else
- simple_unlock(&vp->v_interlock);
- }
- }
+ vp = getcleanvnode(p);
/*
* Unless this is a bad time of the month, at most
* the first NCPUS items on the free list are
* locked, so this is close enough to being empty.
*/
if (vp == NULLVP) {
- simple_unlock(&vnode_free_list_slock);
if (mp && error != EDEADLK)
vfs_unbusy(mp);
if (tryalloc) {
@@ -526,26 +587,6 @@ getnewvnode(tag, mp, vops, vpp)
*vpp = 0;
return (ENFILE);
}
- if (vp->v_usecount)
- panic("free vnode isn't, vp %p", vp);
- TAILQ_REMOVE(listhd, vp, v_freelist);
- /* see comment on why 0xdeadb is set at end of vgone (below) */
- vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
- simple_unlock(&vnode_free_list_slock);
- vp->v_lease = NULL;
-
- if (vp->v_type != VBAD)
- vgonel(vp, p);
- else
- simple_unlock(&vp->v_interlock);
-#ifdef DIAGNOSTIC
- if (vp->v_data || vp->v_uobj.uo_npages ||
- TAILQ_FIRST(&vp->v_uobj.memq))
- panic("cleaned vnode isn't, vp %p", vp);
- if (vp->v_numoutput)
- panic("clean vnode has pending I/O's, vp %p", vp);
-#endif
- KASSERT((vp->v_flag & VONWORKLST) == 0);
vp->v_flag = 0;
vp->v_socket = NULL;
#ifdef VERIFIED_EXEC
@@ -1759,7 +1800,7 @@ vgonel(vp, p)
* If it is on the freelist and not already at the head,
* move it to the head of the list. The test of the back
* pointer and the reference count of zero is because
- * it will be removed from the free list by getnewvnode,
+ * it will be removed from the free list by getcleanvnode,
* but will not have its reference count incremented until
* after calling vgone. If the reference count were
* incremented first, vgone would (incorrectly) try to
@@ -1769,18 +1810,26 @@ vgonel(vp, p)
* that we do not try to move it here.
*/
+ vp->v_type = VBAD;
if (vp->v_usecount == 0) {
+ boolean_t dofree;
+
simple_lock(&vnode_free_list_slock);
if (vp->v_holdcnt > 0)
panic("vgonel: not clean, vp %p", vp);
- if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb &&
- TAILQ_FIRST(&vnode_free_list) != vp) {
+ /*
+ * if it isn't on the freelist, we're called by getcleanvnode.
+ * otherwise, we'll free it.
+ */
+ dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
+ if (dofree) {
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
- TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ numvnodes--;
}
simple_unlock(&vnode_free_list_slock);
+ if (dofree)
+ pool_put(&vnode_pool, vp);
}
- vp->v_type = VBAD;
}
/*
Index: sys/namei.h
===================================================================
--- sys/namei.h (revision 1)
+++ sys/namei.h (working copy)
@@ -166,8 +166,10 @@ struct namecache {
LIST_ENTRY(namecache) nc_hash; /* hash chain */
TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */
LIST_ENTRY(namecache) nc_vhash; /* directory hash chain */
+ LIST_ENTRY(namecache) nc_dvlist;
struct vnode *nc_dvp; /* vnode of parent of name */
u_long nc_dvpid; /* capability number of nc_dvp */
+ LIST_ENTRY(namecache) nc_vlist;
struct vnode *nc_vp; /* vnode the name refers to */
u_long nc_vpid; /* capability number of nc_vp */
char nc_nlen; /* length of name */
Index: sys/vnode.h
===================================================================
--- sys/vnode.h (revision 29)
+++ sys/vnode.h (working copy)
@@ -48,6 +48,8 @@
#include <uvm/uvm_object.h> /* XXX */
#include <uvm/uvm_extern.h> /* XXX */
+struct namecache;
+
/*
* The vnode is the focus of all file activity in UNIX. There is a
* unique vnode allocated for each active file, each current directory,
@@ -100,6 +102,8 @@ struct vnode {
struct buflists v_cleanblkhd; /* clean blocklist head */
struct buflists v_dirtyblkhd; /* dirty blocklist head */
LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */
+ LIST_HEAD(, namecache) v_dnclist; /* namecaches for children */
+ LIST_HEAD(, namecache) v_nclist; /* namecaches for our parent */
union {
struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */
struct socket *vu_socket; /* unix ipc (VSOCK) */
@@ -588,6 +592,7 @@ int getvnode(struct filedesc *fdp, int f
/* see vfssubr(9) */
void vfs_getnewfsid(struct mount *);
+int vfs_drainvnodes(long target, struct proc *);
#ifdef DDB
void vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...));
#endif /* DDB */