Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/fs/tmpfs - Rework and document inode reference counting....



details:   https://anonhg.NetBSD.org/src/rev/6718bfcf4de0
branches:  trunk
changeset: 765532:6718bfcf4de0
user:      rmind <rmind%NetBSD.org@localhost>
date:      Sun May 29 22:29:06 2011 +0000

description:
- Rework and document inode reference counting.  Also document inode life
  cycle (destruction part).  Perform link counting in tmpfs_dir_attach()
  and tmpfs_dir_detach(), instead of alloc/free and arbitrary places.
  Fixes PR/44285, PR/44288, PR/44657 and likely PR/42484.

- Fix the race between the lookup and inode destruction.  Fixes PR/43167
  and its duplicates PR/40088, PR/40757.

- Improve tmpfs_rename() locking a little, fix kqueue event notifications
  and also fix PR/43617.  Add simplistic tmpfs_parentcheck_p(); to be
  expanded and used for further rename() locking fixes.

- Cache directory entry "hint" in the tmpfs node, add tmpfs_dir_cached(),
  and thus avoid unnecessary lookup in tmpfs_remove() and tmpfs_rmdir().

- Set correct _PC_FILESIZEBITS value in tmpfs_pathconf().  Fixes PR/43576.

- Few minor fixes.

diffstat:

 sys/fs/tmpfs/tmpfs.h        |   65 +++++---
 sys/fs/tmpfs/tmpfs_subr.c   |  301 +++++++++++++++++++++++++++----------------
 sys/fs/tmpfs/tmpfs_vfsops.c |   46 +++--
 sys/fs/tmpfs/tmpfs_vnops.c  |  248 +++++++++++++++++++++--------------
 4 files changed, 405 insertions(+), 255 deletions(-)

diffs (truncated from 1233 to 300 lines):

diff -r 7e9416d8e12b -r 6718bfcf4de0 sys/fs/tmpfs/tmpfs.h
--- a/sys/fs/tmpfs/tmpfs.h      Sun May 29 22:14:53 2011 +0000
+++ b/sys/fs/tmpfs/tmpfs.h      Sun May 29 22:29:06 2011 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: tmpfs.h,v 1.43 2011/05/29 01:14:31 christos Exp $      */
+/*     $NetBSD: tmpfs.h,v 1.44 2011/05/29 22:29:06 rmind Exp $ */
 
 /*
  * Copyright (c) 2005, 2006, 2007 The NetBSD Foundation, Inc.
@@ -110,11 +110,29 @@
 typedef struct tmpfs_node {
        LIST_ENTRY(tmpfs_node)  tn_entries;
 
+       /*
+        * Each inode has a corresponding vnode.  It is a bi-directional
+        * association.  Whenever vnode is allocated, its v_data field is
+        * set to the inode it reference, and tmpfs_node_t::tn_vnode is
+        * set to point to the said vnode.
+        *
+        * Further attempts to allocate a vnode for this same node will
+        * result in returning a new reference to the value stored in
+        * tn_vnode.  It may be NULL when the node is unused (that is,
+        * no vnode has been allocated or it has been reclaimed).
+        */
+       kmutex_t                tn_vlock;
+       vnode_t *               tn_vnode;
+
+       /* Directory entry.  Only a hint, since hard link can have multiple. */
+       tmpfs_dirent_t *        tn_dirent_hint;
+
        /* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */
        enum vtype              tn_type;
 
-       /* Inode identifier. */
+       /* Inode identifier and generation number. */
        ino_t                   tn_id;
+       unsigned long           tn_gen;
 
        /* Inode status flags (for operations in delayed manner). */
        int                     tn_status;
@@ -132,25 +150,10 @@
        struct timespec         tn_mtime;
        struct timespec         tn_ctime;
        struct timespec         tn_birthtime;
-       unsigned long           tn_gen;
 
        /* Head of byte-level lock list (used by tmpfs_advlock). */
        struct lockf *          tn_lockf;
 
-       /*
-        * Each inode has a corresponding vnode.  It is a bi-directional
-        * association.  Whenever vnode is allocated, its v_data field is
-        * set to the inode it reference, and tmpfs_node_t::tn_vnode is
-        * set to point to the said vnode.
-        *
-        * Further attempts to allocate a vnode for this same node will
-        * result in returning a new reference to the value stored in
-        * tn_vnode.  It may be NULL when the node is unused (that is,
-        * no vnode has been allocated or it has been reclaimed).
-        */
-       kmutex_t                tn_vlock;
-       vnode_t *               tn_vnode;
-
        union {
                /* Type case: VBLK or VCHR. */
                struct {
@@ -200,6 +203,19 @@
 #define        TMPFS_NODE_STATUSALL    \
     (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)
 
+/*
+ * Bit indicating vnode reclamation.
+ * We abuse tmpfs_node_t::tn_gen for that.
+ */
+#define        TMPFS_NODE_GEN_MASK     (~0UL >> 1)
+#define        TMPFS_RECLAIMING_BIT    (~TMPFS_NODE_GEN_MASK)
+
+#define        TMPFS_NODE_RECLAIMING(node) \
+    (((node)->tn_gen & TMPFS_RECLAIMING_BIT) != 0)
+
+#define        TMPFS_NODE_GEN(node) \
+    ((node)->tn_gen & TMPFS_NODE_GEN_MASK)
+
 /* White-out inode indicator. */
 #define        TMPFS_NODE_WHITEOUT     ((tmpfs_node_t *)-1)
 
@@ -242,22 +258,23 @@
  */
 
 int            tmpfs_alloc_node(tmpfs_mount_t *, enum vtype, uid_t, gid_t,
-                   mode_t, tmpfs_node_t *, char *, dev_t, tmpfs_node_t **);
+                   mode_t, char *, dev_t, tmpfs_node_t **);
 void           tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *);
 
 int            tmpfs_alloc_file(vnode_t *, vnode_t **, struct vattr *,
                    struct componentname *, char *);
 
-int            tmpfs_alloc_vp(struct mount *, tmpfs_node_t *, vnode_t **);
-void           tmpfs_free_vp(vnode_t *);
+int            tmpfs_vnode_get(struct mount *, tmpfs_node_t *, vnode_t **);
 
-int            tmpfs_alloc_dirent(tmpfs_mount_t *, tmpfs_node_t *,
-                   const char *, uint16_t, tmpfs_dirent_t **);
-void           tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *, bool);
-void           tmpfs_dir_attach(vnode_t *, tmpfs_dirent_t *);
+int            tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t,
+                   tmpfs_dirent_t **);
+void           tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *);
+void           tmpfs_dir_attach(vnode_t *, tmpfs_dirent_t *, tmpfs_node_t *);
 void           tmpfs_dir_detach(vnode_t *, tmpfs_dirent_t *);
 
 tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *);
+tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *);
+
 int            tmpfs_dir_getdotdent(tmpfs_node_t *, struct uio *);
 int            tmpfs_dir_getdotdotdent(tmpfs_node_t *, struct uio *);
 tmpfs_dirent_t *tmpfs_dir_lookupbycookie(tmpfs_node_t *, off_t);
diff -r 7e9416d8e12b -r 6718bfcf4de0 sys/fs/tmpfs/tmpfs_subr.c
--- a/sys/fs/tmpfs/tmpfs_subr.c Sun May 29 22:14:53 2011 +0000
+++ b/sys/fs/tmpfs/tmpfs_subr.c Sun May 29 22:29:06 2011 +0000
@@ -1,12 +1,12 @@
-/*     $NetBSD: tmpfs_subr.c,v 1.70 2011/05/25 02:03:22 rmind Exp $    */
+/*     $NetBSD: tmpfs_subr.c,v 1.71 2011/05/29 22:29:06 rmind Exp $    */
 
 /*
- * Copyright (c) 2005, 2006, 2007 The NetBSD Foundation, Inc.
+ * Copyright (c) 2005-2011 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
- * 2005 program.
+ * 2005 program, and by Mindaugas Rasiukevicius.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -31,12 +31,50 @@
  */
 
 /*
- * Efficient memory file system: functions for inode and directory entry
- * construction and destruction.
+ * Efficient memory file system: interfaces for inode and directory entry
+ * construction, destruction and manipulation.
+ *
+ * Reference counting
+ *
+ *     The link count of inode (tmpfs_node_t::tn_links) is used as a
+ *     reference counter.  However, it has slightly different semantics.
+ *
+ *     For directories - link count represents directory entries, which
+ *     refer to the directories.  In other words, it represents the count
+ *     of sub-directories.  It also takes into account the virtual '.'
+ *     entry (which has no real entry in the list).  For files - link count
+ *     represents the hard links.  Since only empty directories can be
+ *     removed - link count aligns the reference counting requirements
+ *     enough.  Note: to check whether directory is not empty, the inode
+ *     size (tmpfs_node_t::tn_size) can be used.
+ *
+ *     The inode itself, as an object, gathers its first reference when
+ *     directory entry is attached via tmpfs_dir_attach(9).  For instance,
+ *     after regular tmpfs_create(), a file would have a link count of 1,
+ *     while directory after tmpfs_mkdir() would have 2 (due to '.').
+ *
+ * Reclamation
+ *
+ *     It should be noted that tmpfs inodes rely on a combination of vnode
+ *     reference counting and link counting.  That is, an inode can only be
+ *     destroyed if its associated vnode is inactive.  The destruction is
+ *     done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
+ *     that tmpfs_node_t::tn_links being 0 is a destruction criterion. 
+ *
+ *     If an inode has references within the file system (tn_links > 0) and
+ *     its inactive vnode gets reclaimed/recycled - then the association is
+ *     broken in tmpfs_reclaim().  In such case, an inode will always pass
+ *     tmpfs_lookup() and thus tmpfs_vnode_get() to associate a new vnode.
+ *
+ * Lock order
+ *
+ *     tmpfs_node_t::tn_vlock ->
+ *             vnode_t::v_vlock ->
+ *                     vnode_t::v_interlock
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.70 2011/05/25 02:03:22 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.71 2011/05/29 22:29:06 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/dirent.h>
@@ -65,9 +103,8 @@
  * insert it into the list of specified mount point.
  */
 int
-tmpfs_alloc_node(tmpfs_mount_t *tmp, enum vtype type, uid_t uid,
-    gid_t gid, mode_t mode, tmpfs_node_t *parent, char *target, dev_t rdev,
-    tmpfs_node_t **node)
+tmpfs_alloc_node(tmpfs_mount_t *tmp, enum vtype type, uid_t uid, gid_t gid,
+    mode_t mode, char *target, dev_t rdev, tmpfs_node_t **node)
 {
        tmpfs_node_t *nnode;
 
@@ -76,22 +113,25 @@
                return ENOSPC;
        }
 
+       /* Initially, no references and no associations. */
+       nnode->tn_links = 0;
+       nnode->tn_vnode = NULL;
+       nnode->tn_dirent_hint = NULL;
+
        /*
         * XXX Where the pool is backed by a map larger than (4GB *
         * sizeof(*nnode)), this may produce duplicate inode numbers
         * for applications that do not understand 64-bit ino_t.
         */
        nnode->tn_id = (ino_t)((uintptr_t)nnode / sizeof(*nnode));
-       nnode->tn_gen = arc4random();
+       nnode->tn_gen = TMPFS_NODE_GEN_MASK & arc4random();
 
        /* Generic initialization. */
        nnode->tn_type = type;
        nnode->tn_size = 0;
        nnode->tn_status = 0;
        nnode->tn_flags = 0;
-       nnode->tn_links = 0;
        nnode->tn_lockf = NULL;
-       nnode->tn_vnode = NULL;
 
        vfs_timestamp(&nnode->tn_atime);
        nnode->tn_birthtime = nnode->tn_atime;
@@ -112,18 +152,13 @@
                nnode->tn_spec.tn_dev.tn_rdev = rdev;
                break;
        case VDIR:
-               /*
-                * Directory.  Parent must be specified, unless allocating
-                * the root inode.
-                */
-               KASSERT(parent || tmp->tm_root == NULL);
-               KASSERT(parent != nnode);
-
+               /* Directory. */
                TAILQ_INIT(&nnode->tn_spec.tn_dir.tn_dir);
-               nnode->tn_spec.tn_dir.tn_parent =
-                   (parent == NULL) ? nnode : parent;
+               nnode->tn_spec.tn_dir.tn_parent = NULL;
                nnode->tn_spec.tn_dir.tn_readdir_lastn = 0;
                nnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
+
+               /* Extra link count for the virtual '.' entry. */
                nnode->tn_links++;
                break;
        case VFIFO:
@@ -200,8 +235,11 @@
                }
                break;
        case VDIR:
-               /* KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir)); */
-               KASSERT(node->tn_spec.tn_dir.tn_parent || node == tmp->tm_root);
+               /*
+                * KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
+                * KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
+                *     node == tmp->tm_root);
+                */
                break;
        default:
                break;
@@ -212,28 +250,34 @@
 }
 
 /*
- * tmpfs_alloc_vp: allocate or reclaim a vnode for a specified inode.
+ * tmpfs_vnode_get: allocate or reclaim a vnode for a specified inode.
  *
+ * => Must be called with tmpfs_node_t::tn_vlock held.
  * => Returns vnode (*vpp) locked.
  */
 int
-tmpfs_alloc_vp(struct mount *mp, tmpfs_node_t *node, vnode_t **vpp)
+tmpfs_vnode_get(struct mount *mp, tmpfs_node_t *node, vnode_t **vpp)
 {
        vnode_t *vp;
        int error;
 again:
        /* If there is already a vnode, try to reclaim it. */
-       mutex_enter(&node->tn_vlock);
        if ((vp = node->tn_vnode) != NULL) {
+               atomic_or_ulong(&node->tn_gen, TMPFS_RECLAIMING_BIT);
                mutex_enter(&vp->v_interlock);
                mutex_exit(&node->tn_vlock);
                error = vget(vp, LK_EXCLUSIVE);
                if (error == ENOENT) {
+                       mutex_enter(&node->tn_vlock);
                        goto again;
                }
+               atomic_and_ulong(&node->tn_gen, ~TMPFS_RECLAIMING_BIT);
                *vpp = vp;
                return error;
        }



Home | Main Index | Thread Index | Old Index