Subject: Re: Now: Fs suspension take 2
To: Juergen Hannken-Illjes <hannken@eis.cs.tu-bs.de>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 08/18/2006 15:02:13
--GvXjxJ+pjyke8COw
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Thu, Aug 03, 2006 at 04:11:29PM +0200, Juergen Hannken-Illjes wrote:
> On Wed, Jul 05, 2006 at 10:30:32AM -0700, Bill Studenmund wrote:
> > On Tue, Jul 04, 2006 at 04:23:55PM +0200, Juergen Hannken-Illjes wrote:
> > > On Mon, Jul 03, 2006 at 03:51:35PM -0700, Jason Thorpe wrote:
>=20
> [snip]
>=20
> > > 2) Put all gates inside the file systems as
> >=20
> > My preference.
>=20
> Ok, the attached diff puts transaction locks so we can suspend ffs file s=
ystems.

Thank you. This is a lot of work!

> - The fstrans implementation goes to files sys/fstrans.h and kern/vfs_tra=
ns.c.
>=20
> - Add new VFS operation VFS_SUSPENDCTL(MP, C) to request a file system to
>   suspend/resume.
>=20
> - Transaction locks are used if mnt_iflag has IMNT_HAS_TRANS.  Otherwise =
the
>   current (vn_start_write) are used.  With this diff IMNT_HAS_TRANS gets
>   set for ffs type file systems if kernel option NEWVNGATE is set.

How many file systems really supported the old suspension method other=20
than ffs? Or is this coexistance more of a smoothing of switching to the=20
new method for ffs?

i.e. do we need this?

> - The ufs_lock/ufs_unlock part may look strange but it is ok to ignore
>   vnode lock requests for the thread currently suspending.

I think it's probably fine to ignore lock requests while the system is=20
suspending. However we need to make sure that locking and unlocking stay=20
on the same side of the is-suspending state.

i.e. any locks we grab while suspending MUST be released before we=20
finish suspending. And any locks grabbed before suspending MUST be=20
released after we finish. Otherwise we can leak locks.

It could be that all we have to do is document this well.

> - I left the throttling part of my first attempt because it is only needed
>   for multiple softdep enabled file systems on the same disk.  This probl=
em
>   should be solved by a congestion control inside softdep.


> @@ -762,17 +817,21 @@ ufs_whiteout(void *v)
>  	int			error;
>  	struct ufsmount		*ump =3D VFSTOUFS(dvp->v_mount);
> =20
>  	error =3D 0;
> +
>  	switch (ap->a_flags) {
>  	case LOOKUP:
>  		/* 4.4 format directories support whiteout operations */
> +		fstrans_done(dvp->v_mount);

Why do we call fstrans_done() here? CREATE and DELETE _start_=20
transactions, so I don't see where the transaction we're _done'ing starts.

>  		if (ump->um_maxsymlinklen > 0)
>  			return (0);
>  		return (EOPNOTSUPP);
> =20
>  	case CREATE:
>  		/* create a new directory whiteout */
> +		if ((error =3D fstrans_start(dvp->v_mount, FSTRANS_SHARED)) !=3D 0)
> +			return error;
>  #ifdef DIAGNOSTIC
>  		if ((cnp->cn_flags & SAVENAME) =3D=3D 0)
>  			panic("ufs_whiteout: missing name");
>  		if (ump->um_maxsymlinklen <=3D 0)
> @@ -791,8 +850,10 @@ ufs_whiteout(void *v)
>  		break;
> =20
>  	case DELETE:
>  		/* remove an existing directory whiteout */
> +		if ((error =3D fstrans_start(dvp->v_mount, FSTRANS_SHARED)) !=3D 0)
> +			return error;
>  #ifdef DIAGNOSTIC
>  		if (ump->um_maxsymlinklen <=3D 0)
>  			panic("ufs_whiteout: old format filesystem");
>  #endif
> @@ -807,8 +868,9 @@ ufs_whiteout(void *v)
>  	if (cnp->cn_flags & HASBUF) {
>  		PNBUF_PUT(cnp->cn_pnbuf);
>  		cnp->cn_flags &=3D ~HASBUF;
>  	}
> +	fstrans_done(dvp->v_mount);
>  	return (error);
>  }


> +/*
> + * Return whether or not the node is locked.
> + */
> +int
> +ufs_islocked(void *v)
> +{
> +	struct vop_islocked_args /* {
> +		struct vnode *a_vp;
> +	} */ *ap =3D v;
> +	struct vnode *vp =3D ap->a_vp;
> +
> +	return (lockstatus(vp->v_vnlock));
> +}

This routine looks like the genfs routine. Do we really need a separate=20
one?

> Index: sys/ufs/ffs/ffs_snapshot.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/ufs/ffs/ffs_snapshot.c,v
> retrieving revision 1.30
> diff -p -u -4 -r1.30 ffs_snapshot.c
> --- sys/ufs/ffs/ffs_snapshot.c	7 Jun 2006 22:34:19 -0000	1.30
> +++ sys/ufs/ffs/ffs_snapshot.c	30 Jul 2006 16:54:09 -0000
> @@ -58,8 +58,9 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot
>  #include <sys/resource.h>
>  #include <sys/resourcevar.h>
>  #include <sys/vnode.h>
>  #include <sys/kauth.h>
> +#include <sys/fstrans.h>
> =20
>  #include <miscfs/specfs/specdev.h>
> =20
>  #include <ufs/ufs/quota.h>
> @@ -284,9 +285,9 @@ ffs_snapshot(struct mount *mp, struct vn
>  	 * All allocations are done, so we can now snapshot the system.
>  	 *
>  	 * Suspend operation on filesystem.
>  	 */
> -	if ((error =3D vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) !=3D 0)=
 {
> +	if ((error =3D vfs_suspend(vp->v_mount, 0)) !=3D 0) {
>  		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
>  		goto out;
>  	}
>  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> @@ -378,25 +379,33 @@ loop:
>  			VI_UNLOCK(xvp);
>  			MNT_ILOCK(mp);
>  			continue;
>  		}
> +#ifndef NEWVNGATE
>  		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) !=3D 0) {
>  			MNT_ILOCK(mp);
>  			goto loop;
>  		}
> +#else /* NEWVNGATE */
> +		VI_UNLOCK(xvp);
> +#endif /* NEWVNGATE */

VI_UNLOCK()?

> Index: sys/kern/vfs_subr.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
> retrieving revision 1.267
> diff -p -u -4 -r1.267 vfs_subr.c
> --- sys/kern/vfs_subr.c	23 Jun 2006 14:13:02 -0000	1.267
> +++ sys/kern/vfs_subr.c	30 Jul 2006 16:54:06 -0000

> @@ -1203,9 +1204,16 @@ vget(struct vnode *vp, int flags)
>  	}
>  #endif
>  	if (flags & LK_TYPE_MASK) {
>  		if ((error =3D vn_lock(vp, flags | LK_INTERLOCK))) {
> -			vrele(vp);
> +			simple_lock(&vp->v_interlock);
> +			if (vp->v_usecount > 1) {
> +				vp->v_usecount--;
> +				simple_unlock(&vp->v_interlock);
> +			} else {
> +				simple_unlock(&vp->v_interlock);
> +				vrele(vp);
> +			}
>  		}
>  		return (error);
>  	}
>  	simple_unlock(&vp->v_interlock);

Why doe we need the above?

> Index: sys/uvm/uvm_pdaemon.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/uvm/uvm_pdaemon.c,v
> retrieving revision 1.76
> diff -p -u -4 -r1.76 uvm_pdaemon.c
> --- sys/uvm/uvm_pdaemon.c	14 Feb 2006 15:06:27 -0000	1.76
> +++ sys/uvm/uvm_pdaemon.c	30 Jul 2006 16:54:13 -0000
> @@ -82,8 +82,9 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.
>  #include <sys/kernel.h>
>  #include <sys/pool.h>
>  #include <sys/buf.h>
>  #include <sys/vnode.h>
> +#include <sys/fstrans.h>
> =20
>  #include <uvm/uvm.h>
> =20
>  /*
> @@ -174,13 +175,8 @@ uvmpd_tune(void)
>  	UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
> =20
>  	uvmexp.freemin =3D uvmexp.npages / 20;
> =20
> -	/* between 16k and 256k */
> -	/* XXX:  what are these values good for? */
> -	uvmexp.freemin =3D MAX(uvmexp.freemin, (16*1024) >> PAGE_SHIFT);
> -	uvmexp.freemin =3D MIN(uvmexp.freemin, (256*1024) >> PAGE_SHIFT);
> -
>  	/* Make sure there's always a user page free. */
>  	if (uvmexp.freemin < uvmexp.reserve_kernel + 1)
>  		uvmexp.freemin =3D uvmexp.reserve_kernel + 1;

Why make the above change? It looks like an unrelated policy change, that=
=20
at least should be a separate checkin.

> Index: common/lib/libc/gmon/mcount.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/common/lib/libc/gmon/mcount.c,v
> retrieving revision 1.5
> diff -p -u -4 -r1.5 mcount.c
> --- common/lib/libc/gmon/mcount.c	8 Jan 2006 07:46:39 -0000	1.5
> +++ common/lib/libc/gmon/mcount.c	30 Jul 2006 16:52:50 -0000
> @@ -94,9 +94,9 @@ struct gmonparam *_m_gmon_alloc(void);
>  #endif
> =20
>  _MCOUNT_DECL __P((u_long, u_long))
>  #ifdef _KERNEL
> -    __attribute__((__unused__,__no_instrument_function__));	/* see below=
. */
> +    __attribute__((__used__,__no_instrument_function__));	/* see below. =
*/
>  #else
>  #ifdef __vax__
>      __attribute__((__unused__));	/* see below. */
>  #else

??

> Index: sys/arch/sparc64/dev/psm.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/arch/sparc64/dev/psm.c,v
> retrieving revision 1.2
> diff -p -u -4 -r1.2 psm.c
> --- sys/arch/sparc64/dev/psm.c	12 Jul 2006 15:03:24 -0000	1.2
> +++ sys/arch/sparc64/dev/psm.c	30 Jul 2006 16:54:02 -0000
> @@ -127,9 +127,9 @@ STATIC void psm_attach(struct device *,=20
>  CFATTACH_DECL(psm, sizeof(struct psm_softc),
>      psm_match, psm_attach, NULL, NULL);
> =20
> =20
> -static int
> +STATIC int
>  psm_match(struct device *parent, struct cfdata *cf, void *aux)
>  {
>  	struct ebus_attach_args *ea =3D aux;
> =20
> @@ -137,9 +137,9 @@ psm_match(struct device *parent, struct=20
>  		return (0);
>  	return (1);
>  }
> =20
> -static void
> +STATIC void
>  psm_attach(struct device *parent, struct device *self, void *aux)
>  {
>  	struct psm_softc	*sc =3D (struct psm_softc *)self;
>  	struct ebus_attach_args	*ea =3D aux;

??

Take care,

Bill

--GvXjxJ+pjyke8COw
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (NetBSD)

iD8DBQFE5jjlWz+3JHUci9cRArvVAJwPJ0E7pPUZVaI7Mef9XDG4A/TwHwCeMOiB
9fH09fqQWL3f9Z3V/deyYsE=
=DgNg
-----END PGP SIGNATURE-----

--GvXjxJ+pjyke8COw--