Subject: Re: Now: Fs suspension take 2
To: Juergen Hannken-Illjes <hannken@eis.cs.tu-bs.de>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 08/18/2006 15:02:13
--GvXjxJ+pjyke8COw
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
On Thu, Aug 03, 2006 at 04:11:29PM +0200, Juergen Hannken-Illjes wrote:
> On Wed, Jul 05, 2006 at 10:30:32AM -0700, Bill Studenmund wrote:
> > On Tue, Jul 04, 2006 at 04:23:55PM +0200, Juergen Hannken-Illjes wrote:
> > > On Mon, Jul 03, 2006 at 03:51:35PM -0700, Jason Thorpe wrote:
>=20
> [snip]
>=20
> > > 2) Put all gates inside the file systems as
> >=20
> > My preference.
>=20
> Ok, the attached diff puts transaction locks so we can suspend ffs file s=
ystems.
Thank you. This is a lot of work!
> - The fstrans implementation goes to files sys/fstrans.h and kern/vfs_tra=
ns.c.
>=20
> - Add new VFS operation VFS_SUSPENDCTL(MP, C) to request a file system to
> suspend/resume.
>=20
> - Transaction locks are used if mnt_iflag has IMNT_HAS_TRANS. Otherwise =
the
> current (vn_start_write) are used. With this diff IMNT_HAS_TRANS gets
> set for ffs type file systems if kernel option NEWVNGATE is set.
How many file systems really supported the old suspension method other=20
than ffs? Or is this coexistance more of a smoothing of switching to the=20
new method for ffs?
i.e. do we need this?
> - The ufs_lock/ufs_unlock part may look strange but it is ok to ignore
> vnode lock requests for the thread currently suspending.
I think it's probably fine to ignore lock requests while the system is=20
suspending. However we need to make sure that locking and unlocking stay=20
on the same side of the is-suspending state.
i.e. any locks we grab while suspending MUST be released before we=20
finish suspending. And any locks grabbed before suspending MUST be=20
released after we finish. Otherwise we can leak locks.
It could be that all we have to do is document this well.
> - I left the throttling part of my first attempt because it is only needed
> for multiple softdep enabled file systems on the same disk. This probl=
em
> should be solved by a congestion control inside softdep.
> @@ -762,17 +817,21 @@ ufs_whiteout(void *v)
> int error;
> struct ufsmount *ump =3D VFSTOUFS(dvp->v_mount);
> =20
> error =3D 0;
> +
> switch (ap->a_flags) {
> case LOOKUP:
> /* 4.4 format directories support whiteout operations */
> + fstrans_done(dvp->v_mount);
Why do we call fstrans_done() here? CREATE and DELETE _start_=20
transactions, so I don't see where the transaction we're _done'ing starts.
> if (ump->um_maxsymlinklen > 0)
> return (0);
> return (EOPNOTSUPP);
> =20
> case CREATE:
> /* create a new directory whiteout */
> + if ((error =3D fstrans_start(dvp->v_mount, FSTRANS_SHARED)) !=3D 0)
> + return error;
> #ifdef DIAGNOSTIC
> if ((cnp->cn_flags & SAVENAME) =3D=3D 0)
> panic("ufs_whiteout: missing name");
> if (ump->um_maxsymlinklen <=3D 0)
> @@ -791,8 +850,10 @@ ufs_whiteout(void *v)
> break;
> =20
> case DELETE:
> /* remove an existing directory whiteout */
> + if ((error =3D fstrans_start(dvp->v_mount, FSTRANS_SHARED)) !=3D 0)
> + return error;
> #ifdef DIAGNOSTIC
> if (ump->um_maxsymlinklen <=3D 0)
> panic("ufs_whiteout: old format filesystem");
> #endif
> @@ -807,8 +868,9 @@ ufs_whiteout(void *v)
> if (cnp->cn_flags & HASBUF) {
> PNBUF_PUT(cnp->cn_pnbuf);
> cnp->cn_flags &=3D ~HASBUF;
> }
> + fstrans_done(dvp->v_mount);
> return (error);
> }
> +/*
> + * Return whether or not the node is locked.
> + */
> +int
> +ufs_islocked(void *v)
> +{
> + struct vop_islocked_args /* {
> + struct vnode *a_vp;
> + } */ *ap =3D v;
> + struct vnode *vp =3D ap->a_vp;
> +
> + return (lockstatus(vp->v_vnlock));
> +}
This routine looks like the genfs routine. Do we really need a separate=20
one?
> Index: sys/ufs/ffs/ffs_snapshot.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/ufs/ffs/ffs_snapshot.c,v
> retrieving revision 1.30
> diff -p -u -4 -r1.30 ffs_snapshot.c
> --- sys/ufs/ffs/ffs_snapshot.c 7 Jun 2006 22:34:19 -0000 1.30
> +++ sys/ufs/ffs/ffs_snapshot.c 30 Jul 2006 16:54:09 -0000
> @@ -58,8 +58,9 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot
> #include <sys/resource.h>
> #include <sys/resourcevar.h>
> #include <sys/vnode.h>
> #include <sys/kauth.h>
> +#include <sys/fstrans.h>
> =20
> #include <miscfs/specfs/specdev.h>
> =20
> #include <ufs/ufs/quota.h>
> @@ -284,9 +285,9 @@ ffs_snapshot(struct mount *mp, struct vn
> * All allocations are done, so we can now snapshot the system.
> *
> * Suspend operation on filesystem.
> */
> - if ((error =3D vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) !=3D 0)=
{
> + if ((error =3D vfs_suspend(vp->v_mount, 0)) !=3D 0) {
> vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> goto out;
> }
> vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> @@ -378,25 +379,33 @@ loop:
> VI_UNLOCK(xvp);
> MNT_ILOCK(mp);
> continue;
> }
> +#ifndef NEWVNGATE
> if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) !=3D 0) {
> MNT_ILOCK(mp);
> goto loop;
> }
> +#else /* NEWVNGATE */
> + VI_UNLOCK(xvp);
> +#endif /* NEWVNGATE */
VI_UNLOCK()?
> Index: sys/kern/vfs_subr.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
> retrieving revision 1.267
> diff -p -u -4 -r1.267 vfs_subr.c
> --- sys/kern/vfs_subr.c 23 Jun 2006 14:13:02 -0000 1.267
> +++ sys/kern/vfs_subr.c 30 Jul 2006 16:54:06 -0000
> @@ -1203,9 +1204,16 @@ vget(struct vnode *vp, int flags)
> }
> #endif
> if (flags & LK_TYPE_MASK) {
> if ((error =3D vn_lock(vp, flags | LK_INTERLOCK))) {
> - vrele(vp);
> + simple_lock(&vp->v_interlock);
> + if (vp->v_usecount > 1) {
> + vp->v_usecount--;
> + simple_unlock(&vp->v_interlock);
> + } else {
> + simple_unlock(&vp->v_interlock);
> + vrele(vp);
> + }
> }
> return (error);
> }
> simple_unlock(&vp->v_interlock);
Why doe we need the above?
> Index: sys/uvm/uvm_pdaemon.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/uvm/uvm_pdaemon.c,v
> retrieving revision 1.76
> diff -p -u -4 -r1.76 uvm_pdaemon.c
> --- sys/uvm/uvm_pdaemon.c 14 Feb 2006 15:06:27 -0000 1.76
> +++ sys/uvm/uvm_pdaemon.c 30 Jul 2006 16:54:13 -0000
> @@ -82,8 +82,9 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.
> #include <sys/kernel.h>
> #include <sys/pool.h>
> #include <sys/buf.h>
> #include <sys/vnode.h>
> +#include <sys/fstrans.h>
> =20
> #include <uvm/uvm.h>
> =20
> /*
> @@ -174,13 +175,8 @@ uvmpd_tune(void)
> UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
> =20
> uvmexp.freemin =3D uvmexp.npages / 20;
> =20
> - /* between 16k and 256k */
> - /* XXX: what are these values good for? */
> - uvmexp.freemin =3D MAX(uvmexp.freemin, (16*1024) >> PAGE_SHIFT);
> - uvmexp.freemin =3D MIN(uvmexp.freemin, (256*1024) >> PAGE_SHIFT);
> -
> /* Make sure there's always a user page free. */
> if (uvmexp.freemin < uvmexp.reserve_kernel + 1)
> uvmexp.freemin =3D uvmexp.reserve_kernel + 1;
Why make the above change? It looks like an unrelated policy change, that=
=20
at least should be a separate checkin.
> Index: common/lib/libc/gmon/mcount.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/common/lib/libc/gmon/mcount.c,v
> retrieving revision 1.5
> diff -p -u -4 -r1.5 mcount.c
> --- common/lib/libc/gmon/mcount.c 8 Jan 2006 07:46:39 -0000 1.5
> +++ common/lib/libc/gmon/mcount.c 30 Jul 2006 16:52:50 -0000
> @@ -94,9 +94,9 @@ struct gmonparam *_m_gmon_alloc(void);
> #endif
> =20
> _MCOUNT_DECL __P((u_long, u_long))
> #ifdef _KERNEL
> - __attribute__((__unused__,__no_instrument_function__)); /* see below=
. */
> + __attribute__((__used__,__no_instrument_function__)); /* see below. =
*/
> #else
> #ifdef __vax__
> __attribute__((__unused__)); /* see below. */
> #else
??
> Index: sys/arch/sparc64/dev/psm.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /cvsroot/src/sys/arch/sparc64/dev/psm.c,v
> retrieving revision 1.2
> diff -p -u -4 -r1.2 psm.c
> --- sys/arch/sparc64/dev/psm.c 12 Jul 2006 15:03:24 -0000 1.2
> +++ sys/arch/sparc64/dev/psm.c 30 Jul 2006 16:54:02 -0000
> @@ -127,9 +127,9 @@ STATIC void psm_attach(struct device *,=20
> CFATTACH_DECL(psm, sizeof(struct psm_softc),
> psm_match, psm_attach, NULL, NULL);
> =20
> =20
> -static int
> +STATIC int
> psm_match(struct device *parent, struct cfdata *cf, void *aux)
> {
> struct ebus_attach_args *ea =3D aux;
> =20
> @@ -137,9 +137,9 @@ psm_match(struct device *parent, struct=20
> return (0);
> return (1);
> }
> =20
> -static void
> +STATIC void
> psm_attach(struct device *parent, struct device *self, void *aux)
> {
> struct psm_softc *sc =3D (struct psm_softc *)self;
> struct ebus_attach_args *ea =3D aux;
??
Take care,
Bill
--GvXjxJ+pjyke8COw
Content-Type: application/pgp-signature
Content-Disposition: inline
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (NetBSD)
iD8DBQFE5jjlWz+3JHUci9cRArvVAJwPJ0E7pPUZVaI7Mef9XDG4A/TwHwCeMOiB
9fH09fqQWL3f9Z3V/deyYsE=
=DgNg
-----END PGP SIGNATURE-----
--GvXjxJ+pjyke8COw--