Subject: PR 32535
To: None <tech-kern@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 10/23/2006 20:29:12
--2OzUYMsT4j3Kc+NU
Content-Type: multipart/mixed; boundary="Jtds+vpI57xq70EV"
Content-Disposition: inline
--Jtds+vpI57xq70EV
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
I have a proposed fix for PR 32535, and I'd like other folks to look it=20
over.
The issue is that when we are doing a directory lookup, after we get and
lock the child vnode, we vrele() the parent. Normally this is great. =20
If the use count goes to zero, we lock the vnode and call VOP_INACTIVE()=20
on it.
Technically locking the parent with the child locked violates the locking=
=20
hierarchy. But if nothing is using the parent, there is no issue.
With layered file systems, though, we run into a problem. We try to lock=20
the parent when the layer vnode's use count goes to zero. But something=20
can be trying to use the underlying vnode (or if there are multiple null=20
layers, using another layer will count). Specifically, something can have=
=20
locked the parrent and be looking for the child.
My proposed fix is to add vrele2(), which is a form of vrele() that is=20
careful if we need to lock the being-released vnode. We pass it the to-be-=
=20
released vnode and another vnode that we have locked. If the reference=20
count goes to zero and the vnode's lock is held, we release the lock on=20
the other vnode, lock the being-released-one, VOP_INACTIVE(), and re-lock=
=20
the other vnode.
I also moved some code around. There were places where we release both=20
nodes, so I made it so we vput() the child before vrele()'ing the parent.=
=20
We then will have no problems (and it seems silly to do the relock dance=20
if we're about to vput()).
However I'd like other folks to look at this!
Once it's all 100%, pullup requests will follow. :-)
Take care,
Bill
--Jtds+vpI57xq70EV
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="pr-32535.patch"
Content-Transfer-Encoding: quoted-printable
? cvslog
? diffie
? arch/i386/compile/GENERIC
Index: kern/vfs_lookup.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_lookup.c,v
retrieving revision 1.71
diff -p -u -r1.71 vfs_lookup.c
--- kern/vfs_lookup.c 23 Jul 2006 22:06:12 -0000 1.71
+++ kern/vfs_lookup.c 23 Oct 2006 20:40:36 -0000
@@ -364,8 +364,9 @@ namei(struct nameidata *ndp)
}
}
PNBUF_PUT(cnp->cn_pnbuf);
- vrele(ndp->ni_dvp);
+ /* release out of hierarchical order so we don't need vrele2() */
vput(ndp->ni_vp);
+ vrele(ndp->ni_dvp);
ndp->ni_vp =3D NULL;
return (error);
}
@@ -756,7 +757,11 @@ nextname:
*/
if (!(cnp->cn_flags & ISLASTCN)) {
cnp->cn_nameptr =3D ndp->ni_next;
- vrele(ndp->ni_dvp);
+ if ((error =3D vrele2(ndp->ni_dvp, dp))) {
+ /* Oops! there was an error re-locking dp! */
+ dpunlocked =3D 1;
+ goto bad;
+ }
goto dirloop;
}
=20
@@ -773,28 +778,44 @@ terminal:
error =3D EROFS;
goto bad2;
}
+ if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+ VOP_UNLOCK(dp, 0);
if (ndp->ni_dvp !=3D NULL) {
if (cnp->cn_flags & SAVESTART) {
ndp->ni_startdir =3D ndp->ni_dvp;
VREF(ndp->ni_startdir);
}
- if (!wantparent)
- vrele(ndp->ni_dvp);
+ if (!wantparent) {
+ if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+ vrele(ndp->ni_dvp);
+ else
+ vrele2(ndp->ni_dvp, dp);
+ }
}
- if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
- VOP_UNLOCK(dp, 0);
return (0);
=20
bad2:
if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
((cnp->cn_flags & PDIRUNLOCK) =3D=3D 0))
- VOP_UNLOCK(ndp->ni_dvp, 0);
- vrele(ndp->ni_dvp);
+ vput(ndp->ni_dvp);
+ else if (!(dpunlocked)) {
+ /*
+ * We're about to vrele(ndp->ni_dvp) while dp is locked. This
+ * strictly-speaking needs vrele2(). However we will then
+ * vput(dp). So instead, do operations in the opposite order,
+ * making vrele(ndp->ni_dvp) sufficient.
+ */
+ vput(dp);
+ vrele(ndp->ni_dvp);
+ goto bad1;
+ } else
+ vrele(ndp->ni_dvp);
bad:
if (dpunlocked)
vrele(dp);
else
vput(dp);
+bad1:
ndp->ni_vp =3D NULL;
return (error);
}
@@ -909,16 +930,29 @@ relookup(struct vnode *dvp, struct vnode
/* ASSERT(dvp =3D=3D ndp->ni_startdir) */
if (cnp->cn_flags & SAVESTART)
VREF(dvp);
- if (!wantparent)
- vrele(dvp);
- if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+ /*
+ * vrele(dvp), catching the case where we want the leaf locked.
+ * In that case, be careful about deadlock in vrele().
+ */
+ if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0) {
VOP_UNLOCK(dp, 0);
+ if (!wantparent)
+ vrele(dvp);
+ } else {
+ if (!wantparent)
+ return (vrele2(dvp, dp));
+ }
return (0);
=20
bad2:
+ vput(dp);
+ *vpp =3D NULL;
if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
- VOP_UNLOCK(dvp, 0);
- vrele(dvp);
+ vput(dvp);
+ else
+ vrele(dvp);
+ return (error);
+
bad:
vput(dp);
*vpp =3D NULL;
Index: kern/vfs_subr.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.274
diff -p -u -r1.274 vfs_subr.c
--- kern/vfs_subr.c 22 Oct 2006 00:48:14 -0000 1.274
+++ kern/vfs_subr.c 23 Oct 2006 20:40:38 -0000
@@ -1304,6 +1304,68 @@ vrele(struct vnode *vp)
}
=20
/*
+ * Vnode release with another vnode locked.
+ * If count drops to zero, call inactive routine and return to freelist.
+ * Same as vrele(), except that if we have to lock the vnode for
+ * VOP_INACTIVE(), unlock ovp before sleeping for vp's lock. Return
+ * value is either 0 or the return value from re-locking ovp.
+ */
+int
+vrele2(struct vnode *vp, struct vnode *ovp)
+{
+ int r;
+ struct lwp *l =3D curlwp; /* XXX */
+
+#ifdef DIAGNOSTIC
+ if (vp =3D=3D NULL)
+ panic("vrele: null vp");
+#endif
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount--;
+ if (vp->v_usecount > 0) {
+ simple_unlock(&vp->v_interlock);
+ return 0;
+ }
+#ifdef DIAGNOSTIC
+ if (vp->v_usecount < 0 || vp->v_writecount !=3D 0) {
+ vprint("vrele: bad ref count", vp);
+ panic("vrele: ref cnt vp %p", vp);
+ }
+#endif
+ /*
+ * Insert at tail of LRU list.
+ */
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_holdcnt > 0)
+ TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
+ else
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ simple_unlock(&vnode_free_list_slock);
+ if (vp->v_flag & VEXECMAP) {
+ uvmexp.execpages -=3D vp->v_uobj.uo_npages;
+ uvmexp.filepages +=3D vp->v_uobj.uo_npages;
+ }
+ vp->v_flag &=3D ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
+ r =3D vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT);
+ if (r =3D=3D 0) {
+ /* No issues aquiring lock, so VOP_INACTIVE() and exit */
+ VOP_INACTIVE(vp, l);
+ return 0;
+ }
+ /* Skip VOP_INACTIVE() if there's an error locking. */
+ if (r !=3D EBUSY)
+ return 0;
+ /*
+ * At this point we know there's contention for vp, so do
+ * the unlock/lock dance for ovp.
+ */
+ VOP_UNLOCK(ovp, 0);
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) =3D=3D 0)
+ VOP_INACTIVE(vp, l);
+ return (vn_lock(ovp, LK_EXCLUSIVE));
+}
+
+/*
* Page or buffer structure gets a reference.
* Called with v_interlock held.
*/
Index: sys/vnode.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.159
diff -p -u -r1.159 vnode.h
--- sys/vnode.h 22 Oct 2006 22:49:38 -0000 1.159
+++ sys/vnode.h 23 Oct 2006 20:40:40 -0000
@@ -545,6 +545,7 @@ void vprint(const char *, struct vnode *
void vput(struct vnode *);
int vrecycle(struct vnode *, struct simplelock *, struct lwp *);
void vrele(struct vnode *);
+int vrele2(struct vnode *, struct vnode *);
int vtruncbuf(struct vnode *, daddr_t, int, int);
void vwakeup(struct buf *);
=20
--Jtds+vpI57xq70EV--
--2OzUYMsT4j3Kc+NU
Content-Type: application/pgp-signature
Content-Disposition: inline
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (NetBSD)
iD8DBQFFPYiIWz+3JHUci9cRAj+DAJ0X2By3uN+6BAtsRE9a7vHKBoB07ACdEagk
PTAXlRlVhnpiiK4hxmDjHRA=
=t+UT
-----END PGP SIGNATURE-----
--2OzUYMsT4j3Kc+NU--