Subject: split LFS vnode locks
To: None <perseant@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 12/15/2002 22:46:25
--NextPart-20021215224130-0409300
Content-Type: Text/Plain; charset=us-ascii

hi.

currently, since lfs_markv have to acquire vnode locks,
lfs_reserve have to release vnode locks before wait for lfs_avail.
however, releasing locks in lfs_reserve isn't safe because
we can't keep the vnodes/inodees from modifying without holding the locks.

i'd like to separate metadata (ie. block pointers) lock from vnode lock
so that lfs_reserve no longer need to release vnode locks as attached patch.
is it ok?

thanks.

YAMAMOTO Takashi

--NextPart-20021215224130-0409300
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="lfs.metalock.diff"

Index: ufs/inode.h
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/ufs/inode.h,v
retrieving revision 1.31
diff -u -p -r1.31 inode.h
--- ufs/inode.h	2002/12/01 00:12:12	1.31
+++ ufs/inode.h	2002/12/15 13:25:30
@@ -62,6 +62,9 @@ struct lfs_inode_ext {
 	off_t	  lfs_osize;		/* size of file on disk */
 	u_int32_t lfs_effnblocks;  /* number of blocks when i/o completes */
 	size_t    lfs_fragsize[NDADDR]; /* size of on-disk direct blocks */
+	int lfs_writecount;
+	int lfs_readcount;
+	int lfs_writerpid;
 };
 
 /*
@@ -118,6 +121,9 @@ struct inode {
 #define i_lfs_effnblks		inode_ext.lfs.lfs_effnblocks
 #define i_lfs_fragsize		inode_ext.lfs.lfs_fragsize
 #define i_lfs_osize		inode_ext.lfs.lfs_osize
+#define i_lfs_writecount	inode_ext.lfs.lfs_writecount
+#define i_lfs_readcount		inode_ext.lfs.lfs_readcount
+#define i_lfs_writerpid		inode_ext.lfs.lfs_writerpid
 	/*
 	 * The on-disk dinode itself.
 	 */
Index: ufs/ufs_readwrite.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.44
diff -u -p -r1.44 ufs_readwrite.c
--- ufs/ufs_readwrite.c	2002/10/23 09:15:08	1.44
+++ ufs/ufs_readwrite.c	2002/12/15 13:25:30
@@ -415,6 +415,12 @@ WRITE(void *v)
 		else
 			flags &= ~B_CLRBUF;
 
+#ifdef LFS_READWRITE
+		error = lfs_reserve(fs, vp,
+		    btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+		if (error)
+			break;
+#endif
 		error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
 		    ap->a_cred, flags, &bp);
 
@@ -442,11 +448,10 @@ WRITE(void *v)
 			break;
 		}
 #ifdef LFS_READWRITE
-		if (!error)
-			error = lfs_reserve(fs, vp, btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
 		(void)VOP_BWRITE(bp);
 		if (!error)
-			lfs_reserve(fs, vp, -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+			lfs_reserve(fs, vp,
+			    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
 #else
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
Index: lfs/lfs.h
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs.h,v
retrieving revision 1.42
diff -u -p -r1.42 lfs.h
--- lfs/lfs.h	2002/12/01 00:12:10	1.42
+++ lfs/lfs.h	2002/12/15 13:25:30
@@ -111,6 +111,35 @@
 #define LFS_WAIT_BYTES      (((bufpages >> 1) - (bufpages >> 3) - 10) * NBPG)
 #define LFS_BUFWAIT         2
 
+/*
+ * lfs inode meta data lock.
+ *
+ * this protects direct/indirect block pointers.
+ */
+#define	LFS_INITLOCKMETA(vp)	lfs_initmetalock(vp)
+/* are we holding exclusive lock? */
+#define	_LFS_OWN_LOCK_META(vp) \
+	(VTOI(vp)->i_lfs_writerpid == curproc->p_pid)
+#define	LFS_OWN_LOCK_META(vp) \
+	(VTOI(vp)->i_lfs_writecount != 0 && _LFS_OWN_LOCK_META(vp))
+
+/* acquire/release write lock */
+#define	LFS_LOCK_META(vp)	lfs_lockmeta(vp)
+#define	LFS_UNLOCK_META(vp)	lfs_unlockmeta(vp)
+
+/* acquire/release read lock */
+#define	LFS_READ_LOCK_META(vp)	lfs_readlockmeta(vp)
+#define	LFS_READ_UNLOCK_META(vp) lfs_readunlockmeta(vp)
+
+/* assert we're (not) holding read or write lock */
+#define	LFS_ASSERT_LOCK_META(vp) \
+	KASSERT(VTOI(vp)->i_lfs_readcount != 0 || LFS_OWN_LOCK_META(vp))
+#define	LFS_ASSERT_UNLOCK_META(vp) \
+	KASSERT(!LFS_OWN_LOCK_META(vp))
+
+/* macros for passing locks to another thread */
+#define	LFS_PASS_METALOCK_TO_AIODONED(vp) /* nothing */
+
 #define LFS_LOCK_BUF(bp) do {						\
 	if (((bp)->b_flags & (B_LOCKED | B_CALL)) == 0) {		\
 		++locked_queue_count;       				\
Index: lfs/lfs_alloc.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_alloc.c,v
retrieving revision 1.57
diff -u -p -r1.57 lfs_alloc.c
--- lfs/lfs_alloc.c	2002/11/24 08:43:26	1.57
+++ lfs/lfs_alloc.c	2002/12/15 13:25:30
@@ -490,6 +490,8 @@ lfs_vcreate(struct mount *mp, ino_t ino,
 	/* Why was IN_MODIFIED ever set here? */
 	/* LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED); */
 
+	LFS_INITLOCKMETA(vp);
+
 #ifdef DEBUG_LFS_VNLOCK
 	if (ino == LFS_IFILE_INUM)
 		vp->v_vnlock->lk_wmesg = "inlock";
Index: lfs/lfs_balloc.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_balloc.c,v
retrieving revision 1.34
diff -u -p -r1.34 lfs_balloc.c
--- lfs/lfs_balloc.c	2002/12/11 13:34:14	1.34
+++ lfs/lfs_balloc.c	2002/12/15 13:25:30
@@ -134,6 +134,7 @@ lfs_balloc(void *v)
 	ufs_daddr_t	lbn, lastblock;
 	int bb, bcount;
 	int error, frags, i, nsize, osize, num;
+	int needunlockmeta = 0;
 
 	vp = ap->a_vp;	
 	ip = VTOI(vp);
@@ -169,7 +170,7 @@ lfs_balloc(void *v)
 			if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
 						    lastblock, &bp,
 						    ap->a_cred)))
-				return (error);
+				goto out;
 			ip->i_ffs_size = (lastblock + 1) * fs->lfs_bsize;
 			uvm_vnp_setsize(vp, ip->i_ffs_size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -202,22 +203,25 @@ lfs_balloc(void *v)
 			if (nsize <= osize) {
 				/* No need to extend */
 				if ((error = bread(vp, lbn, osize, NOCRED, &bp)))
-					return error;
+					goto out;
 			} else {
 				/* Extend existing block */
 				if ((error =
 				     lfs_fragextend(vp, osize, nsize, lbn, &bp,
 						    ap->a_cred)))
-					return error;
+					goto out;
 			}
 			*ap->a_bpp = bp;
 		}
-		return 0;
+		error = 0;
+		goto out;
 	}
 
+	LFS_LOCK_META(vp);
+	needunlockmeta = 1;
 	error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL );
 	if (error)
-		return (error);
+		goto out;
 	/*
 	 * Do byte accounting all at once, so we can gracefully fail *before*
 	 * we start assigning blocks.
@@ -236,7 +240,8 @@ lfs_balloc(void *v)
 		ip->i_lfs->lfs_bfree -= bcount;
 		ip->i_lfs_effnblks += bcount;
 	} else {
-		return ENOSPC;
+		error = ENOSPC;
+		goto out;
 	}
 
 	if (daddr == UNASSIGNED) {
@@ -270,7 +275,7 @@ lfs_balloc(void *v)
 					UNWRITTEN;
 			idaddr = ((daddr_t *)ibp->b_data)[indirs[i].in_off];
 			if ((error = VOP_BWRITE(ibp))) {
-				return error;
+				goto out;
 			}
 		}
 	}	
@@ -328,11 +333,16 @@ lfs_balloc(void *v)
 			bp->b_blkno = daddr;
 			bp->b_flags |= B_READ;
 			VOP_STRATEGY(bp);
-			return (biowait(bp));
+			error = biowait(bp);
+			goto out;
 		}
 	}
 	
-	return (0);
+	error = 0;
+out:
+	if (needunlockmeta)
+		LFS_UNLOCK_META(vp);
+	return (error);
 }
 
 /* VOP_BWRITE 1 time */
Index: lfs/lfs_bio.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_bio.c,v
retrieving revision 1.47
diff -u -p -r1.47 lfs_bio.c
--- lfs/lfs_bio.c	2002/11/27 11:36:40	1.47
+++ lfs/lfs_bio.c	2002/12/15 13:25:30
@@ -110,20 +110,10 @@ extern int lfs_dostats;
 
 /*
  * Try to reserve some blocks, prior to performing a sensitive operation that
- * requires the vnode lock to be honored.  If there is not enough space, give
- * up the vnode lock temporarily and wait for the space to become available.
+ * requires the vnode lock to be honored.  If there is not enough space, wait
+ * for the space to become available.
  *
  * Called with vp locked.  (Note nowever that if fsb < 0, vp is ignored.)
- *
- * XXX YAMT - it isn't safe to unlock vp here
- * because the node might be modified while we sleep.
- * (eg. cached states like i_offset might be stale,
- *  the vnode might be truncated, etc..)
- * maybe we should have a way to restart the vnode op. (EVOPRESTART?)
- *
- * XXX YAMT - we unlock the vnode so that cleaner can lock it.
- * but it isn't enough. eg. for VOP_REMOVE, we should unlock the vnode that
- * is going to be removed as well.
  */
 int
 lfs_reserve(struct lfs *fs, struct vnode *vp, int fsb)
@@ -132,10 +122,11 @@ lfs_reserve(struct lfs *fs, struct vnode
 	struct buf *bp;
 	int error, slept;
 
+	LFS_ASSERT_UNLOCK_META(vp);
+
 	slept = 0;
 	while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail) &&
 	    vp != fs->lfs_unlockvp) {
-		VOP_UNLOCK(vp, 0);
 
 		if (!slept) {
 #ifdef DEBUG
@@ -155,7 +146,6 @@ lfs_reserve(struct lfs *fs, struct vnode
 			
 		error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve",
 			       0);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
 		if (error)
 			return error;
 	}
@@ -261,6 +251,8 @@ lfs_bwrite_ext(struct buf *bp, int flags
 	struct inode *ip;
 	int fsb, error, s;
 	
+	KASSERT(bp->b_flags & B_BUSY);
+	KASSERT(flags & BW_CLEAN || !(bp->b_flags & B_CALL));
 	/*
 	 * Don't write *any* blocks if we're mounted read-only.
 	 * In particular the cleaner can't write blocks either.
@@ -300,7 +292,7 @@ lfs_bwrite_ext(struct buf *bp, int flags
 		}
 		
 		ip = VTOI(bp->b_vp);
-		if (bp->b_flags & B_CALL) {
+		if (flags & BW_CLEAN) {
 			LFS_SET_UINO(ip, IN_CLEANING);
 		} else {
 			LFS_SET_UINO(ip, IN_MODIFIED);
@@ -405,6 +397,8 @@ lfs_check(struct vnode *vp, ufs_daddr_t 
 	struct inode *ip;
 	extern int lfs_dirvcount;
 
+	LFS_ASSERT_UNLOCK_META(vp);
+
 	error = 0;
 	ip = VTOI(vp);
 	
@@ -591,4 +585,94 @@ lfs_countlocked(int *count, long *bytes,
 	*count = n;
 	*bytes = size;
 	return;
+}
+
+/*
+ * XXX we don't use lockmgr locks here because:
+ * - we want to acquire readlock recursively. (lfs_strategy)
+ *   but doing so isn't safe with lockmgr LK_SHARED lock.
+ * - we want to unlock readlock in interrupt context. (lfs_strategy_iodone)
+ *
+ * XXX should have interlocks.
+ */
+void
+lfs_lockmeta(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+	int s = splbio();
+
+	KASSERT(curproc != NULL);
+
+	while (ip->i_lfs_readcount ||
+	    (ip->i_lfs_writecount && !_LFS_OWN_LOCK_META(vp))) {
+		ltsleep(&ip->i_lfs_writecount, PINOD, "lfsmetawrite", 0, 0);
+	}
+	KASSERT(ip->i_lfs_writerpid == NO_PID ||
+	    ip->i_lfs_writerpid == curproc->p_pid);
+	ip->i_lfs_writecount++;
+	ip->i_lfs_writerpid = curproc->p_pid;
+
+	splx(s);
+}
+
+void
+lfs_unlockmeta(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+	int s = splbio();
+
+	KASSERT(curproc != NULL);
+	KASSERT(ip->i_lfs_readcount == 0);
+	KASSERT(ip->i_lfs_writecount > 0);
+	KASSERT(ip->i_lfs_writerpid == curproc->p_pid);
+
+	if ((--ip->i_lfs_writecount) == 0) {
+		ip->i_lfs_writerpid = NO_PID;
+		wakeup(&ip->i_lfs_writecount);
+	}
+
+	splx(s);
+}
+
+void
+lfs_readlockmeta(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+	int s = splbio();
+
+	KASSERT(curproc != NULL);
+
+	while (ip->i_lfs_writecount) {
+		ltsleep(&ip->i_lfs_writecount, PINOD, "lfsmetaread", 0, 0);
+	}
+	KASSERT(ip->i_lfs_writerpid == NO_PID);
+	ip->i_lfs_readcount++;
+
+	splx(s);
+}
+
+/*
+ * lfs_readunlockmeta can be called in interrupt context.
+ */
+void
+lfs_readunlockmeta(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+
+	KASSERT(ip->i_lfs_readcount > 0);
+	KASSERT(ip->i_lfs_writecount == 0);
+	KASSERT(ip->i_lfs_writerpid == NO_PID);
+
+	if ((--ip->i_lfs_readcount) == 0)
+		wakeup(&ip->i_lfs_writecount);
+}
+
+void
+lfs_initmetalock(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+
+	ip->i_lfs_readcount = 0;
+	ip->i_lfs_writecount = 0;
+	ip->i_lfs_writerpid = NO_PID;
 }
Index: lfs/lfs_extern.h
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_extern.h,v
retrieving revision 1.32
diff -u -p -r1.32 lfs_extern.h
--- lfs/lfs_extern.h	2002/12/01 00:12:11	1.32
+++ lfs/lfs_extern.h	2002/12/15 13:25:30
@@ -136,6 +136,11 @@ struct buf *lfs_newbuf(struct lfs *, str
 #endif
 void lfs_countlocked(int *, long *, char *);
 int lfs_reserve(struct lfs *, struct vnode *, int);
+void lfs_lockmeta(struct vnode *);
+void lfs_unlockmeta(struct vnode *);
+void lfs_readlockmeta(struct vnode *);
+void lfs_readunlockmeta(struct vnode *);
+void lfs_initmetalock(struct vnode *);
 
 /* lfs_cksum.c */
 u_int32_t cksum(void *, size_t);
@@ -178,6 +183,7 @@ void lfs_shellsort(struct buf **, ufs_da
 int lfs_vref(struct vnode *);
 void lfs_vunref(struct vnode *);
 void lfs_vunref_head(struct vnode *);
+void lfs_generic_callback(struct buf *, void (*)(struct buf *));
 
 /* lfs_subr.c */
 void lfs_seglock(struct lfs *, unsigned long);
@@ -235,6 +241,7 @@ int lfs_write	 (void *);
 int lfs_whiteout (void *);
 int lfs_getpages (void *);
 int lfs_putpages (void *);
+int lfs_strategy (void *);
 
 __END_DECLS
 extern int lfs_mount_type;
Index: lfs/lfs_inode.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_inode.c,v
retrieving revision 1.60
diff -u -p -r1.60 lfs_inode.c
--- lfs/lfs_inode.c	2002/09/27 15:38:05	1.60
+++ lfs/lfs_inode.c	2002/12/15 13:25:30
@@ -187,6 +187,7 @@ lfs_update(void *v)
 	
 	/* If sync, push back the vnode and any dirty blocks it may have. */
 	if ((ap->a_flags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) {
+		int error;
 		/* Avoid flushing VDIROP. */
 		++fs->lfs_diropwait;
 		while (vp->v_flag & VDIROP) {
@@ -205,7 +206,8 @@ lfs_update(void *v)
 			twice? */
 		}
 		--fs->lfs_diropwait;
-		return lfs_vflush(vp);
+		error = lfs_vflush(vp);
+		return error;
         }
 	return 0;
 }
@@ -379,6 +381,7 @@ lfs_truncate(void *v)
 	 * freeing blocks.  lastiblock values are also normalized to -1
 	 * for calls to lfs_indirtrunc below.
 	 */
+	LFS_LOCK_META(ovp);
 	memcpy((caddr_t)newblks, (caddr_t)&oip->i_ffs_db[0], sizeof newblks);
 	for (level = TRIPLE; level >= SINGLE; level--)
 		if (lastiblock[level] < 0) {
@@ -497,6 +500,7 @@ done:
 	oip->i_lfs_effnblks -= blocksreleased;
 	oip->i_ffs_blocks -= real_released;
 	fs->lfs_bfree += blocksreleased;
+	LFS_UNLOCK_META(ovp);
 #ifdef DIAGNOSTIC
 	if (oip->i_ffs_size == 0 && oip->i_ffs_blocks != 0) {
 		printf("lfs_truncate: truncate to 0 but %d blocks on inode\n",
@@ -584,6 +588,8 @@ lfs_indirtrunc(struct inode *ip, ufs_dad
 	int nblocks, blocksreleased = 0, real_released = 0;
 	int error = 0, allerror = 0;
 
+	LFS_ASSERT_LOCK_META(ITOV(ip));
+
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
@@ -706,6 +712,7 @@ lfs_vtruncbuf(struct vnode *vp, daddr_t 
 	int s, error;
 	struct lfs *fs;
 
+	LFS_ASSERT_LOCK_META(vp);
 	fs = VTOI(vp)->i_lfs;
 	s = splbio();
 
Index: lfs/lfs_segment.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_segment.c,v
retrieving revision 1.84
diff -u -p -r1.84 lfs_segment.c
--- lfs/lfs_segment.c	2002/12/13 14:40:02	1.84
+++ lfs/lfs_segment.c	2002/12/15 13:25:30
@@ -109,7 +109,6 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_segment.
 extern int count_lock_queue(void);
 extern struct simplelock vnode_free_list_slock;		/* XXX */
 
-static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
 static void lfs_super_aiodone(struct buf *);
 static void lfs_cluster_aiodone(struct buf *);
 static void lfs_cluster_callback(struct buf *);
@@ -232,8 +231,7 @@ lfs_vflush(struct vnode *vp)
 					{
 						fs->lfs_avail += btofsb(fs, bp->b_bcount);
 						wakeup(&fs->lfs_avail);
-						lfs_freebuf(bp);
-						bp = NULL;
+						(*bp->b_iodone)(bp);
 						break;
 					}
 				}
@@ -270,8 +268,7 @@ lfs_vflush(struct vnode *vp)
 			/* Copied from lfs_writeseg */
 			if (bp->b_flags & B_CALL) {
 				/* if B_CALL, it was created with newbuf */
-				lfs_freebuf(bp);
-				bp = NULL;
+				(*bp->b_iodone)(bp);
 			} else {
 				bremfree(bp);
 				LFS_UNLOCK_BUF(bp);
@@ -404,6 +401,10 @@ lfs_writevnodes(struct lfs *fs, struct m
 			printf("lfs_writevnodes: starting over\n");
 			goto loop;
 		}
+		
+		if (vp->v_type == VNON) {
+			continue;
+		}
 
 		ip = VTOI(vp);
 		if ((op == VN_DIROP && !(vp->v_flag & VDIROP)) ||
@@ -416,10 +417,6 @@ lfs_writevnodes(struct lfs *fs, struct m
 			vndebug(vp,"empty");
 			continue;
 		}
-		
-		if (vp->v_type == VNON) {
-			continue;
-		}
 
 		if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
 		   && vp != fs->lfs_flushvp
@@ -434,20 +431,8 @@ lfs_writevnodes(struct lfs *fs, struct m
 		}
 
 		needs_unlock = 0;
-		if (VOP_ISLOCKED(vp)) {
-			if (vp != fs->lfs_ivnode &&
-			    vp->v_lock.lk_lockholder != curproc->p_pid) {
-#ifdef DEBUG_LFS
-				printf("lfs_writevnodes: not writing ino %d,"
-				       " locked by pid %d\n",
-				       VTOI(vp)->i_number,
-				       vp->v_lock.lk_lockholder);
-#endif
-				lfs_vunref(vp);
-				continue;
-			}
-		} else if (vp != fs->lfs_ivnode) {
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp != fs->lfs_ivnode) {
+			LFS_LOCK_META(vp);
 			needs_unlock = 1;
 		}
 
@@ -481,8 +466,9 @@ lfs_writevnodes(struct lfs *fs, struct m
 			inodes_written++;
 		}
 
-		if (needs_unlock)
-			VOP_UNLOCK(vp, 0);
+		if (needs_unlock) {
+			LFS_UNLOCK_META(vp);
+		}
 
 		if (lfs_clean_vnhead && only_cleaning)
 			lfs_vunref_head(vp);
@@ -642,7 +628,7 @@ lfs_segwrite(struct mount *mp, int flags
 			fs->lfs_flags &= ~LFS_IFDIRTY;
 
 			ip = VTOI(vp);
-			/* if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) */
+			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
 				lfs_writefile(fs, sp, vp);
 			if (ip->i_flag & IN_ALLMOD)
 				++did_ckp;
@@ -787,6 +773,11 @@ lfs_writefile(struct lfs *fs, struct seg
 	if (frag > 1)
 		panic("lfs_writefile: more than one fragment!");
 #endif
+#ifdef DEBUG_LFS
+	if (frag != 0 && ip->i_ffs_size > fs->lfs_bsize * NDADDR)
+		printf("lfs_writefile: not writing indirect blocks for ino %d due to "
+		    "%d fragments, size=%lu\n", ip->i_number, frag, (ulong)ip->i_ffs_size);
+#endif
 	if (IS_FLUSHING(fs, vp) ||
 	    (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) {
 		lfs_gather(fs, sp, vp, lfs_match_indir);
@@ -919,9 +910,12 @@ lfs_writeinode(struct lfs *fs, struct se
 #endif
 	}
 
-	if (ip->i_number == LFS_IFILE_INUM) /* We know sp->idp == NULL */
+	if (ip->i_number == LFS_IFILE_INUM) {
+		/* We know sp->idp == NULL */
+		KASSERT(sp->idp == NULL);
 		sp->idp = ((struct dinode *)bp->b_data) + 
 			(sp->ninodes % INOPB(fs));
+	}
 	if (gotblk) {
 		LFS_LOCK_BUF(bp);
 		brelse(bp);
@@ -1170,6 +1164,7 @@ lfs_updatemeta(struct segment *sp)
 	if (vp == NULL || nblocks == 0) 
 		return;
 	
+	LFS_LOCK_META(vp);
 	/* Sort the blocks. */
 	/*
 	 * XXX KS - We have to sort even if the blocks come from the
@@ -1333,6 +1328,7 @@ lfs_updatemeta(struct segment *sp)
 		if (lbn >= 0 && lbn < NDADDR)
 			ip->i_lfs_fragsize[lbn] = sbp->b_bcount;
 	}
+	LFS_UNLOCK_META(vp);
 }
 
 /*
@@ -1618,7 +1614,7 @@ lfs_writeseg(struct lfs *fs, struct segm
 	 */
 	if ((nblocks = sp->cbpp - sp->bpp) == 1)
 		return (0);
-	
+
 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 
@@ -1725,8 +1721,7 @@ lfs_writeseg(struct lfs *fs, struct segm
 			if (changed) {
 				bp->b_flags &= ~(B_ERROR | B_GATHERED);
 				if (bp->b_flags & B_CALL) {
-					lfs_freebuf(bp);
-					bp = NULL;
+					(*bp->b_iodone)(bp);
 				} else {
 					/* Still on free list, leave it there */
 					s = splbio();
@@ -1746,8 +1741,7 @@ lfs_writeseg(struct lfs *fs, struct segm
 				bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
 						 B_GATHERED);
 				if (bp->b_flags & B_CALL) {
-					lfs_freebuf(bp);
-					bp = NULL;
+					(*bp->b_iodone)(bp);
 				} else {
 					bremfree(bp);
 					bp->b_flags |= B_DONE;
@@ -2085,7 +2079,7 @@ lfs_match_data(struct lfs *fs, struct bu
 int
 lfs_match_indir(struct lfs *fs, struct buf *bp)
 {
-	int lbn;
+	daddr_t lbn;
 
 	lbn = bp->b_lblkno;
 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
@@ -2094,7 +2088,7 @@ lfs_match_indir(struct lfs *fs, struct b
 int
 lfs_match_dindir(struct lfs *fs, struct buf *bp)
 {
-	int lbn;
+	daddr_t lbn;
 
 	lbn = bp->b_lblkno;
 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
@@ -2103,7 +2097,7 @@ lfs_match_dindir(struct lfs *fs, struct 
 int
 lfs_match_tindir(struct lfs *fs, struct buf *bp)
 {
-	int lbn;
+	daddr_t lbn;
 
 	lbn = bp->b_lblkno;
 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
@@ -2261,7 +2255,7 @@ lfs_cluster_aiodone(struct buf *bp)
 	free(cl, M_SEGMENT);
 }
 
-static void
+void
 lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *))
 {
 	/* reset b_iodone for when this is a single-buf i/o. */
Index: lfs/lfs_syscalls.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_syscalls.c,v
retrieving revision 1.73
diff -u -p -r1.73 lfs_syscalls.c
--- lfs/lfs_syscalls.c	2002/11/24 16:39:13	1.73
+++ lfs/lfs_syscalls.c	2002/12/15 13:25:30
@@ -94,8 +94,9 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls
 #include <ufs/lfs/lfs_extern.h>
 
 /* Flags for return from lfs_fastvget */
-#define FVG_UNLOCK 0x01	  /* Needs to be unlocked */
-#define FVG_PUT	   0x02	  /* Needs to be vput() */
+#define FVG_UNLOCK	0x01	  /* Needs to be VOP_UNLOCK'ed */
+#define FVG_PUT	   	0x02	  /* Needs to be vput() */
+#define FVG_METAUNLOCK	0x04	  /* Needs to be LFS_READ_UNLOCK_META'ed */
 
 /* Max block count for lfs_markv() */
 #define MARKV_MAXBLKCNT		65536
@@ -128,7 +129,9 @@ extern TAILQ_HEAD(bqueues, buf) bufqueue
 
 static int lfs_bmapv(struct proc *, fsid_t *, BLOCK_INFO *, int);
 static int lfs_markv(struct proc *, fsid_t *, BLOCK_INFO *, int);
+#if 0
 static void lfs_fakebuf_iodone(struct buf *);
+#endif
 
 /*
  * sys_lfs_markv:
@@ -259,7 +262,7 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 #ifdef CHECK_COPYIN
 	int i;
 #endif /* CHECK_COPYIN */
-	int numlocked = 0, numrefed = 0;
+	int numlocked = 0, numrefed = 0, nummetalocked = 0;
 	ino_t maxino;
 	size_t obsize;
 
@@ -329,7 +332,11 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 				if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
 					iwritten++;
 #endif
-				if (lfs_fastvget_unlock) {
+				if (lfs_fastvget_unlock & FVG_METAUNLOCK) {
+					LFS_READ_UNLOCK_META(vp);
+					nummetalocked--;
+				}
+				if (lfs_fastvget_unlock & FVG_UNLOCK) {
 					VOP_UNLOCK(vp, 0);
 					numlocked--;
 				}
@@ -368,8 +375,12 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 					    ? blkp->bi_bp
 					    : NULL),
 					   &lfs_fastvget_unlock);
-			if (lfs_fastvget_unlock)
-				numlocked++;
+			if (lfs_fastvget_unlock & FVG_METAUNLOCK)
+				nummetalocked++;
+			if (lfs_fastvget_unlock & FVG_UNLOCK) {
+				lfs_fastvget_unlock &= ~FVG_UNLOCK;
+				VOP_UNLOCK(vp, 0);
+			}
 
 			if (!error) {
 				numrefed++;
@@ -493,6 +504,7 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 				    " size=%d\n", blkp->bi_size);
 			bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
 			if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
+				KASSERT(!(bp->b_flags & B_LOCKED));
 				/*
 				 * The block in question was not found
 				 * in the cache; i.e., the block that
@@ -522,13 +534,25 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 		if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
 			iwritten++;
 #endif
-		if (lfs_fastvget_unlock) {
+		if (lfs_fastvget_unlock & FVG_METAUNLOCK) {
+			LFS_READ_UNLOCK_META(vp);
+			nummetalocked--;
+		}
+		if (lfs_fastvget_unlock & FVG_UNLOCK) {
 			VOP_UNLOCK(vp, 0);
 			numlocked--;
 		}
 		lfs_vunref(vp);
 		numrefed--;
 	}
+
+#ifdef DEBUG_LFS
+	printf("%d]",iwritten);
+	if (numlocked != 0 || numrefed != 0 || nummetalocked != 0) {
+		panic("lfs_markv: numlocked=%d numrefed=%d nummetalocked = %d",
+		    numlocked, numrefed, nummetalocked);
+	}
+#endif
 	
 	/*
 	 * The last write has to be SEGM_SYNC, because of calling semantics.
@@ -539,13 +563,6 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 	lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
 	
 	lfs_segunlock(fs);
-
-#ifdef DEBUG_LFS
-	printf("%d]",iwritten);
-	if (numlocked != 0 || numrefed != 0) {
-		panic("lfs_markv: numlocked=%d numrefed=%d", numlocked, numrefed);
-	}
-#endif
 	
 	vfs_unbusy(mntp);
 	if (error)
@@ -556,13 +573,18 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 	return 0;
 	
  err2:
+	KASSERT(v_daddr != LFS_UNUSED_DADDR);
 	printf("lfs_markv err2\n");
-	if (lfs_fastvget_unlock) {
+	if (lfs_fastvget_unlock & FVG_METAUNLOCK) {
+		LFS_READ_UNLOCK_META(vp);
+		nummetalocked--;
+	}
+	if (lfs_fastvget_unlock & FVG_UNLOCK) {
 		VOP_UNLOCK(vp, 0);
-		--numlocked;
+		numlocked--;
 	}
 	lfs_vunref(vp);
-	--numrefed;
+	numrefed--;
 
 	/* Free up fakebuffers -- have to take these from the LOCKED list */
  again:
@@ -588,8 +610,9 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 	lfs_segunlock(fs);
 	vfs_unbusy(mntp);
 #ifdef DEBUG_LFS
-	if (numlocked != 0 || numrefed != 0) {
-		panic("lfs_markv: numlocked=%d numrefed=%d", numlocked, numrefed);
+	if (numlocked != 0 || numrefed != 0 || nummetalocked != 0) {
+		panic("lfs_markv: numlocked=%d numrefed=%d nummetalocked=%d",
+		    numlocked, numrefed, nummetalocked);
 	}
 #endif
 
@@ -711,7 +734,7 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 	ino_t lastino;
 	ufs_daddr_t v_daddr;
 	int cnt, error, need_unlock = 0;
-	int numlocked = 0, numrefed = 0;
+	int numlocked = 0, numrefed = 0, nummetalocked = 0;
 
 	lfs_cleaner_pid = p->p_pid;
 	
@@ -744,7 +767,11 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 			 * v_daddr.
 			 */
 			if (v_daddr != LFS_UNUSED_DADDR) {
-				if (need_unlock) {
+				if (need_unlock & FVG_METAUNLOCK) {
+					LFS_READ_UNLOCK_META(vp);
+					nummetalocked--;
+				}
+				if (need_unlock & FVG_UNLOCK) {
 					VOP_UNLOCK(vp, 0);
 					numlocked--;
 				}
@@ -773,27 +800,15 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 			 */
 			vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
 			if (vp != NULL && !(vp->v_flag & VXLOCK)) {
-				ip = VTOI(vp);
 				if (lfs_vref(vp)) {
 					v_daddr = LFS_UNUSED_DADDR;
 					need_unlock = 0;
 					continue;
 				}
 				numrefed++;
-				if (VOP_ISLOCKED(vp)) {
-#ifdef DEBUG_LFS
-					printf("lfs_bmapv: inode %d inlocked\n",ip->i_number);
-#endif
-					v_daddr = LFS_UNUSED_DADDR;
-					need_unlock = 0;
-					lfs_vunref(vp);
-					--numrefed;
-					continue;
-				} else {
-					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-					need_unlock = FVG_UNLOCK;
-					numlocked++;
-				}
+				LFS_READ_LOCK_META(vp);
+				nummetalocked++;
+				need_unlock = FVG_METAUNLOCK;
 			} else {
 				error = VFS_VGET(mntp, blkp->bi_inode, &vp);
 				if (error) {
@@ -804,8 +819,10 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 					need_unlock = 0;
 					continue;
 				} else {
-					need_unlock = FVG_PUT;
-					numlocked++;
+					LFS_READ_LOCK_META(vp);
+					VOP_UNLOCK(vp, 0);
+					need_unlock = FVG_METAUNLOCK;
+					nummetalocked++;
 					numrefed++;
 				}
 			}
@@ -855,7 +872,11 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 	 * of a usable vnode in vp is signaled by a valid v_daddr.
 	 */
 	if (v_daddr != LFS_UNUSED_DADDR) {
-		if (need_unlock) {
+		if (need_unlock & FVG_METAUNLOCK) {
+			LFS_READ_UNLOCK_META(vp);
+			nummetalocked--;
+		}
+		if (need_unlock & FVG_UNLOCK) {
 			VOP_UNLOCK(vp, 0);
 			numlocked--;
 		}
@@ -863,9 +884,9 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 		numrefed--;
 	}
 	
-	if (numlocked != 0 || numrefed != 0) {
-		panic("lfs_bmapv: numlocked=%d numrefed=%d", numlocked,
-		      numrefed);
+	if (numlocked != 0 || numrefed != 0 || nummetalocked != 0) {
+		panic("lfs_bmapv: numlocked=%d numrefed=%d nummetalocked=%d",
+		    numlocked, numrefed, nummetalocked);
 	}
 	
 	vfs_unbusy(mntp);
@@ -1051,7 +1072,6 @@ extern struct lock ufs_hashlock;
 int
 lfs_fasthashget(dev_t dev, ino_t ino, int *need_unlock, struct vnode **vpp)
 {
-	struct inode *ip;
 
 	/*
 	 * This is playing fast and loose.  Someone may have the inode
@@ -1067,25 +1087,13 @@ lfs_fasthashget(dev_t dev, ino_t ino, in
 			return EAGAIN;
 #endif
 		}
-		ip = VTOI(*vpp);
 		if (lfs_vref(*vpp)) {
 			clean_inlocked++;
 			return EAGAIN;
 		}
-		if (VOP_ISLOCKED(*vpp)) {
-#ifdef DEBUG_LFS
-			printf("lfs_fastvget: ino %d inlocked by pid %d\n",
-			       ip->i_number, (*vpp)->v_lock.lk_lockholder);
-#endif
-			clean_inlocked++;
-#ifdef LFS_EAGAIN_FAIL
-			lfs_vunref(*vpp);
-			return EAGAIN;
-#endif /* LFS_EAGAIN_FAIL */
-		} else {
-			vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-			*need_unlock |= FVG_UNLOCK;
-		}
+		KASSERT((*need_unlock & FVG_METAUNLOCK) == 0);
+		LFS_READ_LOCK_META(*vpp);
+		*need_unlock |= FVG_METAUNLOCK;
 	} else
 		*vpp = NULL;
 
@@ -1242,7 +1250,11 @@ lfs_fastvget(struct mount *mp, ino_t ino
 	ip->i_devvp = ump->um_devvp;
 	VREF(ip->i_devvp);
 	*vpp = vp;
-	*need_unlock |= FVG_PUT;
+	LFS_READ_LOCK_META(vp);
+	*need_unlock |= FVG_METAUNLOCK;
+
+	KASSERT(VOP_ISLOCKED(vp));
+	*need_unlock |= FVG_UNLOCK;
 
 	uvm_vnp_setsize(vp, ip->i_ffs_size);
 
@@ -1283,6 +1295,8 @@ lfs_fakebuf(struct lfs *fs, struct vnode
 	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size);
 	error = copyin(uaddr, bp->b_data, size);
 	if (error) {
+		panic("lfs_newbuf: copyin failed: uaddr=%p, error=%d\n",
+		    (void *)uaddr, error);
 		lfs_freebuf(bp);
 		return NULL;
 	}
Index: lfs/lfs_vnops.c
===================================================================
RCS file: /cvs/NetBSD/syssrc/sys/ufs/lfs/lfs_vnops.c,v
retrieving revision 1.69
diff -u -p -r1.69 lfs_vnops.c
--- lfs/lfs_vnops.c	2002/11/24 08:23:41	1.69
+++ lfs/lfs_vnops.c	2002/12/15 13:25:30
@@ -138,7 +138,7 @@ const struct vnodeopv_entry_desc lfs_vno
 	{ &vop_lock_desc, ufs_lock },			/* lock */
 	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
 	{ &vop_bmap_desc, ufs_bmap },			/* bmap */
-	{ &vop_strategy_desc, ufs_strategy },		/* strategy */
+	{ &vop_strategy_desc, lfs_strategy },		/* strategy */
 	{ &vop_print_desc, ufs_print },			/* print */
 	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
 	{ &vop_pathconf_desc, ufs_pathconf },		/* pathconf */
@@ -339,6 +339,16 @@ lfs_inactive(void *v)
 	} */ *ap = v;
 	struct inode *ip = VTOI(ap->a_vp);
 
+	/*
+	 * wait until meta locks go away.
+	 */
+	while (ip->i_lfs_writecount || ip->i_lfs_readcount) {
+		printf("lfs_inactive: waiting for vp=%p, ino=%d\n",
+		    ap->a_vp, ip->i_number);
+		ltsleep(&ip->i_lfs_writecount, PVFS, "lfs_inactive", 0, 0);
+	}
+	KASSERT(ip->i_lfs_writerpid == NO_PID);
+
 	if (ip->i_flag & IN_ADIROP)
 		--ip->i_lfs->lfs_nadirop;
 	ip->i_flag &= ~IN_ADIROP;
@@ -367,6 +377,8 @@ lfs_set_dirop(struct vnode *vp)
 	struct lfs *fs;
 	int error;
 
+	LFS_ASSERT_UNLOCK_META(vp);
+
 	fs = VTOI(vp)->i_lfs;
 	/*
 	 * We might need one directory block plus supporting indirect blocks,
@@ -977,4 +989,104 @@ lfs_putpages(void *v)
 
 	error = genfs_putpages(v);
 	return error;
+}
+
+static void
+lfs_strategy_iodone(struct buf *bp)
+{
+	struct vnode *vp = bp->b_vp;
+
+	KASSERT(!(bp->b_flags & B_CALL));
+	KASSERT(bp->b_flags & B_BUSY);
+	KASSERT(vp->v_tag == VT_LFS);
+
+	LFS_READ_UNLOCK_META(vp);
+
+	/* followings are copied from biodone */
+	if (bp->b_flags & B_ASYNC)	/* if async, release */
+		brelse(bp);
+	else {				/* or just wakeup the buffer */
+		bp->b_flags &= ~B_WANTED;
+		wakeup(bp);
+	}
+}
+
+/*
+ * Calculate the logical to physical mapping if not done already,
+ * then call the device strategy routine.
+ */
+int
+lfs_strategy(void *v)
+{
+	struct vop_strategy_args /* {
+		struct buf *a_bp;
+	} */ *ap = v;
+	struct buf	*bp;
+	struct vnode	*vp;
+	struct inode	*ip;
+	int		error;
+	int needunlock;
+
+	bp = ap->a_bp;
+	vp = bp->b_vp;
+	ip = VTOI(vp);
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		panic("lfs_strategy: spec");
+	KASSERT(bp->b_bcount != 0);
+
+	/*
+	 * lfs uses strategy routine only for read.
+	 */
+	KASSERT(bp->b_flags & B_READ);
+
+	if (LFS_OWN_LOCK_META(vp)) {
+		KASSERT(!(bp->b_flags & B_ASYNC));
+		/*
+		 * if we're holding write lock,
+		 * no need to acquire read lock.
+		 * XXX ugly
+		 */
+		needunlock = 0;
+	}
+	else {
+		LFS_READ_LOCK_META(vp);
+		needunlock = 1;
+	}
+	if (bp->b_lblkno == bp->b_blkno) {
+		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
+		if (error) {
+			bp->b_error = error;
+			bp->b_flags |= B_ERROR;
+			goto done;
+		}
+	}
+	if ((long)bp->b_blkno == -1) /* no valid data */
+		clrbuf(bp);
+	if ((long)bp->b_blkno < 0) { /* block is not on disk */
+		error = 0;
+		goto done;
+	}
+	if (needunlock) {
+		/*
+		 * aiodone routine will unlock metadata.
+		 */
+		KASSERT((bp->b_flags & B_CALL) == 0);
+		bp->b_flags |= B_CALL;
+		bp->b_iodone = lfs_strategy_iodone;
+		/*
+		 * pass lock to aiodoned.
+		 */
+		LFS_PASS_METALOCK_TO_AIODONED(vp);
+	}
+	vp = ip->i_devvp;
+	bp->b_dev = vp->v_rdev;
+	VOCALL (vp->v_op, VOFFSET(vop_strategy), ap);
+	return (0);
+
+done:
+	if (needunlock) {
+		LFS_READ_UNLOCK_META(vp);
+	}
+	biodone(bp);
+	return (error);
 }

--NextPart-20021215224130-0409300--