Subject: Re: genfs_putpages with PGO_SYNCIO
To: None <enami@sm.sony.co.jp>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 01/16/2003 23:19:09
--NextPart-20030116231758-0032500
Content-Type: Text/Plain; charset=us-ascii
hi.
> > - introduce B_STABLE so that nfs doesn't have to (ab)use B_ASYNC to determine
> > if stable write is needed or not.
> > - always do async i/o for GOP_WRITE.
>
> Is an error correctly reported? I guess caller of
> VOP_{PUT,GET}PAGES() expects an error to be returned on synchronous
> I/O.
no. thanks.
how about attached one?
it introduces third level nested buf so that we can get error via B_ERROR.
YAMAMOTO Takashi
--NextPart-20030116231758-0032500
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="genfs.syncio3.diff"
Index: sys/buf.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/sys/buf.h,v
retrieving revision 1.55
diff -u -p -r1.55 buf.h
--- sys/buf.h 2002/10/06 17:05:56 1.55
+++ sys/buf.h 2003/01/16 14:16:55
@@ -217,6 +217,7 @@ struct buf {
#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */
#define B_XXX 0x02000000 /* Debugging flag. */
#define B_VFLUSH 0x04000000 /* Buffer is being synced. */
+#define B_STABLE 0x08000000 /* Data should go to stable storage */
/*
* This structure describes a clustered I/O. It is stored in the b_saveaddr
Index: miscfs/genfs/genfs_node.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_node.h,v
retrieving revision 1.3
diff -u -p -r1.3 genfs_node.h
--- miscfs/genfs/genfs_node.h 2001/12/18 07:49:36 1.3
+++ miscfs/genfs/genfs_node.h 2003/01/16 14:16:55
@@ -38,15 +38,16 @@ struct vm_page;
struct genfs_ops {
void (*gop_size)(struct vnode *, off_t, off_t *);
int (*gop_alloc)(struct vnode *, off_t, off_t, int, struct ucred *);
- int (*gop_write)(struct vnode *, struct vm_page **, int, int);
+ int (*gop_write)(struct vnode *, struct vm_page **, int, int,
+ struct buf *);
};
#define GOP_SIZE(vp, size, eobp) \
(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp))
#define GOP_ALLOC(vp, off, len, flags, cred) \
(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
-#define GOP_WRITE(vp, pgs, npages, flags) \
- (*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))
+#define GOP_WRITE(vp, pgs, npages, flags, pbp) \
+ (*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags), (pbp))
struct genfs_node {
struct genfs_ops *g_op; /* ops vector */
@@ -57,7 +58,9 @@ struct genfs_node {
void genfs_size(struct vnode *, off_t, off_t *);
void genfs_node_init(struct vnode *, struct genfs_ops *);
-int genfs_gop_write(struct vnode *, struct vm_page **, int, int);
-int genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
+int genfs_gop_write(struct vnode *, struct vm_page **, int, int,
+ struct buf *);
+int genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int,
+ struct buf *);
#endif /* _MISCFS_GENFS_GENFS_NODE_H_ */
Index: miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.68
diff -u -p -r1.68 genfs_vnops.c
--- miscfs/genfs/genfs_vnops.c 2002/11/15 14:01:57 1.68
+++ miscfs/genfs/genfs_vnops.c 2003/01/16 14:16:55
@@ -1024,6 +1024,7 @@ genfs_putpages(void *v)
struct vnode *vp = ap->a_vp;
struct uvm_object *uobj = &vp->v_uobj;
struct simplelock *slock = &uobj->vmobjlock;
+ struct buf *pbp = NULL;
off_t startoff = ap->a_offlo;
off_t endoff = ap->a_offhi;
off_t off;
@@ -1035,6 +1036,7 @@ genfs_putpages(void *v)
boolean_t wasclean, by_list, needs_clean, yield;
boolean_t async = (flags & PGO_SYNCIO) == 0;
boolean_t pagedaemon = curproc == uvm.pagedaemon_proc;
+ boolean_t metbusy = FALSE;
UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
@@ -1102,18 +1104,23 @@ genfs_putpages(void *v)
KASSERT(pg == NULL ||
(pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
(pg->flags & PG_BUSY) != 0);
+
if (by_list) {
if (pg == &endmp) {
break;
}
if (pg->offset < startoff || pg->offset >= endoff ||
pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+ if (pg->flags & (PG_RELEASED|PG_PAGEOUT))
+ metbusy = TRUE;
pg = TAILQ_NEXT(pg, listq);
continue;
}
off = pg->offset;
} else if (pg == NULL ||
pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+ if (pg != NULL)
+ metbusy = TRUE;
off += PAGE_SIZE;
if (off < endoff) {
pg = uvm_pagelookup(uobj, off);
@@ -1275,13 +1282,33 @@ genfs_putpages(void *v)
* start the i/o. if we're traversing by list,
* keep our place in the list with a marker page.
*/
+ if (!async && pbp == NULL) {
+ s = splbio();
+ vp->v_numoutput++;
+ pbp = pool_get(&bufpool, PR_WAITOK);
+ splx(s);
+ pbp->b_bufsize = 0;
+ pbp->b_data = NULL;
+ pbp->b_bcount = 0;
+ pbp->b_flags = B_BUSY|B_WRITE;
+ pbp->b_iodone = NULL;
+ pbp->b_vp = vp;
+ LIST_INIT(&pbp->b_dep);
+
+ /*
+ * set b_resid to 1 now so that i/o completion
+ * can't occur before we issue all writes.
+ * see below.
+ */
+ pbp->b_resid = 1;
+ }
if (by_list) {
TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp,
listq);
}
simple_unlock(slock);
- error = GOP_WRITE(vp, pgs, npages, flags);
+ error = GOP_WRITE(vp, pgs, npages, flags, pbp);
simple_lock(slock);
if (by_list) {
pg = TAILQ_NEXT(&curmp, listq);
@@ -1333,22 +1360,49 @@ genfs_putpages(void *v)
LIST_REMOVE(vp, v_synclist);
}
splx(s);
- if (!wasclean && !async) {
+ if (pbp != NULL) {
+ KASSERT(!async);
+ KASSERT(!wasclean);
+
s = splbio();
- while (vp->v_numoutput != 0) {
- vp->v_flag |= VBWAIT;
- UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock, FALSE,
- "genput2", 0);
- simple_lock(slock);
+ pbp->b_resid--;
+ /*
+ * b_resid == 0 here means all i/o has already completed.
+ */
+ if (pbp->b_resid == 0) {
+ biodone(pbp);
+ }
+
+ /*
+ * If we met pages being paged out by other threads,
+ * we should wait for all outputs.
+ */
+ if (metbusy) {
+ while (vp->v_numoutput != 0) {
+ vp->v_flag |= VBWAIT;
+ UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock,
+ FALSE, "genput2", 0);
+ simple_lock(slock);
+ }
+ KASSERT(pbp->b_resid == 0);
}
splx(s);
}
- simple_unlock(&uobj->vmobjlock);
+ simple_unlock(slock);
+ if (pbp != NULL) {
+ KASSERT(pbp->b_resid >= 0);
+
+ error = biowait(pbp);
+ s = splbio();
+ pool_put(&bufpool, pbp);
+ splx(s);
+ }
return (error);
}
int
-genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+ struct buf *pbp)
{
int s, error, run;
int fs_bshift, dev_bshift;
@@ -1361,6 +1415,7 @@ genfs_gop_write(struct vnode *vp, struct
struct vnode *devvp;
boolean_t async = (flags & PGO_SYNCIO) == 0;
UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
+ KASSERT((async && pbp == NULL) || (!async && pbp != NULL));
UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
vp, pgs, npages, flags);
@@ -1392,10 +1447,19 @@ genfs_gop_write(struct vnode *vp, struct
mbp->b_bufsize = npages << PAGE_SHIFT;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
- mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
- mbp->b_iodone = uvm_aio_biodone;
+ mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL|B_ASYNC;
mbp->b_vp = vp;
LIST_INIT(&mbp->b_dep);
+ if (pbp) {
+ mbp->b_iodone = uvm_aio_biodone2;
+ mbp->b_private = pbp;
+ s = splbio();
+ pbp->b_resid += mbp->b_bcount;
+ splx(s);
+ }
+ else {
+ mbp->b_iodone = uvm_aio_biodone;
+ }
bp = NULL;
for (offset = startoffset;
@@ -1432,11 +1496,11 @@ genfs_gop_write(struct vnode *vp, struct
bp->b_resid = bp->b_bcount = iobytes;
bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
bp->b_iodone = uvm_aio_biodone1;
+ bp->b_private = mbp;
bp->b_vp = vp;
LIST_INIT(&bp->b_dep);
}
bp->b_lblkno = 0;
- bp->b_private = mbp;
if (devvp->v_type == VBLK) {
bp->b_dev = devvp->v_rdev;
}
@@ -1444,6 +1508,9 @@ genfs_gop_write(struct vnode *vp, struct
/* adjust physical blkno for partial blocks */
bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
dev_bshift);
+ if (!async) {
+ bp->b_flags |= B_STABLE;
+ }
UVMHIST_LOG(ubchist,
"vp %p offset 0x%x bcount 0x%x blkno 0x%x",
vp, offset, bp->b_bcount, bp->b_blkno);
@@ -1466,9 +1533,10 @@ genfs_gop_write(struct vnode *vp, struct
UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
return (0);
}
- UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
- error = biowait(mbp);
- uvm_aio_aiodone(mbp);
+ /*
+ * We don't have to wait for I/O completions even for PGO_SYNCIO
+ * because genfs_putpages will wait for us.
+ */
UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
return (error);
}
@@ -1601,9 +1669,10 @@ genfs_compat_getpages(void *v)
return (error);
}
+/* ARGSUSED */
int
genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
- int flags)
+ int flags, struct buf *pbp)
{
off_t offset;
struct iovec iov;
Index: nfs/nfs_bio.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.85
diff -u -p -r1.85 nfs_bio.c
--- nfs/nfs_bio.c 2002/10/29 10:15:16 1.85
+++ nfs/nfs_bio.c 2003/01/16 14:16:55
@@ -989,7 +989,7 @@ nfs_doio(bp, p)
struct vm_page *pgs[npages];
boolean_t needcommit = TRUE;
- if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) {
+ if ((bp->b_flags & B_STABLE) == 0 && NFS_ISV3(vp)) {
iomode = NFSV3WRITE_UNSTABLE;
} else {
iomode = NFSV3WRITE_FILESYNC;
Index: nfs/nfs_node.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_node.c,v
retrieving revision 1.56
diff -u -p -r1.56 nfs_node.c
--- nfs/nfs_node.c 2002/12/01 23:02:10 1.56
+++ nfs/nfs_node.c 2003/01/16 14:16:55
@@ -79,7 +79,7 @@ extern int prtactive;
void nfs_gop_size(struct vnode *, off_t, off_t *);
int nfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
-int nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int nfs_gop_write(struct vnode *, struct vm_page **, int, int, struct buf *);
struct genfs_ops nfs_genfsops = {
nfs_gop_size,
@@ -323,12 +323,13 @@ nfs_gop_alloc(struct vnode *vp, off_t of
}
int
-nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+ struct buf *pbp)
{
int i;
for (i = 0; i < npages; i++) {
pmap_page_protect(pgs[i], VM_PROT_READ);
}
- return genfs_gop_write(vp, pgs, npages, flags);
+ return genfs_gop_write(vp, pgs, npages, flags, pbp);
}
Index: nfs/nfsnode.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfsnode.h,v
retrieving revision 1.37
diff -u -p -r1.37 nfsnode.h
--- nfs/nfsnode.h 2002/12/01 23:02:11 1.37
+++ nfs/nfsnode.h 2003/01/16 14:16:55
@@ -244,7 +244,8 @@ int nfs_truncate __P((void *));
int nfs_update __P((void *));
int nfs_getpages __P((void *));
int nfs_putpages __P((void *));
-int nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int nfs_gop_write(struct vnode *, struct vm_page **, int, int,
+ struct buf *);
int nfs_kqfilter __P((void *));
extern int (**nfsv2_vnodeop_p) __P((void *));
Index: uvm/uvm_extern.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.75
diff -u -p -r1.75 uvm_extern.h
--- uvm/uvm_extern.h 2002/12/11 07:10:20 1.75
+++ uvm/uvm_extern.h 2003/01/16 14:16:55
@@ -660,6 +660,7 @@ void uvm_setpagesize __P((void));
/* uvm_pager.c */
void uvm_aio_biodone1 __P((struct buf *));
+void uvm_aio_biodone2 __P((struct buf *));
void uvm_aio_biodone __P((struct buf *));
void uvm_aio_aiodone __P((struct buf *));
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.59
diff -u -p -r1.59 uvm_pager.c
--- uvm/uvm_pager.c 2002/11/09 20:09:52 1.59
+++ uvm/uvm_pager.c 2003/01/16 14:16:55
@@ -238,13 +238,13 @@ uvm_pagermapout(kva, npages)
}
/*
- * interrupt-context iodone handler for nested i/o bufs.
+ * common part of iodone handler for 'children' of nested i/o buf.
*
* => must be at splbio().
*/
void
-uvm_aio_biodone1(bp)
+uvm_aio_nestedbiodone(bp)
struct buf *bp;
{
struct buf *mbp = bp->b_private;
@@ -255,10 +255,40 @@ uvm_aio_biodone1(bp)
mbp->b_error = bp->b_error;
}
mbp->b_resid -= bp->b_bcount;
- pool_put(&bufpool, bp);
+ KASSERT(mbp->b_resid >= 0);
if (mbp->b_resid == 0) {
biodone(mbp);
}
+}
+
+/*
+ * interrupt-context iodone handler for nested i/o bufs.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone1(bp)
+ struct buf *bp;
+{
+
+ uvm_aio_nestedbiodone(bp);
+ pool_put(&bufpool, bp);
+}
+
+/*
+ * interrupt-context iodone handler for middle level bufs of nested i/o.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone2(bp)
+ struct buf *bp;
+{
+
+ uvm_aio_nestedbiodone(bp);
+ uvm_aio_biodone(bp);
}
/*
Index: uvm/uvm_pager.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.25
diff -u -p -r1.25 uvm_pager.h
--- uvm/uvm_pager.h 2002/03/25 02:08:10 1.25
+++ uvm/uvm_pager.h 2003/01/16 14:16:55
@@ -150,6 +150,7 @@ void uvm_pager_init __P((void));
PAGER_INLINE struct vm_page *uvm_pageratop __P((vaddr_t));
vaddr_t uvm_pagermapin __P((struct vm_page **, int, int));
void uvm_pagermapout __P((vaddr_t, int));
+void uvm_aio_nestedbiodone __P((struct buf *));
/* Flags to uvm_pagermapin() */
#define UVMPAGER_MAPIN_WAITOK 0x01 /* it's okay to wait */
--NextPart-20030116231758-0032500--