Subject: nfsd page loaning
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 04/23/2003 00:07:52
--NextPart-20030422235913-2478100
Content-Type: Text/Plain; charset=us-ascii
hi,
i made a patch to make nfsd use page-loaning for READ requests.
can anyone please review it?
(it includes a fix to make UBC pay attention for loaned pages.)
thanks.
YAMAMOTO Takashi
--NextPart-20030422235913-2478100
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="nfs.loan.diff"
Index: kern/uipc_socket.c
===================================================================
--- kern/uipc_socket.c (revision 47)
+++ kern/uipc_socket.c (working copy)
@@ -153,6 +153,48 @@
#define SOCK_LOAN_THRESH 4096
#define SOCK_LOAN_CHUNK 65536
+/* XXX socketvar.h */
+vaddr_t so_kvaalloc(vsize_t, struct socket *);
+void so_kvafree(vaddr_t, vsize_t);
+void soloanfree(struct mbuf *, caddr_t, size_t, void *);
+
+static size_t sodopendfree(struct socket *);
+
+vaddr_t
+so_kvaalloc(vsize_t len, struct socket *so)
+{
+ vaddr_t lva;
+ int s;
+
+ while (socurkva + len > somaxkva) {
+ if (sodopendfree(so))
+ continue;
+ SOSEND_COUNTER_INCR(&sosend_kvalimit);
+ s = splvm();
+ sokvawaiters++;
+ (void) tsleep(&socurkva, PVM, "sokva", 0);
+ sokvawaiters--;
+ splx(s);
+ }
+
+ lva = uvm_km_valloc_wait(kernel_map, len);
+ if (lva == 0)
+ return (0);
+ socurkva += len;
+
+ return lva;
+}
+
+void
+so_kvafree(vaddr_t sva, vsize_t len)
+{
+
+ uvm_km_free(kernel_map, sva, len);
+ socurkva -= len;
+ if (sokvawaiters)
+ wakeup(&socurkva);
+}
+
static void
sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
{
@@ -179,10 +221,7 @@
pmap_kremove(sva, len);
pmap_update(pmap_kernel());
uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
- uvm_km_free(kernel_map, sva, len);
- socurkva -= len;
- if (sokvawaiters)
- wakeup(&socurkva);
+ so_kvafree(sva, len);
}
static size_t
@@ -228,7 +267,7 @@
return (rv);
}
-static void
+void
soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg)
{
struct socket *so = arg;
@@ -254,7 +293,7 @@
vaddr_t sva, eva;
vsize_t len;
vaddr_t lva, va;
- int npgs, s, i, error;
+ int npgs, i, error;
if (uio->uio_segflg != UIO_USERSPACE)
return (0);
@@ -272,21 +311,9 @@
/* XXX KDASSERT */
KASSERT(npgs <= M_EXT_MAXPAGES);
- while (socurkva + len > somaxkva) {
- if (sodopendfree(so))
- continue;
- SOSEND_COUNTER_INCR(&sosend_kvalimit);
- s = splvm();
- sokvawaiters++;
- (void) tsleep(&socurkva, PVM, "sokva", 0);
- sokvawaiters--;
- splx(s);
- }
-
- lva = uvm_km_valloc_wait(kernel_map, len);
+ lva = so_kvaalloc(len, so);
if (lva == 0)
- return (0);
- socurkva += len;
+ return 0;
error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
Index: uvm/uvm_bio.c
===================================================================
--- uvm/uvm_bio.c (revision 1)
+++ uvm/uvm_bio.c (working copy)
@@ -294,22 +294,23 @@
va = ufi->orig_rvaddr;
eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT);
- /*
- * for virtually-indexed, virtually-tagged caches we should avoid
- * creating writable mappings when we don't absolutely need them,
- * since the "compatible alias" trick doesn't work on such caches.
- * otherwise, we can always map the pages writable.
- */
+ UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0, 0);
+ simple_lock(&uobj->vmobjlock);
+ uvm_lock_pageq();
+ for (i = 0; va < eva; i++, va += PAGE_SIZE) {
+ /*
+ * for virtually-indexed, virtually-tagged caches we should
+ * avoid creating writable mappings when we don't absolutely
+ * need them, since the "compatible alias" trick doesn't work
+ * on such caches. otherwise, we can always map the pages
+ * writable.
+ */
#ifdef PMAP_CACHE_VIVT
- prot = VM_PROT_READ | access_type;
+ prot = VM_PROT_READ | access_type;
#else
- prot = VM_PROT_READ | VM_PROT_WRITE;
+ prot = VM_PROT_READ | VM_PROT_WRITE;
#endif
- UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0, 0);
- simple_lock(&uobj->vmobjlock);
- uvm_lock_pageq();
- for (i = 0; va < eva; i++, va += PAGE_SIZE) {
UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i], 0, 0);
pg = pgs[i];
@@ -324,6 +325,21 @@
uvm_pagefree(pg);
continue;
}
+ if (pg->loan_count != 0) {
+ /*
+ * avoid unneeded loan break
+ */
+ if ((access_type & VM_PROT_WRITE) == 0)
+ prot &= ~VM_PROT_WRITE;
+
+ if (prot & VM_PROT_WRITE) {
+ uvm_unlock_pageq();
+ pg = uvm_loanbreak(pg);
+ uvm_lock_pageq();
+ if (pg == NULL)
+ continue; /* will re-fault */
+ }
+ }
KASSERT(access_type == VM_PROT_READ ||
(pg->flags & PG_RDONLY) == 0);
pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg),
@@ -517,6 +533,7 @@
KASSERT(rv);
pgs[i] = PHYS_TO_VM_PAGE(pa);
pgs[i]->flags &= ~(PG_FAKE|PG_CLEAN);
+ KASSERT(pgs[i]->loan_count == 0);
uvm_pageactivate(pgs[i]);
}
uvm_unlock_pageq();
Index: uvm/uvm_loan.c
===================================================================
--- uvm/uvm_loan.c (revision 1)
+++ uvm/uvm_loan.c (working copy)
@@ -419,6 +419,42 @@
}
/*
+ * uvm_loanuobjpages: loan pages from a uobj out (O->K)
+ *
+ * => called with uobj locked.
+ * => caller should own the pages.
+ */
+void
+uvm_loanuobjpages(pgpp, npages)
+ struct vm_page **pgpp;
+ int npages;
+{
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ struct vm_page *pg = pgpp[i];
+
+ KASSERT(pg->uobject != NULL);
+ KASSERT(!(pg->flags & (PG_RELEASED|PG_PAGEOUT)));
+ LOCK_ASSERT(simple_lock_held(&pg->uobject->vmobjlock));
+ KASSERT(pg->flags & PG_BUSY);
+
+ uvm_lock_pageq();
+ if (pg->loan_count == 0) {
+ pmap_page_protect(pg, VM_PROT_READ);
+ }
+ pg->loan_count++;
+ uvm_pagedequeue(pg);
+ uvm_unlock_pageq();
+ if (pg->flags & PG_WANTED) {
+ wakeup(pg);
+ }
+ pg->flags &= ~(PG_WANTED|PG_BUSY);
+ UVM_PAGE_OWN(pg, NULL);
+ }
+}
+
+/*
* uvm_loanuobj: loan a page from a uobj out
*
* => called with map, amap, uobj locked
@@ -545,18 +581,7 @@
*/
if ((flags & UVM_LOAN_TOANON) == 0) {
- uvm_lock_pageq();
- if (pg->loan_count == 0) {
- pmap_page_protect(pg, VM_PROT_READ);
- }
- pg->loan_count++;
- uvm_pagedequeue(pg);
- uvm_unlock_pageq();
- if (pg->flags & PG_WANTED) {
- wakeup(pg);
- }
- pg->flags &= ~(PG_WANTED|PG_BUSY);
- UVM_PAGE_OWN(pg, NULL);
+ uvm_loanuobjpages(&pg, 1);
**output = pg;
(*output)++;
return (1);
@@ -905,3 +930,75 @@
TAILQ_INIT(&uvm_loanzero_object.memq);
uvm_loanzero_object.pgops = &ulz_pager;
}
+
+/*
+ * uvm_loanbreak: break loan on a uobj page
+ *
+ * => called with uobj locked
+ * => the page should be busy
+ * => return value:
+ * newly allocated page if succeeded
+ */
+struct vm_page *
+uvm_loanbreak(struct vm_page *uobjpage)
+{
+ struct vm_page *pg;
+ struct uvm_object *uobj = uobjpage->uobject;
+ voff_t offset;
+
+ KASSERT(uobj != NULL);
+ LOCK_ASSERT(simple_lock_held(&uobj->vmobjlock));
+ KASSERT(uobjpage->flags & PG_BUSY);
+
+ /* alloc new un-owned page */
+ pg = uvm_pagealloc(NULL, 0, NULL, 0);
+ if (pg == NULL)
+ return NULL;
+
+ /*
+ * copy the data from the old page to the new
+ * one and clear the fake/clean flags on the
+ * new page (keep it busy). force a reload
+ * of the old page by clearing it from all
+ * pmaps. then lock the page queues to
+ * rename the pages.
+ */
+
+ uvm_pagecopy(uobjpage, pg); /* old -> new */
+ pg->flags &= ~(PG_FAKE|PG_CLEAN);
+ pmap_page_protect(uobjpage, VM_PROT_NONE);
+ if (uobjpage->flags & PG_WANTED)
+ wakeup(uobjpage);
+ /* uobj still locked */
+ uobjpage->flags &= ~(PG_WANTED|PG_BUSY);
+ UVM_PAGE_OWN(uobjpage, NULL);
+
+ uvm_lock_pageq();
+ offset = uobjpage->offset;
+ uvm_pagerealloc(uobjpage, NULL, 0);
+
+ /*
+ * if the page is no longer referenced by
+ * an anon (i.e. we are breaking an O->K
+ * loan), then remove it from any pageq's.
+ */
+ if (uobjpage->uanon == NULL)
+ uvm_pagedequeue(uobjpage);
+
+ /*
+ * at this point we have absolutely no
+ * control over uobjpage
+ */
+
+ /* install new page */
+ uvm_pageactivate(pg);
+ uvm_pagerealloc(pg, uobj, offset);
+ uvm_unlock_pageq();
+
+ /*
+ * done! loan is broken and "pg" is
+ * PG_BUSY. it can now replace uobjpage.
+ */
+
+ return pg;
+}
Index: uvm/uvm_loan.h
===================================================================
--- uvm/uvm_loan.h (revision 1)
+++ uvm/uvm_loan.h (working copy)
@@ -53,6 +53,8 @@
void uvm_loan_init __P((void));
int uvm_loan __P((struct vm_map *, vaddr_t, vsize_t, void *, int));
void uvm_unloan __P((void *, int, int));
+void uvm_loanuobjpages __P((struct vm_page **, int));
+struct vm_page *uvm_loanbreak __P((struct vm_page *));
#endif /* _KERNEL */
Index: uvm/uvm_fault.c
===================================================================
--- uvm/uvm_fault.c (revision 34)
+++ uvm/uvm_fault.c (working copy)
@@ -535,7 +535,7 @@
vm_prot_t enter_prot, check_prot;
boolean_t wired, narrow, promote, locked, shadowed, wire_fault, cow_now;
int npages, nback, nforw, centeridx, error, lcv, gotpages;
- vaddr_t startva, objaddr, currva, offset;
+ vaddr_t startva, objaddr, currva;
voff_t uoff;
paddr_t pa;
struct vm_amap *amap;
@@ -1451,9 +1451,7 @@
} else {
/* write fault: must break the loan here */
- /* alloc new un-owned page */
- pg = uvm_pagealloc(NULL, 0, NULL, 0);
-
+ pg = uvm_loanbreak(uobjpage);
if (pg == NULL) {
/*
@@ -1475,52 +1473,6 @@
uvm_wait("flt_noram4");
goto ReFault;
}
-
- /*
- * copy the data from the old page to the new
- * one and clear the fake/clean flags on the
- * new page (keep it busy). force a reload
- * of the old page by clearing it from all
- * pmaps. then lock the page queues to
- * rename the pages.
- */
-
- uvm_pagecopy(uobjpage, pg); /* old -> new */
- pg->flags &= ~(PG_FAKE|PG_CLEAN);
- pmap_page_protect(uobjpage, VM_PROT_NONE);
- if (uobjpage->flags & PG_WANTED)
- wakeup(uobjpage);
- /* uobj still locked */
- uobjpage->flags &= ~(PG_WANTED|PG_BUSY);
- UVM_PAGE_OWN(uobjpage, NULL);
-
- uvm_lock_pageq();
- offset = uobjpage->offset;
- uvm_pagerealloc(uobjpage, NULL, 0);
-
- /*
- * if the page is no longer referenced by
- * an anon (i.e. we are breaking an O->K
- * loan), then remove it from any pageq's.
- */
- if (uobjpage->uanon == NULL)
- uvm_pagedequeue(uobjpage);
-
- /*
- * at this point we have absolutely no
- * control over uobjpage
- */
-
- /* install new page */
- uvm_pageactivate(pg);
- uvm_pagerealloc(pg, uobj, offset);
- uvm_unlock_pageq();
-
- /*
- * done! loan is broken and "pg" is
- * PG_BUSY. it can now replace uobjpage.
- */
-
uobjpage = pg;
}
}
Index: nfs/nfs_subs.c
===================================================================
--- nfs/nfs_subs.c (revision 46)
+++ nfs/nfs_subs.c (working copy)
@@ -2119,7 +2119,7 @@
}
/*
- * A fiddled version of m_adj() that ensures null fill to a long
+ * A fiddled version of m_adj() that ensures null fill to a 32-bit
* boundary and only trims off the back end
*/
void
@@ -2150,6 +2150,18 @@
if (m->m_len > len) {
m->m_len -= len;
if (nul > 0) {
+ if (M_ROMAP(m)) {
+ struct mbuf *n;
+
+ KDASSERT(MLEN >= nul);
+ n = m_get(M_WAIT, MT_DATA);
+ MCLAIM(n, &nfs_mowner);
+ n->m_len = nul;
+ n->m_next = m->m_next;
+ m->m_len -= nul;
+ m->m_next = n;
+ m = n;
+ }
cp = mtod(m, caddr_t)+m->m_len-nul;
for (i = 0; i < nul; i++)
*cp++ = '\0';
@@ -2168,6 +2180,18 @@
if (m->m_len >= count) {
m->m_len = count;
if (nul > 0) {
+ if (M_ROMAP(m)) {
+ struct mbuf *n;
+
+ KDASSERT(MLEN >= nul);
+ n = m_get(M_WAIT, MT_DATA);
+ MCLAIM(n, &nfs_mowner);
+ n->m_len = nul;
+ n->m_next = m->m_next;
+ m->m_len -= nul;
+ m->m_next = n;
+ m = n;
+ }
cp = mtod(m, caddr_t)+m->m_len-nul;
for (i = 0; i < nul; i++)
*cp++ = '\0';
Index: nfs/nfs_serv.c
===================================================================
--- nfs/nfs_serv.c (revision 20)
+++ nfs/nfs_serv.c (working copy)
@@ -76,6 +76,7 @@
#include <sys/kernel.h>
#include <ufs/ufs/dir.h>
+#include <uvm/uvm_loan.h>
#include <uvm/uvm_extern.h>
#include <nfs/nfsproto.h>
@@ -565,6 +566,11 @@
nfsm_srvdone;
}
+/* XXX socketvar.h */
+vaddr_t so_kvaalloc(vsize_t, struct socket *);
+void so_kvafree(vaddr_t, vsize_t);
+void soloanfree(struct mbuf *, caddr_t, size_t, void *);
+
/*
* nfs read service
*/
@@ -579,20 +585,17 @@
struct mbuf *nam = nfsd->nd_nam;
caddr_t dpos = nfsd->nd_dpos;
struct ucred *cred = &nfsd->nd_cr;
- struct iovec *iv;
- struct iovec *iv2;
struct mbuf *m;
struct nfs_fattr *fp;
u_int32_t *tl;
int32_t t1;
int i;
caddr_t bpos;
- int error = 0, rdonly, cache, cnt, len, left, siz, tlen, getret;
+ int error = 0, rdonly, cache, cnt, len, left, tlen, getret;
int v3 = (nfsd->nd_flag & ND_NFSV3);
uint32_t reqlen;
char *cp2;
struct mbuf *mb, *mreq;
- struct mbuf *m2;
struct vnode *vp;
nfsfh_t nfh;
fhandle_t *fhp;
@@ -659,6 +662,70 @@
}
len = left = cnt;
if (cnt > 0) {
+#if 1 /* XXX */
+ struct vm_page **pgpp;
+ voff_t pgoff = trunc_page(off);
+ int orignpages, npages;
+ vaddr_t lva;
+
+ npages = orignpages = (round_page(off + cnt) - pgoff)
+ >> PAGE_SHIFT;
+ KASSERT(npages <= M_EXT_MAXPAGES); /* XXX */
+
+ lva = so_kvaalloc(npages << PAGE_SHIFT, slp->ns_so);
+ if (lva == 0) {
+ /* XXX is it worth to fall back to VOP_READ? */
+ error = ENOMEM;
+ goto fail;
+ }
+
+ m = m_get(M_WAIT, MT_DATA);
+ pgpp = m->m_ext.ext_pgs;
+again:
+ simple_lock(&vp->v_interlock);
+ error = VOP_GETPAGES(vp, pgoff, pgpp, &npages, 0, VM_PROT_READ,
+ 0, PGO_SYNCIO);
+ if (error == EAGAIN) {
+ tsleep(&lbolt, PVM, "nfsread", 0);
+ goto again;
+ }
+ if (error) {
+ so_kvafree(lva, orignpages << PAGE_SHIFT);
+ m_free(m);
+ goto fail;
+ }
+ KASSERT(npages == orignpages);
+
+ /* loan and unbusy pages */
+ simple_lock(&vp->v_interlock);
+ /* XXX should check PG_RELEASED here? */
+ uvm_loanuobjpages(pgpp, npages);
+ simple_unlock(&vp->v_interlock);
+
+ /* map pages */
+ for (i = 0; i < npages; i++) {
+ pmap_kenter_pa(lva + (i << PAGE_SHIFT),
+ VM_PAGE_TO_PHYS(pgpp[i]), VM_PROT_READ);
+ }
+
+ lva += off & PAGE_MASK;
+
+ MCLAIM(m, &nfs_mowner);
+ MEXTADD(m, (void *)lva, cnt, M_MBUF, soloanfree, slp->ns_so);
+ m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
+ m->m_len = cnt;
+
+ pmap_update(pmap_kernel());
+ mb->m_next = m;
+ mb = m;
+ error = 0;
+ uiop->uio_resid = 0;
+fail:
+#else
+ struct iovec *iv;
+ struct iovec *iv2;
+ struct mbuf *m2;
+ int siz;
/*
* Generate the mbuf list with the uio_iov ref. to it.
*/
@@ -706,6 +773,7 @@
error = VOP_READ(vp, uiop, IO_NODELOCKED, cred);
off = uiop->uio_offset;
free((caddr_t)iv2, M_TEMP);
+#endif
if (error || (getret = VOP_GETATTR(vp, &va, cred, procp)) != 0){
if (!error)
error = getret;
--NextPart-20030422235913-2478100--