Subject: nfsd page loaning
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 04/23/2003 00:07:52
--NextPart-20030422235913-2478100
Content-Type: Text/Plain; charset=us-ascii

hi,

i made a patch to make nfsd use page-loaning for READ requests.
can anyone please review it?
(it includes a fix to make UBC pay attention for loaned pages.)

thanks.

YAMAMOTO Takashi


--NextPart-20030422235913-2478100
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="nfs.loan.diff"

Index: kern/uipc_socket.c
===================================================================
--- kern/uipc_socket.c	(revision 47)
+++ kern/uipc_socket.c	(working copy)
@@ -153,6 +153,48 @@
 #define	SOCK_LOAN_THRESH	4096
 #define	SOCK_LOAN_CHUNK		65536
 
+/* XXX socketvar.h */
+vaddr_t so_kvaalloc(vsize_t, struct socket *);
+void so_kvafree(vaddr_t, vsize_t);
+void soloanfree(struct mbuf *, caddr_t, size_t, void *);
+
+static size_t sodopendfree(struct socket *);
+
+vaddr_t
+so_kvaalloc(vsize_t len, struct socket *so)
+{
+	vaddr_t lva;
+	int s;
+
+	while (socurkva + len > somaxkva) {
+		if (sodopendfree(so))
+			continue;
+		SOSEND_COUNTER_INCR(&sosend_kvalimit);
+		s = splvm();
+		sokvawaiters++;
+		(void) tsleep(&socurkva, PVM, "sokva", 0);
+		sokvawaiters--;
+		splx(s);
+	}
+
+	lva = uvm_km_valloc_wait(kernel_map, len);
+	if (lva == 0)
+		return (0);
+	socurkva += len;
+
+	return lva;
+}
+
+void
+so_kvafree(vaddr_t sva, vsize_t len)
+{
+
+	uvm_km_free(kernel_map, sva, len);
+	socurkva -= len;
+	if (sokvawaiters)
+		wakeup(&socurkva);
+}
+
 static void
 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
 {
@@ -179,10 +221,7 @@
 	pmap_kremove(sva, len);
 	pmap_update(pmap_kernel());
 	uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
-	uvm_km_free(kernel_map, sva, len);
-	socurkva -= len;
-	if (sokvawaiters)
-		wakeup(&socurkva);
+	so_kvafree(sva, len);
 }
 
 static size_t
@@ -228,7 +267,7 @@
 	return (rv);
 }
 
-static void
+void
 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg)
 {
 	struct socket *so = arg;
@@ -254,7 +293,7 @@
 	vaddr_t sva, eva;
 	vsize_t len;
 	vaddr_t lva, va;
-	int npgs, s, i, error;
+	int npgs, i, error;
 
 	if (uio->uio_segflg != UIO_USERSPACE)
 		return (0);
@@ -272,21 +311,9 @@
 	/* XXX KDASSERT */
 	KASSERT(npgs <= M_EXT_MAXPAGES);
 
-	while (socurkva + len > somaxkva) {
-		if (sodopendfree(so))
-			continue;
-		SOSEND_COUNTER_INCR(&sosend_kvalimit);
-		s = splvm();
-		sokvawaiters++;
-		(void) tsleep(&socurkva, PVM, "sokva", 0);
-		sokvawaiters--;
-		splx(s);
-	}
-
-	lva = uvm_km_valloc_wait(kernel_map, len);
+	lva = so_kvaalloc(len, so);
 	if (lva == 0)
-		return (0);
-	socurkva += len;
+		return 0;
 
 	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
 	    m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
Index: uvm/uvm_bio.c
===================================================================
--- uvm/uvm_bio.c	(revision 1)
+++ uvm/uvm_bio.c	(working copy)
@@ -294,22 +294,23 @@
 	va = ufi->orig_rvaddr;
 	eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT);
 
-	/*
-	 * for virtually-indexed, virtually-tagged caches we should avoid
-	 * creating writable mappings when we don't absolutely need them,
-	 * since the "compatible alias" trick doesn't work on such caches.
-	 * otherwise, we can always map the pages writable.
-	 */
+	UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0, 0);
+	simple_lock(&uobj->vmobjlock);
+	uvm_lock_pageq();
+	for (i = 0; va < eva; i++, va += PAGE_SIZE) {
+		/*
+		 * for virtually-indexed, virtually-tagged caches we should
+		 * avoid creating writable mappings when we don't absolutely
+		 * need them, since the "compatible alias" trick doesn't work
+		 * on such caches.  otherwise, we can always map the pages
+		 * writable.
+		 */
 
 #ifdef PMAP_CACHE_VIVT
-	prot = VM_PROT_READ | access_type;
+		prot = VM_PROT_READ | access_type;
 #else
-	prot = VM_PROT_READ | VM_PROT_WRITE;
+		prot = VM_PROT_READ | VM_PROT_WRITE;
 #endif
-	UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0, 0);
-	simple_lock(&uobj->vmobjlock);
-	uvm_lock_pageq();
-	for (i = 0; va < eva; i++, va += PAGE_SIZE) {
 		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i], 0, 0);
 		pg = pgs[i];
 
@@ -324,6 +325,21 @@
 			uvm_pagefree(pg);
 			continue;
 		}
+		if (pg->loan_count != 0) {
+			/*
+			 * avoid unneeded loan break
+			 */
+			if ((access_type & VM_PROT_WRITE) == 0)
+				prot &= ~VM_PROT_WRITE;
+
+			if (prot & VM_PROT_WRITE) {
+				uvm_unlock_pageq();
+				pg = uvm_loanbreak(pg);
+				uvm_lock_pageq();
+				if (pg == NULL)
+					continue; /* will re-fault */
+			}
+		}
 		KASSERT(access_type == VM_PROT_READ ||
 		    (pg->flags & PG_RDONLY) == 0);
 		pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg),
@@ -517,6 +533,7 @@
 			KASSERT(rv);
 			pgs[i] = PHYS_TO_VM_PAGE(pa);
 			pgs[i]->flags &= ~(PG_FAKE|PG_CLEAN);
+			KASSERT(pgs[i]->loan_count == 0);
 			uvm_pageactivate(pgs[i]);
 		}
 		uvm_unlock_pageq();
Index: uvm/uvm_loan.c
===================================================================
--- uvm/uvm_loan.c	(revision 1)
+++ uvm/uvm_loan.c	(working copy)
@@ -419,6 +419,42 @@
 }
 
 /*
+ * uvm_loanuobjpages: loan pages from a uobj out (O->K)
+ *
+ * => called with uobj locked.
+ * => caller should own the pages.
+ */
+void
+uvm_loanuobjpages(pgpp, npages)
+	struct vm_page **pgpp;
+	int npages;
+{
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		struct vm_page *pg = pgpp[i];
+
+		KASSERT(pg->uobject != NULL);
+		KASSERT(!(pg->flags & (PG_RELEASED|PG_PAGEOUT)));
+		LOCK_ASSERT(simple_lock_held(&pg->uobject->vmobjlock));
+		KASSERT(pg->flags & PG_BUSY);
+
+		uvm_lock_pageq();
+		if (pg->loan_count == 0) {
+			pmap_page_protect(pg, VM_PROT_READ);
+		}
+		pg->loan_count++;
+		uvm_pagedequeue(pg);
+		uvm_unlock_pageq();
+		if (pg->flags & PG_WANTED) {
+			wakeup(pg);
+		}
+		pg->flags &= ~(PG_WANTED|PG_BUSY);
+		UVM_PAGE_OWN(pg, NULL);
+	}
+}
+
+/*
  * uvm_loanuobj: loan a page from a uobj out
  *
  * => called with map, amap, uobj locked
@@ -545,18 +581,7 @@
 	 */
 
 	if ((flags & UVM_LOAN_TOANON) == 0) {
-		uvm_lock_pageq();
-		if (pg->loan_count == 0) {
-			pmap_page_protect(pg, VM_PROT_READ);
-		}
-		pg->loan_count++;
-		uvm_pagedequeue(pg);
-		uvm_unlock_pageq();
-		if (pg->flags & PG_WANTED) {
-			wakeup(pg);
-		}
-		pg->flags &= ~(PG_WANTED|PG_BUSY);
-		UVM_PAGE_OWN(pg, NULL);
+		uvm_loanuobjpages(&pg, 1);
 		**output = pg;
 		(*output)++;
 		return (1);
@@ -905,3 +930,75 @@
 	TAILQ_INIT(&uvm_loanzero_object.memq);
 	uvm_loanzero_object.pgops = &ulz_pager;
 }
+
+/*
+ * uvm_loanbreak: break loan on a uobj page
+ *
+ * => called with uobj locked
+ * => the page should be busy
+ * => return value:
+ *	newly allocated page if succeeded
+ */
+struct vm_page *
+uvm_loanbreak(struct vm_page *uobjpage)
+{
+	struct vm_page *pg;
+	struct uvm_object *uobj = uobjpage->uobject;
+	voff_t offset;
+
+	KASSERT(uobj != NULL);
+	LOCK_ASSERT(simple_lock_held(&uobj->vmobjlock));
+	KASSERT(uobjpage->flags & PG_BUSY);
+
+	/* alloc new un-owned page */
+	pg = uvm_pagealloc(NULL, 0, NULL, 0);
+	if (pg == NULL)
+		return NULL;
+
+	/*
+	 * copy the data from the old page to the new
+	 * one and clear the fake/clean flags on the
+	 * new page (keep it busy).  force a reload
+	 * of the old page by clearing it from all
+	 * pmaps.  then lock the page queues to
+	 * rename the pages.
+	 */
+
+	uvm_pagecopy(uobjpage, pg);	/* old -> new */
+	pg->flags &= ~(PG_FAKE|PG_CLEAN);
+	pmap_page_protect(uobjpage, VM_PROT_NONE);
+	if (uobjpage->flags & PG_WANTED)
+		wakeup(uobjpage);
+	/* uobj still locked */
+	uobjpage->flags &= ~(PG_WANTED|PG_BUSY);
+	UVM_PAGE_OWN(uobjpage, NULL);
+
+	uvm_lock_pageq();
+	offset = uobjpage->offset;
+	uvm_pagerealloc(uobjpage, NULL, 0);
+
+	/*
+	 * if the page is no longer referenced by
+	 * an anon (i.e. we are breaking an O->K
+	 * loan), then remove it from any pageq's.
+	 */
+	if (uobjpage->uanon == NULL)
+		uvm_pagedequeue(uobjpage);
+
+	/*
+	 * at this point we have absolutely no
+	 * control over uobjpage
+	 */
+
+	/* install new page */
+	uvm_pageactivate(pg);
+	uvm_pagerealloc(pg, uobj, offset);
+	uvm_unlock_pageq();
+
+	/*
+	 * done!  loan is broken and "pg" is
+	 * PG_BUSY.   it can now replace uobjpage.
+	 */
+
+	return pg;
+}
Index: uvm/uvm_loan.h
===================================================================
--- uvm/uvm_loan.h	(revision 1)
+++ uvm/uvm_loan.h	(working copy)
@@ -53,6 +53,8 @@
 void uvm_loan_init __P((void));
 int uvm_loan __P((struct vm_map *, vaddr_t, vsize_t, void *, int));
 void uvm_unloan __P((void *, int, int));
+void uvm_loanuobjpages __P((struct vm_page **, int));
+struct vm_page *uvm_loanbreak __P((struct vm_page *));
 
 #endif /* _KERNEL */
 
Index: uvm/uvm_fault.c
===================================================================
--- uvm/uvm_fault.c	(revision 34)
+++ uvm/uvm_fault.c	(working copy)
@@ -535,7 +535,7 @@
 	vm_prot_t enter_prot, check_prot;
 	boolean_t wired, narrow, promote, locked, shadowed, wire_fault, cow_now;
 	int npages, nback, nforw, centeridx, error, lcv, gotpages;
-	vaddr_t startva, objaddr, currva, offset;
+	vaddr_t startva, objaddr, currva;
 	voff_t uoff;
 	paddr_t pa;
 	struct vm_amap *amap;
@@ -1451,9 +1451,7 @@
 			} else {
 				/* write fault: must break the loan here */
 
-				/* alloc new un-owned page */
-				pg = uvm_pagealloc(NULL, 0, NULL, 0);
-
+				pg = uvm_loanbreak(uobjpage);
 				if (pg == NULL) {
 
 					/*
@@ -1475,52 +1473,6 @@
 					uvm_wait("flt_noram4");
 					goto ReFault;
 				}
-
-				/*
-				 * copy the data from the old page to the new
-				 * one and clear the fake/clean flags on the
-				 * new page (keep it busy).  force a reload
-				 * of the old page by clearing it from all
-				 * pmaps.  then lock the page queues to
-				 * rename the pages.
-				 */
-
-				uvm_pagecopy(uobjpage, pg);	/* old -> new */
-				pg->flags &= ~(PG_FAKE|PG_CLEAN);
-				pmap_page_protect(uobjpage, VM_PROT_NONE);
-				if (uobjpage->flags & PG_WANTED)
-					wakeup(uobjpage);
-				/* uobj still locked */
-				uobjpage->flags &= ~(PG_WANTED|PG_BUSY);
-				UVM_PAGE_OWN(uobjpage, NULL);
-
-				uvm_lock_pageq();
-				offset = uobjpage->offset;
-				uvm_pagerealloc(uobjpage, NULL, 0);
-
-				/*
-				 * if the page is no longer referenced by
-				 * an anon (i.e. we are breaking an O->K
-				 * loan), then remove it from any pageq's.
-				 */
-				if (uobjpage->uanon == NULL)
-					uvm_pagedequeue(uobjpage);
-
-				/*
-				 * at this point we have absolutely no
-				 * control over uobjpage
-				 */
-
-				/* install new page */
-				uvm_pageactivate(pg);
-				uvm_pagerealloc(pg, uobj, offset);
-				uvm_unlock_pageq();
-
-				/*
-				 * done!  loan is broken and "pg" is
-				 * PG_BUSY.   it can now replace uobjpage.
-				 */
-
 				uobjpage = pg;
 			}
 		}
Index: nfs/nfs_subs.c
===================================================================
--- nfs/nfs_subs.c	(revision 46)
+++ nfs/nfs_subs.c	(working copy)
@@ -2119,7 +2119,7 @@
 }
 
 /*
- * A fiddled version of m_adj() that ensures null fill to a long
+ * A fiddled version of m_adj() that ensures null fill to a 32-bit
  * boundary and only trims off the back end
  */
 void
@@ -2150,6 +2150,18 @@
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
+			if (M_ROMAP(m)) {
+				struct mbuf *n;
+
+				KDASSERT(MLEN >= nul);
+				n = m_get(M_WAIT, MT_DATA);
+				MCLAIM(n, &nfs_mowner);
+				n->m_len = nul;
+				n->m_next = m->m_next;
+				m->m_len -= nul;
+				m->m_next = n;
+				m = n;
+			}
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
@@ -2168,6 +2180,18 @@
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
+				if (M_ROMAP(m)) {
+					struct mbuf *n;
+
+					KDASSERT(MLEN >= nul);
+					n = m_get(M_WAIT, MT_DATA);
+					MCLAIM(n, &nfs_mowner);
+					n->m_len = nul;
+					n->m_next = m->m_next;
+					m->m_len -= nul;
+					m->m_next = n;
+					m = n;
+				}
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
Index: nfs/nfs_serv.c
===================================================================
--- nfs/nfs_serv.c	(revision 20)
+++ nfs/nfs_serv.c	(working copy)
@@ -76,6 +76,7 @@
 #include <sys/kernel.h>
 #include <ufs/ufs/dir.h>
 
+#include <uvm/uvm_loan.h>
 #include <uvm/uvm_extern.h>
 
 #include <nfs/nfsproto.h>
@@ -565,6 +566,11 @@
 	nfsm_srvdone;
 }
 
+/* XXX socketvar.h */
+vaddr_t so_kvaalloc(vsize_t, struct socket *);
+void so_kvafree(vaddr_t, vsize_t);
+void soloanfree(struct mbuf *, caddr_t, size_t, void *);
+
 /*
  * nfs read service
  */
@@ -579,20 +585,17 @@
 	struct mbuf *nam = nfsd->nd_nam;
 	caddr_t dpos = nfsd->nd_dpos;
 	struct ucred *cred = &nfsd->nd_cr;
-	struct iovec *iv;
-	struct iovec *iv2;
 	struct mbuf *m;
 	struct nfs_fattr *fp;
 	u_int32_t *tl;
 	int32_t t1;
 	int i;
 	caddr_t bpos;
-	int error = 0, rdonly, cache, cnt, len, left, siz, tlen, getret;
+	int error = 0, rdonly, cache, cnt, len, left, tlen, getret;
 	int v3 = (nfsd->nd_flag & ND_NFSV3);
 	uint32_t reqlen;
 	char *cp2;
 	struct mbuf *mb, *mreq;
-	struct mbuf *m2;
 	struct vnode *vp;
 	nfsfh_t nfh;
 	fhandle_t *fhp;
@@ -659,6 +662,70 @@
 	}
 	len = left = cnt;
 	if (cnt > 0) {
+#if 1 /* XXX */
+		struct vm_page **pgpp;
+		voff_t pgoff = trunc_page(off);
+		int orignpages, npages;
+		vaddr_t lva;
+
+		npages = orignpages = (round_page(off + cnt) - pgoff)
+		    >> PAGE_SHIFT;
+		KASSERT(npages <= M_EXT_MAXPAGES); /* XXX */
+
+		lva = so_kvaalloc(npages << PAGE_SHIFT, slp->ns_so);
+		if (lva == 0) {
+			/* XXX is it worth to fall back to VOP_READ? */
+			error = ENOMEM;
+			goto fail;
+		}
+
+		m = m_get(M_WAIT, MT_DATA);
+		pgpp = m->m_ext.ext_pgs;
+again:
+		simple_lock(&vp->v_interlock);
+		error = VOP_GETPAGES(vp, pgoff, pgpp, &npages, 0, VM_PROT_READ,
+		    0, PGO_SYNCIO);
+		if (error == EAGAIN) {
+			tsleep(&lbolt, PVM, "nfsread", 0);
+			goto again;
+		}
+		if (error) {
+			so_kvafree(lva, orignpages << PAGE_SHIFT);
+			m_free(m);
+			goto fail;
+		}
+		KASSERT(npages == orignpages);
+		
+		/* loan and unbusy pages */
+		simple_lock(&vp->v_interlock);
+		/* XXX should check PG_RELEASED here? */
+		uvm_loanuobjpages(pgpp, npages);
+		simple_unlock(&vp->v_interlock);
+
+		/* map pages */
+		for (i = 0; i < npages; i++) {
+			pmap_kenter_pa(lva + (i << PAGE_SHIFT),
+			    VM_PAGE_TO_PHYS(pgpp[i]), VM_PROT_READ);
+		}
+
+		lva += off & PAGE_MASK;
+
+		MCLAIM(m, &nfs_mowner);
+		MEXTADD(m, (void *)lva, cnt, M_MBUF, soloanfree, slp->ns_so);
+		m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
+		m->m_len = cnt;
+
+		pmap_update(pmap_kernel());
+		mb->m_next = m;
+		mb = m;
+		error = 0;
+		uiop->uio_resid = 0;
+fail:
+#else
+		struct iovec *iv;
+		struct iovec *iv2;
+		struct mbuf *m2;
+		int siz;
 		/*
 		 * Generate the mbuf list with the uio_iov ref. to it.
 		 */
@@ -706,6 +773,7 @@
 		error = VOP_READ(vp, uiop, IO_NODELOCKED, cred);
 		off = uiop->uio_offset;
 		free((caddr_t)iv2, M_TEMP);
+#endif
 		if (error || (getret = VOP_GETATTR(vp, &va, cred, procp)) != 0){
 			if (!error)
 				error = getret;

--NextPart-20030422235913-2478100--