Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys convert the (still disabled) 'direct write' for pipes to...



details:   https://anonhg.NetBSD.org/src/rev/1547e0edd6ef
branches:  trunk
changeset: 362440:1547e0edd6ef
user:      jdolecek <jdolecek%NetBSD.org@localhost>
date:      Sun Jun 10 17:54:51 2018 +0000

description:
convert the (still disabled) 'direct write' for pipes to use the
experimental PMAP_DIRECT if available; the direct code paths now survive
longer than the pmap_enter() variant, but still triggers panic during
build.sh tools run; remove some obsolete sysctls

add some XXXs to mark places which need attention to make this more stable

Note: the loan case is now actually significantly slower than the
non-loan case on MP systems, due to synchronous IPIs triggered by
marking the page read-only by uvm_loan(); this is being discussed
in the email thread
https://mail-index.netbsd.org/tech-kern/2018/05/21/msg023441.html

that is basically the same issue due to which loaning was disabled
for sosend()

diffstat:

 sys/kern/sys_pipe.c |  180 ++++++++++++++++++++++-----------------------------
 sys/sys/pipe.h      |   15 ++--
 2 files changed, 84 insertions(+), 111 deletions(-)

diffs (truncated from 374 to 300 lines):

diff -r b4c363728414 -r 1547e0edd6ef sys/kern/sys_pipe.c
--- a/sys/kern/sys_pipe.c       Sun Jun 10 17:52:20 2018 +0000
+++ b/sys/kern/sys_pipe.c       Sun Jun 10 17:54:51 2018 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $  */
+/*     $NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $  */
 
 /*-
  * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -102,6 +102,12 @@
 
 #ifndef PIPE_NODIRECT
 #include <uvm/uvm.h>
+
+#if !defined(PMAP_DIRECT)
+#  define PIPE_NODIRECT                /* Direct map interface not available */
+#endif
+
+bool pipe_direct = true;
 #endif
 
 static int     pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
@@ -136,20 +142,6 @@
 #define        MAXPIPESIZE     (2 * PIPE_SIZE / 3)
 
 /*
- * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
- * is there so that on large systems, we don't exhaust it.
- */
-#define        MAXPIPEKVA      (8 * 1024 * 1024)
-static u_int   maxpipekva = MAXPIPEKVA;
-
-/*
- * Limit for direct transfers, we cannot, of course limit
- * the amount of kva for pipes in general though.
- */
-#define        LIMITPIPEKVA    (16 * 1024 * 1024)
-static u_int   limitpipekva = LIMITPIPEKVA;
-
-/*
  * Limit the number of "big" pipes
  */
 #define        LIMITBIGPIPES   32
@@ -177,6 +169,7 @@
 #ifndef PIPE_NODIRECT
 static int     pipe_loan_alloc(struct pipe *, int);
 static void    pipe_loan_free(struct pipe *);
+static int     pipe_direct_process_read(void *, size_t, void *);
 #endif /* PIPE_NODIRECT */
 
 static pool_cache_t    pipe_wr_cache;
@@ -446,6 +439,16 @@
        fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
 }
 
+#ifndef PIPE_NODIRECT
+static int
+pipe_direct_process_read(void *va, size_t len, void *arg)
+{
+       struct uio *uio = (struct uio *)arg;
+
+       return uiomove(va, len, uio);
+}
+#endif
+
 static int
 pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
     int flags)
@@ -507,30 +510,39 @@
 #ifndef PIPE_NODIRECT
                if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) {
                        struct pipemapping * const rmap = &rpipe->pipe_map;
+                       voff_t pgoff;
+                       u_int pgst, npages;
+
                        /*
                         * Direct copy, bypassing a kernel buffer.
                         */
-                       void *va;
-
                        KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
 
-                       size = rmap->cnt;
-                       if (size > uio->uio_resid)
-                               size = uio->uio_resid;
+                       size = MIN(rmap->cnt, uio->uio_resid);
+
+                       if (size > 0) {
+                               KASSERT(size > 0);
+                               mutex_exit(lock);
 
-                       va = (char *)rmap->kva + rmap->pos;
-                       mutex_exit(lock);
-                       error = uiomove(va, size, uio);
-                       mutex_enter(lock);
-                       if (error)
-                               break;
-                       nread += size;
-                       rmap->pos += size;
-                       rmap->cnt -= size;
+                               pgst = rmap->pos >> PAGE_SHIFT;
+                               pgoff = rmap->pos & PAGE_MASK;
+                               npages = (size + pgoff + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                               KASSERTMSG(npages > 0 && (pgst + npages) <= rmap->npages, "npages %u pgst %u rmap->npages %u", npages, pgst, rmap->npages);
+                               
+                               error = uvm_direct_process(&rmap->pgs[pgst], npages,
+                                   pgoff, size, pipe_direct_process_read, uio);
+                               mutex_enter(lock);
+
+                               nread += size;
+                               rmap->pos += size;
+                               rmap->cnt -= size;
+                       }
+
                        if (rmap->cnt == 0) {
                                rpipe->pipe_state &= ~PIPE_DIRECTR;
                                cv_broadcast(&rpipe->pipe_wcv);
                        }
+
                        continue;
                }
 #endif
@@ -630,18 +642,20 @@
 pipe_loan_alloc(struct pipe *wpipe, int npages)
 {
        struct pipemapping * const wmap = &wpipe->pipe_map;
-       const vsize_t len = ptoa(npages);
+
+       KASSERT(wmap->npages == 0);
 
-       atomic_add_int(&amountpipekva, len);
-       wmap->kva = uvm_km_alloc(kernel_map, len, 0,
-           UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
-       if (wmap->kva == 0) {
-               atomic_add_int(&amountpipekva, -len);
-               return (ENOMEM);
+       if (npages > wmap->maxpages) {
+               pipe_loan_free(wpipe);
+
+               wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_NOSLEEP);
+               if (wmap->pgs == NULL)
+                       return ENOMEM;
+               wmap->maxpages = npages;
        }
 
        wmap->npages = npages;
-       wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_SLEEP);
+
        return (0);
 }
 
@@ -652,18 +666,16 @@
 pipe_loan_free(struct pipe *wpipe)
 {
        struct pipemapping * const wmap = &wpipe->pipe_map;
-       const vsize_t len = ptoa(wmap->npages);
 
-       uvm_km_free(kernel_map, wmap->kva, len, UVM_KMF_VAONLY);
-       wmap->kva = 0;
-       atomic_add_int(&amountpipekva, -len);
-       kmem_free(wmap->pgs, wmap->npages * sizeof(struct vm_page *));
-       wmap->pgs = NULL;
-#if 0
+       if (wmap->maxpages > 0) {
+               kmem_free(wmap->pgs, wmap->maxpages * sizeof(struct vm_page *));
+               wmap->pgs = NULL;
+               wmap->maxpages = 0;
+       }
+
        wmap->npages = 0;
        wmap->pos = 0;
        wmap->cnt = 0;
-#endif
 }
 
 /*
@@ -681,20 +693,18 @@
 {
        struct pipemapping * const wmap = &wpipe->pipe_map;
        kmutex_t * const lock = wpipe->pipe_lock;
-       struct vm_page **pgs;
        vaddr_t bbase, base, bend;
        vsize_t blen, bcnt;
        int error, npages;
        voff_t bpos;
-       u_int starting_color;
 
-       KASSERT(mutex_owned(wpipe->pipe_lock));
+       KASSERT(mutex_owned(lock));
        KASSERT(wmap->cnt == 0);
 
        mutex_exit(lock);
 
        /*
-        * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
+        * Handle first PIPE_DIRECT_CHUNK bytes of buffer. Deal with buffers
         * not aligned to PAGE_SIZE.
         */
        bbase = (vaddr_t)uio->uio_iov->iov_base;
@@ -711,43 +721,28 @@
                bcnt = uio->uio_iov->iov_len;
        }
        npages = atop(blen);
-       starting_color = atop(base) & uvmexp.colormask;
+
+       KASSERT((wpipe->pipe_state & (PIPE_DIRECTW | PIPE_DIRECTR)) == 0);
+       KASSERT(wmap->npages == 0);
 
-       /*
-        * Free the old kva if we need more pages than we have
-        * allocated.
-        */
-       if (wmap->kva != 0 && starting_color + npages > wmap->npages)
-               pipe_loan_free(wpipe);
-
-       /* Allocate new kva. */
-       if (wmap->kva == 0) {
-               error = pipe_loan_alloc(wpipe, starting_color + npages);
-               if (error) {
-                       mutex_enter(lock);
-                       return (error);
-               }
+       /* Make sure page array is big enough */
+       error = pipe_loan_alloc(wpipe, npages);
+       if (error) {
+               mutex_enter(lock);
+               return (error);
        }
 
        /* Loan the write buffer memory from writer process */
-       pgs = wmap->pgs + starting_color;
        error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen,
-                        pgs, UVM_LOAN_TOPAGE);
+                        wmap->pgs, UVM_LOAN_TOPAGE);
        if (error) {
                pipe_loan_free(wpipe);
                mutex_enter(lock);
                return (ENOMEM); /* so that caller fallback to ordinary write */
        }
 
-       /* Enter the loaned pages to kva */
-       vaddr_t kva = wpipe->pipe_map.kva;
-       for (int j = 0; j < npages; j++, kva += PAGE_SIZE) {
-               pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ, 0);
-       }
-       pmap_update(pmap_kernel());
-
        /* Now we can put the pipe in direct write mode */
-       wmap->pos = bpos + ptoa(starting_color);
+       wmap->pos = bpos;
        wmap->cnt = bcnt;
 
        /*
@@ -783,17 +778,13 @@
 
        /* Acquire the pipe lock and cleanup */
        (void)pipelock(wpipe, false);
-       mutex_exit(lock);
 
-       if (pgs != NULL) {
-               pmap_kremove(wpipe->pipe_map.kva, blen);
-               pmap_update(pmap_kernel());
-               uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
-       }
-       if (error || amountpipekva > maxpipekva)
-               pipe_loan_free(wpipe);
+       mutex_exit(lock);
+       /* XXX what happens if the writer process exits without waiting for reader?
+        * XXX FreeBSD does a clone in this case */
+       uvm_unloan(wmap->pgs, npages, UVM_LOAN_TOPAGE);
+       mutex_enter(lock);
 
-       mutex_enter(lock);
        if (error) {
                pipeselwakeup(wpipe, wpipe, POLL_ERR);
 
@@ -914,7 +905,7 @@
                 */
                if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
                    (fp->f_flag & FNONBLOCK) == 0 &&
-                   (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
+                   pipe_direct) {
                        error = pipe_direct_write(fp, wpipe, uio);
 
                        /*
@@ -1271,12 +1262,8 @@
                pipe->pipe_buffer.buffer = NULL;
        }
 #ifndef PIPE_NODIRECT
-       if (pipe->pipe_map.kva != 0) {
+       if (pipe->pipe_map.npages > 0)



Home | Main Index | Thread Index | Old Index