Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/net Make bpf MP-safe



details:   https://anonhg.NetBSD.org/src/rev/dd8a64d09d86
branches:  trunk
changeset: 351307:dd8a64d09d86
user:      ozaki-r <ozaki-r%NetBSD.org@localhost>
date:      Thu Feb 09 09:30:26 2017 +0000

description:
Make bpf MP-safe

By the change, bpf_mtap can run without any locks as long as its bpf filter
doesn't match a target packet. Pushing data to a bpf buffer still needs
a lock. Removing the lock requires big changes and it's a future work.

Another known issue is that we need to remain some obsolete variables to
avoid breaking kvm(3) users such as netstat and fstat. One problem for
MP-ification is that in order to keep statistic counters of bpf_d we need
to use atomic operations for them. Once we retire the kvm(3) users, we
should make the counters per-CPU and remove the atomic operations.

diffstat:

 sys/net/bpf.c     |  409 +++++++++++++++++++++++++++++++++++------------------
 sys/net/bpfdesc.h |   21 ++-
 sys/net/if.c      |   16 +-
 3 files changed, 287 insertions(+), 159 deletions(-)

diffs (truncated from 1235 to 300 lines):

diff -r dde81632a419 -r dd8a64d09d86 sys/net/bpf.c
--- a/sys/net/bpf.c     Thu Feb 09 08:38:25 2017 +0000
+++ b/sys/net/bpf.c     Thu Feb 09 09:30:26 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $        */
+/*     $NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $        */
 
 /*
  * Copyright (c) 1990, 1991, 1993
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_bpf.h"
@@ -77,6 +77,8 @@
 #include <sys/kauth.h>
 #include <sys/syslog.h>
 #include <sys/percpu.h>
+#include <sys/pserialize.h>
+#include <sys/lwp.h>
 
 #include <net/if.h>
 #include <net/slip.h>
@@ -132,11 +134,48 @@
        }
 
 /*
+ * Locking notes:
+ * - bpf_mtx (adaptive mutex) protects:
+ *   - Gobal lists: bpf_iflist and bpf_dlist
+ *   - struct bpf_if
+ *   - bpf_close
+ *   - bpf_psz (pserialize)
+ * - struct bpf_d has two mutexes:
+ *   - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
+ *     on packet tapping
+ *   - bd_mtx (adaptive mutex) protects member variables other than the buffers
+ * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
+ * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
+ *   never freed because struct bpf_d is only freed in bpf_close and
+ *   bpf_close never be called while executing bpf_read and bpf_write
+ * - A filter that is assigned to bpf_d can be replaced with another filter
+ *   while tapping packets, so it needs to be done atomically
+ * - struct bpf_d is iterated on bpf_dlist with psz
+ * - struct bpf_if is iterated on bpf_iflist with psz or psref
+ */
+/*
  * Use a mutex to avoid a race condition between gathering the stats/peers
  * and opening/closing the device.
  */
 static kmutex_t bpf_mtx;
 
+static struct psref_class      *bpf_psref_class __read_mostly;
+static pserialize_t            bpf_psz;
+
+static inline void
+bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
+{
+
+       psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
+}
+
+static inline void
+bpf_if_release(struct bpf_if *bp, struct psref *psref)
+{
+
+       psref_release(psref, &bp->bif_psref, bpf_psref_class);
+}
+
 /*
  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
  *  bpf_dtab holds the descriptors, indexed by minor device #
@@ -201,6 +240,7 @@
                            void *(*cpfn)(void *, const void *, size_t),
                            void *, u_int, u_int, const bool);
 static void    bpf_freed(struct bpf_d *);
+static void    bpf_free_filter(struct bpf_filter *);
 static void    bpf_ifname(struct ifnet *, struct ifreq *);
 static void    *bpf_mcpy(void *, const void *, size_t);
 static int     bpf_movein(struct uio *, int, uint64_t,
@@ -404,7 +444,9 @@
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+
        KASSERT(mutex_owned(&bpf_mtx));
+       KASSERT(mutex_owned(d->bd_mtx));
        /*
         * Point d at bp, and add d to the interface's list of listeners.
         * Finally, point the driver's bpf cookie at the interface so
@@ -425,6 +467,7 @@
        struct bpf_if *bp;
 
        KASSERT(mutex_owned(&bpf_mtx));
+       KASSERT(mutex_owned(d->bd_mtx));
 
        bp = d->bd_bif;
        /*
@@ -442,7 +485,13 @@
                 * the interface was configured down, so only panic
                 * if we don't get an unexpected error.
                 */
+#ifndef NET_MPSAFE
+               KERNEL_LOCK(1, NULL);
+#endif
                error = ifpromisc(bp->bif_ifp, 0);
+#ifndef NET_MPSAFE
+               KERNEL_UNLOCK_ONE(NULL);
+#endif
 #ifdef DIAGNOSTIC
                if (error)
                        printf("%s: ifpromisc failed: %d", __func__, error);
@@ -452,11 +501,8 @@
        /* Remove d from the interface's descriptor list. */
        BPFIF_DLIST_WRITER_REMOVE(d);
 
-       /* TODO pserialize_perform(); */
-       /* TODO psref_target_destroy(); */
-       BPFIF_DLIST_ENTRY_DESTROY(d);
+       pserialize_perform(bpf_psz);
 
-       /* XXX NOMPSAFE? */
        if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
                /*
                 * Let the driver know that there are no more listeners.
@@ -471,6 +517,8 @@
 {
 
        mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
+       bpf_psz = pserialize_create();
+       bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);
 
        PSLIST_INIT(&bpf_iflist);
        PSLIST_INIT(&bpf_dlist);
@@ -521,9 +569,11 @@
        selinit(&d->bd_sel);
        d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
        d->bd_jitcode = NULL;
+       d->bd_filter = NULL;
        BPF_DLIST_ENTRY_INIT(d);
        BPFIF_DLIST_ENTRY_INIT(d);
-       d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
+       d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
+       d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
        cv_init(&d->bd_cv, "bpf");
 
        mutex_enter(&bpf_mtx);
@@ -542,14 +592,11 @@
 bpf_close(struct file *fp)
 {
        struct bpf_d *d;
-       int s;
 
-       KERNEL_LOCK(1, NULL);
        mutex_enter(&bpf_mtx);
 
        if ((d = fp->f_bpf) == NULL) {
                mutex_exit(&bpf_mtx);
-               KERNEL_UNLOCK_ONE(NULL);
                return 0;
        }
 
@@ -558,28 +605,28 @@
         */
        d->bd_pid = curproc->p_pid;
 
-       s = splnet();
+       mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING)
-               callout_stop(&d->bd_callout);
+               callout_halt(&d->bd_callout, d->bd_mtx);
        d->bd_state = BPF_IDLE;
        if (d->bd_bif)
                bpf_detachd(d);
-       splx(s);
-       bpf_freed(d);
+       mutex_exit(d->bd_mtx);
+
        BPF_DLIST_WRITER_REMOVE(d);
-       fp->f_bpf = NULL;
 
+       pserialize_perform(bpf_psz);
        mutex_exit(&bpf_mtx);
-       KERNEL_UNLOCK_ONE(NULL);
 
-       /* TODO pserialize_perform(); */
-       /* TODO psref_target_destroy(); */
+       BPFIF_DLIST_ENTRY_DESTROY(d);
        BPF_DLIST_ENTRY_DESTROY(d);
-
+       fp->f_bpf = NULL;
+       bpf_freed(d);
        callout_destroy(&d->bd_callout);
        seldestroy(&d->bd_sel);
        softint_disestablish(d->bd_sih);
        mutex_obj_free(d->bd_mtx);
+       mutex_obj_free(d->bd_buf_mtx);
        cv_destroy(&d->bd_cv);
 
        kmem_free(d, sizeof(*d));
@@ -608,7 +655,6 @@
        struct bpf_d *d = fp->f_bpf;
        int timed_out;
        int error;
-       int s;
 
        getnanotime(&d->bd_atime);
        /*
@@ -618,17 +664,18 @@
        if (uio->uio_resid != d->bd_bufsize)
                return (EINVAL);
 
-       KERNEL_LOCK(1, NULL);
-       s = splnet();
+       mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING)
-               callout_stop(&d->bd_callout);
+               callout_halt(&d->bd_callout, d->bd_buf_mtx);
        timed_out = (d->bd_state == BPF_TIMED_OUT);
        d->bd_state = BPF_IDLE;
+       mutex_exit(d->bd_mtx);
        /*
         * If the hold buffer is empty, then do a timed sleep, which
         * ends when the timeout expires or when enough packets
         * have arrived to fill the store buffer.
         */
+       mutex_enter(d->bd_buf_mtx);
        while (d->bd_hbuf == NULL) {
                if (fp->f_flag & FNONBLOCK) {
                        if (d->bd_slen == 0) {
@@ -649,9 +696,7 @@
                        break;
                }
 
-               mutex_enter(d->bd_mtx);
-               error = cv_timedwait_sig(&d->bd_cv, d->bd_mtx, d->bd_rtout);
-               mutex_exit(d->bd_mtx);
+               error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);
 
                if (error == EINTR || error == ERESTART)
                        goto out;
@@ -683,7 +728,7 @@
        /*
         * At this point, we know we have something in the hold slot.
         */
-       splx(s);
+       mutex_exit(d->bd_buf_mtx);
 
        /*
         * Move data from hold buffer into user space.
@@ -692,13 +737,12 @@
         */
        error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
 
-       s = splnet();
+       mutex_enter(d->bd_buf_mtx);
        d->bd_fbuf = d->bd_hbuf;
        d->bd_hbuf = NULL;
        d->bd_hlen = 0;
 out:
-       splx(s);
-       KERNEL_UNLOCK_ONE(NULL);
+       mutex_exit(d->bd_buf_mtx);
        return (error);
 }
 
@@ -710,9 +754,9 @@
 bpf_wakeup(struct bpf_d *d)
 {
 
-       mutex_enter(d->bd_mtx);
+       mutex_enter(d->bd_buf_mtx);
        cv_broadcast(&d->bd_cv);
-       mutex_exit(d->bd_mtx);
+       mutex_exit(d->bd_buf_mtx);
 
        if (d->bd_async)
                softint_schedule(d->bd_sih);
@@ -733,15 +777,14 @@
 bpf_timed_out(void *arg)
 {
        struct bpf_d *d = arg;
-       int s;
 
-       s = splnet();
+       mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING) {
                d->bd_state = BPF_TIMED_OUT;
                if (d->bd_slen != 0)
                        bpf_wakeup(d);
        }
-       splx(s);
+       mutex_exit(d->bd_mtx);
 }
 
 



Home | Main Index | Thread Index | Old Index