Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/net Make bpf MP-safe
details: https://anonhg.NetBSD.org/src/rev/dd8a64d09d86
branches: trunk
changeset: 351307:dd8a64d09d86
user: ozaki-r <ozaki-r%NetBSD.org@localhost>
date: Thu Feb 09 09:30:26 2017 +0000
description:
Make bpf MP-safe
By the change, bpf_mtap can run without any locks as long as its bpf filter
doesn't match a target packet. Pushing data to a bpf buffer still needs
a lock. Removing the lock requires big changes and it's a future work.
Another known issue is that we need to remain some obsolete variables to
avoid breaking kvm(3) users such as netstat and fstat. One problem for
MP-ification is that in order to keep statistic counters of bpf_d we need
to use atomic operations for them. Once we retire the kvm(3) users, we
should make the counters per-CPU and remove the atomic operations.
diffstat:
sys/net/bpf.c | 409 +++++++++++++++++++++++++++++++++++------------------
sys/net/bpfdesc.h | 21 ++-
sys/net/if.c | 16 +-
3 files changed, 287 insertions(+), 159 deletions(-)
diffs (truncated from 1235 to 300 lines):
diff -r dde81632a419 -r dd8a64d09d86 sys/net/bpf.c
--- a/sys/net/bpf.c Thu Feb 09 08:38:25 2017 +0000
+++ b/sys/net/bpf.c Thu Feb 09 09:30:26 2017 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $ */
+/* $NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $ */
/*
* Copyright (c) 1990, 1991, 1993
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $");
#if defined(_KERNEL_OPT)
#include "opt_bpf.h"
@@ -77,6 +77,8 @@
#include <sys/kauth.h>
#include <sys/syslog.h>
#include <sys/percpu.h>
+#include <sys/pserialize.h>
+#include <sys/lwp.h>
#include <net/if.h>
#include <net/slip.h>
@@ -132,11 +134,48 @@
}
/*
+ * Locking notes:
+ * - bpf_mtx (adaptive mutex) protects:
+ * - Gobal lists: bpf_iflist and bpf_dlist
+ * - struct bpf_if
+ * - bpf_close
+ * - bpf_psz (pserialize)
+ * - struct bpf_d has two mutexes:
+ * - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
+ * on packet tapping
+ * - bd_mtx (adaptive mutex) protects member variables other than the buffers
+ * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
+ * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
+ * never freed because struct bpf_d is only freed in bpf_close and
+ * bpf_close never be called while executing bpf_read and bpf_write
+ * - A filter that is assigned to bpf_d can be replaced with another filter
+ * while tapping packets, so it needs to be done atomically
+ * - struct bpf_d is iterated on bpf_dlist with psz
+ * - struct bpf_if is iterated on bpf_iflist with psz or psref
+ */
+/*
* Use a mutex to avoid a race condition between gathering the stats/peers
* and opening/closing the device.
*/
static kmutex_t bpf_mtx;
+static struct psref_class *bpf_psref_class __read_mostly;
+static pserialize_t bpf_psz;
+
+static inline void
+bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
+{
+
+ psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
+}
+
+static inline void
+bpf_if_release(struct bpf_if *bp, struct psref *psref)
+{
+
+ psref_release(psref, &bp->bif_psref, bpf_psref_class);
+}
+
/*
* bpf_iflist is the list of interfaces; each corresponds to an ifnet
* bpf_dtab holds the descriptors, indexed by minor device #
@@ -201,6 +240,7 @@
void *(*cpfn)(void *, const void *, size_t),
void *, u_int, u_int, const bool);
static void bpf_freed(struct bpf_d *);
+static void bpf_free_filter(struct bpf_filter *);
static void bpf_ifname(struct ifnet *, struct ifreq *);
static void *bpf_mcpy(void *, const void *, size_t);
static int bpf_movein(struct uio *, int, uint64_t,
@@ -404,7 +444,9 @@
static void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
+
KASSERT(mutex_owned(&bpf_mtx));
+ KASSERT(mutex_owned(d->bd_mtx));
/*
* Point d at bp, and add d to the interface's list of listeners.
* Finally, point the driver's bpf cookie at the interface so
@@ -425,6 +467,7 @@
struct bpf_if *bp;
KASSERT(mutex_owned(&bpf_mtx));
+ KASSERT(mutex_owned(d->bd_mtx));
bp = d->bd_bif;
/*
@@ -442,7 +485,13 @@
* the interface was configured down, so only panic
* if we don't get an unexpected error.
*/
+#ifndef NET_MPSAFE
+ KERNEL_LOCK(1, NULL);
+#endif
error = ifpromisc(bp->bif_ifp, 0);
+#ifndef NET_MPSAFE
+ KERNEL_UNLOCK_ONE(NULL);
+#endif
#ifdef DIAGNOSTIC
if (error)
printf("%s: ifpromisc failed: %d", __func__, error);
@@ -452,11 +501,8 @@
/* Remove d from the interface's descriptor list. */
BPFIF_DLIST_WRITER_REMOVE(d);
- /* TODO pserialize_perform(); */
- /* TODO psref_target_destroy(); */
- BPFIF_DLIST_ENTRY_DESTROY(d);
+ pserialize_perform(bpf_psz);
- /* XXX NOMPSAFE? */
if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
/*
* Let the driver know that there are no more listeners.
@@ -471,6 +517,8 @@
{
mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
+ bpf_psz = pserialize_create();
+ bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);
PSLIST_INIT(&bpf_iflist);
PSLIST_INIT(&bpf_dlist);
@@ -521,9 +569,11 @@
selinit(&d->bd_sel);
d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
d->bd_jitcode = NULL;
+ d->bd_filter = NULL;
BPF_DLIST_ENTRY_INIT(d);
BPFIF_DLIST_ENTRY_INIT(d);
- d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
+ d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
+ d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
cv_init(&d->bd_cv, "bpf");
mutex_enter(&bpf_mtx);
@@ -542,14 +592,11 @@
bpf_close(struct file *fp)
{
struct bpf_d *d;
- int s;
- KERNEL_LOCK(1, NULL);
mutex_enter(&bpf_mtx);
if ((d = fp->f_bpf) == NULL) {
mutex_exit(&bpf_mtx);
- KERNEL_UNLOCK_ONE(NULL);
return 0;
}
@@ -558,28 +605,28 @@
*/
d->bd_pid = curproc->p_pid;
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING)
- callout_stop(&d->bd_callout);
+ callout_halt(&d->bd_callout, d->bd_mtx);
d->bd_state = BPF_IDLE;
if (d->bd_bif)
bpf_detachd(d);
- splx(s);
- bpf_freed(d);
+ mutex_exit(d->bd_mtx);
+
BPF_DLIST_WRITER_REMOVE(d);
- fp->f_bpf = NULL;
+ pserialize_perform(bpf_psz);
mutex_exit(&bpf_mtx);
- KERNEL_UNLOCK_ONE(NULL);
- /* TODO pserialize_perform(); */
- /* TODO psref_target_destroy(); */
+ BPFIF_DLIST_ENTRY_DESTROY(d);
BPF_DLIST_ENTRY_DESTROY(d);
-
+ fp->f_bpf = NULL;
+ bpf_freed(d);
callout_destroy(&d->bd_callout);
seldestroy(&d->bd_sel);
softint_disestablish(d->bd_sih);
mutex_obj_free(d->bd_mtx);
+ mutex_obj_free(d->bd_buf_mtx);
cv_destroy(&d->bd_cv);
kmem_free(d, sizeof(*d));
@@ -608,7 +655,6 @@
struct bpf_d *d = fp->f_bpf;
int timed_out;
int error;
- int s;
getnanotime(&d->bd_atime);
/*
@@ -618,17 +664,18 @@
if (uio->uio_resid != d->bd_bufsize)
return (EINVAL);
- KERNEL_LOCK(1, NULL);
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING)
- callout_stop(&d->bd_callout);
+ callout_halt(&d->bd_callout, d->bd_buf_mtx);
timed_out = (d->bd_state == BPF_TIMED_OUT);
d->bd_state = BPF_IDLE;
+ mutex_exit(d->bd_mtx);
/*
* If the hold buffer is empty, then do a timed sleep, which
* ends when the timeout expires or when enough packets
* have arrived to fill the store buffer.
*/
+ mutex_enter(d->bd_buf_mtx);
while (d->bd_hbuf == NULL) {
if (fp->f_flag & FNONBLOCK) {
if (d->bd_slen == 0) {
@@ -649,9 +696,7 @@
break;
}
- mutex_enter(d->bd_mtx);
- error = cv_timedwait_sig(&d->bd_cv, d->bd_mtx, d->bd_rtout);
- mutex_exit(d->bd_mtx);
+ error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);
if (error == EINTR || error == ERESTART)
goto out;
@@ -683,7 +728,7 @@
/*
* At this point, we know we have something in the hold slot.
*/
- splx(s);
+ mutex_exit(d->bd_buf_mtx);
/*
* Move data from hold buffer into user space.
@@ -692,13 +737,12 @@
*/
error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
- s = splnet();
+ mutex_enter(d->bd_buf_mtx);
d->bd_fbuf = d->bd_hbuf;
d->bd_hbuf = NULL;
d->bd_hlen = 0;
out:
- splx(s);
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
return (error);
}
@@ -710,9 +754,9 @@
bpf_wakeup(struct bpf_d *d)
{
- mutex_enter(d->bd_mtx);
+ mutex_enter(d->bd_buf_mtx);
cv_broadcast(&d->bd_cv);
- mutex_exit(d->bd_mtx);
+ mutex_exit(d->bd_buf_mtx);
if (d->bd_async)
softint_schedule(d->bd_sih);
@@ -733,15 +777,14 @@
bpf_timed_out(void *arg)
{
struct bpf_d *d = arg;
- int s;
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING) {
d->bd_state = BPF_TIMED_OUT;
if (d->bd_slen != 0)
bpf_wakeup(d);
}
- splx(s);
+ mutex_exit(d->bd_mtx);
}
Home |
Main Index |
Thread Index |
Old Index