Subject: port-next68k/16798: race condition in the ethernet code
To: None <gnats-bugs@gnats.netbsd.org>
From: None <chris@Pin.LU>
List: netbsd-bugs
Date: 05/14/2002 04:05:27
>Number: 16798
>Category: port-next68k
>Synopsis: race condition in the ethernet code
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: port-next68k-maintainer
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Mon May 13 19:46:00 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator: Christian Limpach
>Release: current (as of May 14 2002)
>Organization:
>Environment:
System: NetBSD clapper 1.5ZC NetBSD 1.5ZC (CLAPPER) #46: Mon May 13 20:51:50 CEST 2002 root@marble:/devel/netbsd/src-current/sys/arch/next68k/compile/CLAPPER next68k
>Description:
There is a race condition in the ethernet code. Most of the network
code runs at IPL3 (splnet). mb8795_txdma_shutdown, which runs at IPL6
(spldma, since it's a handler for DMA interrupts), calls mb8795_start
which does an IF_DEQUEUE. If there's a packet being queued at the
same moment, the ifp->if_snd queue can get corrupted: ifq_head is NULL
while ifq_tail is not, this blocks sending forever and once ifq_maxlen
packets are in the queue the upper layers will get errors but nothing
but a reboot will clear the error condition.
>How-To-Repeat:
I've been running build.sh on a next to build the toolchain and this
would usually trigger the race condition within 1 hour. Once this
happens, "xe0: No packet to start" error messages are printed and once
the ifp->if_snd is full, "nfs-timer: ignoring error 55" messages are
printed with a debug kernel.
>Fix:
I have tried 2 ways to fix this, both seem to fix the problem
equally well.
- increase splnet's level to IPL6
- mb8795_start dequeues the packets from the ifp->if_snd queue and
queues them in the new sc->sc_tx_snd queue. The sc->sc_tx_snd queue
is only accessed at IPL6 and thus safe from the mb8795_txdma_shutdown
handler. A new mb8795_start_dma is called from mb8795_start and
mb8795_txdma_shutdown to dequeue the packets from the sc->sc_tx_snd
queue and start the dma.
I've implemented the 2nd solution since I'm not too sure if raising
splnet's level is a good idea. The following patch implements the 2nd
solution.
Index: arch/next68k/dev/mb8795.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/next68k/dev/mb8795.c,v
retrieving revision 1.24
diff -u -r1.24 mb8795.c
--- mb8795.c 2001/06/16 09:18:46 1.24
+++ mb8795.c 2002/05/14 01:12:45
@@ -124,6 +124,7 @@
void mb8795_rxdma_shutdown __P((void *));
void mb8795_txdma_shutdown __P((void *));
bus_dmamap_t mb8795_txdma_restart __P((bus_dmamap_t,void *));
+void mb8795_start_dma __P((struct ifnet *));
void
mb8795_config(sc)
@@ -551,8 +552,8 @@
nextdma_start(sc->sc_rx_nd, DMACSR_SETREAD);
- if (ifp->if_snd.ifq_head != NULL) {
- mb8795_start(ifp);
+ if (! IF_IS_EMPTY(&sc->sc_tx_snd)) {
+ mb8795_start_dma(ifp);
}
}
@@ -710,44 +711,89 @@
*/
void
mb8795_start(ifp)
- struct ifnet *ifp;
+ struct ifnet *ifp;
{
- int error;
- struct mb8795_softc *sc = ifp->if_softc;
+ struct mb8795_softc *sc = ifp->if_softc;
+ struct mbuf *m;
+ int s;
- if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
- return;
+ DPRINTF(("%s: mb8795_start()\n",sc->sc_dev.dv_xname));
+
+ while (1) {
+ if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
+ return;
+
+#if 0
+ return; /* @@@ Turn off xmit for debugging */
+#endif
+
+ ifp->if_flags |= IFF_OACTIVE;
+
+ IFQ_DEQUEUE(&ifp->if_snd, m);
+ if (m == 0) {
+ DPRINTF(("%s: No packet to start\n",
+ sc->sc_dev.dv_xname));
+ ifp->if_flags &= ~IFF_OACTIVE;
+ return;
+ }
+
+#if NBPFILTER > 0
+ /*
+ * Pass packet to bpf if there is a listener.
+ */
+ if (ifp->if_bpf)
+ bpf_mtap(ifp->if_bpf, m);
+#endif
+
+ s = spldma();
+ IF_ENQUEUE(&sc->sc_tx_snd, m);
+ if (sc->sc_tx_loaded == 0)
+ mb8795_start_dma(ifp);
+ splx(s);
+
+ ifp->if_flags &= ~IFF_OACTIVE;
+ }
+
+}
+
+void
+mb8795_start_dma(ifp)
+ struct ifnet *ifp;
+{
+ int error;
+ struct mb8795_softc *sc = ifp->if_softc;
- DPRINTF(("%s: mb8795_start()\n",sc->sc_dev.dv_xname));
+ DPRINTF(("%s: mb8795_start_dma()\n",sc->sc_dev.dv_xname));
#if (defined(DIAGNOSTIC))
- {
- u_char txstat;
- txstat = bus_space_read_1(sc->sc_bst,sc->sc_bsh, XE_TXSTAT);
- if (!(txstat & XE_TXSTAT_READY)) {
+ {
+ u_char txstat;
+ txstat = bus_space_read_1(sc->sc_bst,sc->sc_bsh, XE_TXSTAT);
+ if (!(txstat & XE_TXSTAT_READY)) {
/* @@@ I used to panic here, but then it paniced once.
* Let's see if I can just reset instead. [ dbj 980706.1900 ]
*/
- printf("%s: transmitter not ready\n", sc->sc_dev.dv_xname);
+ printf("%s: transmitter not ready\n",
+ sc->sc_dev.dv_xname);
mb8795_reset(sc);
return;
- }
- }
+ }
+ }
#endif
#if 0
return; /* @@@ Turn off xmit for debugging */
#endif
- bus_space_write_1(sc->sc_bst,sc->sc_bsh, XE_TXSTAT, XE_TXSTAT_CLEAR);
-
- IF_DEQUEUE(&ifp->if_snd, sc->sc_tx_mb_head);
- if (sc->sc_tx_mb_head == 0) {
- printf("%s: No packet to start\n",
- sc->sc_dev.dv_xname);
- return;
- }
-
+ bus_space_write_1(sc->sc_bst,sc->sc_bsh, XE_TXSTAT, XE_TXSTAT_CLEAR);
+
+ IF_DEQUEUE(&sc->sc_tx_snd, sc->sc_tx_mb_head);
+ if (sc->sc_tx_mb_head == 0) {
+ DPRINTF(("%s: No packet to start\n",
+ sc->sc_dev.dv_xname));
+ return;
+ }
+
ifp->if_timer = 5;
/* The following is a next specific hack that should
@@ -762,10 +808,10 @@
&~(DMA_ENDALIGNMENT-1)))-(s);}
#if 0
- error = bus_dmamap_load_mbuf(sc->sc_tx_dmat,
- sc->sc_tx_dmamap,
- sc->sc_tx_mb_head,
- BUS_DMA_NOWAIT);
+ error = bus_dmamap_load_mbuf(sc->sc_tx_dmat,
+ sc->sc_tx_dmamap,
+ sc->sc_tx_mb_head,
+ BUS_DMA_NOWAIT);
#else
{
u_char *buf = sc->sc_txbuf;
@@ -788,38 +834,30 @@
}
error = bus_dmamap_load(sc->sc_tx_dmat, sc->sc_tx_dmamap,
- buf,buflen,NULL,BUS_DMA_NOWAIT);
+ buf,buflen,NULL,BUS_DMA_NOWAIT);
}
#endif
- if (error) {
- printf("%s: can't load mbuf chain, error = %d\n",
- sc->sc_dev.dv_xname, error);
- m_freem(sc->sc_tx_mb_head);
- sc->sc_tx_mb_head = NULL;
- return;
- }
+ if (error) {
+ printf("%s: can't load mbuf chain, error = %d\n",
+ sc->sc_dev.dv_xname, error);
+ m_freem(sc->sc_tx_mb_head);
+ sc->sc_tx_mb_head = NULL;
+ return;
+ }
#ifdef DIAGNOSTIC
if (sc->sc_tx_loaded != 0) {
- panic("%s: sc->sc_tx_loaded is %d",sc->sc_dev.dv_xname,
- sc->sc_tx_loaded);
+ panic("%s: sc->sc_tx_loaded is %d",sc->sc_dev.dv_xname,
+ sc->sc_tx_loaded);
}
#endif
- ifp->if_flags |= IFF_OACTIVE;
-
- bus_dmamap_sync(sc->sc_tx_dmat, sc->sc_tx_dmamap, 0,
+ bus_dmamap_sync(sc->sc_tx_dmat, sc->sc_tx_dmamap, 0,
sc->sc_tx_dmamap->dm_mapsize, BUS_DMASYNC_PREWRITE);
nextdma_start(sc->sc_tx_nd, DMACSR_SETWRITE);
-
-#if NBPFILTER > 0
- /*
- * Pass packet to bpf if there is a listener.
- */
- if (ifp->if_bpf)
- bpf_mtap(ifp->if_bpf, sc->sc_tx_mb_head);
-#endif
+
+ ifp->if_opackets++;
}
@@ -880,12 +918,10 @@
}
#endif
- ifp->if_flags &= ~IFF_OACTIVE;
-
ifp->if_timer = 0;
- if (ifp->if_snd.ifq_head != NULL) {
- mb8795_start(ifp);
+ if (! IF_IS_EMPTY(&sc->sc_tx_snd)) {
+ mb8795_start_dma(ifp);
}
}
Index: arch/next68k/dev/mb8795var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/next68k/dev/mb8795var.h,v
retrieving revision 1.3
diff -u -r1.3 mb8795var.h
--- mb8795var.h 2001/04/02 05:29:43 1.3
+++ mb8795var.h 2002/05/14 01:12:45
@@ -55,9 +55,11 @@
struct mbuf *sc_tx_mb_head; /* pointer to data for this command */
int sc_tx_loaded;
- u_char *sc_txbuf; /* to solve alignment problems, we
- * copy the mbuf into this buffer before
- * trying to dma it */
+ u_char *sc_txbuf; /* to solve alignment problems, we
+ * copy the mbuf into this buffer before
+ * trying to dma it */
+
+ struct ifaltq sc_tx_snd;
bus_dma_tag_t sc_rx_dmat;
bus_dmamap_t sc_rx_dmamap[MB8795_NRXBUFS];
>Release-Note:
>Audit-Trail:
>Unformatted: