Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/dev/pci/ixgbe Revert new watchdog timer com...
details: https://anonhg.NetBSD.org/src/rev/8aeebf38b492
branches: trunk
changeset: 319129:8aeebf38b492
user: msaitoh <msaitoh%NetBSD.org@localhost>
date: Fri May 18 10:09:02 2018 +0000
description:
Revert new watchdog timer commits. The new watchdog timer made stability
worse than before. It seems unknown problems exists.
http://mail-index.netbsd.org/source-changes/2018/05/08/msg095020.html
http://mail-index.netbsd.org/source-changes/2018/05/16/msg095240.html
diffstat:
sys/dev/pci/ixgbe/ix_txrx.c | 42 ++++++----
sys/dev/pci/ixgbe/ixgbe.c | 158 ++++++++++++++++---------------------------
sys/dev/pci/ixgbe/ixgbe.h | 10 +-
sys/dev/pci/ixgbe/ixv.c | 140 ++++++++++++++------------------------
4 files changed, 143 insertions(+), 207 deletions(-)
diffs (truncated from 578 to 300 lines):
diff -r 23f7c77afb1c -r 8aeebf38b492 sys/dev/pci/ixgbe/ix_txrx.c
--- a/sys/dev/pci/ixgbe/ix_txrx.c Fri May 18 06:39:58 2018 +0000
+++ b/sys/dev/pci/ixgbe/ix_txrx.c Fri May 18 10:09:02 2018 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: ix_txrx.c,v 1.44 2018/05/16 08:08:24 msaitoh Exp $ */
+/* $NetBSD: ix_txrx.c,v 1.45 2018/05/18 10:09:02 msaitoh Exp $ */
/******************************************************************************
@@ -130,10 +130,9 @@
int
ixgbe_legacy_start_locked(struct ifnet *ifp, struct tx_ring *txr)
{
+ int rc;
struct mbuf *m_head;
struct adapter *adapter = txr->adapter;
- int enqueued = 0;
- int rc;
IXGBE_TX_LOCK_ASSERT(txr);
@@ -159,7 +158,6 @@
if ((rc = ixgbe_xmit(txr, m_head)) == EAGAIN) {
break;
}
- enqueued++;
IFQ_DEQUEUE(&ifp->if_snd, m_head);
if (rc != 0) {
m_freem(m_head);
@@ -170,11 +168,6 @@
bpf_mtap(ifp, m_head);
}
- if (enqueued) {
- txr->lastsent = time_uptime;
- txr->sending = true;
- }
-
return IXGBE_SUCCESS;
} /* ixgbe_legacy_start_locked */
@@ -323,11 +316,6 @@
break;
}
- if (enqueued) {
- txr->lastsent = time_uptime;
- txr->sending = true;
- }
-
if (txr->tx_avail < IXGBE_TX_CLEANUP_THRESHOLD(txr->adapter))
ixgbe_txeof(txr);
@@ -552,6 +540,10 @@
if (m_head->m_flags & M_MCAST)
ifp->if_omcasts++;
+ /* Mark queue as having work */
+ if (txr->busy == 0)
+ txr->busy = 1;
+
return (0);
} /* ixgbe_xmit */
@@ -677,7 +669,6 @@
/* Free any existing tx buffers. */
txbuf = txr->tx_buffers;
for (int i = 0; i < txr->num_desc; i++, txbuf++) {
- txr->sending = false;
if (txbuf->m_head != NULL) {
bus_dmamap_sync(txr->txtag->dt_dmat, txbuf->map,
0, txbuf->m_head->m_pkthdr.len,
@@ -1138,7 +1129,7 @@
#endif /* DEV_NETMAP */
if (txr->tx_avail == txr->num_desc) {
- txr->sending = false;
+ txr->busy = 0;
return false;
}
@@ -1220,8 +1211,25 @@
work += txr->num_desc;
txr->next_to_clean = work;
+ /*
+ * Queue Hang detection, we know there's
+ * work outstanding or the first return
+ * would have been taken, so increment busy
+ * if nothing managed to get cleaned, then
+ * in local_timer it will be checked and
+ * marked as HUNG if it exceeds a MAX attempt.
+ */
+ if ((processed == 0) && (txr->busy != IXGBE_QUEUE_HUNG))
+ ++txr->busy;
+ /*
+ * If anything gets cleaned we reset state to 1,
+ * note this will turn off HUNG if its set.
+ */
+ if (processed)
+ txr->busy = 1;
+
if (txr->tx_avail == txr->num_desc)
- txr->sending = false;
+ txr->busy = 0;
return ((limit > 0) ? false : true);
} /* ixgbe_txeof */
diff -r 23f7c77afb1c -r 8aeebf38b492 sys/dev/pci/ixgbe/ixgbe.c
--- a/sys/dev/pci/ixgbe/ixgbe.c Fri May 18 06:39:58 2018 +0000
+++ b/sys/dev/pci/ixgbe/ixgbe.c Fri May 18 10:09:02 2018 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: ixgbe.c,v 1.152 2018/05/15 09:30:56 msaitoh Exp $ */
+/* $NetBSD: ixgbe.c,v 1.153 2018/05/18 10:09:02 msaitoh Exp $ */
/******************************************************************************
@@ -184,8 +184,6 @@
static void ixgbe_free_pci_resources(struct adapter *);
static void ixgbe_local_timer(void *);
static void ixgbe_local_timer1(void *);
-static void ixgbe_watchdog(struct ifnet *);
-static bool ixgbe_watchdog_txq(struct ifnet *, struct tx_ring *, bool *);
static int ixgbe_setup_interface(device_t, struct adapter *);
static void ixgbe_config_gpie(struct adapter *);
static void ixgbe_config_dmac(struct adapter *);
@@ -4293,8 +4291,11 @@
ixgbe_local_timer1(void *arg)
{
struct adapter *adapter = arg;
+ device_t dev = adapter->dev;
struct ix_queue *que = adapter->queues;
+ u64 queues = 0;
u64 v0, v1, v2, v3, v4, v5, v6, v7;
+ int hung = 0;
int i;
KASSERT(mutex_owned(&adapter->core_mtx));
@@ -4331,94 +4332,64 @@
adapter->enomem_tx_dma_setup.ev_count = v6;
adapter->tso_err.ev_count = v7;
- ixgbe_watchdog(adapter->ifp);
+ /*
+ * Check the TX queues status
+ * - mark hung queues so we don't schedule on them
+ * - watchdog only if all queues show hung
+ */
+ que = adapter->queues;
+ for (i = 0; i < adapter->num_queues; i++, que++) {
+ /* Keep track of queues with work for soft irq */
+ if (que->txr->busy)
+ queues |= ((u64)1 << que->me);
+ /*
+ * Each time txeof runs without cleaning, but there
+ * are uncleaned descriptors it increments busy. If
+ * we get to the MAX we declare it hung.
+ */
+ if (que->busy == IXGBE_QUEUE_HUNG) {
+ ++hung;
+ /* Mark the queue as inactive */
+ adapter->active_queues &= ~((u64)1 << que->me);
+ continue;
+ } else {
+ /* Check if we've come back from hung */
+ if ((adapter->active_queues & ((u64)1 << que->me)) == 0)
+ adapter->active_queues |= ((u64)1 << que->me);
+ }
+ if (que->busy >= IXGBE_MAX_TX_BUSY) {
+ device_printf(dev,
+ "Warning queue %d appears to be hung!\n", i);
+ que->txr->busy = IXGBE_QUEUE_HUNG;
+ ++hung;
+ }
+ }
+
+ /* Only truely watchdog if all queues show hung */
+ if (hung == adapter->num_queues)
+ goto watchdog;
+ else if (queues != 0) { /* Force an IRQ on queues with work */
+ que = adapter->queues;
+ for (i = 0; i < adapter->num_queues; i++, que++) {
+ mutex_enter(&que->dc_mtx);
+ if (que->disabled_count == 0)
+ ixgbe_rearm_queues(adapter,
+ queues & ((u64)1 << i));
+ mutex_exit(&que->dc_mtx);
+ }
+ }
out:
callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter);
+ return;
+
+watchdog:
+ device_printf(adapter->dev, "Watchdog timeout -- resetting\n");
+ adapter->ifp->if_flags &= ~IFF_RUNNING;
+ adapter->watchdog_events.ev_count++;
+ ixgbe_init_locked(adapter);
} /* ixgbe_local_timer */
-static void
-ixgbe_watchdog(struct ifnet *ifp)
-{
- struct adapter *adapter = ifp->if_softc;
- struct ix_queue *que;
- struct tx_ring *txr;
- u64 queues = 0;
- bool hung = false;
- bool sending = false;
- int i;
-
- txr = adapter->tx_rings;
- for (i = 0; i < adapter->num_queues; i++, txr++) {
- hung = ixgbe_watchdog_txq(ifp, txr, &sending);
- if (hung)
- break;
- else if (sending)
- queues |= ((u64)1 << txr->me);
- }
-
- if (hung) {
- ifp->if_flags &= ~IFF_RUNNING;
- ifp->if_oerrors++;
- adapter->watchdog_events.ev_count++;
- ixgbe_init_locked(adapter);
- } else if (queues != 0) {
- /*
- * Force an IRQ on queues with work.
- *
- * It's supporsed not to be called ixgbe_rearm_queues() if
- * any chips have no bug. In reality, ixgbe_rearm_queues() is
- * required on 82599 and newer chip AND other than queue 0 to
- * prevent device timeout. When it occured, packet was sent but
- * the descriptor's DD bot wasn't set even though
- * IXGBE_TXD_CMD_EOP and IXGBE_TXD_CMD_RS were set. After
- * forcing interrupt by writing EICS register in
- * ixgbe_rearm_queues(), DD is set. Why? Is this an
- * undocumented errata? It might be possible not call
- * rearm_queues on 82598 or queue 0, we call in any cases in
- * case the problem occurs.
- */
- que = adapter->queues;
- for (i = 0; i < adapter->num_queues; i++, que++) {
- u64 index = queues & ((u64)1 << i);
-
- mutex_enter(&que->dc_mtx);
- if ((index != 0) && (que->disabled_count == 0))
- ixgbe_rearm_queues(adapter, index);
- mutex_exit(&que->dc_mtx);
- }
- }
-}
-
-static bool
-ixgbe_watchdog_txq(struct ifnet *ifp, struct tx_ring *txr, bool *sending)
-{
- struct adapter *adapter = ifp->if_softc;
- device_t dev = adapter->dev;
- bool hung = false;
- bool more = false;
-
- IXGBE_TX_LOCK(txr);
- *sending = txr->sending;
- if (*sending && ((time_uptime - txr->lastsent) > IXGBE_TX_TIMEOUT)) {
- /*
- * Since we're using delayed interrupts, sweep up before we
- * report an error.
- */
- do {
- more = ixgbe_txeof(txr);
- } while (more);
- hung = true;
- device_printf(dev,
- "Watchdog timeout (queue %d%s)-- resetting\n", txr->me,
- (txr->tx_avail == txr->num_desc)
- ? ", lost interrupt?" : "");
- }
- IXGBE_TX_UNLOCK(txr);
-
- return hung;
-}
-
/************************************************************************
* ixgbe_sfp_probe
*
@@ -4577,8 +4548,6 @@
struct ifnet *ifp;
struct adapter *adapter = arg;
struct ixgbe_hw *hw = &adapter->hw;
- struct tx_ring *txr;
- int i;
ifp = adapter->ifp;
@@ -4588,13 +4557,6 @@
ixgbe_disable_intr(adapter);
callout_stop(&adapter->timer);
- txr = adapter->tx_rings;
Home |
Main Index |
Thread Index |
Old Index