tech-net archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: NetBSD 5.1 TCP performance issue (lots of ACK)
On Thu, Oct 27, 2011 at 07:51:43PM +0200, Manuel Bouyer wrote:
> On Thu, Oct 27, 2011 at 12:00:33PM -0400, Thor Lancelot Simon wrote:
> > It's possible this has to do with the interrupt moderation tuning. I
> > believe we've been pending the checkin of better values than the ones
> > I worked out from the documentation for quite some time -- there were
> > highly unobvious performance effects with small buffers. Simon did
> > a bunch of testing and concluded, as I recall, that the values used
> > by Intel in the Linux driver were "magic" and that we should use
> > those, not mine.
> >
> > If this hasn't been adjusted to match the Linux driver, you might
> > want to take a quick look at the values it uses and see whether
> > they yield better small-buffer performance in your case.
>
> I looked quickly at this and came up with the attached patch.
>
> With this (installed on both NetBSD hosts) I get mittiged results:
> - the NetBSD client against the linux server gets degranded and unstable
> performances several runs gives large variations in speed
> - the NetBSD client against the NetBSD server gets better performances
> in average (but still not in the 90MB range) and also with large
> variations between runs
> - the linux client against the NetBSD server gets a little boost and
> the speed is stll stable between runs
> - ttcp performances between NetBSD hosts gets a little boost too,
> and the speed is stll stable between runs
>
> But I do get Ierrs on both NetBSD hosts now, with the ttcp or glusterfs
> test. I don't know where these errors comes from. Linux has no errors.
> I don't think it's wm_add_rxbuf(), netstat -m and vmstat -m shows
> no issues with mbuf allocations.
> So I guess these are errors at the adapter level, we may need to change
> more things to match these values.
> Also, linux seems to be using more advanced features for these adapters,
> this is something we may have to look at too.
Here is an updated patch. The key point to avoid the receive errors is
to do another BUS_DMASYNC after reading wrx_status, before reading the
other values to avoid reading e.g. len before status gets updated.
The errors were because of 0-len receive descriptors.
With this I get 113MB/s with the ttcp test, and between 70 and 90MB/s
with glusterfs. the NetBSD client now gets the same speed with a
NetBSD or linux server.
In the patch there is changes for the WM_F_NEWQUEUE adapters but they
may not be correct. When using WM_F_NEWQUEUE for the i80003 (which
the linux driver does), performances are a little lower and I get
a high interrupt rate - just as if interrupt mitigation was not
working.
--
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
NetBSD: 26 ans d'experience feront toujours la difference
--
Index: sys/dev/pci/if_wm.c
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/if_wm.c,v
retrieving revision 1.162.4.15
diff -u -p -u -r1.162.4.15 if_wm.c
--- sys/dev/pci/if_wm.c 7 Mar 2011 04:14:19 -0000 1.162.4.15
+++ sys/dev/pci/if_wm.c 28 Oct 2011 14:03:33 -0000
@@ -327,6 +327,15 @@ struct wm_softc {
struct evcnt sc_ev_rx_xoff; /* Rx PAUSE(!0) frames */
struct evcnt sc_ev_rx_xon; /* Rx PAUSE(0) frames */
struct evcnt sc_ev_rx_macctl; /* Rx Unsupported */
+
+ struct evcnt sc_ev_errcrc; /* WMREG_CRCERRS */
+ struct evcnt sc_ev_erralg; /* WMREG_ALGNERRC */
+ struct evcnt sc_ev_errsym; /* WMREG_SYMERRC */
+ struct evcnt sc_ev_errrx; /* WMREG_RXERRC */
+ struct evcnt sc_ev_errsec; /* WMREG_SEC */
+ struct evcnt sc_ev_errcext; /* WMREG_CEXTERR */
+ struct evcnt sc_ev_errrlec; /* WMREG_RLEC */
+ struct evcnt sc_ev_erralloc; /* mbuf alloc failed */
#endif /* WM_EVENT_COUNTERS */
bus_addr_t sc_tdt_reg; /* offset of TDT register */
@@ -1116,7 +1125,8 @@ wm_attach(device_t parent, device_t self
}
if ((sc->sc_type == WM_T_82575) || (sc->sc_type == WM_T_82576)
- || (sc->sc_type == WM_T_82580) || (sc->sc_type == WM_T_82580ER))
+ || (sc->sc_type == WM_T_82580) || (sc->sc_type == WM_T_82580ER)
+ /* || (sc->sc_type == WM_T_80003) */)
sc->sc_flags |= WM_F_NEWQUEUE;
/* Set device properties (mactype) */
@@ -1924,6 +1934,24 @@ wm_attach(device_t parent, device_t self
NULL, xname, "rx_xon");
evcnt_attach_dynamic(&sc->sc_ev_rx_macctl, EVCNT_TYPE_MISC,
NULL, xname, "rx_macctl");
+
+
+ evcnt_attach_dynamic(&sc->sc_ev_errcrc, EVCNT_TYPE_MISC,
+ NULL, xname, "errcrc");
+ evcnt_attach_dynamic(&sc->sc_ev_erralg, EVCNT_TYPE_MISC,
+ NULL, xname, "erralg");
+ evcnt_attach_dynamic(&sc->sc_ev_errsym, EVCNT_TYPE_MISC,
+ NULL, xname, "errsym");
+ evcnt_attach_dynamic(&sc->sc_ev_errrx, EVCNT_TYPE_MISC,
+ NULL, xname, "errrx");
+ evcnt_attach_dynamic(&sc->sc_ev_errsec, EVCNT_TYPE_MISC,
+ NULL, xname, "errsec");
+ evcnt_attach_dynamic(&sc->sc_ev_errcext, EVCNT_TYPE_MISC,
+ NULL, xname, "errcext");
+ evcnt_attach_dynamic(&sc->sc_ev_errrlec, EVCNT_TYPE_MISC,
+ NULL, xname, "errrlec");
+ evcnt_attach_dynamic(&sc->sc_ev_erralloc, EVCNT_TYPE_MISC,
+ NULL, xname, "erralloc");
#endif /* WM_EVENT_COUNTERS */
if (!pmf_device_register(self, NULL, NULL))
@@ -2879,11 +2907,7 @@ wm_rxintr(struct wm_softc *sc)
device_xname(sc->sc_dev), i));
WM_CDRXSYNC(sc, i, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
-
status = sc->sc_rxdescs[i].wrx_status;
- errors = sc->sc_rxdescs[i].wrx_errors;
- len = le16toh(sc->sc_rxdescs[i].wrx_len);
- vlantag = sc->sc_rxdescs[i].wrx_special;
if ((status & WRX_ST_DD) == 0) {
/*
@@ -2892,6 +2916,14 @@ wm_rxintr(struct wm_softc *sc)
WM_CDRXSYNC(sc, i, BUS_DMASYNC_PREREAD);
break;
}
+ /*
+ * sync again, to make sure the values below have been read
+ * after status.
+ */
+ WM_CDRXSYNC(sc, i, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
+ errors = sc->sc_rxdescs[i].wrx_errors;
+ len = le16toh(sc->sc_rxdescs[i].wrx_len);
+ vlantag = sc->sc_rxdescs[i].wrx_special;
if (__predict_false(sc->sc_rxdiscard)) {
DPRINTF(WM_DEBUG_RX,
@@ -2924,6 +2956,7 @@ wm_rxintr(struct wm_softc *sc)
* far, and discard the rest of the packet.
*/
ifp->if_ierrors++;
+ WM_EVCNT_INCR(&sc->sc_ev_erralloc);
bus_dmamap_sync(sc->sc_dmat, rxs->rxs_dmamap, 0,
rxs->rxs_dmamap->dm_mapsize, BUS_DMASYNC_PREREAD);
WM_INIT_RXDESC(sc, i);
@@ -3232,6 +3265,7 @@ wm_tick(void *arg)
{
struct wm_softc *sc = arg;
struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+ uint32_t reg;
int s;
s = splnet();
@@ -3245,14 +3279,20 @@ wm_tick(void *arg)
}
ifp->if_collisions += CSR_READ(sc, WMREG_COLC);
- ifp->if_ierrors += 0ULL + /* ensure quad_t */
- + CSR_READ(sc, WMREG_CRCERRS)
- + CSR_READ(sc, WMREG_ALGNERRC)
- + CSR_READ(sc, WMREG_SYMERRC)
- + CSR_READ(sc, WMREG_RXERRC)
- + CSR_READ(sc, WMREG_SEC)
- + CSR_READ(sc, WMREG_CEXTERR)
- + CSR_READ(sc, WMREG_RLEC);
+
+#define WM_IFERR(r, c) { \
+ reg = CSR_READ(sc, (r)); \
+ ifp->if_ierrors += 0ULL /* ensure quad_t */ + reg; \
+ WM_EVCNT_ADD((c), reg);\
+}
+ WM_IFERR(WMREG_CRCERRS, &sc->sc_ev_errcrc);
+ WM_IFERR(WMREG_ALGNERRC, &sc->sc_ev_erralg);
+ WM_IFERR(WMREG_SYMERRC, &sc->sc_ev_errsym);
+ WM_IFERR(WMREG_RXERRC, &sc->sc_ev_errrx);
+ WM_IFERR(WMREG_SEC, &sc->sc_ev_errsec);
+ WM_IFERR(WMREG_CEXTERR, &sc->sc_ev_errcext);
+ WM_IFERR(WMREG_RLEC, &sc->sc_ev_errrlec);
+#undef WM_IFERR
if (sc->sc_flags & WM_F_HAS_MII)
mii_tick(&sc->sc_mii);
@@ -3612,7 +3652,9 @@ wm_init(struct ifnet *ifp)
/* update statistics before reset */
ifp->if_collisions += CSR_READ(sc, WMREG_COLC);
- ifp->if_ierrors += CSR_READ(sc, WMREG_RXERRC);
+ reg = CSR_READ(sc, WMREG_RXERRC);
+ ifp->if_ierrors += 0ULL + reg;
+ WM_EVCNT_ADD(&sc->sc_ev_errrx, reg);
/* Reset the chip to a known state. */
wm_reset(sc);
@@ -3664,14 +3706,23 @@ wm_init(struct ifnet *ifp)
CSR_WRITE(sc, WMREG_TDLEN, WM_TXDESCSIZE(sc));
CSR_WRITE(sc, WMREG_TDH, 0);
CSR_WRITE(sc, WMREG_TDT, 0);
+#if 0
CSR_WRITE(sc, WMREG_TIDV, 375); /* ITR / 4 */
CSR_WRITE(sc, WMREG_TADV, 375); /* should be same */
+#endif
+ CSR_WRITE(sc, WMREG_TIDV, 8); /* linux magic */
+ CSR_WRITE(sc, WMREG_TADV, 32); /* ditto */
- if ((sc->sc_flags & WM_F_NEWQUEUE) != 0)
- CSR_WRITE(sc, WMREG_TXDCTL, TXDCTL_QUEUE_ENABLE
- | TXDCTL_PTHRESH(0) | TXDCTL_HTHRESH(0)
- | TXDCTL_WTHRESH(0));
- else {
+ if ((sc->sc_flags & WM_F_NEWQUEUE) != 0) {
+ CSR_WRITE(sc, WMREG_TXDCTL_1,
+ TXDCTL_COUNT_DESC | TXDCTL_DESC_GRAN
+ | TXDCTL_PTHRESH(0x1f) | TXDCTL_HTHRESH(1)
+ | TXDCTL_WTHRESH(1));
+ CSR_WRITE(sc, WMREG_TXDCTL,
+ TXDCTL_COUNT_DESC | TXDCTL_DESC_GRAN
+ | TXDCTL_PTHRESH(0x1f) | TXDCTL_HTHRESH(1)
+ | TXDCTL_WTHRESH(1));
+ } else {
CSR_WRITE(sc, WMREG_TXDCTL, TXDCTL_PTHRESH(0) |
TXDCTL_HTHRESH(0) | TXDCTL_WTHRESH(0));
CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_PTHRESH(0) |
@@ -3716,14 +3767,23 @@ wm_init(struct ifnet *ifp)
panic("%s: MCLBYTES %d unsupported for i2575 or
higher\n", __func__, MCLBYTES);
CSR_WRITE(sc, WMREG_SRRCTL, SRRCTL_DESCTYPE_LEGACY
| (MCLBYTES >> SRRCTL_BSIZEPKT_SHIFT));
- CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_QUEUE_ENABLE
- | RXDCTL_PTHRESH(16) | RXDCTL_HTHRESH(8)
- | RXDCTL_WTHRESH(1));
+ CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_GRAN
+ | RXDCTL_PTHRESH(4) | RXDCTL_HTHRESH(0x20)
+ | RXDCTL_WTHRESH(4));
+ CSR_WRITE(sc, WMREG_RXDCTL_1, RXDCTL_GRAN
+ | RXDCTL_PTHRESH(4) | RXDCTL_HTHRESH(0x20)
+ | RXDCTL_WTHRESH(4));
+ CSR_WRITE(sc, WMREG_RDTR, 0x20);/* linux magic */
+ CSR_WRITE(sc, WMREG_RADV, 0x20); /* ditto */
} else {
CSR_WRITE(sc, WMREG_RDH, 0);
CSR_WRITE(sc, WMREG_RDT, 0);
+#if 0
CSR_WRITE(sc, WMREG_RDTR, 375 | RDTR_FPD); /*
ITR/4 */
CSR_WRITE(sc, WMREG_RADV, 375); /* MUST be same
*/
+#endif
+ CSR_WRITE(sc, WMREG_RDTR, 0 | RDTR_FPD);/* linux magic
*/
+ CSR_WRITE(sc, WMREG_RADV, 8); /* ditto */
}
}
for (i = 0; i < WM_NRXDESC; i++) {
@@ -3892,7 +3952,10 @@ wm_init(struct ifnet *ifp)
* divided by 4 to get "simple timer" behavior.
*/
+#if 0
sc->sc_itr = 1500; /* 2604 ints/sec */
+#endif
+ sc->sc_itr = 1000000000 / (20000 * 256); /* linux defaults */
CSR_WRITE(sc, WMREG_ITR, sc->sc_itr);
}
Index: sys/dev/pci/if_wmreg.h
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/if_wmreg.h,v
retrieving revision 1.24.20.5
diff -u -p -u -r1.24.20.5 if_wmreg.h
--- sys/dev/pci/if_wmreg.h 19 Nov 2010 23:40:28 -0000 1.24.20.5
+++ sys/dev/pci/if_wmreg.h 28 Oct 2011 14:03:33 -0000
@@ -495,9 +495,10 @@ struct livengood_tcpip_ctxdesc {
#define WMREG_RDT_2 0x0c18 /* for 82576 ... */
#define WMREG_RXDCTL 0x2828 /* Receive Descriptor Control */
+#define WMREG_RXDCTL_1 0x2928 /* Receive Descriptor Control - queue 1
*/
#define WMREG_RXDCTL_2 0x0c28 /* for 82576 ... */
-#define RXDCTL_PTHRESH(x) ((x) << 0) /* prefetch threshold */
-#define RXDCTL_HTHRESH(x) ((x) << 8) /* host threshold */
+#define RXDCTL_PTHRESH(x) ((x) << 8) /* prefetch threshold */
+#define RXDCTL_HTHRESH(x) ((x) << 0) /* host threshold */
#define RXDCTL_WTHRESH(x) ((x) << 16) /* write back threshold */
#define RXDCTL_GRAN (1U << 24) /* 0 = cacheline, 1 =
descriptor */
/* flags used starting with 82575 ... */
@@ -625,10 +626,13 @@ struct livengood_tcpip_ctxdesc {
#define WMREG_TIDV 0x3820
#define WMREG_TXDCTL 0x3828 /* Trandmit Descriptor Control */
+#define WMREG_TXDCTL_1 0x3928 /* Trandmit Descriptor Control - queue
1 */
#define TXDCTL_PTHRESH(x) ((x) << 0) /* prefetch threshold */
#define TXDCTL_HTHRESH(x) ((x) << 8) /* host threshold */
#define TXDCTL_WTHRESH(x) ((x) << 16) /* write back threshold */
/* flags used starting with 82575 ... */
+#define TXDCTL_COUNT_DESC 0x00400000 /* enable counting of desc left */
+#define TXDCTL_DESC_GRAN 0x01000000 /* descriptor granularity */
#define TXDCTL_QUEUE_ENABLE 0x02000000 /* Enable specific Tx Queue */
#define TXDCTL_SWFLSH 0x04000000 /* Tx Desc. write-back flushing */
#define TXDCTL_PRIORITY 0x08000000
Home |
Main Index |
Thread Index |
Old Index