Subject: Re: RTL8169 hw IP4CSUM_Tx workaround
To: None <tech-kern@NetBSD.org, tech-net@NetBSD.org>
From: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp>
List: tech-net
Date: 10/24/2006 04:11:35
mouse@Rodents.Montreal.QC.CA wrote:
> >> + /*
> >> + * Walk packet chain to find last mbuf. We will either
> >> + * pad there, or append a new mbuf and pad it.
> >> + */
> >> + for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
> >> + continue;
> > I'm not exactly thrilled with walking the packet chain...
Well, I'm not a network guy (tm) so I just pull the code from if_bge.c
> I don't see it as a big deal. THis is done only for packets small
> enough to need padding; the mbuf chain is unlikely to contain more than
> one or two mbufs, no?
I've just take simple statistics on my macppc (patch attached),
and it shows:
---
% vmstat -e | grep re0
re0 total TX packets 1324966 552 misc
re0 total padded packets 252 0 misc
re0 padded 1 seg packets 243 0 misc
re0 padded 1 seg packets no trailing space 243 0 misc
re0 padded more seg packets 9 0 misc
%
---
with a quick glance, I can see:
- 0.02% of TX packets require padding
- 96% of short packets have no chain
- short packets without chain have no trailing space for padding
- fragmented short packets seem to have trailing space for padding
- no short packet which has 2~5 fragments
> Also, I'm not sure what the alternative is. Get corrupted packets
> fast? I don't see that as better.
One possible alternative is to allocalte DMA memory for padded length
and use it on setting up TX DMA descriptors against such short packets
as src/sys/dev/ic/dp83932.c does, but I wonder which way is better.
IMHO, if there is quite few packets which require padding, the latter
way seems a bit complicated, IMHO.
---
Izumi Tsutsui
Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.47
diff -u -r1.47 rtl8169.c
--- rtl8169.c 22 Oct 2006 01:47:53 -0000 1.47
+++ rtl8169.c 23 Oct 2006 17:17:11 -0000
@@ -152,8 +152,11 @@
#include <dev/ic/rtl8169var.h>
+#define ETHER_PAD_LEN (ETHER_MIN_LEN - ETHER_CRC_LEN)
+
static int re_encap(struct rtk_softc *, struct mbuf *, int *);
+static inline int re_cksum_pad(struct rtk_softc *, struct mbuf *);
static int re_newbuf(struct rtk_softc *, int, struct mbuf *);
static int re_rx_list_init(struct rtk_softc *);
@@ -758,7 +761,7 @@
*/
ifp->if_capabilities |=
- /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+ IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
IFCAP_TSOv4;
@@ -791,6 +794,38 @@
if_attach(ifp);
ether_ifattach(ifp, eaddr);
+#if 1
+#define EVCNT_INCR(ev) (ev)->ev_count++
+ evcnt_attach_dynamic(&sc->sc_ev_totalsentpackets, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "total TX packets");
+ evcnt_attach_dynamic(&sc->sc_ev_paddedpackets, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "total padded packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded1seg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 1 seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded1segnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 1 seg packets no trailing space");
+ evcnt_attach_dynamic(&sc->sc_ev_padded2seg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 2 seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded2segnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 2 seg packets no trailing space");
+ evcnt_attach_dynamic(&sc->sc_ev_padded3seg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 3 seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded3segnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 3 seg packets no trailing space");
+ evcnt_attach_dynamic(&sc->sc_ev_padded4seg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 4 seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded4segnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 4 seg packets no trailing space");
+ evcnt_attach_dynamic(&sc->sc_ev_padded5seg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 5 seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_padded5segnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded 5 seg packets no trailing space");
+ evcnt_attach_dynamic(&sc->sc_ev_paddedmoreseg, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded more seg packets");
+ evcnt_attach_dynamic(&sc->sc_ev_paddedmoresegnospace, EVCNT_TYPE_MISC,
+ NULL, sc->sc_dev.dv_xname, "padded more seg packets no trailing space");
+#endif
+
/*
* Make sure the interface is shutdown during reboot.
@@ -1499,6 +1534,89 @@
return handled;
}
+/*
+ * This function is just taken and modified from bge(4) driver.
+ *
+ * Although re(4) chips support auto-padding small packets,
+ * but it seems to cause a bug on IPv4 hardware checksum offload.
+ * To avoid the bug, pad packets manually if ip4csum is enabled.
+ */
+static inline int
+re_cksum_pad(struct rtk_softc *sc, struct mbuf *pkt)
+{
+ struct mbuf *m_last, *m_pad;
+ int padlen;
+ int nsegs;
+
+ padlen = ETHER_PAD_LEN - pkt->m_pkthdr.len;
+
+ /*
+ * Walk packet chain to find last mbuf. We will either
+ * pad there, or append a new mbuf and pad it.
+ */
+ m_pad = NULL;
+ nsegs = 0;
+ for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
+ nsegs++;
+
+ /* `m_last' now points to last in chain. */
+ if (M_TRAILINGSPACE(m_last) < padlen) {
+ /*
+ * Allocate a new empty mbuf chain for padding.
+ *
+ * XXX
+ * Is it better to allocate a new mbuf by MCLGET(9)
+ * and copy whole data to avoid one more fragment
+ * since the packet size is small enough in this case?
+ */
+ MGET(m_pad, M_DONTWAIT, MT_DATA);
+ if (m_pad == NULL)
+ return ENOMEM;
+ m_last->m_next = m_pad;
+ m_last = m_pad;
+ }
+
+ memset(mtod(m_last, char *) + m_last->m_len, 0, padlen);
+ m_last->m_len += padlen;
+ pkt->m_pkthdr.len += padlen;
+
+ EVCNT_INCR(&sc->sc_ev_paddedpackets);
+ switch (nsegs) {
+ case 1:
+ EVCNT_INCR(&sc->sc_ev_padded1seg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_padded1segnospace);
+ break;
+ case 2:
+ EVCNT_INCR(&sc->sc_ev_padded2seg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_padded2segnospace);
+ break;
+ case 3:
+ EVCNT_INCR(&sc->sc_ev_padded3seg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_padded3segnospace);
+ break;
+ case 4:
+ EVCNT_INCR(&sc->sc_ev_padded4seg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_padded4segnospace);
+ break;
+ case 5:
+ EVCNT_INCR(&sc->sc_ev_padded5seg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_padded5segnospace);
+ break;
+ default:
+ EVCNT_INCR(&sc->sc_ev_paddedmoreseg);
+ if (m_pad)
+ EVCNT_INCR(&sc->sc_ev_paddedmoresegnospace);
+ break;
+ }
+
+ return 0;
+}
+
static int
re_encap(struct rtk_softc *sc, struct mbuf *m, int *idx)
{
@@ -1515,6 +1633,10 @@
return EFBIG;
}
+#if 1
+ EVCNT_INCR(&sc->sc_ev_totalsentpackets);
+#endif
+
/*
* Set up checksum offload. Note: checksum offload bits must
* appear in all descriptors of a multi-descriptor transmit
@@ -1539,6 +1661,15 @@
if ((m->m_pkthdr.csum_flags &
(M_CSUM_IPv4 | M_CSUM_TCPv4 | M_CSUM_UDPv4)) != 0) {
rtk_flags |= RTK_TDESC_CMD_IPCSUM;
+ /*
+ * Pad small packets explicitly if ip4csum is enabled
+ * to avoid a hardware bug around IPv4 outboard cksum.
+ */
+ if (m->m_pkthdr.len < ETHER_PAD_LEN) {
+ error = re_cksum_pad(sc, m);
+ if (error != 0)
+ return error;
+ }
if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) {
rtk_flags |= RTK_TDESC_CMD_TCPCSUM;
} else if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) {
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.25
diff -u -r1.25 rtl81x9var.h
--- rtl81x9var.h 22 Oct 2006 01:47:53 -0000 1.25
+++ rtl81x9var.h 23 Oct 2006 17:17:11 -0000
@@ -164,6 +164,22 @@
#if NRND > 0
rndsource_element_t rnd_source;
#endif
+#if 1
+ struct evcnt sc_ev_totalsentpackets;
+ struct evcnt sc_ev_paddedpackets;
+ struct evcnt sc_ev_padded1seg;
+ struct evcnt sc_ev_padded1segnospace;
+ struct evcnt sc_ev_padded2seg;
+ struct evcnt sc_ev_padded2segnospace;
+ struct evcnt sc_ev_padded3seg;
+ struct evcnt sc_ev_padded3segnospace;
+ struct evcnt sc_ev_padded4seg;
+ struct evcnt sc_ev_padded4segnospace;
+ struct evcnt sc_ev_padded5seg;
+ struct evcnt sc_ev_padded5segnospace;
+ struct evcnt sc_ev_paddedmoreseg;
+ struct evcnt sc_ev_paddedmoresegnospace;
+#endif
};
#define RTK_TX_DESC_CNT(sc) \