Subject: Re: RTL8169 hw IP4CSUM_Tx workaround
To: None <tech-kern@NetBSD.org, tech-net@NetBSD.org>
From: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp>
List: tech-kern
Date: 10/24/2006 04:11:35
mouse@Rodents.Montreal.QC.CA wrote:

> >> +	/*
> >> +	 * Walk packet chain to find last mbuf. We will either
> >> +	 * pad there, or append a new mbuf and pad it.
> >> +	 */
> >> +	for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
> >> +		continue;
> > I'm not exactly thrilled with walking the packet chain...

Well, I'm not a network guy (tm) so I just pull the code from if_bge.c

> I don't see it as a big deal.  THis is done only for packets small
> enough to need padding; the mbuf chain is unlikely to contain more than
> one or two mbufs, no?

I've just take simple statistics on my macppc (patch attached),
and it shows:

---
% vmstat -e | grep re0
re0 total TX packets                        1324966      552 misc
re0 total padded packets                        252        0 misc
re0 padded 1 seg packets                        243        0 misc
re0 padded 1 seg packets no trailing space      243        0 misc
re0 padded more seg packets                       9        0 misc
%

---
with a quick glance, I can see:
- 0.02% of TX packets require padding
- 96% of short packets have no chain
- short packets without chain have no trailing space for padding
- fragmented short packets seem to have trailing space for padding
- no short packet which has 2~5 fragments

> Also, I'm not sure what the alternative is.  Get corrupted packets
> fast?  I don't see that as better.

One possible alternative is to allocalte DMA memory for padded length
and use it on setting up TX DMA descriptors against such short packets
as src/sys/dev/ic/dp83932.c does, but I wonder which way is better.
IMHO, if there is quite few packets which require padding, the latter
way seems a bit complicated, IMHO.
---
Izumi Tsutsui


Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.47
diff -u -r1.47 rtl8169.c
--- rtl8169.c	22 Oct 2006 01:47:53 -0000	1.47
+++ rtl8169.c	23 Oct 2006 17:17:11 -0000
@@ -152,8 +152,11 @@
 
 #include <dev/ic/rtl8169var.h>
 
+#define	ETHER_PAD_LEN	(ETHER_MIN_LEN - ETHER_CRC_LEN)
+
 
 static int re_encap(struct rtk_softc *, struct mbuf *, int *);
+static inline int re_cksum_pad(struct rtk_softc *, struct mbuf *);
 
 static int re_newbuf(struct rtk_softc *, int, struct mbuf *);
 static int re_rx_list_init(struct rtk_softc *);
@@ -758,7 +761,7 @@
 	 */
 
 	ifp->if_capabilities |=
-	    /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+	    IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
 	    IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
 	    IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
 	    IFCAP_TSOv4;
@@ -791,6 +794,38 @@
 	if_attach(ifp);
 	ether_ifattach(ifp, eaddr);
 
+#if 1
+#define EVCNT_INCR(ev)	(ev)->ev_count++
+	evcnt_attach_dynamic(&sc->sc_ev_totalsentpackets, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "total TX packets");
+	evcnt_attach_dynamic(&sc->sc_ev_paddedpackets, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "total padded packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded1seg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 1 seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded1segnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 1 seg packets no trailing space");
+	evcnt_attach_dynamic(&sc->sc_ev_padded2seg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 2 seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded2segnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 2 seg packets no trailing space");
+	evcnt_attach_dynamic(&sc->sc_ev_padded3seg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 3 seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded3segnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 3 seg packets no trailing space");
+	evcnt_attach_dynamic(&sc->sc_ev_padded4seg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 4 seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded4segnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 4 seg packets no trailing space");
+	evcnt_attach_dynamic(&sc->sc_ev_padded5seg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 5 seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_padded5segnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded 5 seg packets no trailing space");
+	evcnt_attach_dynamic(&sc->sc_ev_paddedmoreseg, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded more seg packets");
+	evcnt_attach_dynamic(&sc->sc_ev_paddedmoresegnospace, EVCNT_TYPE_MISC,
+	    NULL, sc->sc_dev.dv_xname, "padded more seg packets no trailing space");
+#endif
+
 
 	/*
 	 * Make sure the interface is shutdown during reboot.
@@ -1499,6 +1534,89 @@
 	return handled;
 }
 
+/*
+ * This function is just taken and modified from bge(4) driver.
+ *
+ * Although re(4) chips support auto-padding small packets,
+ * but it seems to cause a bug on IPv4 hardware checksum offload.
+ * To avoid the bug, pad packets manually if ip4csum is enabled.
+ */
+static inline int
+re_cksum_pad(struct rtk_softc *sc, struct mbuf *pkt)
+{
+	struct mbuf *m_last, *m_pad;
+	int padlen;
+	int nsegs;
+
+	padlen = ETHER_PAD_LEN - pkt->m_pkthdr.len;
+
+	/*
+	 * Walk packet chain to find last mbuf. We will either
+	 * pad there, or append a new mbuf and pad it.
+	 */
+	m_pad = NULL;
+	nsegs = 0;
+	for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
+		nsegs++;
+
+	/* `m_last' now points to last in chain. */
+	if (M_TRAILINGSPACE(m_last) < padlen) {
+		/*
+		 * Allocate a new empty mbuf chain for padding.
+		 *
+		 * XXX
+		 * Is it better to allocate a new mbuf by MCLGET(9)
+		 * and copy whole data to avoid one more fragment
+		 * since the packet size is small enough in this case?
+		 */
+		MGET(m_pad, M_DONTWAIT, MT_DATA);
+		if (m_pad == NULL)
+			return ENOMEM;
+		m_last->m_next = m_pad;
+		m_last = m_pad;
+	}
+
+	memset(mtod(m_last, char *) + m_last->m_len, 0, padlen);
+	m_last->m_len += padlen;
+	pkt->m_pkthdr.len += padlen;
+
+	EVCNT_INCR(&sc->sc_ev_paddedpackets);
+	switch (nsegs) {
+	case 1:
+		EVCNT_INCR(&sc->sc_ev_padded1seg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_padded1segnospace);
+		break;
+	case 2:
+		EVCNT_INCR(&sc->sc_ev_padded2seg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_padded2segnospace);
+		break;
+	case 3:
+		EVCNT_INCR(&sc->sc_ev_padded3seg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_padded3segnospace);
+		break;
+	case 4:
+		EVCNT_INCR(&sc->sc_ev_padded4seg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_padded4segnospace);
+		break;
+	case 5:
+		EVCNT_INCR(&sc->sc_ev_padded5seg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_padded5segnospace);
+		break;
+	default:
+		EVCNT_INCR(&sc->sc_ev_paddedmoreseg);
+		if (m_pad)
+			EVCNT_INCR(&sc->sc_ev_paddedmoresegnospace);
+		break;
+	}
+
+	return 0;
+}
+
 static int
 re_encap(struct rtk_softc *sc, struct mbuf *m, int *idx)
 {
@@ -1515,6 +1633,10 @@
 		return EFBIG;
 	}
 
+#if 1
+	EVCNT_INCR(&sc->sc_ev_totalsentpackets);
+#endif
+
 	/*
 	 * Set up checksum offload. Note: checksum offload bits must
 	 * appear in all descriptors of a multi-descriptor transmit
@@ -1539,6 +1661,15 @@
 		if ((m->m_pkthdr.csum_flags &
 		    (M_CSUM_IPv4 | M_CSUM_TCPv4 | M_CSUM_UDPv4)) != 0) {
 			rtk_flags |= RTK_TDESC_CMD_IPCSUM;
+			/*
+			 * Pad small packets explicitly if ip4csum is enabled
+			 * to avoid a hardware bug around IPv4 outboard cksum.
+			 */
+			if (m->m_pkthdr.len < ETHER_PAD_LEN) {
+				error = re_cksum_pad(sc, m);
+				if (error != 0)
+					return error;
+			}
 			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) {
 				rtk_flags |= RTK_TDESC_CMD_TCPCSUM;
 			} else if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) {
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.25
diff -u -r1.25 rtl81x9var.h
--- rtl81x9var.h	22 Oct 2006 01:47:53 -0000	1.25
+++ rtl81x9var.h	23 Oct 2006 17:17:11 -0000
@@ -164,6 +164,22 @@
 #if NRND > 0
 	rndsource_element_t     rnd_source;
 #endif
+#if 1
+	struct evcnt sc_ev_totalsentpackets;
+	struct evcnt sc_ev_paddedpackets;
+	struct evcnt sc_ev_padded1seg;
+	struct evcnt sc_ev_padded1segnospace;
+	struct evcnt sc_ev_padded2seg;  
+	struct evcnt sc_ev_padded2segnospace;
+	struct evcnt sc_ev_padded3seg;  
+	struct evcnt sc_ev_padded3segnospace;
+	struct evcnt sc_ev_padded4seg;
+	struct evcnt sc_ev_padded4segnospace;
+	struct evcnt sc_ev_padded5seg;
+	struct evcnt sc_ev_padded5segnospace; 
+	struct evcnt sc_ev_paddedmoreseg;
+	struct evcnt sc_ev_paddedmoresegnospace;
+#endif
 };
 
 #define	RTK_TX_DESC_CNT(sc)	\