Subject: Re: VAX code generation problem wrt IPSEC?
To: None <port-vax@netbsd.org, current-users@netbsd.org>
From: Olaf Seibert <rhialto@polderland.nl>
List: current-users
Date: 11/12/2001 02:30:11
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

I haven't tested yet if just using a -O1 or -O2 ip_cksump.c fixes the IP
checksum problem; first I looked at the assembly code generated by gcc.
ANd I found something curious in the code generation for a loop:

	while ((mlen -= 32) >= 0) {
	    sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
	    sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
	    sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
	    sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
	    w += 16;
	}

is (optimised) turned into 

	L68:
	#APP
	    addl2 (r3)+,r6		I presume this adds 4 bytes
	    adwc  (r3)+,r6		at a time, otherwise I can't see
	    adwc  (r3)+,r6		how 16 += are turned into
	    adwc  (r3)+,r6		8 adwc (r3)+
	    adwc  (r3)+,r6
	    adwc  (r3)+,r6
	    adwc  (r3)+,r6
	    adwc  (r3)+,r6
	    adwc     $0,r6
	#NO_APP
	L80:
	    subl2 $32,r2
	    jgeq L68

but unoptimised into

	L65:
	    subl3 $32,-12(fp),r0
	    movl r0,r1
	    movl r1,-12(fp)
	    tstl r1
	    jgeq L67
	    jbr L66
	L67:
	    movl -8(fp),r5		r5 is sum, apparently
	    movl -4(fp),r4		r4 is w
	#APP
	    addl2 (r4)+,r5		add a word, w[0]
	#NO_APP
	    movl r5,-8(fp)		store the sum
	    movl -8(fp),r5		and get it back

	    movl -4(fp),r4		next addition: now we should do
	#APP				w[1] but the increment has been lost!
	    adwc  (r4)+,r5		so once again we add w[0]
	#NO_APP
(repeated, also 8 times adwc, which is add word with carry, according
to a quick google search (boo, hiss, I should have proper vax assembly
docs - at least I know pdp-11), anyway using a 32-bits adwc for 16-bits
additions seems quite a liberty by the compiler in itself)

so that looks like it is a big problem.

Now first I'll go to sleep, then tomorrow I will check if I have not
been writing nonsense.

-Olaf.
-- 
___ Olaf 'Rhialto' Seibert - rhialto@     --Soep van de dag, wat zal dat zijn
\X/ xs4all.nl --wat kan dat wezen, beter maar het ergste vrezen -Boy Bensdorp

--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum-O2.s"

#NO_APP
gcc2_compiled.:
___gnu_compiled_c:
.text
LC0:
	.ascii "cksum: out of data\12\0"
	.align 1
.globl _in_cksum
	.type	 _in_cksum,@function
_in_cksum:
	.word 0x3c0
	movl 4(ap),r4
	movl 8(ap),r5
	clrl r6
	clrl r7
	tstl r4
	jeql L55
	movl $16,r8
L76:
	tstl r5
	jeql L77
	movl 12(r4),r2
	jeql L56
	movl 8(r4),r3
	cmpl r5,r2
	jgeq L60
	movl r5,r2
L60:
	subl3 r2,r5,r5
	cmpl r2,$15
	jleq L62
	bicl3 $-4,r3,r0
	jeql L80
	bicl3 $-65536,r6,r0
	extzv r8,r8,r6,r1
	addl3 r0,r1,r6
	jlbc r3,L64
	movzbl (r3)+,r0
	addl2 r0,r6
	ashl $8,r6,r6
	xorl2 $1,r7
	decl r2
L64:
	jbc $1,r3,L80
	movzwl (r3)+,r0
	addl2 r0,r6
	subl2 $2,r2
	jbr L80
L68:
#APP
	addl2 (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc     $0,r6
#NO_APP
L80:
	subl2 $32,r2
	jgeq L68
	addl2 $32,r2
	cmpl r2,$15
	jleq L62
#APP
	addl2 (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc  (r3)+,r6
	adwc     $0,r6
#NO_APP
	subl2 $16,r2
L62:
	cmpl r2,$7
	jleq L71
#APP
	addl2 (r3)+,r6
	adwc  (r3)+,r6
	adwc     $0,r6
#NO_APP
	subl2 $8,r2
L71:
	cmpl r2,$3
	jleq L72
#APP
	addl2 (r3)+,r6
	adwc     $0,r6
#NO_APP
	subl2 $4,r2
L72:
	tstl r2
	jleq L56
	bicl3 $-65536,r6,r1
	extzv r8,r8,r6,r0
	addl3 r1,r0,r6
	cmpl r2,$1
	jleq L74
	movzwl (r3)+,r0
	addl2 r0,r6
	subl2 $2,r2
L74:
	tstl r2
	jleq L56
	movzbl (r3),r0
	addl2 r0,r6
	ashl $8,r6,r6
	xorl2 $1,r7
L56:
	movl (r4),r4
	jneq L76
L55:
	tstl r5
	jeql L77
	pushab LC0
	calls $1,_printf
L77:
	tstl r7
	jeql L78
#APP
	rotl  $8,r6,r6
#NO_APP
L78:
	bicl3 $-65536,r6,r1
	movl $16,r0
	extzv r0,r0,r6,r0
	addl3 r1,r0,r6
	movzwl $65535,r0
	cmpl r6,r0
	jlequ L79
	subl3 r0,r6,r6
L79:
	xorl3 $65535,r6,r0
	ret
Lfe1:
	.size	 _in_cksum,Lfe1-_in_cksum

--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum-O0.s"

#NO_APP
gcc2_compiled.:
___gnu_compiled_c:
.text
LC0:
	.ascii "cksum: out of data\12\0"
	.align 1
.globl _in_cksum
	.type	 _in_cksum,@function
_in_cksum:
	.word 0x0
	subl2 $16,sp
	clrl -8(fp)
	clrl -12(fp)
	clrl -16(fp)
L53:
	tstl 4(ap)
	jeql L57
	tstl 8(ap)
	jneq L56
	jbr L57
L57:
	jbr L54
L56:
	movl 4(ap),r0
	movl 12(r0),r1
	movl r1,-12(fp)
	tstl r1
	jneq L58
	jbr L55
L58:
	movl 4(ap),r0
	movl 8(r0),-4(fp)
	cmpl 8(ap),-12(fp)
	jgeq L59
	movl 8(ap),-12(fp)
L59:
	subl3 -12(fp),8(ap),8(ap)
	cmpl -12(fp),$15
	jgtr L60
	jbr L61
L60:
	bicl3 $-4,-4(fp),r0
	tstl r0
	jeql L62
	bicl3 $-65536,-8(fp),r0
	movl -8(fp),r1
	movl $16,r3
	subb3 r3,$32,r2
	extzv r3,r2,r1,r1
	addl3 r0,r1,-8(fp)
	bicl3 $-2,-4(fp),r0
	tstl r0
	jeql L63
	movzbl *-4(fp),r0
	addl2 r0,-8(fp)
	ashl $8,-8(fp),-8(fp)
	xorl2 $1,-16(fp)
	incl -4(fp)
	decl -12(fp)
L63:
	bicl3 $-3,-4(fp),r0
	tstl r0
	jeql L62
	movzwl *-4(fp),r0
	addl2 r0,-8(fp)
	addl2 $2,-4(fp)
	subl2 $2,-12(fp)
L64:
L62:
	nop
L65:
	subl3 $32,-12(fp),r0
	movl r0,r1
	movl r1,-12(fp)
	tstl r1
	jgeq L67
	jbr L66
L67:
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	addl2 (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
#APP
	adwc     $0,r5
#NO_APP
	movl r5,-8(fp)
	jbr L65
L66:
	addl2 $32,-12(fp)
	cmpl -12(fp),$15
	jleq L68
	movl -8(fp),r4
	movl -4(fp),r5
#APP
	addl2 (r5)+,r4
#NO_APP
	movl r4,-8(fp)
	movl -8(fp),r4
	movl -4(fp),r5
#APP
	adwc  (r5)+,r4
#NO_APP
	movl r4,-8(fp)
	movl -8(fp),r4
	movl -4(fp),r5
#APP
	adwc  (r5)+,r4
#NO_APP
	movl r4,-8(fp)
	movl -8(fp),r4
	movl -4(fp),r5
#APP
	adwc  (r5)+,r4
#NO_APP
	movl r4,-8(fp)
	movl -8(fp),r4
#APP
	adwc     $0,r4
#NO_APP
	movl r4,-8(fp)
	subl2 $16,-12(fp)
L68:
	nop
L61:
	cmpl -12(fp),$7
	jleq L69
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	addl2 (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
	movl -4(fp),r4
#APP
	adwc  (r4)+,r5
#NO_APP
	movl r5,-8(fp)
	movl -8(fp),r5
#APP
	adwc     $0,r5
#NO_APP
	movl r5,-8(fp)
	subl2 $8,-12(fp)
L69:
	cmpl -12(fp),$3
	jleq L70
	movl -8(fp),r4
	movl -4(fp),r5
#APP
	addl2 (r5)+,r4
#NO_APP
	movl r4,-8(fp)
	movl -8(fp),r4
#APP
	adwc     $0,r4
#NO_APP
	movl r4,-8(fp)
	subl2 $4,-12(fp)
L70:
	tstl -12(fp)
	jleq L55
	bicl3 $-65536,-8(fp),r0
	movl -8(fp),r1
	movl $16,r3
	subb3 r3,$32,r2
	extzv r3,r2,r1,r1
	addl3 r0,r1,-8(fp)
	cmpl -12(fp),$1
	jleq L72
	movzwl *-4(fp),r0
	addl2 r0,-8(fp)
	addl2 $2,-4(fp)
	subl2 $2,-12(fp)
L72:
	tstl -12(fp)
	jleq L55
	movzbl *-4(fp),r0
	addl2 r0,-8(fp)
	ashl $8,-8(fp),-8(fp)
	xorl2 $1,-16(fp)
L73:
L71:
L55:
	movl *4(ap),4(ap)
	jbr L53
L54:
	tstl 8(ap)
	jeql L74
	pushab LC0
	calls $1,_printf
L74:
	tstl -16(fp)
	jeql L75
	movl -8(fp),r4
#APP
	rotl  $8,r4,r4
#NO_APP
	movl r4,-8(fp)
L75:
	bicl3 $-65536,-8(fp),r0
	movl -8(fp),r1
	movl $16,r3
	subb3 r3,$32,r2
	extzv r3,r2,r1,r1
	addl3 r0,r1,-8(fp)
	cmpl -8(fp),$65535
	jlequ L76
	addl2 $-65535,-8(fp)
L76:
	xorl3 $65535,-8(fp),r1
	movl r1,r0
	ret
	ret
Lfe1:
	.size	 _in_cksum,Lfe1-_in_cksum

--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum.c"

/*	$NetBSD: in_cksum.c,v 1.14 2000/03/30 13:24:55 augustss Exp $	*/

/*
 * Copyright (c) 1988, 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)in_cksum.c	8.1 (Berkeley) 6/10/93
 */

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <netinet/in.h>

/*
 * Checksum routine for Internet Protocol family headers (Portable Version).
 *
 * This routine is very heavily used in the network
 * code and should be modified for each CPU to be as fast as possible.
 */

#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}

int
in_cksum(m, len)
	struct mbuf *m;
	int len;
{
	u_int16_t *w;
	int sum = 0;
	int mlen = 0;
	int byte_swapped = 0;

	union {
		u_int8_t  c[2];
		u_int16_t s;
	} s_util;
	union {
		u_int16_t s[2];
		u_int32_t l;
	} l_util;

	for (;m && len; m = m->m_next) {
		if (m->m_len == 0)
			continue;
		w = mtod(m, u_int16_t *);
		if (mlen == -1) {
			/*
			 * The first byte of this mbuf is the continuation
			 * of a word spanning between this mbuf and the
			 * last mbuf.
			 *
			 * s_util.c[0] is already saved when scanning previous 
			 * mbuf.
			 */
			s_util.c[1] = *(u_int8_t *)w;
			sum += s_util.s;
			w = (u_int16_t *)((u_int8_t *)w + 1);
			mlen = m->m_len - 1;
			len--;
		} else
			mlen = m->m_len;
		if (len < mlen)
			mlen = len;
		len -= mlen;
		/*
		 * Force to even boundary.
		 */
		if ((1 & (long) w) && (mlen > 0)) {
			REDUCE;
			sum <<= 8;
			s_util.c[0] = *(u_int8_t *)w;
			w = (u_int16_t *)((int8_t *)w + 1);
			mlen--;
			byte_swapped = 1;
		}
		/*
		 * Unroll the loop to make overhead from
		 * branches &c small.
		 */
		while ((mlen -= 32) >= 0) {
			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
			sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
			sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
			sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
			w += 16;
		}
		mlen += 32;
		while ((mlen -= 8) >= 0) {
			sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
			w += 4;
		}
		mlen += 8;
		if (mlen == 0 && byte_swapped == 0)
			continue;
		REDUCE;
		while ((mlen -= 2) >= 0) {
			sum += *w++;
		}
		if (byte_swapped) {
			REDUCE;
			sum <<= 8;
			byte_swapped = 0;
			if (mlen == -1) {
				s_util.c[1] = *(u_int8_t *)w;
				sum += s_util.s;
				mlen = 0;
			} else
				mlen = -1;
		} else if (mlen == -1)
			s_util.c[0] = *(u_int8_t *)w;
	}
	if (len)
		printf("cksum: out of data\n");
	if (mlen == -1) {
		/* The last mbuf has odd # of bytes. Follow the
		   standard (the odd byte may be shifted left by 8 bits
		   or not as determined by endian-ness of the machine) */
		s_util.c[1] = 0;
		sum += s_util.s;
	}
	REDUCE;
	return (~sum & 0xffff);
}

--opJtzjQTFsWo+cga--