Subject: New in_cksum/in4_cksum implementation
To: None <port-arm@netbsd.org>
From: Steve Woodford <scw@wasabisystems.com>
List: port-arm
Date: 09/11/2003 09:17:27
--Boundary-00=_X+CY/iV8AJzbxiI
Content-Type: text/plain;
charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline
Hi folks,
I've been doing some Xscale optimisation work recently for Wasabi,
part of which involved re-writing in_cksum/in4_cksum in assembly.
While the resulting code is hand-crafted for Xscale, I've added the
necessary tweaks to support vanilla ARM cpus too. Thanks to Chris
Gilbert for useful feedback on that side of things.
Benchmark tests with a gigabit ethernet card show between 7% and 29%
improvement in throughput, depending on data size, compared to the
old code (using pkgsrc/benchmarks/nttcp). I don't have a figure for
regular ARM cpus, since I don't have an ARM board with fast enough
ethernet. I'd still expect to see a bit of improvement, though.
Wasabi would like to contribute this code back to NetBSD. If there are
no objections, I'd like to commit the attached code to the NetBSD
tree asap. I'd also like to see some figures from non-xscale machines
with decent ethernet. :)
Comments?
Cheers, Steve
--
Wasabi Systems Inc. - The NetBSD Company -
http://www.wasabisystems.com/
--Boundary-00=_X+CY/iV8AJzbxiI
Content-Type: text/x-csrc;
charset="us-ascii";
name="in_cksum_arm.S"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename="in_cksum_arm.S"
/* $NetBSD$ */
/*
* Copyright 2003 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Steve C. Woodford for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
*/
#include "opt_inet.h"
#include <machine/asm.h>
#include "assym.h"
/*
* int in_cksum(struct mbuf *m, int len)
*
* Entry:
* r0 m
* r1 len
*
* NOTE: Assumes 'm' is *never* NULL.
*/
/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
ENTRY(in_cksum)
stmfd sp!, {r4-r11,lr}
mov r8, #0x00
mov r9, r1
mov r10, #0x00
mov ip, r0
.Lin_cksum_loop:
ldr r1, [ip, #(M_LEN)]
ldr r0, [ip, #(M_DATA)]
ldr ip, [ip, #(M_NEXT)]
.Lin_cksum_entry4:
cmp r9, r1
movlt r1, r9
sub r9, r9, r1
eor r11, r10, r0
add r10, r10, r1
adds r2, r1, #0x00
blne _ASM_LABEL(L_cksumdata)
tst r11, #0x01
movne r2, r2, ror #8
adds r8, r8, r2
adc r8, r8, #0x00
cmp ip, #0x00
bne .Lin_cksum_loop
mov r1, #0xff
orr r1, r1, #0xff00
and r0, r8, r1
add r0, r0, r8, lsr #16
add r0, r0, r0, lsr #16
and r0, r0, r1
eor r0, r0, r1
ldmfd sp!, {r4-r11,pc}
#ifdef INET
/*
* int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
*
* Entry:
* r0 m
* r1 nxt
* r2 off
* r3 len
*/
/* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */
ENTRY(in4_cksum)
stmfd sp!, {r4-r11,lr}
mov r8, #0x00 /* Accumulate sum in r8 */
/*
* First, deal with a pseudo header, if present
*/
ldr r6, [r0, #(M_DATA)]
cmp r1, #0x00
beq .Lin4_cksum_skip_entry
add r8, r1, r3 /* sum = nxt + len */
add r1, r6, #(IP_SRC)
tst r1, #0x03 /* Data 32-bit aligned? */
ldreq r5, [r6, #(IP_SRC)]
ldreq r4, [r6, #(IP_DST)]
beq .Lin4_cksum_add_ips
/*
* It would be nice to use this for more than just Xscale, but not all
* ARM cpus (and acorn32's bus) can deal with half-word loads.
*/
#ifdef __XSCALE__
tst r1, #0x01 /* Data 16-bit aligned? */
ldreqh r5, [r6, #(IP_SRC)]
ldreqh r7, [r6, #(IP_DST + 2)]
ldreq r4, [r6, #(IP_SRC + 2)]
orreq r5, r5, r7, lsl #16
beq .Lin4_cksum_add_ips
#endif
/* Data is aligned to an odd address. Bummer. Do it the slow way. */
ldrb r4, [r6, #(IP_SRC + 0)]
ldrb r1, [r6, #(IP_SRC + 1)]
ldrb r7, [r6, #(IP_SRC + 2)]
ldrb r9, [r6, #(IP_SRC + 3)]
#ifndef __ARMEB__
orr r4, r4, r1, lsl #8 /* ..10 */
orr r4, r4, r7, lsl #16 /* .210 */
orr r4, r4, r9, lsl #24 /* 3210 */
#else
orr r4, r9, r4, lsl #24 /* 0..3 */
orr r4, r4, r1, lsl #16 /* 01.3 */
orr r4, r4, r7, lsl #8 /* 0123 */
#endif
ldrb r5, [r6, #(IP_DST + 0)]
ldrb r1, [r6, #(IP_DST + 1)]
ldrb r7, [r6, #(IP_DST + 2)]
ldrb r9, [r6, #(IP_DST + 3)]
#ifndef __ARMEB__
orr r5, r5, r1, lsl #8 /* ..10 */
orr r5, r5, r7, lsl #16 /* .210 */
orr r5, r5, r9, lsl #24 /* 3210 */
#else
orr r5, r9, r4, lsl #24 /* 0..3 */
orr r5, r5, r1, lsl #16 /* 01.3 */
orr r5, r5, r7, lsl #8 /* 0123 */
#endif
.Lin4_cksum_add_ips:
adds r5, r5, r4
#ifndef __ARMEB__
adcs r8, r5, r8, lsl #8
#else
adcs r8, r5, r8
#endif
adc r8, r8, #0x00
mov r1, #0x00
b .Lin4_cksum_skip_entry
.Lin4_cksum_skip_loop:
ldr r1, [r0, #(M_LEN)]
ldr r6, [r0, #(M_DATA)]
ldr r0, [r0, #(M_NEXT)]
.Lin4_cksum_skip_entry:
subs r2, r2, r1
blt .Lin4_cksum_skip_done
cmp r0, #0x00
bne .Lin4_cksum_skip_loop
b .Lin4_cksum_whoops
.Lin4_cksum_skip_done:
mov ip, r0
add r0, r2, r6
add r0, r0, r1
rsb r1, r2, #0x00
mov r9, r3
mov r10, #0x00
b .Lin_cksum_entry4
.Lin4_cksum_whoops:
adr r0, .Lin4_cksum_whoops_str
bl _C_LABEL(panic)
.Lin4_cksum_whoops_str:
.asciz "in4_cksum: out of mbufs\n"
.align 5
#endif /* INET */
/*
* The main in*_cksum() workhorse...
*
* Entry parameters:
* r0 Pointer to buffer
* r1 Buffer length
* lr Return address
*
* Returns:
* r2 Accumulated 32-bit sum
*
* Clobbers:
* r0-r7
*/
/* LINTSTUB: Ignore */
ASENTRY_NP(L_cksumdata)
#ifdef __XSCALE__
pld [r0] /* Pre-fetch the start of the buffer */
#endif
mov r2, #0
mov r3, #0
/* We first have to word-align the buffer. */
ands r7, r0, #0x03
beq .Lcksumdata_wordaligned
rsb r7, r7, #0x04
cmp r1, r7 /* Enough bytes left to make it? */
blt .Lcksumdata_endgame
cmp r7, #0x02
ldrb r4, [r0], #0x01 /* Fetch 1st byte */
ldrgeb r5, [r0], #0x01 /* Fetch 2nd byte */
movlt r5, #0x00
ldrgtb r6, [r0], #0x01 /* Fetch 3rd byte */
movle r6, #0x00
/* Combine the three bytes depending on endianness and alignment */
#ifdef __ARMEB__
orreq r2, r5, r4, lsl #8
orreq r2, r2, r6, lsl #24
orrne r2, r4, r5, lsl #8
orrne r2, r2, r6, lsl #16
#else
orreq r2, r4, r5, lsl #8
orreq r2, r2, r6, lsl #16
orrne r2, r5, r4, lsl #8
orrne r2, r2, r6, lsl #24
#endif
subs r1, r1, r7 /* Update length */
moveq pc, lr /* All done? */
/* Buffer is now word aligned */
.Lcksumdata_wordaligned:
#ifdef __XSCALE__
cmp r1, #0x04 /* Less than 4 bytes left? */
blt .Lcksumdata_endgame /* Yup */
tst r0, #0x04 /* Now try to quad-align */
subne r1, r1, #0x04
ldrne r3, [r0], #0x04
subs r1, r1, #0x40
blt .Lcksumdata_bigloop_end
/*
* Buffer is now quad aligned. Sum 64 bytes at a time.
* Note: First ldrd is hoisted above the loop so that
* subsequent iterations avoid an additional one-cycle
* result penalty.
*/
ldrd r4, [r0], #0x08
.Lcksumdata_bigloop:
pld [r0, #0x18]
ldrd r6, [r0], #0x08
adds r2, r2, r4
adcs r2, r2, r5
ldrd r4, [r0], #0x08
adcs r2, r2, r6
adcs r2, r2, r7
ldrd r6, [r0], #0x08
adcs r2, r2, r4
adcs r2, r2, r5
ldrd r4, [r0], #0x08
adcs r2, r2, r6
adcs r2, r2, r7
pld [r0, #0x18]
ldrd r6, [r0], #0x08
adcs r2, r2, r4
adcs r2, r2, r5
ldrd r4, [r0], #0x08
adcs r2, r2, r6
adcs r2, r2, r7
ldrd r6, [r0], #0x08
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6 /* XXX: 1-cycle result penalty :-/ */
adcs r2, r2, r7
adc r2, r2, #0x00
subs r1, r1, #0x40
ldrged r4, [r0], #0x08
bge .Lcksumdata_bigloop
#else /* !__XSCALE__ */
subs r1, r1, #0x40
blt .Lcksumdata_bigloop_end
.Lcksumdata_bigloop:
ldmia r0!, {r4, r5, r6, r7}
adds r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
ldmia r0!, {r4, r5, r6, r7}
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
ldmia r0!, {r4, r5, r6, r7}
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
ldmia r0!, {r4, r5, r6, r7}
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
adc r2, r2, #0x00
subs r1, r1, #0x40
bge .Lcksumdata_bigloop
#endif
.Lcksumdata_bigloop_end:
adds r2, r2, r3
adc r2, r2, #0x00
adds r1, r1, #0x40
moveq pc, lr
cmp r1, #0x20
blt .Lcksumdata_less_than_32
#ifdef __XSCALE__
ldrd r4, [r0], #0x08
pld [r0, #0x18]
ldrd r6, [r0], #0x08
adds r2, r2, r4
adcs r2, r2, r5
ldrd r4, [r0], #0x08
adcs r2, r2, r6
adcs r2, r2, r7
ldrd r6, [r0], #0x08
#else
ldmia r0!, {r4, r5, r6, r7}
adds r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
ldmia r0!, {r4, r5, r6, r7}
#endif
adcs r2, r2, r4
adcs r2, r2, r5
adcs r2, r2, r6
adcs r2, r2, r7
adc r2, r2, #0x00
subs r1, r1, #0x20
moveq pc, lr
.Lcksumdata_less_than_32:
/* There are less than 32 bytes left */
and r3, r1, #0x18
rsb r4, r3, #0x18
sub r1, r1, r3
adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
add pc, pc, r4
nop
/*
* Note: We use ldm here, even on Xscale, since the combined issue/result
* latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
*/
/* At least 24 bytes remaining... */
ldmia r0!, {r4, r5}
adcs r2, r2, r4
adcs r2, r2, r5
/* At least 16 bytes remaining... */
ldmia r0!, {r4, r5}
adcs r2, r2, r4
adcs r2, r2, r5
/* At least 8 bytes remaining... */
ldmia r0!, {r4, r5}
adcs r2, r2, r4
adcs r2, r2, r5
/* Less than 8 bytes remaining... */
adc r2, r2, #0x00
subs r1, r1, #0x04
blt .Lcksumdata_lessthan4
ldr r4, [r0], #0x04
sub r1, r1, #0x04
adds r2, r2, r4
adc r2, r2, #0x00
/* Deal with < 4 bytes remaining */
.Lcksumdata_lessthan4:
adds r1, r1, #0x04
moveq pc, lr
/* Deal with 1 to 3 remaining bytes, possibly misaligned */
.Lcksumdata_endgame:
ldrb r3, [r0] /* Fetch first byte */
cmp r1, #0x02
ldrgeb r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
movlt r4, #0x00
ldrgtb r5, [r0, #0x02]
movle r5, #0x00
/* Combine the three bytes depending on endianness and alignment */
tst r0, #0x01
#ifdef __ARMEB__
orreq r3, r4, r3, lsl #8
orreq r3, r3, r5, lsl #24
orrne r3, r3, r4, lsl #8
orrne r3, r3, r5, lsl #16
#else
orreq r3, r3, r4, lsl #8
orreq r3, r3, r5, lsl #16
orrne r3, r4, r3, lsl #8
orrne r3, r3, r5, lsl #24
#endif
adds r2, r2, r3
adc r2, r2, #0x00
mov pc, lr
--Boundary-00=_X+CY/iV8AJzbxiI--