Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/arch/arm/cortex Add preliminary version of a NEON based ...
details: https://anonhg.NetBSD.org/src/rev/25a66c8c5482
branches: trunk
changeset: 783369:25a66c8c5482
user: matt <matt%NetBSD.org@localhost>
date: Mon Dec 17 00:44:03 2012 +0000
description:
Add preliminary version of a NEON based in_cksum routine.
diffstat:
sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S | 141 ++++++++++++++++++++++++++++
sys/arch/arm/cortex/cpu_in_cksum_neon.c | 124 ++++++++++++++++++++++++
sys/arch/arm/cortex/files.cortex | 7 +-
3 files changed, 271 insertions(+), 1 deletions(-)
diffs (287 lines):
diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S Mon Dec 17 00:44:03 2012 +0000
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "assym.h"
+
+RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.1 2012/12/17 00:44:03 matt Exp $")
+
+/*
+ * uint32_t
+ * cpu_in_cksum_neon(const void *dptr, size_t dlen)
+ *
+ * r0 = dptr
+ * r1 = dlen
+ */
+ENTRY(cpu_in_cksum_neon)
+ str lr, [sp, #-8]! /* save lr */
+ mov ip, r0 /* leave r0 as temp */
+ add r3, r1, ip /* get end pointer */
+ ands r1, ip, #15 /* get qword offset */
+ bic ip, ip, #15 /* start on a qword boundary */
+ veor q3, q3, q3 /* clear accumulator */
+ beq .Lpre_main_loop /* ya, qword boundary start */
+
+ sub r0, r3, ip /* get length to qword start */
+ cmp r0, #16 /* do we have at least a qword? */
+ andlt r2, r3, #15 /* no, factor in trailing bytes */
+ blt .Ltrailing_bytes /* and do the last partial qword */
+ mov r2, #0 /* yes, no trailing bytes */
+ bl partial_qword /* do the partial initial qword */
+ mov r1, #0 /* no more leading bytes */
+
+.Lpre_main_loop:
+ and r2, r3, #15 /* trailing bytes */
+ bic r3, r3, #15 /* last partial or empty qword */
+ cmp ip, r3 /* at or past the end? */
+ bge .Ltrailing_bytes /* yes, deal with any trailing bytes */
+
+.Lmain_loop:
+ vld1.64 {d4-d5}, [ip:128]!
+ vmovl.u16 q0, d4 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */
+ vmovl.u16 q0, d5 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */
+ cmp ip, r3
+ blt .Lmain_loop
+
+.Ltrailing_bytes:
+ cmp r2, #0 /* any trailing bytes? */
+ blne partial_qword /* yes, do final qword */
+ ldr lr, [sp], #8 /* fetch LR */
+
+.Lfold_csum:
+ /*
+ * We now have 4 32-bit sums in q3 (each is 20-bits or less).
+ * Now to get to 1 I32 bit sum.
+ */
+ vadd.u32 d6, d6, d7 /* 4 I32 -> 2 I32 */
+ vmovl.u32 q3, d6 /* split two I32 into two I64 */
+ vadd.u32 d6, d6, d7 /* 2 I32 -> 1 I32 */
+ vmovl.u16 q3, d6 /* split two I16 into two I32 */
+ vmovl.u32 q3, d6 /* split two I32 into two I64 */
+ vadd.u32 d6, d6, d7 /* 2 I16 -> 1 I32 */
+ vmov r0, s12 /* fetch csum from d6/q3 */
+ /*
+ * The result could be 0x10000 but we expect the caller to deal
+ * with it
+ */
+ RET
+END(cpu_in_cksum_neon)
+
+/*
+ * Handling partial qwords is tricky.
+ */
+ .type partial_qword, %function
+partial_qword:
+ str lr, [sp, #-8]! /* save LR */
+ vld1.64 {d4-d5}, [ip:128]! /* fetch data */
+ veor q0, q0, q0 /* create a null mask */
+ movs r0, r1, lsl #3 /* any leading bytes? */
+ blne _C_LABEL(__neon_leading_qword_bitmask)
+ vmvn.u64 q0, q0 /* invert leading mask to trailing */
+ vand.u32 q2, q2, q0 /* preserve them */
+ vmvn.u64 q0, #0 /* create mask */
+ movs r0, r2, lsl #3 /* if equal, no trailing bytes */
+ blne _C_LABEL(__neon_leading_qword_bitmask)
+ vand.u32 q2, q2, q0 /* preserve them */
+ ldr lr, [sp], #8 /* Fetch LR */
+ vmovl.u16 q0, d4 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */
+ vmovl.u16 q0, d5 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q0 /* add 4 U32 to accumulator */
+ RET
+ .size partial_qword, . - partial_qword
+
+/*
+ * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr)
+ */
+ENTRY(cpu_in_cksum_neon_v4hdr)
+ veor q1, q1, q1
+ bic ip, r0, #7
+ vld1.32 {d0-d2},[ip] /* it must be in 24 bytes */
+ mov r1, #0 /* now we must clear one register */
+ tst r0, #4 /* depending on 64-bit alignment */
+ beq 1f
+ vmov s0, s5 /* move last U32 to first U32 */
+1: vmovl.u32 q1, d2 /* move s5 to d3 and clear s5 */
+ vmovl.u16 q2, d0 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q2 /* add 4 U32 to accumulator */
+ vmovl.u16 q2, d1 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q2 /* add 4 U32 to accumulator */
+ vmovl.u16 q2, d2 /* 4 U16 -> 4 U32 */
+ vadd.u32 q3, q3, q2 /* add 4 U32 to accumulator */
+ b .Lfold_csum
+END(cpu_in_cksum_neon_v4hdr)
diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/cpu_in_cksum_neon.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/arch/arm/cortex/cpu_in_cksum_neon.c Mon Dec 17 00:44:03 2012 +0000
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+__KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum_neon.c,v 1.1 2012/12/17 00:44:03 matt Exp $");
+
+#include <sys/param.h>
+#include <sys/cpu.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+uint32_t cpu_in_cksum_neon(const void *, size_t);
+uint32_t cpu_in_cksum_neon_v4hdr(const void *);
+
+int
+cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
+{
+ uint32_t csum = initial_sum;
+ int odd = 0;
+
+ /*
+ * Taken control of the NEON PCU.
+ */
+ vfp_hijack();
+
+ /*
+ * Fast path for the normal ip_header
+ */
+ if (off == 0
+ && csum == 0
+ && len == sizeof(struct ip)
+ && ((uintptr_t)m->m_data & 3) == 0
+ && m->m_len >= len) {
+ csum = cpu_in_cksum_neon_v4hdr(m->m_data);
+
+ /*
+ * We are now down with NEON.
+ */
+ vfp_surrender();
+
+ if (csum == 0x10000) /* note 0x10000 - 0xffff == 1 */
+ return 1;
+ return csum == 0 ? 0xffff : csum; /* never return 0. */
+ }
+
+ /*
+ * Skip the initial mbufs
+ */
+ while (m->m_len >= off) {
+ m = m->m_next;
+ off -= m->m_len;
+ KASSERT(m != NULL);
+ }
+
+ for (; len > 0; m = m->m_next, off = 0) {
+ KASSERT(m != NULL);
+ int dlen = MIN(m->m_len - off, len);
+ const void *dptr = m->m_data + off;
+ /*
+ * This routine will add based on the memory layout so
+ * if the previous len was odd or the this buffer starts
+ * on an odd address, shift the csum by 8 so its properly
+ * aligned. It will be taken care of when we do the final
+ * checksum fold.
+ */
+ uint32_t tmpsum = cpu_in_cksum_neon(dptr, dlen);
+ if (odd ^ ((uint32_t)dptr & 1))
+ tmpsum <<= 8;
+ /*
+ * Accumulate checksum, folding will be done later
+ */
+ csum += tmpsum;
+ odd ^= dlen & 1;
+ len -= dlen;
+ }
+
+ /*
+ * We are now down with NEON.
+ */
+ vfp_surrender();
+
+ /*
+ * Time to fold the checksum
+ */
+ csum = (csum >> 16) + (csum & 0xffff);
+ /*
+ * Now it could be 0x1xxxx so fold again
+ */
+ csum = (csum >> 16) + (csum & 0xffff);
+
+ KASSERT(csum <= 0x10000);
+ if (csum == 0x10000) /* note 0x10000 - 0xffff == 1 */
+ return 1;
+ return csum == 0 ? 0xffff : csum; /* never return 0. */
+}
diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/files.cortex
--- a/sys/arch/arm/cortex/files.cortex Sun Dec 16 23:47:56 2012 +0000
+++ b/sys/arch/arm/cortex/files.cortex Mon Dec 17 00:44:03 2012 +0000
@@ -1,4 +1,9 @@
-# $NetBSD: files.cortex,v 1.2 2012/09/02 16:55:10 matt Exp $
+# $NetBSD: files.cortex,v 1.3 2012/12/17 00:44:03 matt Exp $
+
+defflag opt_cpu_in_cksum.h NEON_IN_CKSUM
+
+file arch/arm/cortex/cpu_in_cksum_neon.c (inet | inet6) & neon_in_cksum
+file arch/arm/cortex/cpu_in_cksum_asm_neon.S (inet | inet6) & neon_in_cksum
device armperiph {}
attach armperiph at mainbus
Home |
Main Index |
Thread Index |
Old Index