Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/arm/cortex Add preliminary version of a NEON based ...



details:   https://anonhg.NetBSD.org/src/rev/25a66c8c5482
branches:  trunk
changeset: 783369:25a66c8c5482
user:      matt <matt%NetBSD.org@localhost>
date:      Mon Dec 17 00:44:03 2012 +0000

description:
Add preliminary version of a NEON based in_cksum routine.

diffstat:

 sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S |  141 ++++++++++++++++++++++++++++
 sys/arch/arm/cortex/cpu_in_cksum_neon.c     |  124 ++++++++++++++++++++++++
 sys/arch/arm/cortex/files.cortex            |    7 +-
 3 files changed, 271 insertions(+), 1 deletions(-)

diffs (287 lines):

diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S       Mon Dec 17 00:44:03 2012 +0000
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "assym.h"
+
+RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.1 2012/12/17 00:44:03 matt Exp $")
+
+/*
+ * uint32_t
+ * cpu_in_cksum_neon(const void *dptr, size_t dlen)
+ *
+ *     r0 = dptr
+ *     r1 = dlen
+ */
+ENTRY(cpu_in_cksum_neon)
+       str             lr, [sp, #-8]!  /* save lr */
+       mov             ip, r0          /* leave r0 as temp */
+       add             r3, r1, ip      /* get end pointer */
+       ands            r1, ip, #15     /* get qword offset */
+       bic             ip, ip, #15     /* start on a qword boundary */
+       veor            q3, q3, q3      /* clear accumulator */
+       beq             .Lpre_main_loop /* ya, qword boundary start */
+
+       sub             r0, r3, ip      /* get length to qword start */
+       cmp             r0, #16         /* do we have at least a qword? */
+       andlt           r2, r3, #15     /* no, factor in trailing bytes */
+       blt             .Ltrailing_bytes /*   and do the last partial qword */
+       mov             r2, #0          /* yes, no trailing bytes */
+       bl              partial_qword   /* do the partial initial qword */
+       mov             r1, #0          /* no more leading bytes */
+
+.Lpre_main_loop:
+       and             r2, r3, #15     /* trailing bytes */
+       bic             r3, r3, #15     /* last partial or empty qword */
+       cmp             ip, r3          /* at or past the end? */
+       bge             .Ltrailing_bytes /* yes, deal with any trailing bytes */
+
+.Lmain_loop:
+       vld1.64         {d4-d5}, [ip:128]!
+       vmovl.u16       q0, d4          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d5          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
+       cmp             ip, r3
+       blt             .Lmain_loop
+
+.Ltrailing_bytes:
+       cmp             r2, #0          /* any trailing bytes? */
+       blne            partial_qword   /* yes, do final qword */
+       ldr             lr, [sp], #8    /* fetch LR */
+
+.Lfold_csum:
+       /*
+        * We now have 4 32-bit sums in q3 (each is 20-bits or less).
+        * Now to get to 1 I32 bit sum.
+        */
+       vadd.u32        d6, d6, d7      /* 4 I32 -> 2 I32 */
+       vmovl.u32       q3, d6          /* split two I32 into two I64 */
+       vadd.u32        d6, d6, d7      /* 2 I32 -> 1 I32 */
+       vmovl.u16       q3, d6          /* split two I16 into two I32 */
+       vmovl.u32       q3, d6          /* split two I32 into two I64 */
+       vadd.u32        d6, d6, d7      /* 2 I16 -> 1 I32 */
+       vmov            r0, s12         /* fetch csum from d6/q3 */
+       /*
+        * The result could be 0x10000 but we expect the caller to deal
+        * with it
+        */
+       RET
+END(cpu_in_cksum_neon)
+
+/*
+ * Handling partial qwords is tricky.
+ */
+       .type           partial_qword, %function
+partial_qword:
+       str             lr, [sp, #-8]!  /* save LR */
+       vld1.64         {d4-d5}, [ip:128]!      /* fetch data */
+       veor            q0, q0, q0      /* create a null mask */
+       movs            r0, r1, lsl #3  /* any leading bytes? */
+       blne            _C_LABEL(__neon_leading_qword_bitmask)
+       vmvn.u64        q0, q0          /* invert leading mask to trailing */
+       vand.u32        q2, q2, q0      /* preserve them */
+       vmvn.u64        q0, #0          /* create mask */
+       movs            r0, r2, lsl #3  /* if equal, no trailing bytes */
+       blne            _C_LABEL(__neon_leading_qword_bitmask)
+       vand.u32        q2, q2, q0      /* preserve them */
+       ldr             lr, [sp], #8    /* Fetch LR */
+       vmovl.u16       q0, d4          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d5          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
+       RET
+       .size           partial_qword, . - partial_qword
+
+/*
+ * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr)
+ */
+ENTRY(cpu_in_cksum_neon_v4hdr)
+       veor            q1, q1, q1
+       bic             ip, r0, #7
+       vld1.32         {d0-d2},[ip]    /* it must be in 24 bytes */
+       mov             r1, #0          /* now we must clear one register */
+       tst             r0, #4          /* depending on 64-bit alignment */
+       beq             1f
+       vmov            s0, s5          /* move last U32 to first U32 */
+1:     vmovl.u32       q1, d2          /* move s5 to d3 and clear s5 */
+       vmovl.u16       q2, d0          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q2      /* add 4 U32 to accumulator */
+       vmovl.u16       q2, d1          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q2      /* add 4 U32 to accumulator */
+       vmovl.u16       q2, d2          /* 4 U16 -> 4 U32 */
+       vadd.u32        q3, q3, q2      /* add 4 U32 to accumulator */
+       b               .Lfold_csum
+END(cpu_in_cksum_neon_v4hdr)
diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/cpu_in_cksum_neon.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/arch/arm/cortex/cpu_in_cksum_neon.c   Mon Dec 17 00:44:03 2012 +0000
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+__KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum_neon.c,v 1.1 2012/12/17 00:44:03 matt Exp $");
+
+#include <sys/param.h>
+#include <sys/cpu.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+uint32_t cpu_in_cksum_neon(const void *, size_t);
+uint32_t cpu_in_cksum_neon_v4hdr(const void *);
+
+int
+cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
+{
+       uint32_t csum = initial_sum;
+       int odd = 0;
+
+       /*
+        * Taken control of the NEON PCU.
+        */
+       vfp_hijack();
+
+       /*
+        * Fast path for the normal ip_header
+        */
+       if (off == 0
+           && csum == 0
+           && len == sizeof(struct ip)
+           && ((uintptr_t)m->m_data & 3) == 0
+           && m->m_len >= len) {
+               csum = cpu_in_cksum_neon_v4hdr(m->m_data);
+
+               /*
+                * We are now down with NEON.
+                */
+               vfp_surrender();
+
+               if (csum == 0x10000)    /* note 0x10000 - 0xffff == 1 */
+                       return 1;
+               return csum == 0 ? 0xffff : csum;       /* never return 0. */
+       }
+
+       /*
+        * Skip the initial mbufs
+        */
+       while (m->m_len >= off) {
+               m = m->m_next;
+               off -= m->m_len;
+               KASSERT(m != NULL);
+       }
+
+       for (; len > 0; m = m->m_next, off = 0) {
+               KASSERT(m != NULL);
+               int dlen = MIN(m->m_len - off, len);
+               const void *dptr = m->m_data + off;
+               /*
+                * This routine will add based on the memory layout so
+                * if the previous len was odd or the this buffer starts
+                * on an odd address, shift the csum by 8 so its properly
+                * aligned.  It will be taken care of when we do the final
+                * checksum fold.
+                */
+               uint32_t tmpsum = cpu_in_cksum_neon(dptr, dlen);
+               if (odd ^ ((uint32_t)dptr & 1))
+                       tmpsum <<= 8;
+               /*
+                * Accumulate checksum, folding will be done later
+                */
+               csum += tmpsum;
+               odd ^= dlen & 1;
+               len -= dlen;
+       }
+
+       /*
+        * We are now down with NEON.
+        */
+       vfp_surrender();
+
+       /*
+        * Time to fold the checksum
+        */
+       csum = (csum >> 16) + (csum & 0xffff);
+       /*
+        * Now it could be 0x1xxxx so fold again
+        */
+       csum = (csum >> 16) + (csum & 0xffff);
+
+       KASSERT(csum <= 0x10000);
+       if (csum == 0x10000)    /* note 0x10000 - 0xffff == 1 */
+               return 1;
+       return csum == 0 ? 0xffff : csum;       /* never return 0. */
+}
diff -r 6be0c6fff1e6 -r 25a66c8c5482 sys/arch/arm/cortex/files.cortex
--- a/sys/arch/arm/cortex/files.cortex  Sun Dec 16 23:47:56 2012 +0000
+++ b/sys/arch/arm/cortex/files.cortex  Mon Dec 17 00:44:03 2012 +0000
@@ -1,4 +1,9 @@
-# $NetBSD: files.cortex,v 1.2 2012/09/02 16:55:10 matt Exp $
+# $NetBSD: files.cortex,v 1.3 2012/12/17 00:44:03 matt Exp $
+
+defflag opt_cpu_in_cksum.h                     NEON_IN_CKSUM
+
+file   arch/arm/cortex/cpu_in_cksum_neon.c     (inet | inet6) & neon_in_cksum
+file   arch/arm/cortex/cpu_in_cksum_asm_neon.S (inet | inet6) & neon_in_cksum
 
 device armperiph {}
 attach armperiph at mainbus



Home | Main Index | Thread Index | Old Index