Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/common/lib/libc/arch/arm/string A version of memset that can...



details:   https://anonhg.NetBSD.org/src/rev/709b33332aeb
branches:  trunk
changeset: 783926:709b33332aeb
user:      matt <matt%NetBSD.org@localhost>
date:      Sat Jan 12 20:27:13 2013 +0000

description:
A version of memset that can do NEON, VFP as well as normal arm instructions

diffstat:

 common/lib/libc/arch/arm/string/memset_arm.S |  173 +++++++++++++++++++++++++++
 1 files changed, 173 insertions(+), 0 deletions(-)

diffs (177 lines):

diff -r 41df4801cced -r 709b33332aeb common/lib/libc/arch/arm/string/memset_arm.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/lib/libc/arch/arm/string/memset_arm.S      Sat Jan 12 20:27:13 2013 +0000
@@ -0,0 +1,173 @@
+/*     $NetBSD: memset_arm.S,v 1.1 2013/01/12 20:27:13 matt Exp $      */
+
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <machine/asm.h>
+
+#if defined(NEON)
+#define        STORE8          vst1.32         {d0}, [ip:64]!
+#define        STORE16         vst1.32         {d0-d1}, [ip:64]!
+#define        STORE32         vst1.32         {d0-d3}, [ip:64]!
+#elif defined(VFP)
+#define        STORE8          vstmia          ip!, {d0}
+#define        STORE16         vstmia          ip!, {d0-d1}
+#define        STORE32         vstmia          ip!, {d0-d3}
+#elif defined(_ARM_ARCH_DWORD_OK)
+#define        STORE8          strd            r2, [ip], #8
+#define        STORE16         STORE8; STORE8
+#define        STORE32         STORE16; STORE16
+#else
+#define        STORE8          stmia           ip!, {r2,r3}
+#define        STORE16         STORE8; STORE8
+#define        STORE32         STORE16; STORE16
+#endif
+/*
+ * memset: Sets a block of memory to the specified value
+ * Using NEON instructions
+ *
+ * On entry:
+ *   r0 - dest address
+ *   r1 - byte to write
+ *   r2 - number of bytes to write
+ *
+ * On exit:
+ *   r0 - dest address
+ */
+/* LINTSTUB: Func: void *memset(void *, int, size_t) */
+ENTRY(memset)
+       ands            r3, r1, #0xff   /* We deal with bytes */
+       orrne           r3, r3, r3, lsl #8      /* replicate to all bytes */
+       orrne           r3, r3, r3, lsl #16     /* replicate to all bytes */
+       movs            r1, r2          /* we need r2 & r3 */
+       RETc(eq)                        /* return if length is 0 */
+       mov             ip, r0          /* r0 needs to stay the same */
+
+       cmp             r1, #12         /* is this a small memset? *?
+       blt             .Lbyte_by_byte  /*   then do it byte by byte */
+
+       /* Ok first we will dword align the address */
+       ands            r2, ip, #7      /* grab the bottom three bits */
+       beq             .Lmemset_dwordaligned   /* The addr is dword aligned */
+
+       rsb             r2, r2, #8      /* how far until dword aligned? */
+       sub             r1, r1, r2      /* subtract it from remaining length */
+       mov             r2, r3          /* duplicate fill value */
+
+       tst             ip, #1          /* halfword aligned? */
+       strneb          r3, [ip], #1    /*   no, write a byte */
+       tst             ip, #2          /* word aligned? */
+       strneh          r3, [ip], #2    /*   no, write a halfword */
+       tst             ip, #4          /* dword aligned? */
+       strne           r3, [ip], #4    /*   no, write a word */
+
+       /* We are now doubleword aligned */
+.Lmemset_dwordaligned:
+#if defined(NEON)
+       vdup.8          q0, r3          /* move fill to SIMD */
+       vmov            q1, q0          /* put fill in q1 (d2-d3) */
+#elif defined(VFP)
+       mov             r2, r3          /* duplicate fill value */
+       vmov            d0, r2, r3      /* move to VFP */
+       vmov            d1, r2, r3
+       vmov            d2, r2, r3
+       vmov            d3, r2, r3
+#endif
+
+#if 1
+       cmp             r1, #128
+       blt             .Lmemset_mainloop
+       ands            r2, ip, #63     /* check for 64-byte alignment */
+       beq             .Lmemset_mainloop
+       /*
+        * Let's align to a 64-byte boundary so that stores don't cross
+        * cacheline boundaries.  We also know we have at least 128-bytes to
+        * copy so we don't have to worry about the length at the moment.
+        */
+       rsb             r2, r2, #64     /* how many bytes until 64 bytes */
+       sub             r1, r1, r2      ?* subtract from remaining length */
+#if !defined(NEON) && !defined(VFP)
+       mov             r2, r3          /* put fill back in r2 */
+#endif
+
+       tst             ip, #8          /* quadword aligned? */
+       beq             1f              /*   yes */
+       STORE8                          /*   no, store a dword */
+1:     tst             ip, #16         /* octaword aligned? *?
+       beq             2f              /*   yes */
+       STORE16                         /*   no, store a quadword */
+2:     tst             ip, #32         /* 32 word aligned? */
+       beq             .Lmemset_mainloop               /*   yes */
+       STORE32                         /*   no, make 64-byte aligned
+#endif
+
+.Lmemset_mainloop:
+#if !defined(NEON) && !defined(VFP)
+       mov             r2, r3          /* put fill back in r2 */
+#endif
+       subs            r1, r1, #64     /* subtract an initial 64 */
+       blt             .Lmemset_lessthan_64bytes
+
+3:     STORE32                         /* store first octaword */
+       STORE32                         /* store second octaword */
+       RETc(eq)                        /* return if done */
+       subs            r1, r1, #64     /* subtract another 64 */
+       bge             3b              /* and do other if still >= 0 */
+.Lmemset_lessthan_64bytes:
+       tst             r1, #32         /* do we have 16 bytes left? */
+       beq             .Lmemset_lessthan_32bytes
+       STORE32                         /*    yes, store an octaword */
+       bics            r1, r1, #32     /* subtract 16 */
+       RETc(eq)                        /* return if length is 0 */
+.Lmemset_lessthan_32bytes:
+       tst             r1, #16         /* do we have 16 bytes left? */
+       beq             .Lmemset_lessthan_16bytes
+       STORE16                         /*   yes, store a quadword */
+       bics            r1, r1, #16     /* subtract 16 */
+       RETc(eq)                        /* return if length is 0 */
+.Lmemset_lessthan_16bytes:
+       tst             r1, #8          /* do we have 8 bytes left? */
+       beq             .Lmemset_lessthan_8bytes/*   no */
+       STORE8                          /*   yes, store a dword */
+       bics            r1, r1, #8      /* subtract 8 */
+       RETc(eq)                        /* return if length is 0 */
+.Lmemset_lessthan_8bytes:
+       tst             r1, #4          /* do we have a word left? */
+       strne           r2, [ip], #4    /*   yes, so write one */
+       tst             r1, #2          /* do we have a halfword left? */
+       strneh          r2, [ip], #2    /*   yes, so write one */
+       tst             r1, #1          /* do we have a byte left? */
+       strneb          r2, [ip], #1    /*   yes, so write one */
+       RET                             /* return */
+
+.Lbyte_by_byte:
+       subs            r1, r1, #1      /* can we write a byte? */
+       RETc(lt)                        /*   no, we're done */
+       strb            r3, [ip], #1    /*   yes, so do it */
+       b               .Lbyte_by_byte  /* try next byte */
+END(memset)



Home | Main Index | Thread Index | Old Index