Subject: Re: Performance of various memcpy()'s
To: None <tech-perform@netbsd.org>
From: Bang Jun-Young <junyoung@mogua.com>
List: port-i386
Date: 10/28/2002 14:33:03
--Dxnq1zWXvFF0Q93v
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
On Wed, Oct 23, 2002 at 11:54:42PM +0900, Bang Jun-Young wrote:
> In this test, non-temporal movntq instruction was obviously a big win.
> Since it doesn't pollute cache lines, you can get 2x performance for
> copying data not in cache.
This time I implemented i686_copyin() and i686_copyout() using
non-temporal MOVNTQ instruction. Benchmark results are as follows:
memcpy 128B -- 8192 loops
aligned blocks
libc memcpy 2.871323 s
i686_copyin (MOVQ, FNSAVE/FRSTOR) 3.784806 s
i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR) 3.375719 s
MMX memcpy using MOVQ 2.746474 s
memcpy 256B -- 4096 loops
aligned blocks
libc memcpy 2.857081 s
i686_copyin (MOVQ, FNSAVE/FRSTOR) 2.859079 s
i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR) 2.503540 s
MMX memcpy using MOVQ 2.692716 s
memcpy 512B -- 2048 loops
aligned blocks
libc memcpy 2.858101 s
i686_copyin (MOVQ, FNSAVE/FRSTOR) 2.741855 s
i686_copyin3 (MOVQ, FNSAVE/FRSTOR) 1.982627 s
MMX memcpy using MOVQ 2.653495 s
memcpy 1024B -- 1024 loops
aligned blocks
libc memcpy 2.859076 s
i686_copyin (MOVQ, FNSAVE/FRSTOR) 2.679616 s
i686_copyin3 (MOVNTQ, FNSAVE/FRSTOR) 1.857517 s
MMX memcpy using MOVQ 2.643854 s
I'd appreciate it if someone would apply the patch and perform
a "real world" benchmark with it.
Jun-Young
--
Bang Jun-Young <junyoung@mogua.com>
--Dxnq1zWXvFF0Q93v
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="locore.s.i686.diff"
Index: locore.s
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/i386/i386/locore.s,v
retrieving revision 1.267
diff -u -r1.267 locore.s
--- locore.s 2002/10/23 03:28:34 1.267
+++ locore.s 2002/10/28 05:11:13
@@ -951,8 +951,8 @@
#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */
#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */
#elif defined(I686_CPU)
-#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */
-#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */
+#define DEFAULT_COPYOUT _C_LABEL(i686_copyout) /* XXX */
+#define DEFAULT_COPYIN _C_LABEL(i686_copyin) /* XXX */
#endif
.data
@@ -1108,6 +1108,101 @@
ret
#endif /* I486_CPU || I586_CPU || I686_CPU */
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i686_copyout)
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+
+ /*
+ * We check that the end of the destination buffer is not past the end
+ * of the user's address space.
+ */
+ movl %edi,%edx
+ addl %eax,%edx
+ jc _C_LABEL(i686_copy_efault)
+ cmpl $VM_MAXUSER_ADDRESS,%edx
+ ja _C_LABEL(i686_copy_efault)
+
+ GET_CURPCB(%edx)
+ movl $_C_LABEL(i686_copy_fault),PCB_ONFAULT(%edx)
+
+ cmpl $512,%eax
+ jb 2f
+
+ xorl %ebx,%ebx
+ movl %eax,%edx
+ shrl $6,%edx
+
+ /*
+ * Save FPU state in stack.
+ */
+ smsw %cx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+
+1: movq (%esi),%mm0
+ movq 8(%esi),%mm1
+ movq 16(%esi),%mm2
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ movntq %mm0,(%edi)
+ movntq %mm1,8(%edi)
+ movntq %mm2,16(%edi)
+ movntq %mm3,24(%edi)
+ movntq %mm4,32(%edi)
+ movntq %mm5,40(%edi)
+ movntq %mm6,48(%edi)
+ movntq %mm7,56(%edi)
+
+ addl $64,%esi
+ addl $64,%edi
+ incl %ebx
+ cmpl %edx,%ebx
+ jb 1b
+
+ /*
+ * Restore FPU state.
+ */
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %cx
+ sfence
+ emms
+
+ andl $63,%eax
+ jz 3f
+
+2: /* plain old bcopy(%esi, %edi, %eax); */
+ cld
+ movl %eax,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ andl $3,%eax
+ jz 3f
+ movl %eax,%ecx
+ rep
+ movsb
+
+3: GET_CURPCB(%edx)
+ xorl %eax,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+ movl %eax,PCB_ONFAULT(%edx)
+ ret
+#endif /* I686_CPU */
+
/*
* int copyin(const void *from, void *to, size_t len);
* Copy len bytes from the user's address space.
@@ -1160,6 +1255,114 @@
xorl %eax,%eax
ret
#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+#if defined(I686_CPU)
+/* LINTSTUB: Func: int i686_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i686_copyin)
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+ GET_CURPCB(%eax)
+ movl $_C_LABEL(i686_copy_fault),PCB_ONFAULT(%eax)
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+
+ /*
+ * We check that the end of the destination buffer is not past the end
+ * of the user's address space. If it's not, then we only need to
+ * check that each page is readable, and the CPU will do that for us.
+ */
+ movl %esi,%edx
+ addl %eax,%edx
+ jc _C_LABEL(i686_copy_efault)
+ cmpl $VM_MAXUSER_ADDRESS,%edx
+ ja _C_LABEL(i686_copy_efault)
+
+ cmpl $512,%eax
+ jb 2f
+
+ xorl %ebx,%ebx
+ movl %eax,%edx
+ shrl $6,%edx
+
+ /*
+ * Save FPU state in stack.
+ */
+ smsw %cx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+
+1: movq (%esi),%mm0
+ movq 8(%esi),%mm1
+ movq 16(%esi),%mm2
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ movntq %mm0,(%edi)
+ movntq %mm1,8(%edi)
+ movntq %mm2,16(%edi)
+ movntq %mm3,24(%edi)
+ movntq %mm4,32(%edi)
+ movntq %mm5,40(%edi)
+ movntq %mm6,48(%edi)
+ movntq %mm7,56(%edi)
+
+ addl $64,%esi
+ addl $64,%edi
+ incl %ebx
+ cmpl %edx,%ebx
+ jb 1b
+
+ /*
+ * Restore FPU state.
+ */
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %cx
+ sfence
+ emms
+
+ andl $63,%eax
+ jz 3f
+
+2: /* plain old bcopy(%esi, %edi, %eax); */
+ cld
+ movl %eax,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ andl $3,%eax
+ jz 3f
+ movl %eax,%ecx
+ rep
+ movsb
+
+3: GET_CURPCB(%edx)
+ xorl %eax,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+ movl %eax,PCB_ONFAULT(%edx)
+ ret
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_efault)
+ movl $EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(i686_copy_fault)
+ GET_CURPCB(%edx)
+ movl %eax,PCB_ONFAULT(%edx)
+ popl %ebx
+ popl %edi
+ popl %esi
+ ret
+#endif /* I686_CPU */
/* LINTSTUB: Ignore */
NENTRY(copy_efault)
--Dxnq1zWXvFF0Q93v--