pkgsrc-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[pkgsrc/trunk]: pkgsrc/www/seamonkey/patches evert previous.
details: https://anonhg.NetBSD.org/pkgsrc/rev/45a6a812265e
branches: trunk
changeset: 352908:45a6a812265e
user: wiz <wiz%pkgsrc.org@localhost>
date: Mon Sep 19 22:39:21 2016 +0000
description:
evert previous.
diffstat:
www/seamonkey/patches/patch-mozilla_gfx_ycbcr_yuv__row__arm.S | 317 ++++++++++
1 files changed, 317 insertions(+), 0 deletions(-)
diffs (truncated from 321 to 300 lines):
diff -r cb06fcb696ce -r 45a6a812265e www/seamonkey/patches/patch-mozilla_gfx_ycbcr_yuv__row__arm.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/www/seamonkey/patches/patch-mozilla_gfx_ycbcr_yuv__row__arm.S Mon Sep 19 22:39:21 2016 +0000
@@ -0,0 +1,317 @@
+$NetBSD: patch-mozilla_gfx_ycbcr_yuv__row__arm.S,v 1.3 2016/09/19 22:39:21 wiz Exp $
+
+--- mozilla/gfx/ycbcr/yuv_row_arm.S.orig 2015-01-01 17:44:52.000000000 +0000
++++ mozilla/gfx/ycbcr/yuv_row_arm.S
+@@ -0,0 +1,312 @@
++/* This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#if defined(__ARM_EABI__) && !defined(__ARM_DWARF_EH__)
++#define UNWIND
++#else
++#define UNWIND @
++#endif
++
++ .arch armv7-a
++ .fpu neon
++/* Allow to build on targets not supporting neon, and force the object file
++ * target to avoid bumping the final binary target */
++ .object_arch armv4t
++ .text
++ .align
++
++ .balign 64
++YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
++ .short -14240
++ .short -14240+384
++ .short 8672
++ .short 8672+192
++ .short -17696
++ .short -17696+384
++ .byte 102
++ .byte 25
++ .byte 52
++ .byte 129
++YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
++ .short -14240+128
++ .short -14240+256
++ .short 8672+64
++ .short 8672+128
++ .short -17696+128
++ .short -17696+256
++ .byte 102
++ .byte 25
++ .byte 52
++ .byte 129
++YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
++ .short -14240+256
++ .short -14240+128
++ .short 8672+128
++ .short 8672+64
++ .short -17696+256
++ .short -17696+128
++ .byte 102
++ .byte 25
++ .byte 52
++ .byte 129
++YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
++ .short -14240+384
++ .short -14240
++ .short 8672+192
++ .short 8672
++ .short -17696+384
++ .short -17696
++ .byte 102
++ .byte 25
++ .byte 52
++ .byte 129
++
++@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
++@ yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
++@
++@ ctx = {
++@ uint16_t *rgb_row; /*r0*/
++@ const uint8_t *y_row; /*r1*/
++@ const uint8_t *u_row; /*r2*/
++@ const uint8_t *v_row; /*r3*/
++@ int y_yweight; /*r4*/
++@ int y_pitch; /*r5*/
++@ int width; /*r6*/
++@ int source_x0_q16; /*r7*/
++@ int source_dx_q16; /*r8*/
++@ int source_uv_xoffs_q16; /*r9*/
++@ };
++ .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
++ .type ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
++ .balign 64
++ .cfi_startproc
++ UNWIND .fnstart
++ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
++ STMFD r13!,{r4-r9,r14} @ 8 words.
++ ADR r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
++ VPUSH {Q4-Q7} @ 16 words.
++ ADD r14,r14,r1, LSL #4 @ Select the dither table to use
++ LDMIA r0, {r0-r9}
++ @ Set up image index registers.
++ ADD r12,r8, r8
++ VMOV.I32 D16,#0 @ Q8 = < 2| 2| 0| 0>*source_dx_q16
++ VDUP.32 D17,r12
++ ADD r12,r12,r12
++ VTRN.32 D16,D17 @ Q2 = < 2| 0| 2| 0>*source_dx_q16
++ VDUP.32 D19,r12 @ Q9 = < 4| 4| ?| ?>*source_dx_q16
++ ADD r12,r12,r12
++ VDUP.32 Q0, r7 @ Q0 = < 1| 1| 1| 1>*source_x0_q16
++ VADD.I32 D17,D17,D19 @ Q8 = < 6| 4| 2| 0>*source_dx_q16
++ CMP r8, #0 @ If source_dx_q16 is negative...
++ VDUP.32 Q9, r12 @ Q9 = < 8| 8| 8| 8>*source_dx_q16
++ ADDLT r7, r7, r8, LSL #4 @ Make r7 point to the end of the block
++ VADD.I32 Q0, Q0, Q8 @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
++ SUBLT r7, r7, r8 @ (i.e., the lowest address we'll use)
++ VADD.I32 Q1, Q0, Q9 @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
++ VDUP.I32 Q9, r8 @ Q8 = < 1| 1| 1| 1>*source_dx_q16
++ VADD.I32 Q2, Q0, Q9 @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
++ VADD.I32 Q3, Q1, Q9 @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
++ VLD1.64 {D30,D31},[r14,:128] @ Load some constants
++ VMOV.I8 D28,#52
++ VMOV.I8 D29,#129
++ @ The basic idea here is to do aligned loads of a block of data and then
++ @ index into it using VTBL to extract the data from the source X
++ @ coordinate corresponding to each destination pixel.
++ @ This is significantly less code and significantly fewer cycles than doing
++ @ a series of single-lane loads, but it means that the X step between
++ @ pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
++ @ that we could read 8 pixels from a single aligned 32-byte block of data.
++ @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
++ @ separated into even pixels and odd pixels to make extracting offsets and
++ @ weights easier.
++ @ We then pull out two bytes from the middle of each coordinate: the top
++ @ byte corresponds to the integer part of the X coordinate, and the bottom
++ @ byte corresponds to the weight to use for bilinear blending.
++ @ These are separated out into different registers with VTRN.
++ @ Then by subtracting the integer X coordinate of the first pixel in the
++ @ data block we loaded, we produce an index register suitable for use by
++ @ VTBL.
++s42xbily_neon_loop:
++ @ Load the Y' data.
++ MOV r12,r7, ASR #16
++ VRSHRN.S32 D16,Q0, #8
++ AND r12,r12,#~15 @ Read 16-byte aligned blocks
++ VDUP.I8 D20,r12
++ ADD r12,r1, r12 @ r12 = y_row+(source_x&~7)
++ VRSHRN.S32 D17,Q1, #8
++ PLD [r12,#64]
++ VLD1.64 {D8, D9, D10,D11},[r12,:128],r5 @ Load Y' top row
++ ADD r14,r7, r8, LSL #3
++ VRSHRN.S32 D18,Q2, #8
++ MOV r14,r14,ASR #16
++ VRSHRN.S32 D19,Q3, #8
++ AND r14,r14,#~15 @ Read 16-byte aligned blocks
++ VLD1.64 {D12,D13,D14,D15},[r12,:128] @ Load Y' bottom row
++ PLD [r12,#64]
++ VDUP.I8 D21,r14
++ ADD r14,r1, r14 @ r14 = y_row+(source_x&~7)
++ VMOV.I8 Q13,#1
++ PLD [r14,#64]
++ VTRN.8 Q8, Q9 @ Q8 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
++ @ Q9 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
++ VSUB.S8 Q9, Q9, Q10 @ Make offsets relative to the data we loaded.
++ @ First 8 Y' pixels
++ VTBL.8 D20,{D8, D9, D10,D11},D18 @ Index top row at source_x
++ VTBL.8 D24,{D12,D13,D14,D15},D18 @ Index bottom row at source_x
++ VADD.S8 Q13,Q9, Q13 @ Add 1 to source_x
++ VTBL.8 D22,{D8, D9, D10,D11},D26 @ Index top row at source_x+1
++ VTBL.8 D26,{D12,D13,D14,D15},D26 @ Index bottom row at source_x+1
++ @ Next 8 Y' pixels
++ VLD1.64 {D8, D9, D10,D11},[r14,:128],r5 @ Load Y' top row
++ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Y' bottom row
++ PLD [r14,#64]
++ VTBL.8 D21,{D8, D9, D10,D11},D19 @ Index top row at source_x
++ VTBL.8 D25,{D12,D13,D14,D15},D19 @ Index bottom row at source_x
++ VTBL.8 D23,{D8, D9, D10,D11},D27 @ Index top row at source_x+1
++ VTBL.8 D27,{D12,D13,D14,D15},D27 @ Index bottom row at source_x+1
++ @ Blend Y'.
++ VDUP.I16 Q9, r4 @ Load the y weights.
++ VSUBL.U8 Q4, D24,D20 @ Q5:Q4 = c-a
++ VSUBL.U8 Q5, D25,D21
++ VSUBL.U8 Q6, D26,D22 @ Q7:Q6 = d-b
++ VSUBL.U8 Q7, D27,D23
++ VMUL.S16 Q4, Q4, Q9 @ Q5:Q4 = (c-a)*yweight
++ VMUL.S16 Q5, Q5, Q9
++ VMUL.S16 Q6, Q6, Q9 @ Q7:Q6 = (d-b)*yweight
++ VMUL.S16 Q7, Q7, Q9
++ VMOVL.U8 Q12,D16 @ Promote the x weights to 16 bits.
++ VMOVL.U8 Q13,D17 @ Sadly, there's no VMULW.
++ VRSHRN.S16 D8, Q4, #8 @ Q4 = (c-a)*yweight+128>>8
++ VRSHRN.S16 D9, Q5, #8
++ VRSHRN.S16 D12,Q6, #8 @ Q6 = (d-b)*yweight+128>>8
++ VRSHRN.S16 D13,Q7, #8
++ VADD.I8 Q10,Q10,Q4 @ Q10 = a+((c-a)*yweight+128>>8)
++ VADD.I8 Q11,Q11,Q6 @ Q11 = b+((d-b)*yweight+128>>8)
++ VSUBL.U8 Q4, D22,D20 @ Q5:Q4 = b-a
++ VSUBL.U8 Q5, D23,D21
++ VMUL.S16 Q4, Q4, Q12 @ Q5:Q4 = (b-a)*xweight
++ VMUL.S16 Q5, Q5, Q13
++ VRSHRN.S16 D8, Q4, #8 @ Q4 = (b-a)*xweight+128>>8
++ ADD r12,r7, r9
++ VRSHRN.S16 D9, Q5, #8
++ MOV r12,r12,ASR #17
++ VADD.I8 Q8, Q10,Q4 @ Q8 = a+((b-a)*xweight+128>>8)
++ @ Start extracting the chroma x coordinates, and load Cb and Cr.
++ AND r12,r12,#~15 @ Read 16-byte aligned blocks
++ VDUP.I32 Q9, r9 @ Q9 = source_uv_xoffs_q16 x 4
++ ADD r14,r2, r12
++ VADD.I32 Q10,Q0, Q9
++ VLD1.64 {D8, D9, D10,D11},[r14,:128] @ Load Cb
++ PLD [r14,#64]
++ VADD.I32 Q11,Q1, Q9
++ ADD r14,r3, r12
++ VADD.I32 Q12,Q2, Q9
++ VLD1.64 {D12,D13,D14,D15},[r14,:128] @ Load Cr
++ PLD [r14,#64]
++ VADD.I32 Q13,Q3, Q9
++ VRSHRN.S32 D20,Q10,#9 @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
++ VRSHRN.S32 D21,Q11,#9
++ VDUP.I8 Q9, r12
++ VRSHRN.S32 D22,Q12,#9 @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
++ VRSHRN.S32 D23,Q13,#9
++ @ We don't actually need the x weights, but we get them for free.
++ @ Free ALU slot
++ VTRN.8 Q10,Q11 @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
++ @ Free ALU slot @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
++ VSUB.S8 Q11,Q11,Q9 @ Make offsets relative to the data we loaded.
++ VTBL.8 D18,{D8, D9, D10,D11},D22 @ Index Cb at source_x
++ VMOV.I8 D24,#74
++ VTBL.8 D19,{D8, D9, D10,D11},D23
++ VMOV.I8 D26,#102
++ VTBL.8 D20,{D12,D13,D14,D15},D22 @ Index Cr at source_x
++ VMOV.I8 D27,#25
++ VTBL.8 D21,{D12,D13,D14,D15},D23
++ @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
++ @ We use VDUP to expand constants, because it's a permute instruction, so
++ @ it can dual issue on the A8.
++ SUBS r6, r6, #16 @ width -= 16
++ VMULL.U8 Q4, D16,D24 @ Q5:Q4 = Y'*74
++ VDUP.32 Q6, D30[1] @ Q7:Q6 = bias_G
++ VMULL.U8 Q5, D17,D24
++ VDUP.32 Q7, D30[1]
++ VMLSL.U8 Q6, D18,D27 @ Q7:Q6 = -25*Cb+bias_G
++ VDUP.32 Q11,D30[0] @ Q12:Q11 = bias_R
++ VMLSL.U8 Q7, D19,D27
++ VDUP.32 Q12,D30[0]
++ VMLAL.U8 Q11,D20,D26 @ Q12:Q11 = 102*Cr+bias_R
++ VDUP.32 Q8, D31[0] @ Q13:Q8 = bias_B
++ VMLAL.U8 Q12,D21,D26
++ VDUP.32 Q13,D31[0]
++ VMLAL.U8 Q8, D18,D29 @ Q13:Q8 = 129*Cb+bias_B
++ VMLAL.U8 Q13,D19,D29
++ VMLSL.U8 Q6, D20,D28 @ Q7:Q6 = -25*Cb-52*Cr+bias_G
++ VMLSL.U8 Q7, D21,D28
++ VADD.S16 Q11,Q4, Q11 @ Q12:Q11 = 74*Y'+102*Cr+bias_R
++ VADD.S16 Q12,Q5, Q12
++ VQADD.S16 Q8, Q4, Q8 @ Q13:Q8 = 74*Y'+129*Cr+bias_B
++ VQADD.S16 Q13,Q5, Q13
++ VADD.S16 Q6, Q4, Q6 @ Q7:Q6 = 74*Y'-25*Cb-52*Cr+bias_G
++ VADD.S16 Q7, Q5, Q7
++ @ Push each value to the top of its word and saturate it.
++ VQSHLU.S16 Q11,Q11,#2
++ VQSHLU.S16 Q12,Q12,#2
++ VQSHLU.S16 Q6, Q6, #2
++ VQSHLU.S16 Q7, Q7, #2
++ VQSHLU.S16 Q8, Q8, #2
++ VQSHLU.S16 Q13,Q13,#2
++ @ Merge G and B into R.
++ VSRI.U16 Q11,Q6, #5
++ VSRI.U16 Q12,Q7, #5
++ VSRI.U16 Q11,Q8, #11
++ MOV r14,r8, LSL #4
++ VSRI.U16 Q12,Q13,#11
++ BLT s42xbily_neon_tail
++ VDUP.I32 Q13,r14
++ @ Store the result.
++ VST1.16 {D22,D23,D24,D25},[r0]!
++ BEQ s42xbily_neon_done
++ @ Advance the x coordinates.
++ VADD.I32 Q0, Q0, Q13
++ VADD.I32 Q1, Q1, Q13
++ ADD r7, r14
++ VADD.I32 Q2, Q2, Q13
++ VADD.I32 Q3, Q3, Q13
++ B s42xbily_neon_loop
++s42xbily_neon_tail:
++ @ We have between 1 and 15 pixels left to write.
++ @ -r6 == the number of pixels we need to skip writing.
++ @ Adjust r0 to point to the last one we need to write, because we're going
++ @ to write them in reverse order.
++ ADD r0, r0, r6, LSL #1
++ MOV r14,#-2
++ ADD r0, r0, #30
++ @ Skip past the ones we don't need to write.
++ SUB PC, PC, r6, LSL #2
++ ORR r0, r0, r0
++ VST1.16 {D25[3]},[r0,:16],r14
++ VST1.16 {D25[2]},[r0,:16],r14
++ VST1.16 {D25[1]},[r0,:16],r14
++ VST1.16 {D25[0]},[r0,:16],r14
++ VST1.16 {D24[3]},[r0,:16],r14
Home |
Main Index |
Thread Index |
Old Index