pkgsrc-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[pkgsrc/trunk]: pkgsrc/www Consistently move and patch yuv_row_arm.S.



details:   https://anonhg.NetBSD.org/pkgsrc/rev/df0bd178cd54
branches:  trunk
changeset: 354593:df0bd178cd54
user:      joerg <joerg%pkgsrc.org@localhost>
date:      Thu Nov 03 22:46:43 2016 +0000

description:
Consistently move and patch yuv_row_arm.S.

diffstat:

 www/firefox/Makefile                                          |    5 +-
 www/firefox/distinfo                                          |    4 +-
 www/firefox/patches/patch-gfx_ycbcr_yuv__row__arm.S           |  331 +---------
 www/seamonkey/Makefile                                        |    5 +-
 www/seamonkey/distinfo                                        |    5 +-
 www/seamonkey/patches/patch-mozilla_gfx_ycbcr_yuv__row__arm.S |  330 +---------
 www/seamonkey/patches/patch-mozilla_gfx_ycbcr_yuv__row__arm.s |   37 -
 7 files changed, 64 insertions(+), 653 deletions(-)

diffs (truncated from 812 to 300 lines):

diff -r 1da27a02694c -r df0bd178cd54 www/firefox/Makefile
--- a/www/firefox/Makefile      Thu Nov 03 21:25:55 2016 +0000
+++ b/www/firefox/Makefile      Thu Nov 03 22:46:43 2016 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.268 2016/10/26 20:23:27 ryoon Exp $
+# $NetBSD: Makefile,v 1.269 2016/11/03 22:46:43 joerg Exp $
 
 FIREFOX_VER=   ${MOZ_BRANCH}${MOZ_BRANCH_MINOR}
 MOZ_BRANCH=    49.0
@@ -65,6 +65,9 @@
 SUBST_FILES.sys-dic=   extensions/spellcheck/hunspell/glue/mozHunspell.cpp
 SUBST_VARS.sys-dic=    PREFIX
 
+post-extract:
+       mv ${WRKSRC}/gfx/ycbcr/yuv_row_arm.s ${WRKSRC}/gfx/ycbcr/yuv_row_arm.S
+
 pre-configure:
        cd ${WRKSRC} && autoconf
        cd ${WRKSRC}/js/src && autoconf
diff -r 1da27a02694c -r df0bd178cd54 www/firefox/distinfo
--- a/www/firefox/distinfo      Thu Nov 03 21:25:55 2016 +0000
+++ b/www/firefox/distinfo      Thu Nov 03 22:46:43 2016 +0000
@@ -1,4 +1,4 @@
-$NetBSD: distinfo,v 1.257 2016/10/30 01:10:10 kamil Exp $
+$NetBSD: distinfo,v 1.258 2016/11/03 22:46:43 joerg Exp $
 
 SHA1 (firefox-49.0.2.source.tar.xz) = 52d527f06c522c95e2fcf4008dce1a9913379aaf
 RMD160 (firefox-49.0.2.source.tar.xz) = b0c7bda2f551ea025bb75b0f9a58722f7322ea2d
@@ -28,7 +28,7 @@
 SHA1 (patch-gfx_graphite2_src_Bidi.cpp) = 9b357196b795f7698f0763cb6cfcd39b4aea6420
 SHA1 (patch-gfx_skia_skia_src_core_SkUtilsArm.cpp) = 94a5a88f1177e09ef7b8dbdb6439153933004356
 SHA1 (patch-gfx_ycbcr_moz.build) = 705c36b972ef1533330e4a180002cef1c22755bf
-SHA1 (patch-gfx_ycbcr_yuv__row__arm.S) = f3bf72cb9b52b0c64d8ea5d3a25a797409da9d5a
+SHA1 (patch-gfx_ycbcr_yuv__row__arm.S) = 79587891c2a1716a27d4dca0e5b5880069a430eb
 SHA1 (patch-image_decoders_nsJPEGDecoder.cpp) = fb650d1ae95321a6fc7565ffe3375944d06f95a9
 SHA1 (patch-intl_hyphenation_glue_hnjalloc.h) = abe01bea5872a57f3d00bbbf89f958621f08a655
 SHA1 (patch-ipc_chromium_src_base_atomicops.h) = 24b63a6e51d9ab27f2788ee02f2ffa7e1c36f29a
diff -r 1da27a02694c -r df0bd178cd54 www/firefox/patches/patch-gfx_ycbcr_yuv__row__arm.S
--- a/www/firefox/patches/patch-gfx_ycbcr_yuv__row__arm.S       Thu Nov 03 21:25:55 2016 +0000
+++ b/www/firefox/patches/patch-gfx_ycbcr_yuv__row__arm.S       Thu Nov 03 22:46:43 2016 +0000
@@ -1,319 +1,42 @@
-$NetBSD: patch-gfx_ycbcr_yuv__row__arm.S,v 1.2 2016/06/16 12:08:21 ryoon Exp $
+$NetBSD: patch-gfx_ycbcr_yuv__row__arm.S,v 1.3 2016/11/03 22:46:43 joerg Exp $
 
 * Copy from yuv_row_arm.s to process it with pre-processor
 
---- gfx/ycbcr/yuv_row_arm.S.orig       2014-12-01 14:53:14.000000000 +0000
+--- gfx/ycbcr/yuv_row_arm.S.orig       2016-05-12 17:13:08.000000000 +0000
 +++ gfx/ycbcr/yuv_row_arm.S
-@@ -0,0 +1,312 @@
-+/* This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
+@@ -2,6 +2,12 @@
+  * License, v. 2.0. If a copy of the MPL was not distributed with this
+  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+ 
 +#if defined(__ARM_EABI__) && !defined(__ARM_DWARF_EH__)
 +#define UNWIND
 +#else
 +#define UNWIND @
 +#endif
 +
-+    .arch   armv7-a
-+    .fpu    neon
-+/* Allow to build on targets not supporting neon, and force the object file
-+ * target to avoid bumping the final binary target */
-+    .object_arch armv4t
-+    .text
-+    .align
-+
-+    .balign 64
-+YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
-+    .short -14240
-+    .short -14240+384
-+    .short   8672
-+    .short   8672+192
-+    .short -17696
-+    .short -17696+384
-+    .byte 102
-+    .byte  25
-+    .byte  52
-+    .byte 129
-+YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
-+    .short -14240+128
-+    .short -14240+256
-+    .short   8672+64
-+    .short   8672+128
-+    .short -17696+128
-+    .short -17696+256
-+    .byte 102
-+    .byte  25
-+    .byte  52
-+    .byte 129
-+YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
-+    .short -14240+256
-+    .short -14240+128
-+    .short   8672+128
-+    .short   8672+64
-+    .short -17696+256
-+    .short -17696+128
-+    .byte 102
-+    .byte  25
-+    .byte  52
-+    .byte 129
-+YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
-+    .short -14240+384
-+    .short -14240
-+    .short   8672+192
-+    .short   8672
-+    .short -17696+384
-+    .short -17696
-+    .byte 102
-+    .byte  25
-+    .byte  52
-+    .byte 129
-+
-+@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
-+@  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
-+@
-+@ ctx = {
-+@   uint16_t *rgb_row;       /*r0*/
-+@   const uint8_t *y_row;    /*r1*/
-+@   const uint8_t *u_row;    /*r2*/
-+@   const uint8_t *v_row;    /*r3*/
-+@   int y_yweight;           /*r4*/
-+@   int y_pitch;             /*r5*/
-+@   int width;               /*r6*/
-+@   int source_x0_q16;       /*r7*/
-+@   int source_dx_q16;       /*r8*/
-+@   int source_uv_xoffs_q16; /*r9*/
-+@ };
-+    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
-+    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
-+    .balign 64
+     .arch   armv7-a
+     .fpu    neon
+ /* Allow to build on targets not supporting neon, and force the object file
+@@ -74,7 +80,8 @@ YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
+     .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
+     .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
+     .balign 64
+-    .fnstart
 +    .cfi_startproc
 +    UNWIND .fnstart
-+ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
-+    STMFD       r13!,{r4-r9,r14}       @ 8 words.
-+    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
-+    VPUSH       {Q4-Q7}                @ 16 words.
-+    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
-+    LDMIA       r0, {r0-r9}
-+    @ Set up image index registers.
-+    ADD         r12,r8, r8
-+    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
-+    VDUP.32     D17,r12
-+    ADD         r12,r12,r12
-+    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
-+    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
-+    ADD         r12,r12,r12
-+    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
-+    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
-+    CMP         r8, #0                 @ If source_dx_q16 is negative...
-+    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
-+    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
-+    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
-+    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
-+    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
-+    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
-+    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
-+    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
-+    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
-+    VMOV.I8     D28,#52
-+    VMOV.I8     D29,#129
-+    @ The basic idea here is to do aligned loads of a block of data and then
-+    @  index into it using VTBL to extract the data from the source X
-+    @  coordinate corresponding to each destination pixel.
-+    @ This is significantly less code and significantly fewer cycles than doing
-+    @  a series of single-lane loads, but it means that the X step between
-+    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
-+    @  that we could read 8 pixels from a single aligned 32-byte block of data.
-+    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
-+    @  separated into even pixels and odd pixels to make extracting offsets and
-+    @  weights easier.
-+    @ We then pull out two bytes from the middle of each coordinate: the top
-+    @  byte corresponds to the integer part of the X coordinate, and the bottom
-+    @  byte corresponds to the weight to use for bilinear blending.
-+    @ These are separated out into different registers with VTRN.
-+    @ Then by subtracting the integer X coordinate of the first pixel in the
-+    @  data block we loaded, we produce an index register suitable for use by
-+    @  VTBL.
-+s42xbily_neon_loop:
-+    @ Load the Y' data.
-+    MOV         r12,r7, ASR #16
-+    VRSHRN.S32  D16,Q0, #8
-+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
-+    VDUP.I8     D20,r12
-+    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
-+    VRSHRN.S32  D17,Q1, #8
-+    PLD         [r12,#64]
-+    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
-+    ADD         r14,r7, r8, LSL #3
-+    VRSHRN.S32  D18,Q2, #8
-+    MOV         r14,r14,ASR #16
-+    VRSHRN.S32  D19,Q3, #8
-+    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
-+    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
-+    PLD         [r12,#64]
-+    VDUP.I8     D21,r14
-+    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
-+    VMOV.I8     Q13,#1
-+    PLD         [r14,#64]
-+    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
-+                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
-+    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
-+    @ First 8 Y' pixels
-+    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
-+    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
-+    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
-+    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
-+    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
-+    @ Next 8 Y' pixels
-+    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
-+    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
-+    PLD         [r14,#64]
-+    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
-+    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
-+    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
-+    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
-+    @ Blend Y'.
-+    VDUP.I16    Q9, r4         @ Load the y weights.
-+    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
-+    VSUBL.U8    Q5, D25,D21
-+    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
-+    VSUBL.U8    Q7, D27,D23
-+    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
-+    VMUL.S16    Q5, Q5, Q9
-+    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
-+    VMUL.S16    Q7, Q7, Q9
-+    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
-+    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
-+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
-+    VRSHRN.S16  D9, Q5, #8
-+    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
-+    VRSHRN.S16  D13,Q7, #8
-+    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
-+    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
-+    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
-+    VSUBL.U8    Q5, D23,D21
-+    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
-+    VMUL.S16    Q5, Q5, Q13
-+    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
-+    ADD         r12,r7, r9
-+    VRSHRN.S16  D9, Q5, #8
-+    MOV         r12,r12,ASR #17
-+    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
-+    @ Start extracting the chroma x coordinates, and load Cb and Cr.
-+    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
-+    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
-+    ADD         r14,r2, r12
-+    VADD.I32    Q10,Q0, Q9
-+    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
-+    PLD         [r14,#64]
-+    VADD.I32    Q11,Q1, Q9
-+    ADD         r14,r3, r12
-+    VADD.I32    Q12,Q2, Q9
-+    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
-+    PLD         [r14,#64]
-+    VADD.I32    Q13,Q3, Q9
-+    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
-+    VRSHRN.S32  D21,Q11,#9
-+    VDUP.I8     Q9, r12
-+    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
-+    VRSHRN.S32  D23,Q13,#9
-+    @ We don't actually need the x weights, but we get them for free.
-+    @ Free ALU slot
-+    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
-+    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
-+    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
-+    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
-+    VMOV.I8     D24,#74
-+    VTBL.8      D19,{D8, D9, D10,D11},D23
-+    VMOV.I8     D26,#102
-+    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
-+    VMOV.I8     D27,#25
-+    VTBL.8      D21,{D12,D13,D14,D15},D23
-+    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
-+    @ We use VDUP to expand constants, because it's a permute instruction, so
-+    @  it can dual issue on the A8.
-+    SUBS        r6, r6, #16    @ width -= 16
-+    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
-+    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
-+    VMULL.U8    Q5, D17,D24
-+    VDUP.32     Q7, D30[1]
-+    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
-+    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
-+    VMLSL.U8    Q7, D19,D27
-+    VDUP.32     Q12,D30[0]
-+    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
-+    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B



Home | Main Index | Thread Index | Old Index