summaryrefslogtreecommitdiff
path: root/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm
diff options
context:
space:
mode:
authorTim Redfern <tim@eclectronics.org>2013-08-26 15:10:18 +0100
committerTim Redfern <tim@eclectronics.org>2013-08-26 15:10:18 +0100
commit150c9823e71a161e97003849cf8b2f55b21520bd (patch)
tree3559c840cf403d1386708b2591d58f928c7b160d /ffmpeg1/libavcodec/x86/h264_weight_10bit.asm
parentb4b1e2630c95d5e6014463f7608d59dc2322a3b8 (diff)
adding ffmpeg specific version
Diffstat (limited to 'ffmpeg1/libavcodec/x86/h264_weight_10bit.asm')
-rw-r--r--ffmpeg1/libavcodec/x86/h264_weight_10bit.asm282
1 files changed, 282 insertions, 0 deletions
diff --git a/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm
new file mode 100644
index 0000000..3b09e42
--- /dev/null
+++ b/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm
@@ -0,0 +1,282 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+sq_1: dq 1
+ dq 0
+
+cextern pw_1
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
+; int weight, int offset);
+;-----------------------------------------------------------------------------
+%macro WEIGHT_PROLOGUE 0
+.prologue:
+ PROLOGUE 0,6,8
+ movifnidn r0, r0mp
+ movifnidn r1d, r1m
+ movifnidn r2d, r2m
+ movifnidn r4d, r4m
+ movifnidn r5d, r5m
+%endmacro
+
+%macro WEIGHT_SETUP 0
+ mova m0, [pw_1]
+ movd m2, r3m
+ pslld m0, m2 ; 1<<log2_denom
+ SPLATW m0, m0
+ shl r5, 19 ; *8, move to upper half of dword
+ lea r5, [r5+r4*2+0x10000]
+ movd m3, r5d ; weight<<1 | 1+(offset<<(3))
+ pshufd m3, m3, 0
+ mova m4, [pw_pixel_max]
+ paddw m2, [sq_1] ; log2_denom+1
+%if notcpuflag(sse4)
+ pxor m7, m7
+%endif
+%endmacro
+
+%macro WEIGHT_OP 1-2
+%if %0==1
+ mova m5, [r0+%1]
+ punpckhwd m6, m5, m0
+ punpcklwd m5, m0
+%else
+ movq m5, [r0+%1]
+ movq m6, [r0+%2]
+ punpcklwd m5, m0
+ punpcklwd m6, m0
+%endif
+ pmaddwd m5, m3
+ pmaddwd m6, m3
+ psrad m5, m2
+ psrad m6, m2
+%if cpuflag(sse4)
+ packusdw m5, m6
+ pminsw m5, m4
+%else
+ packssdw m5, m6
+ CLIPW m5, m7, m4
+%endif
+%endmacro
+
+%macro WEIGHT_FUNC_DBL 0
+cglobal h264_weight_16_10
+ WEIGHT_PROLOGUE
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0
+ mova [r0 ], m5
+ WEIGHT_OP 16
+ mova [r0+16], m5
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_DBL
+INIT_XMM sse4
+WEIGHT_FUNC_DBL
+
+
+%macro WEIGHT_FUNC_MM 0
+cglobal h264_weight_8_10
+ WEIGHT_PROLOGUE
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0
+ mova [r0], m5
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_MM
+INIT_XMM sse4
+WEIGHT_FUNC_MM
+
+
+%macro WEIGHT_FUNC_HALF_MM 0
+cglobal h264_weight_4_10
+ WEIGHT_PROLOGUE
+ sar r2d, 1
+ WEIGHT_SETUP
+ lea r3, [r1*2]
+.nextrow:
+ WEIGHT_OP 0, r1
+ movh [r0], m5
+ movhps [r0+r1], m5
+ add r0, r3
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM
+INIT_XMM sse4
+WEIGHT_FUNC_HALF_MM
+
+
+;-----------------------------------------------------------------------------
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+; int log2_denom, int weightd, int weights, int offset);
+;-----------------------------------------------------------------------------
+%if ARCH_X86_32
+DECLARE_REG_TMP 3
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro BIWEIGHT_PROLOGUE 0
+.prologue:
+ PROLOGUE 0,8,8
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r5m
+ movifnidn r6d, r6m
+ movifnidn t0d, r7m
+%endmacro
+
+%macro BIWEIGHT_SETUP 0
+ lea t0, [t0*4+1] ; (offset<<2)+1
+ or t0, 1
+ shl r6, 16
+ or r5, r6
+ movd m4, r5d ; weightd | weights
+ movd m5, t0d ; (offset+1)|1
+ movd m6, r4m ; log2_denom
+ pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
+ paddd m6, [sq_1]
+ pshufd m4, m4, 0
+ pshufd m5, m5, 0
+ mova m3, [pw_pixel_max]
+ movifnidn r3d, r3m
+%if notcpuflag(sse4)
+ pxor m7, m7
+%endif
+%endmacro
+
+%macro BIWEIGHT 1-2
+%if %0==1
+ mova m0, [r0+%1]
+ mova m1, [r1+%1]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+%else
+ movq m0, [r0+%1]
+ movq m1, [r1+%1]
+ punpcklwd m0, m1
+ movq m2, [r0+%2]
+ movq m1, [r1+%2]
+ punpcklwd m2, m1
+%endif
+ pmaddwd m0, m4
+ pmaddwd m2, m4
+ paddd m0, m5
+ paddd m2, m5
+ psrad m0, m6
+ psrad m2, m6
+%if cpuflag(sse4)
+ packusdw m0, m2
+ pminsw m0, m3
+%else
+ packssdw m0, m2
+ CLIPW m0, m7, m3
+%endif
+%endmacro
+
+%macro BIWEIGHT_FUNC_DBL 0
+cglobal h264_biweight_16_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+.nextrow:
+ BIWEIGHT 0
+ mova [r0 ], m0
+ BIWEIGHT 16
+ mova [r0+16], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_DBL
+INIT_XMM sse4
+BIWEIGHT_FUNC_DBL
+
+%macro BIWEIGHT_FUNC 0
+cglobal h264_biweight_8_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+.nextrow:
+ BIWEIGHT 0
+ mova [r0], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC
+INIT_XMM sse4
+BIWEIGHT_FUNC
+
+%macro BIWEIGHT_FUNC_HALF 0
+cglobal h264_biweight_4_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+ sar r3d, 1
+ lea r4, [r2*2]
+.nextrow:
+ BIWEIGHT 0, r2
+ movh [r0 ], m0
+ movhps [r0+r2], m0
+ add r0, r4
+ add r1, r4
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF
+INIT_XMM sse4
+BIWEIGHT_FUNC_HALF