diff options
| author | Tim Redfern <tim@eclectronics.org> | 2013-08-26 15:10:18 +0100 |
|---|---|---|
| committer | Tim Redfern <tim@eclectronics.org> | 2013-08-26 15:10:18 +0100 |
| commit | 150c9823e71a161e97003849cf8b2f55b21520bd (patch) | |
| tree | 3559c840cf403d1386708b2591d58f928c7b160d /ffmpeg1/libavcodec/x86/h264_weight_10bit.asm | |
| parent | b4b1e2630c95d5e6014463f7608d59dc2322a3b8 (diff) | |
adding ffmpeg specific version
Diffstat (limited to 'ffmpeg1/libavcodec/x86/h264_weight_10bit.asm')
| -rw-r--r-- | ffmpeg1/libavcodec/x86/h264_weight_10bit.asm | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm new file mode 100644 index 0000000..3b09e42 --- /dev/null +++ b/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm @@ -0,0 +1,282 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_pixel_max: times 8 dw ((1 << 10)-1) +sq_1: dq 1 + dq 0 + +cextern pw_1 + +SECTION .text + +;----------------------------------------------------------------------------- +; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, +; int weight, int offset); +;----------------------------------------------------------------------------- +%macro WEIGHT_PROLOGUE 0 +.prologue: + PROLOGUE 0,6,8 + movifnidn r0, r0mp + movifnidn r1d, r1m + movifnidn r2d, r2m + movifnidn r4d, r4m + movifnidn r5d, r5m +%endmacro + +%macro WEIGHT_SETUP 0 + mova m0, [pw_1] + movd m2, r3m + pslld m0, m2 ; 1<<log2_denom + SPLATW m0, m0 + shl r5, 19 ; *8, move to upper half of dword + lea r5, [r5+r4*2+0x10000] + movd m3, r5d ; weight<<1 | 1+(offset<<(3)) + pshufd m3, m3, 0 + mova m4, [pw_pixel_max] + paddw m2, [sq_1] ; log2_denom+1 +%if notcpuflag(sse4) + pxor m7, m7 +%endif +%endmacro + +%macro WEIGHT_OP 1-2 +%if %0==1 + mova m5, [r0+%1] + punpckhwd m6, m5, m0 + punpcklwd m5, m0 +%else + movq m5, [r0+%1] + movq m6, [r0+%2] + punpcklwd m5, m0 + punpcklwd m6, m0 +%endif + pmaddwd m5, m3 + pmaddwd m6, m3 + psrad m5, m2 + psrad m6, m2 +%if cpuflag(sse4) + packusdw m5, m6 + pminsw m5, m4 +%else + packssdw m5, m6 + CLIPW m5, m7, m4 +%endif +%endmacro + +%macro WEIGHT_FUNC_DBL 0 +cglobal h264_weight_16_10 + WEIGHT_PROLOGUE + WEIGHT_SETUP +.nextrow: + WEIGHT_OP 0 + mova [r0 ], m5 + WEIGHT_OP 16 + mova [r0+16], m5 + add r0, r1 + dec r2d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +WEIGHT_FUNC_DBL +INIT_XMM sse4 +WEIGHT_FUNC_DBL + + +%macro WEIGHT_FUNC_MM 0 +cglobal h264_weight_8_10 + WEIGHT_PROLOGUE + WEIGHT_SETUP +.nextrow: + WEIGHT_OP 0 + mova [r0], m5 + add r0, r1 + dec r2d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +WEIGHT_FUNC_MM +INIT_XMM sse4 +WEIGHT_FUNC_MM + + +%macro WEIGHT_FUNC_HALF_MM 0 +cglobal h264_weight_4_10 + WEIGHT_PROLOGUE + sar r2d, 1 + WEIGHT_SETUP + lea r3, [r1*2] +.nextrow: + WEIGHT_OP 0, r1 + movh [r0], m5 + movhps [r0+r1], m5 + add r0, r3 + dec r2d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +WEIGHT_FUNC_HALF_MM +INIT_XMM sse4 +WEIGHT_FUNC_HALF_MM + + +;----------------------------------------------------------------------------- +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, +; int log2_denom, int weightd, int weights, int offset); +;----------------------------------------------------------------------------- +%if ARCH_X86_32 +DECLARE_REG_TMP 3 +%else +DECLARE_REG_TMP 7 +%endif + +%macro BIWEIGHT_PROLOGUE 0 +.prologue: + PROLOGUE 0,8,8 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r5m + movifnidn r6d, r6m + movifnidn t0d, r7m +%endmacro + +%macro BIWEIGHT_SETUP 0 + lea t0, [t0*4+1] ; (offset<<2)+1 + or t0, 1 + shl r6, 16 + or r5, r6 + movd m4, r5d ; weightd | weights + movd m5, t0d ; (offset+1)|1 + movd m6, r4m ; log2_denom + pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom + paddd m6, [sq_1] + pshufd m4, m4, 0 + pshufd m5, m5, 0 + mova m3, [pw_pixel_max] + movifnidn r3d, r3m +%if notcpuflag(sse4) + pxor m7, m7 +%endif +%endmacro + +%macro BIWEIGHT 1-2 +%if %0==1 + mova m0, [r0+%1] + mova m1, [r1+%1] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 +%else + movq m0, [r0+%1] + movq m1, [r1+%1] + punpcklwd m0, m1 + movq m2, [r0+%2] + movq m1, [r1+%2] + punpcklwd m2, m1 +%endif + pmaddwd m0, m4 + pmaddwd m2, m4 + paddd m0, m5 + paddd m2, m5 + psrad m0, m6 + psrad m2, m6 +%if cpuflag(sse4) + packusdw m0, m2 + pminsw m0, m3 +%else + packssdw m0, m2 + CLIPW m0, m7, m3 +%endif +%endmacro + +%macro BIWEIGHT_FUNC_DBL 0 +cglobal h264_biweight_16_10 + BIWEIGHT_PROLOGUE + BIWEIGHT_SETUP +.nextrow: + BIWEIGHT 0 + mova [r0 ], m0 + BIWEIGHT 16 + mova [r0+16], m0 + add r0, r2 + add r1, r2 + dec r3d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +BIWEIGHT_FUNC_DBL +INIT_XMM sse4 +BIWEIGHT_FUNC_DBL + +%macro BIWEIGHT_FUNC 0 +cglobal h264_biweight_8_10 + BIWEIGHT_PROLOGUE + BIWEIGHT_SETUP +.nextrow: + BIWEIGHT 0 + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +BIWEIGHT_FUNC +INIT_XMM sse4 +BIWEIGHT_FUNC + +%macro BIWEIGHT_FUNC_HALF 0 +cglobal h264_biweight_4_10 + BIWEIGHT_PROLOGUE + BIWEIGHT_SETUP + sar r3d, 1 + lea r4, [r2*2] +.nextrow: + BIWEIGHT 0, r2 + movh [r0 ], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3d + jnz .nextrow + REP_RET +%endmacro + +INIT_XMM sse2 +BIWEIGHT_FUNC_HALF +INIT_XMM sse4 +BIWEIGHT_FUNC_HALF |
