diff options
Diffstat (limited to 'ffmpeg1/libavcodec/x86')
95 files changed, 0 insertions, 39398 deletions
diff --git a/ffmpeg1/libavcodec/x86/Makefile b/ffmpeg1/libavcodec/x86/Makefile deleted file mode 100644 index 38ef867..0000000 --- a/ffmpeg1/libavcodec/x86/Makefile +++ /dev/null @@ -1,95 +0,0 @@ -OBJS += x86/fmtconvert_init.o \ - x86/constants.o - -OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o -OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o -OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o -OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -OBJS-$(CONFIG_FFT) += x86/fft_init.o -OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o -OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o -OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o -OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o -OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o -OBJS-$(CONFIG_LPC) += x86/lpc.o -OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o -OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o -OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o -OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o -OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o -OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o -OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o -OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o -OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ - x86/rv40dsp_init.o -OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o -OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o -OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o -OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o -OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o -OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o -OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o -OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o -OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o -OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o - -MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ - x86/fdct.o \ - x86/idct_mmx_xvid.o \ - x86/idct_sse2_xvid.o \ - x86/simple_idct.o \ - -MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ - x86/motion_est.o -MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o -MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o -MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o -MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o - -YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o -YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o -YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o -YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ - x86/dwt_yasm.o -YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o -YASM-OBJS-$(CONFIG_FFT) += x86/fft.o -YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o -YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o -YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ - x86/h264_chromamc_10bit.o -YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ - x86/h264_deblock_10bit.o \ - x86/h264_idct.o \ - x86/h264_idct_10bit.o \ - x86/h264_weight.o \ - x86/h264_weight_10bit.o -YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ - x86/h264_intrapred_10bit.o -YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ - x86/h264_qpel_10bit.o \ - x86/qpelbase.o \ - x86/fpelbase.o -YASM-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp.o \ - x86/fpelbase.o -YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o -YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o -YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o -YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o -YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o -YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ - x86/rv40dsp.o -YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o -YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o -YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o -YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o -YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o -YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o -YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o - -YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ - x86/mpeg4qpel.o \ - x86/qpelbase.o \ - x86/fpelbase.o - -YASM-OBJS += x86/deinterlace.o \ - x86/fmtconvert.o diff --git a/ffmpeg1/libavcodec/x86/ac3dsp.asm b/ffmpeg1/libavcodec/x86/ac3dsp.asm deleted file mode 100644 index 98fb446..0000000 --- a/ffmpeg1/libavcodec/x86/ac3dsp.asm +++ /dev/null @@ -1,457 +0,0 @@ -;***************************************************************************** -;* x86-optimized AC-3 DSP utils -;* Copyright (c) 2011 Justin Ruggles -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -; 16777216.0f - used in ff_float_to_fixed24() -pf_1_24: times 4 dd 0x4B800000 - -; used in ff_ac3_compute_mantissa_size() -cextern ac3_bap_bits -pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 -pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 - -; used in ff_ac3_extract_exponents() -pd_1: times 4 dd 1 -pd_151: times 4 dd 151 - -SECTION .text - -;----------------------------------------------------------------------------- -; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) -;----------------------------------------------------------------------------- - -%macro AC3_EXPONENT_MIN 0 -cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset - shl reuse_blksq, 8 - jz .end - LOOP_ALIGN -.nextexp: - mov offsetq, reuse_blksq - mova m0, [expq+offsetq] - sub offsetq, 256 - LOOP_ALIGN -.nextblk: - PMINUB m0, [expq+offsetq], m1 - sub offsetq, 256 - jae .nextblk - mova [expq], m0 - add expq, mmsize - sub expnq, mmsize - jg .nextexp -.end: - REP_RET -%endmacro - -%define LOOP_ALIGN -INIT_MMX mmx -AC3_EXPONENT_MIN -%if HAVE_MMXEXT_EXTERNAL -%define LOOP_ALIGN ALIGN 16 -INIT_MMX mmxext -AC3_EXPONENT_MIN -%endif -%if HAVE_SSE2_EXTERNAL -INIT_XMM sse2 -AC3_EXPONENT_MIN -%endif -%undef LOOP_ALIGN - -;----------------------------------------------------------------------------- -; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) -; -; This function uses 2 different methods to calculate a valid result. -; 1) logical 'or' of abs of each element -; This is used for ssse3 because of the pabsw instruction. -; It is also used for mmx because of the lack of min/max instructions. -; 2) calculate min/max for the array, then or(abs(min),abs(max)) -; This is used for mmxext and sse2 because they have pminsw/pmaxsw. -;----------------------------------------------------------------------------- - -; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word -%macro OR_WORDS_HORIZ 2 ; src, tmp -%if cpuflag(sse2) - movhlps %2, %1 - por %1, %2 - pshuflw %2, %1, q0032 - por %1, %2 - pshuflw %2, %1, q0001 - por %1, %2 -%elif cpuflag(mmxext) - pshufw %2, %1, q0032 - por %1, %2 - pshufw %2, %1, q0001 - por %1, %2 -%else ; mmx - movq %2, %1 - psrlq %2, 32 - por %1, %2 - movq %2, %1 - psrlq %2, 16 - por %1, %2 -%endif -%endmacro - -%macro AC3_MAX_MSB_ABS_INT16 1 -cglobal ac3_max_msb_abs_int16, 2,2,5, src, len - pxor m2, m2 - pxor m3, m3 -.loop: -%ifidn %1, min_max - mova m0, [srcq] - mova m1, [srcq+mmsize] - pminsw m2, m0 - pminsw m2, m1 - pmaxsw m3, m0 - pmaxsw m3, m1 -%else ; or_abs -%if notcpuflag(ssse3) - mova m0, [srcq] - mova m1, [srcq+mmsize] - ABS2 m0, m1, m3, m4 -%else ; ssse3 - ; using memory args is faster for ssse3 - pabsw m0, [srcq] - pabsw m1, [srcq+mmsize] -%endif - por m2, m0 - por m2, m1 -%endif - add srcq, mmsize*2 - sub lend, mmsize - ja .loop -%ifidn %1, min_max - ABS2 m2, m3, m0, m1 - por m2, m3 -%endif - OR_WORDS_HORIZ m2, m0 - movd eax, m2 - and eax, 0xFFFF - RET -%endmacro - -INIT_MMX mmx -AC3_MAX_MSB_ABS_INT16 or_abs -INIT_MMX mmxext -AC3_MAX_MSB_ABS_INT16 min_max -INIT_XMM sse2 -AC3_MAX_MSB_ABS_INT16 min_max -INIT_XMM ssse3 -AC3_MAX_MSB_ABS_INT16 or_abs - -;----------------------------------------------------------------------------- -; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() -;----------------------------------------------------------------------------- - -%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set -cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift - movd m0, shiftd -.loop: - mova m1, [srcq ] - mova m2, [srcq+mmsize ] - mova m3, [srcq+mmsize*2] - mova m4, [srcq+mmsize*3] - %3 m1, m0 - %3 m2, m0 - %3 m3, m0 - %3 m4, m0 - mova [srcq ], m1 - mova [srcq+mmsize ], m2 - mova [srcq+mmsize*2], m3 - mova [srcq+mmsize*3], m4 - add srcq, mmsize*4 - sub lend, mmsize*32/%2 - ja .loop -.end: - REP_RET -%endmacro - -;----------------------------------------------------------------------------- -; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) -;----------------------------------------------------------------------------- - -INIT_MMX mmx -AC3_SHIFT l, 16, psllw -INIT_XMM sse2 -AC3_SHIFT l, 16, psllw - -;----------------------------------------------------------------------------- -; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) -;----------------------------------------------------------------------------- - -INIT_MMX mmx -AC3_SHIFT r, 32, psrad -INIT_XMM sse2 -AC3_SHIFT r, 32, psrad - -;----------------------------------------------------------------------------- -; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) -;----------------------------------------------------------------------------- - -; The 3DNow! version is not bit-identical because pf2id uses truncation rather -; than round-to-nearest. -INIT_MMX 3dnow -cglobal float_to_fixed24, 3, 3, 0, dst, src, len - movq m0, [pf_1_24] -.loop: - movq m1, [srcq ] - movq m2, [srcq+8 ] - movq m3, [srcq+16] - movq m4, [srcq+24] - pfmul m1, m0 - pfmul m2, m0 - pfmul m3, m0 - pfmul m4, m0 - pf2id m1, m1 - pf2id m2, m2 - pf2id m3, m3 - pf2id m4, m4 - movq [dstq ], m1 - movq [dstq+8 ], m2 - movq [dstq+16], m3 - movq [dstq+24], m4 - add srcq, 32 - add dstq, 32 - sub lend, 8 - ja .loop - femms - RET - -INIT_XMM sse -cglobal float_to_fixed24, 3, 3, 3, dst, src, len - movaps m0, [pf_1_24] -.loop: - movaps m1, [srcq ] - movaps m2, [srcq+16] - mulps m1, m0 - mulps m2, m0 - cvtps2pi mm0, m1 - movhlps m1, m1 - cvtps2pi mm1, m1 - cvtps2pi mm2, m2 - movhlps m2, m2 - cvtps2pi mm3, m2 - movq [dstq ], mm0 - movq [dstq+ 8], mm1 - movq [dstq+16], mm2 - movq [dstq+24], mm3 - add srcq, 32 - add dstq, 32 - sub lend, 8 - ja .loop - emms - RET - -INIT_XMM sse2 -cglobal float_to_fixed24, 3, 3, 9, dst, src, len - movaps m0, [pf_1_24] -.loop: - movaps m1, [srcq ] - movaps m2, [srcq+16 ] - movaps m3, [srcq+32 ] - movaps m4, [srcq+48 ] -%ifdef m8 - movaps m5, [srcq+64 ] - movaps m6, [srcq+80 ] - movaps m7, [srcq+96 ] - movaps m8, [srcq+112] -%endif - mulps m1, m0 - mulps m2, m0 - mulps m3, m0 - mulps m4, m0 -%ifdef m8 - mulps m5, m0 - mulps m6, m0 - mulps m7, m0 - mulps m8, m0 -%endif - cvtps2dq m1, m1 - cvtps2dq m2, m2 - cvtps2dq m3, m3 - cvtps2dq m4, m4 -%ifdef m8 - cvtps2dq m5, m5 - cvtps2dq m6, m6 - cvtps2dq m7, m7 - cvtps2dq m8, m8 -%endif - movdqa [dstq ], m1 - movdqa [dstq+16 ], m2 - movdqa [dstq+32 ], m3 - movdqa [dstq+48 ], m4 -%ifdef m8 - movdqa [dstq+64 ], m5 - movdqa [dstq+80 ], m6 - movdqa [dstq+96 ], m7 - movdqa [dstq+112], m8 - add srcq, 128 - add dstq, 128 - sub lenq, 32 -%else - add srcq, 64 - add dstq, 64 - sub lenq, 16 -%endif - ja .loop - REP_RET - -;------------------------------------------------------------------------------ -; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) -;------------------------------------------------------------------------------ - -%macro PHADDD4 2 ; xmm src, xmm tmp - movhlps %2, %1 - paddd %1, %2 - pshufd %2, %1, 0x1 - paddd %1, %2 -%endmacro - -INIT_XMM sse2 -cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum - movdqa m0, [mant_cntq ] - movdqa m1, [mant_cntq+ 1*16] - paddw m0, [mant_cntq+ 2*16] - paddw m1, [mant_cntq+ 3*16] - paddw m0, [mant_cntq+ 4*16] - paddw m1, [mant_cntq+ 5*16] - paddw m0, [mant_cntq+ 6*16] - paddw m1, [mant_cntq+ 7*16] - paddw m0, [mant_cntq+ 8*16] - paddw m1, [mant_cntq+ 9*16] - paddw m0, [mant_cntq+10*16] - paddw m1, [mant_cntq+11*16] - pmaddwd m0, [ac3_bap_bits ] - pmaddwd m1, [ac3_bap_bits+16] - paddd m0, m1 - PHADDD4 m0, m1 - movd sumd, m0 - movdqa m3, [pw_bap_mul1] - movhpd m0, [mant_cntq +2] - movlpd m0, [mant_cntq+1*32+2] - movhpd m1, [mant_cntq+2*32+2] - movlpd m1, [mant_cntq+3*32+2] - movhpd m2, [mant_cntq+4*32+2] - movlpd m2, [mant_cntq+5*32+2] - pmulhuw m0, m3 - pmulhuw m1, m3 - pmulhuw m2, m3 - paddusw m0, m1 - paddusw m0, m2 - pmaddwd m0, [pw_bap_mul2] - PHADDD4 m0, m1 - movd eax, m0 - add eax, sumd - RET - -;------------------------------------------------------------------------------ -; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) -;------------------------------------------------------------------------------ - -%macro PABSD 1-2 ; src/dst, unused -%if cpuflag(ssse3) - pabsd %1, %1 -%else ; src/dst, tmp - pxor %2, %2 - pcmpgtd %2, %1 - pxor %1, %2 - psubd %1, %2 -%endif -%endmacro - -%if HAVE_AMD3DNOW_EXTERNAL -INIT_MMX 3dnow -cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len - add expq, lenq - lea coefq, [coefq+4*lenq] - neg lenq - movq m3, [pd_1] - movq m4, [pd_151] -.loop: - movq m0, [coefq+4*lenq ] - movq m1, [coefq+4*lenq+8] - PABSD m0, m2 - PABSD m1, m2 - pslld m0, 1 - por m0, m3 - pi2fd m2, m0 - psrld m2, 23 - movq m0, m4 - psubd m0, m2 - pslld m1, 1 - por m1, m3 - pi2fd m2, m1 - psrld m2, 23 - movq m1, m4 - psubd m1, m2 - packssdw m0, m0 - packuswb m0, m0 - packssdw m1, m1 - packuswb m1, m1 - punpcklwd m0, m1 - movd [expq+lenq], m0 - add lenq, 4 - jl .loop - REP_RET -%endif - -%macro AC3_EXTRACT_EXPONENTS 0 -cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len - add expq, lenq - lea coefq, [coefq+4*lenq] - neg lenq - mova m2, [pd_1] - mova m3, [pd_151] -.loop: - ; move 4 32-bit coefs to xmm0 - mova m0, [coefq+4*lenq] - ; absolute value - PABSD m0, m1 - ; convert to float and extract exponents - pslld m0, 1 - por m0, m2 - cvtdq2ps m1, m0 - psrld m1, 23 - mova m0, m3 - psubd m0, m1 - ; move the lowest byte in each of 4 dwords to the low dword - ; NOTE: We cannot just extract the low bytes with pshufb because the dword - ; result for 16777215 is -1 due to float inaccuracy. Using packuswb - ; clips this to 0, which is the correct exponent. - packssdw m0, m0 - packuswb m0, m0 - movd [expq+lenq], m0 - - add lenq, 4 - jl .loop - REP_RET -%endmacro - -%if HAVE_SSE2_EXTERNAL -INIT_XMM sse2 -AC3_EXTRACT_EXPONENTS -%endif -%if HAVE_SSSE3_EXTERNAL -INIT_XMM ssse3 -AC3_EXTRACT_EXPONENTS -%endif diff --git a/ffmpeg1/libavcodec/x86/ac3dsp_init.c b/ffmpeg1/libavcodec/x86/ac3dsp_init.c deleted file mode 100644 index e2a190e..0000000 --- a/ffmpeg1/libavcodec/x86/ac3dsp_init.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * x86-optimized AC-3 DSP utils - * Copyright (c) 2011 Justin Ruggles - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" -#include "libavcodec/ac3.h" -#include "libavcodec/ac3dsp.h" - -extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); - -extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); - -extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); - -extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); - -extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); - -extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); - -extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); - -#if ARCH_X86_32 && defined(__INTEL_COMPILER) -# undef HAVE_7REGS -# define HAVE_7REGS 0 -#endif - -#if HAVE_SSE_INLINE && HAVE_7REGS - -#define IF1(x) x -#define IF0(x) - -#define MIX5(mono, stereo) \ - __asm__ volatile ( \ - "movss 0(%1), %%xmm5 \n" \ - "movss 8(%1), %%xmm6 \n" \ - "movss 24(%1), %%xmm7 \n" \ - "shufps $0, %%xmm5, %%xmm5 \n" \ - "shufps $0, %%xmm6, %%xmm6 \n" \ - "shufps $0, %%xmm7, %%xmm7 \n" \ - "1: \n" \ - "movaps (%0, %2), %%xmm0 \n" \ - "movaps (%0, %3), %%xmm1 \n" \ - "movaps (%0, %4), %%xmm2 \n" \ - "movaps (%0, %5), %%xmm3 \n" \ - "movaps (%0, %6), %%xmm4 \n" \ - "mulps %%xmm5, %%xmm0 \n" \ - "mulps %%xmm6, %%xmm1 \n" \ - "mulps %%xmm5, %%xmm2 \n" \ - "mulps %%xmm7, %%xmm3 \n" \ - "mulps %%xmm7, %%xmm4 \n" \ - stereo("addps %%xmm1, %%xmm0 \n") \ - "addps %%xmm1, %%xmm2 \n" \ - "addps %%xmm3, %%xmm0 \n" \ - "addps %%xmm4, %%xmm2 \n" \ - mono("addps %%xmm2, %%xmm0 \n") \ - "movaps %%xmm0, (%0, %2) \n" \ - stereo("movaps %%xmm2, (%0, %3) \n") \ - "add $16, %0 \n" \ - "jl 1b \n" \ - : "+&r"(i) \ - : "r"(matrix), \ - "r"(samples[0] + len), \ - "r"(samples[1] + len), \ - "r"(samples[2] + len), \ - "r"(samples[3] + len), \ - "r"(samples[4] + len) \ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ - "memory" \ - ); - -#define MIX_MISC(stereo) \ - __asm__ volatile ( \ - "mov %5, %2 \n" \ - "1: \n" \ - "mov -%c7(%6, %2, %c8), %3 \n" \ - "movaps (%3, %0), %%xmm0 \n" \ - stereo("movaps %%xmm0, %%xmm1 \n") \ - "mulps %%xmm4, %%xmm0 \n" \ - stereo("mulps %%xmm5, %%xmm1 \n") \ - "2: \n" \ - "mov (%6, %2, %c8), %1 \n" \ - "movaps (%1, %0), %%xmm2 \n" \ - stereo("movaps %%xmm2, %%xmm3 \n") \ - "mulps (%4, %2, 8), %%xmm2 \n" \ - stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ - "addps %%xmm2, %%xmm0 \n" \ - stereo("addps %%xmm3, %%xmm1 \n") \ - "add $4, %2 \n" \ - "jl 2b \n" \ - "mov %5, %2 \n" \ - stereo("mov (%6, %2, %c8), %1 \n") \ - "movaps %%xmm0, (%3, %0) \n" \ - stereo("movaps %%xmm1, (%1, %0) \n") \ - "add $16, %0 \n" \ - "jl 1b \n" \ - : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ - : "r"(matrix_simd + in_ch), \ - "g"((intptr_t) - 4 * (in_ch - 1)), \ - "r"(samp + in_ch), \ - "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ - : "memory" \ - ); - -static void ac3_downmix_sse(float **samples, float (*matrix)[2], - int out_ch, int in_ch, int len) -{ - int (*matrix_cmp)[2] = (int(*)[2])matrix; - intptr_t i, j, k, m; - - i = -len * sizeof(float); - if (in_ch == 5 && out_ch == 2 && - !(matrix_cmp[0][1] | matrix_cmp[2][0] | - matrix_cmp[3][1] | matrix_cmp[4][0] | - (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | - (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { - MIX5(IF0, IF1); - } else if (in_ch == 5 && out_ch == 1 && - matrix_cmp[0][0] == matrix_cmp[2][0] && - matrix_cmp[3][0] == matrix_cmp[4][0]) { - MIX5(IF1, IF0); - } else { - DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; - float *samp[AC3_MAX_CHANNELS]; - - for (j = 0; j < in_ch; j++) - samp[j] = samples[j] + len; - - j = 2 * in_ch * sizeof(float); - __asm__ volatile ( - "1: \n" - "sub $8, %0 \n" - "movss (%2, %0), %%xmm4 \n" - "movss 4(%2, %0), %%xmm5 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "shufps $0, %%xmm5, %%xmm5 \n" - "movaps %%xmm4, (%1, %0, 4) \n" - "movaps %%xmm5, 16(%1, %0, 4) \n" - "jg 1b \n" - : "+&r"(j) - : "r"(matrix_simd), "r"(matrix) - : "memory" - ); - if (out_ch == 2) { - MIX_MISC(IF1); - } else { - MIX_MISC(IF0); - } - } -} - -#endif /* HAVE_SSE_INLINE && HAVE_7REGS */ - -av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) -{ - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMX(mm_flags)) { - c->ac3_exponent_min = ff_ac3_exponent_min_mmx; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; - c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; - c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; - } - if (EXTERNAL_AMD3DNOW(mm_flags)) { - c->extract_exponents = ff_ac3_extract_exponents_3dnow; - if (!bit_exact) { - c->float_to_fixed24 = ff_float_to_fixed24_3dnow; - } - } - if (EXTERNAL_MMXEXT(mm_flags)) { - c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; - } - if (EXTERNAL_SSE(mm_flags)) { - c->float_to_fixed24 = ff_float_to_fixed24_sse; - } - if (EXTERNAL_SSE2(mm_flags)) { - c->ac3_exponent_min = ff_ac3_exponent_min_sse2; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; - c->float_to_fixed24 = ff_float_to_fixed24_sse2; - c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; - c->extract_exponents = ff_ac3_extract_exponents_sse2; - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { - c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; - c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; - } - } - if (EXTERNAL_SSSE3(mm_flags)) { - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; - if (!(mm_flags & AV_CPU_FLAG_ATOM)) { - c->extract_exponents = ff_ac3_extract_exponents_ssse3; - } - } - -#if HAVE_SSE_INLINE && HAVE_7REGS - if (INLINE_SSE(mm_flags)) { - c->downmix = ac3_downmix_sse; - } -#endif -} diff --git a/ffmpeg1/libavcodec/x86/cabac.h b/ffmpeg1/libavcodec/x86/cabac.h deleted file mode 100644 index 2c9f77e..0000000 --- a/ffmpeg1/libavcodec/x86/cabac.h +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_CABAC_H -#define AVCODEC_X86_CABAC_H - -#include "libavcodec/cabac.h" -#include "libavutil/attributes.h" -#include "libavutil/x86/asm.h" -#include "libavutil/internal.h" -#include "config.h" - -#if HAVE_INLINE_ASM - -#ifdef BROKEN_RELOCATIONS -#define TABLES_ARG , "r"(tables) - -#if HAVE_FAST_CMOV -#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ - "cmp "low" , "tmp" \n\t"\ - "cmova %%ecx , "range" \n\t"\ - "sbb %%rcx , %%rcx \n\t"\ - "and %%ecx , "tmp" \n\t"\ - "xor %%rcx , "retq" \n\t"\ - "sub "tmp" , "low" \n\t" -#else /* HAVE_FAST_CMOV */ -#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ -/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \ - "sub "low" , "tmp" \n\t"\ - "sar $31 , "tmp" \n\t"\ - "sub %%ecx , "range" \n\t"\ - "and "tmp" , "range" \n\t"\ - "add %%ecx , "range" \n\t"\ - "shl $17 , %%ecx \n\t"\ - "and "tmp" , %%ecx \n\t"\ - "sub %%ecx , "low" \n\t"\ - "xor "tmp" , "ret" \n\t"\ - "movslq "ret" , "retq" \n\t" -#endif /* HAVE_FAST_CMOV */ - -#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \ - "movzbl "statep" , "ret" \n\t"\ - "mov "range" , "tmp" \n\t"\ - "and $0xC0 , "range" \n\t"\ - "lea ("ret", "range", 2), %%ecx \n\t"\ - "movzbl "lps_off"("tables", %%rcx), "range" \n\t"\ - "sub "range" , "tmp" \n\t"\ - "mov "tmp" , %%ecx \n\t"\ - "shl $17 , "tmp" \n\t"\ - BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ - "movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\ - "shl %%cl , "range" \n\t"\ - "movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\ - "shl %%cl , "low" \n\t"\ - "mov "tmpbyte" , "statep" \n\t"\ - "test "lowword" , "lowword" \n\t"\ - "jnz 2f \n\t"\ - "mov "byte" , %%"REG_c" \n\t"\ - "add"OPSIZE" $2 , "byte" \n\t"\ - "movzwl (%%"REG_c") , "tmp" \n\t"\ - "lea -1("low") , %%ecx \n\t"\ - "xor "low" , %%ecx \n\t"\ - "shr $15 , %%ecx \n\t"\ - "bswap "tmp" \n\t"\ - "shr $15 , "tmp" \n\t"\ - "movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\ - "sub $0xFFFF , "tmp" \n\t"\ - "neg %%ecx \n\t"\ - "add $7 , %%ecx \n\t"\ - "shl %%cl , "tmp" \n\t"\ - "add "tmp" , "low" \n\t"\ - "2: \n\t" - -#else /* BROKEN_RELOCATIONS */ -#define TABLES_ARG -#define RIP_ARG - -#if HAVE_FAST_CMOV -#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ - "mov "tmp" , %%ecx \n\t"\ - "shl $17 , "tmp" \n\t"\ - "cmp "low" , "tmp" \n\t"\ - "cmova %%ecx , "range" \n\t"\ - "sbb %%ecx , %%ecx \n\t"\ - "and %%ecx , "tmp" \n\t"\ - "xor %%ecx , "ret" \n\t"\ - "sub "tmp" , "low" \n\t" -#else /* HAVE_FAST_CMOV */ -#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ - "mov "tmp" , %%ecx \n\t"\ - "shl $17 , "tmp" \n\t"\ - "sub "low" , "tmp" \n\t"\ - "sar $31 , "tmp" \n\t" /*lps_mask*/\ - "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\ - "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\ - "add %%ecx , "range" \n\t" /*new range*/\ - "shl $17 , %%ecx \n\t"\ - "and "tmp" , %%ecx \n\t"\ - "sub %%ecx , "low" \n\t"\ - "xor "tmp" , "ret" \n\t" -#endif /* HAVE_FAST_CMOV */ - -#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \ - "movzbl "statep" , "ret" \n\t"\ - "mov "range" , "tmp" \n\t"\ - "and $0xC0 , "range" \n\t"\ - "movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\ - "sub "range" , "tmp" \n\t"\ - BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \ - "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\ - "shl %%cl , "range" \n\t"\ - "movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\ - "shl %%cl , "low" \n\t"\ - "mov "tmpbyte" , "statep" \n\t"\ - "test "lowword" , "lowword" \n\t"\ - " jnz 2f \n\t"\ - "mov "byte" , %%"REG_c" \n\t"\ - "add"OPSIZE" $2 , "byte" \n\t"\ - "movzwl (%%"REG_c") , "tmp" \n\t"\ - "lea -1("low") , %%ecx \n\t"\ - "xor "low" , %%ecx \n\t"\ - "shr $15 , %%ecx \n\t"\ - "bswap "tmp" \n\t"\ - "shr $15 , "tmp" \n\t"\ - "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\ - "sub $0xFFFF , "tmp" \n\t"\ - "neg %%ecx \n\t"\ - "add $7 , %%ecx \n\t"\ - "shl %%cl , "tmp" \n\t"\ - "add "tmp" , "low" \n\t"\ - "2: \n\t" - -#endif /* BROKEN_RELOCATIONS */ - - -#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ - && !( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) -#define get_cabac_inline get_cabac_inline_x86 -static av_always_inline int get_cabac_inline_x86(CABACContext *c, - uint8_t *const state) -{ - int bit, tmp; -#ifdef BROKEN_RELOCATIONS - void *tables; - - __asm__ volatile( - "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" - : "=&r"(tables) - ); -#endif - - __asm__ volatile( - BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1", - "%2", "%q2", "%3", "%b3", - "%c6(%5)", "%c7(%5)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%8") - : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp) - : "r"(state), "r"(c), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)) - TABLES_ARG - ,"1"(c->low), "2"(c->range) - : "%"REG_c, "memory" - ); - return bit & 1; -} -#endif /* HAVE_7REGS */ - -#define get_cabac_bypass_sign get_cabac_bypass_sign_x86 -static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) -{ - x86_reg tmp; - __asm__ volatile( - "movl %c6(%2), %k1 \n\t" - "movl %c3(%2), %%eax \n\t" - "shl $17, %k1 \n\t" - "add %%eax, %%eax \n\t" - "sub %k1, %%eax \n\t" - "cltd \n\t" - "and %%edx, %k1 \n\t" - "add %k1, %%eax \n\t" - "xor %%edx, %%ecx \n\t" - "sub %%edx, %%ecx \n\t" - "test %%ax, %%ax \n\t" - "jnz 1f \n\t" - "mov %c4(%2), %1 \n\t" - "subl $0xFFFF, %%eax \n\t" - "movzwl (%1), %%edx \n\t" - "bswap %%edx \n\t" - "shrl $15, %%edx \n\t" - "add $2, %1 \n\t" - "addl %%edx, %%eax \n\t" - "mov %1, %c4(%2) \n\t" - "1: \n\t" - "movl %%eax, %c3(%2) \n\t" - - : "+c"(val), "=&r"(tmp) - : "r"(c), - "i"(offsetof(CABACContext, low)), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)), - "i"(offsetof(CABACContext, range)) - : "%eax", "%edx", "memory" - ); - return val; -} - -#endif /* HAVE_INLINE_ASM */ -#endif /* AVCODEC_X86_CABAC_H */ diff --git a/ffmpeg1/libavcodec/x86/cavsdsp.c b/ffmpeg1/libavcodec/x86/cavsdsp.c deleted file mode 100644 index deeb5cf..0000000 --- a/ffmpeg1/libavcodec/x86/cavsdsp.c +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. - * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> - * - * MMX-optimized DSP functions, based on H.264 optimizations by - * Michael Niedermayer and Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/common.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/cavsdsp.h" -#include "dsputil_mmx.h" -#include "config.h" - -#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) - -/* in/out: mma=mma+mmb, mmb=mmb-mma */ -#define SUMSUB_BA( a, b ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "psubw "#a", "#b" \n\t" - -/***************************************************************************** - * - * inverse transform - * - ****************************************************************************/ - -static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) -{ - __asm__ volatile( - "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ - "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ - "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ - "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ - "movq %%mm4, %%mm0 \n\t" - "movq %%mm5, %%mm3 \n\t" - "movq %%mm2, %%mm6 \n\t" - "movq %%mm7, %%mm1 \n\t" - - "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ - "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ - "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ - "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ - "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ - "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ - "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ - "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ - "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ - "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ - "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ - - "movq %%mm5, %%mm4 \n\t" - "movq %%mm7, %%mm6 \n\t" - "movq %%mm3, %%mm0 \n\t" - "movq %%mm1, %%mm2 \n\t" - SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ - "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ - "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ - "paddw %%mm7, %%mm7 \n\t" - "paddw %%mm5, %%mm5 \n\t" - "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ - "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ - - SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ - "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ - "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ - "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ - "paddw %%mm1, %%mm1 \n\t" - "paddw %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ - - "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ - "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ - "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ - "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ - "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm0 \n\t" - "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ - "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ - - "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ - "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ - SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ - "psllw $3, %%mm0 \n\t" - "psllw $3, %%mm2 \n\t" - "paddw %1, %%mm0 \n\t" /* add rounding bias */ - "paddw %1, %%mm2 \n\t" /* add rounding bias */ - - SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ - SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ - SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ - SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ - SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ - SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ - :: "r"(block), "m"(bias) - ); -} - -static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - DECLARE_ALIGNED(8, int16_t, b2)[64]; - - for(i=0; i<2; i++){ - DECLARE_ALIGNED(8, uint64_t, tmp); - - cavs_idct8_1d(block+4*i, ff_pw_4.a); - - __asm__ volatile( - "psraw $3, %%mm7 \n\t" - "psraw $3, %%mm6 \n\t" - "psraw $3, %%mm5 \n\t" - "psraw $3, %%mm4 \n\t" - "psraw $3, %%mm3 \n\t" - "psraw $3, %%mm2 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm0 \n\t" - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - cavs_idct8_1d(b2+4*i, ff_pw_64.a); - - __asm__ volatile( - "psraw $7, %%mm7 \n\t" - "psraw $7, %%mm6 \n\t" - "psraw $7, %%mm5 \n\t" - "psraw $7, %%mm4 \n\t" - "psraw $7, %%mm3 \n\t" - "psraw $7, %%mm2 \n\t" - "psraw $7, %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - ff_add_pixels_clamped_mmx(b2, dst, stride); -} - -/***************************************************************************** - * - * motion compensation - * - ****************************************************************************/ - -/* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ - "psllw $3, "#E" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $3, "#E" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#E", %%mm6 \n\t"\ - "paddw "#B", "#B" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $1, "#B" \n\t"\ - "psubw "#A", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "paddw "#D", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $3, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw %5, %%mm7 \n\t"\ - "psllw $3, "#B" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $3, "#B" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#B", %%mm6 \n\t"\ - "paddw "#E", "#E" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $1, "#E" \n\t"\ - "psubw "#F", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - - -#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - } - -#define QPEL_CAVS(OPNAME, OP, MMX)\ -static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "movq %6, %%mm5 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm5, %%mm1 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q) \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ - : "memory"\ - );\ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define CAVS_MC(OPNAME, SIZE, MMX) \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ -}\ - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMXEXT_OP(a, b, temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ - -#if HAVE_MMXEXT_INLINE -QPEL_CAVS(put_, PUT_OP, mmxext) -QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext) - -CAVS_MC(put_, 8, mmxext) -CAVS_MC(put_, 16, mmxext) -CAVS_MC(avg_, 8, mmxext) -CAVS_MC(avg_, 16, mmxext) - -static av_cold void ff_cavsdsp_init_mmxext(CAVSDSPContext *c, - AVCodecContext *avctx) -{ -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmxext; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmxext; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; - c->idct_perm = FF_TRANSPOSE_IDCT_PERM; -} -#endif /* HAVE_MMXEXT_INLINE */ - -#if HAVE_AMD3DNOW_INLINE -QPEL_CAVS(put_, PUT_OP, 3dnow) -QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) - -CAVS_MC(put_, 8, 3dnow) -CAVS_MC(put_, 16,3dnow) -CAVS_MC(avg_, 8, 3dnow) -CAVS_MC(avg_, 16,3dnow) - -static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c, - AVCodecContext *avctx) -{ -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; - c->idct_perm = FF_TRANSPOSE_IDCT_PERM; -} -#endif /* HAVE_AMD3DNOW_INLINE */ - -av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) -{ - int mm_flags = av_get_cpu_flags(); - -#if HAVE_MMXEXT_INLINE - if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmxext(c, avctx); -#endif /* HAVE_MMXEXT_INLINE */ -#if HAVE_AMD3DNOW_INLINE - if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx); -#endif /* HAVE_AMD3DNOW_INLINE */ -} diff --git a/ffmpeg1/libavcodec/x86/constants.c b/ffmpeg1/libavcodec/x86/constants.c deleted file mode 100644 index 821d73f..0000000 --- a/ffmpeg1/libavcodec/x86/constants.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * MMX/SSE constants used across x86 dsp optimizations. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" // for xmm_reg - -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; - -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; diff --git a/ffmpeg1/libavcodec/x86/dct32.asm b/ffmpeg1/libavcodec/x86/dct32.asm deleted file mode 100644 index 6fd5ba3..0000000 --- a/ffmpeg1/libavcodec/x86/dct32.asm +++ /dev/null @@ -1,490 +0,0 @@ -;****************************************************************************** -;* 32 point SSE-optimized DCT transform -;* Copyright (c) 2010 Vitor Sessak -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA 32 - -align 32 -ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 - dd 0.553104, 0.582935, 0.622504, 0.674808 - dd -10.190008, -3.407609, -2.057781, -1.484165 - dd -1.169440, -0.972568, -0.839350, -0.744536 - dd 0.502419, 0.522499, 0.566944, 0.646822 - dd 0.788155, 1.060678, 1.722447, 5.101149 - dd 0.509796, 0.601345, 0.899976, 2.562916 - dd 0.509796, 0.601345, 0.899976, 2.562916 - dd 1.000000, 1.000000, 1.306563, 0.541196 - dd 1.000000, 1.000000, 1.306563, 0.541196 - dd 1.000000, 0.707107, 1.000000, -0.707107 - dd 1.000000, 0.707107, 1.000000, -0.707107 - dd 0.707107, 0.707107, 0.707107, 0.707107 - -align 32 -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 - -%macro BUTTERFLY 4 - subps %4, %1, %2 - addps %2, %2, %1 - mulps %1, %4, %3 -%endmacro - -%macro BUTTERFLY0 5 -%if cpuflag(sse2) && notcpuflag(avx) - pshufd %4, %1, %5 - xorps %1, %2 - addps %1, %4 - mulps %1, %3 -%else - shufps %4, %1, %1, %5 - xorps %1, %1, %2 - addps %4, %4, %1 - mulps %1, %4, %3 -%endif -%endmacro - -%macro BUTTERFLY2 4 - BUTTERFLY0 %1, %2, %3, %4, 0x1b -%endmacro - -%macro BUTTERFLY3 4 - BUTTERFLY0 %1, %2, %3, %4, 0xb1 -%endmacro - -%macro BUTTERFLY3V 5 - movaps m%5, m%1 - addps m%1, m%2 - subps m%5, m%2 - SWAP %2, %5 - mulps m%2, [ps_cos_vec+192] - movaps m%5, m%3 - addps m%3, m%4 - subps m%4, m%5 - mulps m%4, [ps_cos_vec+192] -%endmacro - -%macro PASS6_AND_PERMUTE 0 - mov tmpd, [outq+4] - movss m7, [outq+72] - addss m7, [outq+76] - movss m3, [outq+56] - addss m3, [outq+60] - addss m4, m3 - movss m2, [outq+52] - addss m2, m3 - movss m3, [outq+104] - addss m3, [outq+108] - addss m1, m3 - addss m5, m4 - movss [outq+ 16], m1 - movss m1, [outq+100] - addss m1, m3 - movss m3, [outq+40] - movss [outq+ 48], m1 - addss m3, [outq+44] - movss m1, [outq+100] - addss m4, m3 - addss m3, m2 - addss m1, [outq+108] - movss [outq+ 40], m3 - addss m2, [outq+36] - movss m3, [outq+8] - movss [outq+ 56], m2 - addss m3, [outq+12] - movss [outq+ 32], m3 - movss m3, [outq+80] - movss [outq+ 8], m5 - movss [outq+ 80], m1 - movss m2, [outq+52] - movss m5, [outq+120] - addss m5, [outq+124] - movss m1, [outq+64] - addss m2, [outq+60] - addss m0, m5 - addss m5, [outq+116] - mov [outq+64], tmpd - addss m6, m0 - addss m1, m6 - mov tmpd, [outq+12] - mov [outq+ 96], tmpd - movss [outq+ 4], m1 - movss m1, [outq+24] - movss [outq+ 24], m4 - movss m4, [outq+88] - addss m4, [outq+92] - addss m3, m4 - addss m4, [outq+84] - mov tmpd, [outq+108] - addss m1, [outq+28] - addss m0, m1 - addss m1, m5 - addss m6, m3 - addss m3, m0 - addss m0, m7 - addss m5, [outq+20] - addss m7, m1 - movss [outq+ 12], m6 - mov [outq+112], tmpd - movss m6, [outq+28] - movss [outq+ 28], m0 - movss m0, [outq+36] - movss [outq+ 36], m7 - addss m1, m4 - movss m7, [outq+116] - addss m0, m2 - addss m7, [outq+124] - movss [outq+ 72], m0 - movss m0, [outq+44] - addss m2, m0 - movss [outq+ 44], m1 - movss [outq+ 88], m2 - addss m0, [outq+60] - mov tmpd, [outq+60] - mov [outq+120], tmpd - movss [outq+104], m0 - addss m4, m5 - addss m5, [outq+68] - movss [outq+52], m4 - movss [outq+60], m5 - movss m4, [outq+68] - movss m5, [outq+20] - movss [outq+ 20], m3 - addss m5, m7 - addss m7, m6 - addss m4, m5 - movss m2, [outq+84] - addss m2, [outq+92] - addss m5, m2 - movss [outq+ 68], m4 - addss m2, m7 - movss m4, [outq+76] - movss [outq+ 84], m2 - movss [outq+ 76], m5 - addss m7, m4 - addss m6, [outq+124] - addss m4, m6 - addss m6, [outq+92] - movss [outq+100], m4 - movss [outq+108], m6 - movss m6, [outq+92] - movss [outq+92], m7 - addss m6, [outq+124] - movss [outq+116], m6 -%endmacro - -INIT_YMM avx -SECTION_TEXT -%if HAVE_AVX_EXTERNAL -; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) -cglobal dct32_float, 2,3,8, out, in, tmp - ; pass 1 - vmovaps m4, [inq+0] - vinsertf128 m5, m5, [inq+96], 1 - vinsertf128 m5, m5, [inq+112], 0 - vshufps m5, m5, m5, 0x1b - BUTTERFLY m4, m5, [ps_cos_vec], m6 - - vmovaps m2, [inq+64] - vinsertf128 m6, m6, [inq+32], 1 - vinsertf128 m6, m6, [inq+48], 0 - vshufps m6, m6, m6, 0x1b - BUTTERFLY m2, m6, [ps_cos_vec+32], m0 - - ; pass 2 - - BUTTERFLY m5, m6, [ps_cos_vec+64], m0 - BUTTERFLY m4, m2, [ps_cos_vec+64], m7 - - - ; pass 3 - vperm2f128 m3, m6, m4, 0x31 - vperm2f128 m1, m6, m4, 0x20 - vshufps m3, m3, m3, 0x1b - - BUTTERFLY m1, m3, [ps_cos_vec+96], m6 - - - vperm2f128 m4, m5, m2, 0x20 - vperm2f128 m5, m5, m2, 0x31 - vshufps m5, m5, m5, 0x1b - - BUTTERFLY m4, m5, [ps_cos_vec+96], m6 - - ; pass 4 - vmovaps m6, [ps_p1p1m1m1+0] - vmovaps m2, [ps_cos_vec+128] - - BUTTERFLY2 m5, m6, m2, m7 - BUTTERFLY2 m4, m6, m2, m7 - BUTTERFLY2 m1, m6, m2, m7 - BUTTERFLY2 m3, m6, m2, m7 - - - ; pass 5 - vshufps m6, m6, m6, 0xcc - vmovaps m2, [ps_cos_vec+160] - - BUTTERFLY3 m5, m6, m2, m7 - BUTTERFLY3 m4, m6, m2, m7 - BUTTERFLY3 m1, m6, m2, m7 - BUTTERFLY3 m3, m6, m2, m7 - - vperm2f128 m6, m3, m3, 0x31 - vmovaps [outq], m3 - - vextractf128 [outq+64], m5, 1 - vextractf128 [outq+32], m5, 0 - - vextractf128 [outq+80], m4, 1 - vextractf128 [outq+48], m4, 0 - - vperm2f128 m0, m1, m1, 0x31 - vmovaps [outq+96], m1 - - vzeroupper - - ; pass 6, no SIMD... -INIT_XMM - PASS6_AND_PERMUTE - RET -%endif - -%if ARCH_X86_64 -%define SPILL SWAP -%define UNSPILL SWAP - -%macro PASS5 0 - nop ; FIXME code alignment - SWAP 5, 8 - SWAP 4, 12 - SWAP 6, 14 - SWAP 7, 13 - SWAP 0, 15 - PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 - TRANSPOSE4x4PS 8, 9, 10, 11, 0 - BUTTERFLY3V 8, 9, 10, 11, 0 - addps m10, m11 - TRANSPOSE4x4PS 12, 13, 14, 15, 0 - BUTTERFLY3V 12, 13, 14, 15, 0 - addps m14, m15 - addps m12, m14 - addps m14, m13 - addps m13, m15 -%endmacro - -%macro PASS6 0 - SWAP 9, 12 - SWAP 11, 14 - movss [outq+0x00], m8 - pshuflw m0, m8, 0xe - movss [outq+0x10], m9 - pshuflw m1, m9, 0xe - movss [outq+0x20], m10 - pshuflw m2, m10, 0xe - movss [outq+0x30], m11 - pshuflw m3, m11, 0xe - movss [outq+0x40], m12 - pshuflw m4, m12, 0xe - movss [outq+0x50], m13 - pshuflw m5, m13, 0xe - movss [outq+0x60], m14 - pshuflw m6, m14, 0xe - movaps [outq+0x70], m15 - pshuflw m7, m15, 0xe - addss m0, m1 - addss m1, m2 - movss [outq+0x08], m0 - addss m2, m3 - movss [outq+0x18], m1 - addss m3, m4 - movss [outq+0x28], m2 - addss m4, m5 - movss [outq+0x38], m3 - addss m5, m6 - movss [outq+0x48], m4 - addss m6, m7 - movss [outq+0x58], m5 - movss [outq+0x68], m6 - movss [outq+0x78], m7 - - PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 - movhlps m0, m1 - pshufd m1, m1, 3 - SWAP 0, 2, 4, 6, 8, 10, 12, 14 - SWAP 1, 3, 5, 7, 9, 11, 13, 15 -%rep 7 - movhlps m0, m1 - pshufd m1, m1, 3 - addss m15, m1 - SWAP 0, 2, 4, 6, 8, 10, 12, 14 - SWAP 1, 3, 5, 7, 9, 11, 13, 15 -%endrep -%assign i 4 -%rep 15 - addss m0, m1 - movss [outq+i], m0 - SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - %assign i i+8 -%endrep -%endmacro - -%else ; ARCH_X86_32 -%macro SPILL 2 ; xmm#, mempos - movaps [outq+(%2-8)*16], m%1 -%endmacro -%macro UNSPILL 2 - movaps m%1, [outq+(%2-8)*16] -%endmacro - -%define PASS6 PASS6_AND_PERMUTE -%macro PASS5 0 - movaps m2, [ps_cos_vec+160] - shufps m3, m3, 0xcc - - BUTTERFLY3 m5, m3, m2, m1 - SPILL 5, 8 - - UNSPILL 1, 9 - BUTTERFLY3 m1, m3, m2, m5 - SPILL 1, 14 - - BUTTERFLY3 m4, m3, m2, m5 - SPILL 4, 12 - - BUTTERFLY3 m7, m3, m2, m5 - SPILL 7, 13 - - UNSPILL 5, 10 - BUTTERFLY3 m5, m3, m2, m7 - SPILL 5, 10 - - UNSPILL 4, 11 - BUTTERFLY3 m4, m3, m2, m7 - SPILL 4, 11 - - BUTTERFLY3 m6, m3, m2, m7 - SPILL 6, 9 - - BUTTERFLY3 m0, m3, m2, m7 - SPILL 0, 15 -%endmacro -%endif - - -; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -%macro DCT32_FUNC 0 -cglobal dct32_float, 2, 3, 16, out, in, tmp - ; pass 1 - - movaps m0, [inq+0] - LOAD_INV m1, [inq+112] - BUTTERFLY m0, m1, [ps_cos_vec], m3 - - movaps m7, [inq+64] - LOAD_INV m4, [inq+48] - BUTTERFLY m7, m4, [ps_cos_vec+32], m3 - - ; pass 2 - movaps m2, [ps_cos_vec+64] - BUTTERFLY m1, m4, m2, m3 - SPILL 1, 11 - SPILL 4, 8 - - ; pass 1 - movaps m1, [inq+16] - LOAD_INV m6, [inq+96] - BUTTERFLY m1, m6, [ps_cos_vec+16], m3 - - movaps m4, [inq+80] - LOAD_INV m5, [inq+32] - BUTTERFLY m4, m5, [ps_cos_vec+48], m3 - - ; pass 2 - BUTTERFLY m0, m7, m2, m3 - - movaps m2, [ps_cos_vec+80] - BUTTERFLY m6, m5, m2, m3 - - BUTTERFLY m1, m4, m2, m3 - - ; pass 3 - movaps m2, [ps_cos_vec+96] - shufps m1, m1, 0x1b - BUTTERFLY m0, m1, m2, m3 - SPILL 0, 15 - SPILL 1, 14 - - UNSPILL 0, 8 - shufps m5, m5, 0x1b - BUTTERFLY m0, m5, m2, m3 - - UNSPILL 1, 11 - shufps m6, m6, 0x1b - BUTTERFLY m1, m6, m2, m3 - SPILL 1, 11 - - shufps m4, m4, 0x1b - BUTTERFLY m7, m4, m2, m3 - - ; pass 4 - movaps m3, [ps_p1p1m1m1+0] - movaps m2, [ps_cos_vec+128] - - BUTTERFLY2 m5, m3, m2, m1 - - BUTTERFLY2 m0, m3, m2, m1 - SPILL 0, 9 - - BUTTERFLY2 m6, m3, m2, m1 - SPILL 6, 10 - - UNSPILL 0, 11 - BUTTERFLY2 m0, m3, m2, m1 - SPILL 0, 11 - - BUTTERFLY2 m4, m3, m2, m1 - - BUTTERFLY2 m7, m3, m2, m1 - - UNSPILL 6, 14 - BUTTERFLY2 m6, m3, m2, m1 - - UNSPILL 0, 15 - BUTTERFLY2 m0, m3, m2, m1 - - PASS5 - PASS6 - RET -%endmacro - -%macro LOAD_INV 2 -%if cpuflag(sse2) - pshufd %1, %2, 0x1b -%elif cpuflag(sse) - movaps %1, %2 - shufps %1, %1, 0x1b -%endif -%endmacro - -INIT_XMM sse -DCT32_FUNC -INIT_XMM sse2 -DCT32_FUNC diff --git a/ffmpeg1/libavcodec/x86/deinterlace.asm b/ffmpeg1/libavcodec/x86/deinterlace.asm deleted file mode 100644 index 3812dbe..0000000 --- a/ffmpeg1/libavcodec/x86/deinterlace.asm +++ /dev/null @@ -1,82 +0,0 @@ -;****************************************************************************** -;* MMX optimized deinterlacing functions -;* Copyright (c) 2010 Vitor Sessak -;* Copyright (c) 2002 Michael Niedermayer -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -cextern pw_4 - -SECTION .text - -%macro DEINTERLACE 1 -%ifidn %1, inplace -;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) -cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size -%else -;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) -cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size -%endif - pxor mm7, mm7 - movq mm6, [pw_4] -.nextrow: - movd mm0, [lum_m4q] - movd mm1, [lum_m3q] - movd mm2, [lum_m2q] -%ifidn %1, inplace - movd [lum_m4q], mm2 -%endif - movd mm3, [lum_m1q] - movd mm4, [lumq] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpcklbw mm2, mm7 - punpcklbw mm3, mm7 - punpcklbw mm4, mm7 - paddw mm1, mm3 - psllw mm2, 1 - paddw mm0, mm4 - psllw mm1, 2 - paddw mm2, mm6 - paddw mm1, mm2 - psubusw mm1, mm0 - psrlw mm1, 3 - packuswb mm1, mm7 -%ifidn %1, inplace - movd [lum_m2q], mm1 -%else - movd [dstq], mm1 - add dstq, 4 -%endif - add lum_m4q, 4 - add lum_m3q, 4 - add lum_m2q, 4 - add lum_m1q, 4 - add lumq, 4 - sub sized, 4 - jg .nextrow - REP_RET -%endmacro - -DEINTERLACE "" - -DEINTERLACE inplace diff --git a/ffmpeg1/libavcodec/x86/dirac_dwt.c b/ffmpeg1/libavcodec/x86/dirac_dwt.c deleted file mode 100644 index fbb25a4..0000000 --- a/ffmpeg1/libavcodec/x86/dirac_dwt.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * MMX optimized discrete wavelet transform - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * Copyright (c) 2010 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/asm.h" -#include "dsputil_mmx.h" -#include "dirac_dwt.h" - -#define COMPOSE_VERTICAL(ext, align) \ -void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ -void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ -void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ -void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ -void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ -void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ -void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ -\ -static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ -{ \ - int i, width_align = width&~(align-1); \ -\ - for(i=width_align; i<width; i++) \ - b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ -\ - ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ -} \ -\ -static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ -{ \ - int i, width_align = width&~(align-1); \ -\ - for(i=width_align; i<width; i++) \ - b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ -\ - ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ -} \ -\ -static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ - IDWTELEM *b3, IDWTELEM *b4, int width) \ -{ \ - int i, width_align = width&~(align-1); \ -\ - for(i=width_align; i<width; i++) \ - b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ -\ - ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ -} \ -\ -static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ - IDWTELEM *b3, IDWTELEM *b4, int width) \ -{ \ - int i, width_align = width&~(align-1); \ -\ - for(i=width_align; i<width; i++) \ - b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ -\ - ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ -} \ -static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ -{ \ - int i, width_align = width&~(align-1); \ -\ - for(i=width_align; i<width; i++) { \ - b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ - b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ - } \ -\ - ff_vertical_compose_haar##ext(b0, b1, width_align); \ -} \ -static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ -{\ - int w2= w>>1;\ - int x= w2 - (w2&(align-1));\ - ff_horizontal_compose_haar0i##ext(b, tmp, w);\ -\ - for (; x < w2; x++) {\ - b[2*x ] = tmp[x];\ - b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ - }\ -}\ -static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ -{\ - int w2= w>>1;\ - int x= w2 - (w2&(align-1));\ - ff_horizontal_compose_haar1i##ext(b, tmp, w);\ -\ - for (; x < w2; x++) {\ - b[2*x ] = (tmp[x] + 1)>>1;\ - b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ - }\ -}\ -\ - -#if HAVE_YASM -#if !ARCH_X86_64 -COMPOSE_VERTICAL(_mmx, 4) -#endif -COMPOSE_VERTICAL(_sse2, 8) - - -void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); - -static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) -{ - int w2= w>>1; - int x= w2 - (w2&7); - ff_horizontal_compose_dd97i_ssse3(b, tmp, w); - - for (; x < w2; x++) { - b[2*x ] = (tmp[x] + 1)>>1; - b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; - } -} -#endif - -void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - -#if !ARCH_X86_64 - if (!(mm_flags & AV_CPU_FLAG_MMX)) - return; - - switch (type) { - case DWT_DIRAC_DD9_7: - d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; - d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; - break; - case DWT_DIRAC_LEGALL5_3: - d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; - d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; - break; - case DWT_DIRAC_DD13_7: - d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; - d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; - break; - case DWT_DIRAC_HAAR0: - d->vertical_compose = (void*)vertical_compose_haar_mmx; - d->horizontal_compose = horizontal_compose_haar0i_mmx; - break; - case DWT_DIRAC_HAAR1: - d->vertical_compose = (void*)vertical_compose_haar_mmx; - d->horizontal_compose = horizontal_compose_haar1i_mmx; - break; - } -#endif - - if (!(mm_flags & AV_CPU_FLAG_SSE2)) - return; - - switch (type) { - case DWT_DIRAC_DD9_7: - d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; - d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; - break; - case DWT_DIRAC_LEGALL5_3: - d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; - d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; - break; - case DWT_DIRAC_DD13_7: - d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; - d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; - break; - case DWT_DIRAC_HAAR0: - d->vertical_compose = (void*)vertical_compose_haar_sse2; - d->horizontal_compose = horizontal_compose_haar0i_sse2; - break; - case DWT_DIRAC_HAAR1: - d->vertical_compose = (void*)vertical_compose_haar_sse2; - d->horizontal_compose = horizontal_compose_haar1i_sse2; - break; - } - - if (!(mm_flags & AV_CPU_FLAG_SSSE3)) - return; - - switch (type) { - case DWT_DIRAC_DD9_7: - d->horizontal_compose = horizontal_compose_dd97i_ssse3; - break; - } -#endif // HAVE_YASM -} diff --git a/ffmpeg1/libavcodec/x86/dirac_dwt.h b/ffmpeg1/libavcodec/x86/dirac_dwt.h deleted file mode 100644 index 126b290..0000000 --- a/ffmpeg1/libavcodec/x86/dirac_dwt.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DIRAC_DWT_H -#define AVCODEC_X86_DIRAC_DWT_H - -#include "libavcodec/dirac_dwt.h" - -void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); -void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); -void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); - -void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); - -#endif diff --git a/ffmpeg1/libavcodec/x86/diracdsp_mmx.c b/ffmpeg1/libavcodec/x86/diracdsp_mmx.c deleted file mode 100644 index cb6465f..0000000 --- a/ffmpeg1/libavcodec/x86/diracdsp_mmx.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (C) 2010 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_mmx.h" -#include "diracdsp_mmx.h" - -void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); -void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); -void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); -void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); - -#define HPEL_FILTER(MMSIZE, EXT) \ - void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ - void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ - \ - static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ - const uint8_t *src, int stride, int width, int height) \ - { \ - while( height-- ) \ - { \ - ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ - ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ - ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ - \ - dsth += stride; \ - dstv += stride; \ - dstc += stride; \ - src += stride; \ - } \ - } - -#if !ARCH_X86_64 -HPEL_FILTER(8, mmx) -#endif -HPEL_FILTER(16, sse2) - -#define PIXFUNC(PFX, IDX, EXT) \ - /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ - c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ - c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT - -void ff_diracdsp_init_mmx(DiracDSPContext* c) -{ - int mm_flags = av_get_cpu_flags(); - - if (!(mm_flags & AV_CPU_FLAG_MMX)) - return; - -#if HAVE_YASM - c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; -#if !ARCH_X86_64 - c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; - c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; - c->dirac_hpel_filter = dirac_hpel_filter_mmx; - c->add_rect_clamped = ff_add_rect_clamped_mmx; - c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; -#endif -#endif - -#if HAVE_MMX_INLINE - PIXFUNC(put, 0, mmx); - PIXFUNC(avg, 0, mmx); -#endif - -#if HAVE_MMXEXT_INLINE - if (mm_flags & AV_CPU_FLAG_MMX2) { - PIXFUNC(avg, 0, mmxext); - } -#endif - - if (mm_flags & AV_CPU_FLAG_SSE2) { -#if HAVE_YASM - c->dirac_hpel_filter = dirac_hpel_filter_sse2; - c->add_rect_clamped = ff_add_rect_clamped_sse2; - c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; - - c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; - c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; -#endif -#if HAVE_SSE2_INLINE - c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; - c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; - c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; - c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; -#endif - } -} diff --git a/ffmpeg1/libavcodec/x86/diracdsp_mmx.h b/ffmpeg1/libavcodec/x86/diracdsp_mmx.h deleted file mode 100644 index 8985854..0000000 --- a/ffmpeg1/libavcodec/x86/diracdsp_mmx.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2010 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DIRACDSP_H -#define AVCODEC_X86_DIRACDSP_H - -#include "libavcodec/diracdsp.h" - -void ff_diracdsp_init_mmx(DiracDSPContext* c); - -DECL_DIRAC_PIXOP(put, mmx); -DECL_DIRAC_PIXOP(avg, mmx); -DECL_DIRAC_PIXOP(avg, mmxext); - -void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); -void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); -void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); -void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); - -void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); -void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); - -void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); -void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); -void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); - -void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); -void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); - -#endif diff --git a/ffmpeg1/libavcodec/x86/diracdsp_yasm.asm b/ffmpeg1/libavcodec/x86/diracdsp_yasm.asm deleted file mode 100644 index 3e9765b..0000000 --- a/ffmpeg1/libavcodec/x86/diracdsp_yasm.asm +++ /dev/null @@ -1,264 +0,0 @@ -;****************************************************************************** -;* Copyright (c) 2010 David Conrad -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -pw_3: times 8 dw 3 -pw_7: times 8 dw 7 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -pb_128: times 16 db 128 - -section .text - -%macro UNPACK_ADD 6 - mov%5 %1, %3 - mov%6 m5, %4 - mova m4, %1 - mova %2, m5 - punpcklbw %1, m7 - punpcklbw m5, m7 - punpckhbw m4, m7 - punpckhbw %2, m7 - paddw %1, m5 - paddw %2, m4 -%endmacro - -%macro HPEL_FILTER 1 -; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); -cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 - mov src0q, srcq - lea stridex3q, [3*strideq] - sub src0q, stridex3q - pxor m7, m7 -.loop: - ; 7*(src[0] + src[1]) - UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a - pmullw m0, [pw_7] - pmullw m1, [pw_7] - - ; 3*( ... + src[-2] + src[3]) - UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a - paddw m0, m2 - paddw m1, m3 - pmullw m0, [pw_3] - pmullw m1, [pw_3] - - ; ... - 7*(src[-1] + src[2]) - UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a - pmullw m2, [pw_7] - pmullw m3, [pw_7] - psubw m0, m2 - psubw m1, m3 - - ; ... - (src[-3] + src[4]) - UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a - psubw m0, m2 - psubw m1, m3 - - paddw m0, [pw_16] - paddw m1, [pw_16] - psraw m0, 5 - psraw m1, 5 - packuswb m0, m1 - mova [dstq], m0 - add dstq, mmsize - add srcq, mmsize - add src0q, mmsize - sub widthd, mmsize - jg .loop - RET - -; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); -cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width - dec widthd - pxor m7, m7 - and widthd, ~(mmsize-1) -.loop: - ; 7*(src[0] + src[1]) - UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u - pmullw m0, [pw_7] - pmullw m1, [pw_7] - - ; 3*( ... + src[-2] + src[3]) - UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u - paddw m0, m2 - paddw m1, m3 - pmullw m0, [pw_3] - pmullw m1, [pw_3] - - ; ... - 7*(src[-1] + src[2]) - UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u - pmullw m2, [pw_7] - pmullw m3, [pw_7] - psubw m0, m2 - psubw m1, m3 - - ; ... - (src[-3] + src[4]) - UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u - psubw m0, m2 - psubw m1, m3 - - paddw m0, [pw_16] - paddw m1, [pw_16] - psraw m0, 5 - psraw m1, 5 - packuswb m0, m1 - mova [dstq + widthq], m0 - sub widthd, mmsize - jge .loop - RET -%endmacro - -%macro PUT_RECT 1 -; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) -cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 - mova m0, [pb_128] - add wd, (mmsize-1) - and wd, ~(mmsize-1) - -%if ARCH_X86_64 - movsxd dst_strideq, dst_strided - movsxd src_strideq, src_strided - mov r7d, r5m - mov r8d, wd - %define wspill r8d - %define hd r7d -%else - mov r4m, wd - %define wspill r4m - %define hd r5mp -%endif - -.loopy - lea src2q, [srcq+src_strideq*2] - lea dst2q, [dstq+dst_strideq] -.loopx: - sub wd, mmsize - mova m1, [srcq +2*wq] - mova m2, [src2q+2*wq] - packsswb m1, [srcq +2*wq+mmsize] - packsswb m2, [src2q+2*wq+mmsize] - paddb m1, m0 - paddb m2, m0 - mova [dstq +wq], m1 - mova [dst2q+wq], m2 - jg .loopx - - lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*2] - sub hd, 2 - mov wd, wspill - jg .loopy - RET -%endm - -%macro ADD_RECT 1 -; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) -cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h - mova m0, [pw_32] - add wd, (mmsize-1) - and wd, ~(mmsize-1) - -%if ARCH_X86_64 - movsxd strideq, strided - movsxd idwt_strideq, idwt_strided - mov r8d, wd - %define wspill r8d -%else - mov r5m, wd - %define wspill r5m -%endif - -.loop: - sub wd, mmsize - movu m1, [srcq +2*wq] ; FIXME: ensure alignment - paddw m1, m0 - psraw m1, 6 - movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment - paddw m2, m0 - psraw m2, 6 - paddw m1, [idwtq+2*wq] - paddw m2, [idwtq+2*wq+mmsize] - packuswb m1, m2 - mova [dstq +wq], m1 - jg .loop - - lea srcq, [srcq + 2*strideq] - add dstq, strideq - lea idwtq, [idwtq+ 2*idwt_strideq] - sub hd, 1 - mov wd, wspill - jg .loop - RET -%endm - -%macro ADD_OBMC 2 -; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) -cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen - pxor m4, m4 -.loop: -%assign i 0 -%rep %1 / mmsize - mova m0, [srcq+i] - mova m1, m0 - punpcklbw m0, m4 - punpckhbw m1, m4 - mova m2, [obmcq+i] - mova m3, m2 - punpcklbw m2, m4 - punpckhbw m3, m4 - pmullw m0, m2 - pmullw m1, m3 - movu m2, [dstq+2*i] - movu m3, [dstq+2*i+mmsize] - paddw m0, m2 - paddw m1, m3 - movu [dstq+2*i], m0 - movu [dstq+2*i+mmsize], m1 -%assign i i+mmsize -%endrep - lea srcq, [srcq+strideq] - lea dstq, [dstq+2*strideq] - add obmcq, 32 - sub yblend, 1 - jg .loop - RET -%endm - -INIT_MMX -%if ARCH_X86_64 == 0 -PUT_RECT mmx -ADD_RECT mmx - -HPEL_FILTER mmx -ADD_OBMC 32, mmx -ADD_OBMC 16, mmx -%endif -ADD_OBMC 8, mmx - -INIT_XMM -PUT_RECT sse2 -ADD_RECT sse2 - -HPEL_FILTER sse2 -ADD_OBMC 32, sse2 -ADD_OBMC 16, sse2 diff --git a/ffmpeg1/libavcodec/x86/dnxhdenc.c b/ffmpeg1/libavcodec/x86/dnxhdenc.c deleted file mode 100644 index 349fbb0..0000000 --- a/ffmpeg1/libavcodec/x86/dnxhdenc.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * VC3/DNxHD SIMD functions - * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> - * - * VC-3 encoder funded by the British Broadcasting Corporation - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dnxhdenc.h" - -#if HAVE_SSE2_INLINE - -static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "pxor %%xmm5, %%xmm5 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "movq (%0, %2), %%xmm2 \n\t" - "movq (%0, %2,2), %%xmm3 \n\t" - "punpcklbw %%xmm5, %%xmm0 \n\t" - "punpcklbw %%xmm5, %%xmm1 \n\t" - "punpcklbw %%xmm5, %%xmm2 \n\t" - "punpcklbw %%xmm5, %%xmm3 \n\t" - "movdqa %%xmm0, (%1) \n\t" - "movdqa %%xmm1, 16(%1) \n\t" - "movdqa %%xmm2, 32(%1) \n\t" - "movdqa %%xmm3, 48(%1) \n\t" - "movdqa %%xmm3 , 64(%1) \n\t" - "movdqa %%xmm2 , 80(%1) \n\t" - "movdqa %%xmm1 , 96(%1) \n\t" - "movdqa %%xmm0, 112(%1) \n\t" - : "+r" (pixels) - : "r" (block), "r" ((x86_reg)line_size) - ); -} - -#endif /* HAVE_SSE2_INLINE */ - -av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx) -{ -#if HAVE_SSE2_INLINE - if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) { - if (ctx->cid_table->bit_depth == 8) - ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2; - } -#endif /* HAVE_SSE2_INLINE */ -} diff --git a/ffmpeg1/libavcodec/x86/dsputil.asm b/ffmpeg1/libavcodec/x86/dsputil.asm deleted file mode 100644 index 9970c02..0000000 --- a/ffmpeg1/libavcodec/x86/dsputil.asm +++ /dev/null @@ -1,652 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -pb_f: times 16 db 15 -pb_zzzzzzzz77777777: times 8 db -1 -pb_7: times 8 db 7 -pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 -pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 -pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 -pd_16384: times 4 dd 16384 -pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 - -SECTION_TEXT - -%macro SCALARPRODUCT 0 -; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order) -cglobal scalarproduct_int16, 3,3,3, v1, v2, order - shl orderq, 1 - add v1q, orderq - add v2q, orderq - neg orderq - pxor m2, m2 -.loop: - movu m0, [v1q + orderq] - movu m1, [v1q + orderq + mmsize] - pmaddwd m0, [v2q + orderq] - pmaddwd m1, [v2q + orderq + mmsize] - paddd m2, m0 - paddd m2, m1 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m2 - paddd m2, m0 - pshuflw m0, m2, 0x4e -%else - pshufw m0, m2, 0x4e -%endif - paddd m2, m0 - movd eax, m2 - RET - -; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 - movd eax, m6 - RET -%endmacro - -INIT_MMX mmxext -SCALARPRODUCT -INIT_XMM sse2 -SCALARPRODUCT - -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -INIT_XMM ssse3 -cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 - movd eax, m6 - RET - - -;----------------------------------------------------------------------------- -; void ff_apply_window_int16(int16_t *output, const int16_t *input, -; const int16_t *window, unsigned int len) -;----------------------------------------------------------------------------- - -%macro REVERSE_WORDS 1-2 -%if cpuflag(ssse3) && notcpuflag(atom) - pshufb %1, %2 -%elif cpuflag(sse2) - pshuflw %1, %1, 0x1B - pshufhw %1, %1, 0x1B - pshufd %1, %1, 0x4E -%elif cpuflag(mmxext) - pshufw %1, %1, 0x1B -%endif -%endmacro - -%macro MUL16FIXED 3 -%if cpuflag(ssse3) ; dst, src, unused -; dst = ((dst * src) + (1<<14)) >> 15 - pmulhrsw %1, %2 -%elif cpuflag(mmxext) ; dst, src, temp -; dst = (dst * src) >> 15 -; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back -; in from the pmullw result. - mova %3, %1 - pmulhw %1, %2 - pmullw %3, %2 - psrlw %3, 15 - psllw %1, 1 - por %1, %3 -%endif -%endmacro - -%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version -%if %1 -cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 -%else -cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 -%endif - lea offset2q, [offsetq-mmsize] -%if cpuflag(ssse3) && notcpuflag(atom) - mova m5, [pb_revwords] - ALIGN 16 -%elif %1 - mova m5, [pd_16384] -%endif -.loop: -%if cpuflag(ssse3) - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The ssse3 version is bit-identical. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - pmulhrsw m1, m0 - REVERSE_WORDS m0, m5 - pmulhrsw m0, [ inputq+offsetq ] - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m0 -%elif %1 - ; This version expands 16-bit to 32-bit, multiplies by the window, - ; adds 16384 for rounding, right shifts 15, then repacks back to words to - ; save to the output. The window is reversed for the second half. - mova m3, [windowq+offset2q] - mova m4, [ inputq+offset2q] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offset2q], m0 - REVERSE_WORDS m3 - mova m4, [ inputq+offsetq] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offsetq], m0 -%else - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The mmxext and sse2 versions do not use rounding, and - ; therefore are not bit-identical to the C version. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - mova m2, [ inputq+offsetq ] - MUL16FIXED m1, m0, m3 - REVERSE_WORDS m0 - MUL16FIXED m2, m0, m3 - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m2 -%endif - add offsetd, mmsize - sub offset2d, mmsize - jae .loop - REP_RET -%endmacro - -INIT_MMX mmxext -APPLY_WINDOW_INT16 0 -INIT_XMM sse2 -APPLY_WINDOW_INT16 0 - -INIT_MMX mmxext -APPLY_WINDOW_INT16 1 -INIT_XMM sse2 -APPLY_WINDOW_INT16 1 -INIT_XMM ssse3 -APPLY_WINDOW_INT16 1 -INIT_XMM ssse3, atom -APPLY_WINDOW_INT16 1 - - -; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) -INIT_MMX mmxext -cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top - movq mm0, [topq] - movq mm2, mm0 - movd mm4, [left_topq] - psllq mm2, 8 - movq mm1, mm0 - por mm4, mm2 - movd mm3, [leftq] - psubb mm0, mm4 ; t-tl - add dstq, wq - add topq, wq - add diffq, wq - neg wq - jmp .skip -.loop: - movq mm4, [topq+wq] - movq mm0, mm4 - psllq mm4, 8 - por mm4, mm1 - movq mm1, mm0 ; t - psubb mm0, mm4 ; t-tl -.skip: - movq mm2, [diffq+wq] -%assign i 0 -%rep 8 - movq mm4, mm0 - paddb mm4, mm3 ; t-tl+l - movq mm5, mm3 - pmaxub mm3, mm1 - pminub mm5, mm1 - pminub mm3, mm4 - pmaxub mm3, mm5 ; median - paddb mm3, mm2 ; +residual -%if i==0 - movq mm7, mm3 - psllq mm7, 56 -%else - movq mm6, mm3 - psrlq mm7, 8 - psllq mm6, 56 - por mm7, mm6 -%endif -%if i<7 - psrlq mm0, 8 - psrlq mm1, 8 - psrlq mm2, 8 -%endif -%assign i i+1 -%endrep - movq [dstq+wq], mm7 - add wq, 8 - jl .loop - movzx r2d, byte [dstq-1] - mov [leftq], r2d - movzx r2d, byte [topq-1] - mov [left_topq], r2d - RET - - -%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned - add srcq, wq - add dstq, wq - neg wq -%%.loop: -%if %2 - mova m1, [srcq+wq] -%else - movu m1, [srcq+wq] -%endif - mova m2, m1 - psllw m1, 8 - paddb m1, m2 - mova m2, m1 - pshufb m1, m3 - paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 - paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 - paddb m1, m2 -%endif - paddb m0, m1 -%if %1 - mova [dstq+wq], m0 -%else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 -%endif - add wq, mmsize - jl %%.loop - mov eax, mmsize-1 - sub eax, wd - movd m1, eax - pshufb m0, m1 - movd eax, m0 - RET -%endmacro - -; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) -INIT_MMX ssse3 -cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left -.skip_prologue: - mova m5, [pb_7] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - psllq m0, 56 - ADD_HFYU_LEFT_LOOP 1, 1 - -INIT_XMM sse4 -cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left - mova m5, [pb_f] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 - jnz .src_unaligned - test dstq, 15 - jnz .dst_unaligned - ADD_HFYU_LEFT_LOOP 1, 1 -.dst_unaligned: - ADD_HFYU_LEFT_LOOP 0, 1 -.src_unaligned: - ADD_HFYU_LEFT_LOOP 0, 0 - -;----------------------------------------------------------------------------- -; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, -; int32_t max, unsigned int len) -;----------------------------------------------------------------------------- - -; %1 = number of xmm registers used -; %2 = number of inline load/process/store loops per asm loop -; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop -; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) -; %5 = suffix -%macro VECTOR_CLIP_INT32 4-5 -cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len -%if %4 - cvtsi2ss m4, minm - cvtsi2ss m5, maxm -%else - movd m4, minm - movd m5, maxm -%endif - SPLATD m4 - SPLATD m5 -.loop: -%assign %%i 1 -%rep %2 - mova m0, [srcq+mmsize*0*%%i] - mova m1, [srcq+mmsize*1*%%i] - mova m2, [srcq+mmsize*2*%%i] - mova m3, [srcq+mmsize*3*%%i] -%if %3 - mova m7, [srcq+mmsize*4*%%i] - mova m8, [srcq+mmsize*5*%%i] - mova m9, [srcq+mmsize*6*%%i] - mova m10, [srcq+mmsize*7*%%i] -%endif - CLIPD m0, m4, m5, m6 - CLIPD m1, m4, m5, m6 - CLIPD m2, m4, m5, m6 - CLIPD m3, m4, m5, m6 -%if %3 - CLIPD m7, m4, m5, m6 - CLIPD m8, m4, m5, m6 - CLIPD m9, m4, m5, m6 - CLIPD m10, m4, m5, m6 -%endif - mova [dstq+mmsize*0*%%i], m0 - mova [dstq+mmsize*1*%%i], m1 - mova [dstq+mmsize*2*%%i], m2 - mova [dstq+mmsize*3*%%i], m3 -%if %3 - mova [dstq+mmsize*4*%%i], m7 - mova [dstq+mmsize*5*%%i], m8 - mova [dstq+mmsize*6*%%i], m9 - mova [dstq+mmsize*7*%%i], m10 -%endif -%assign %%i %%i+1 -%endrep - add srcq, mmsize*4*(%2+%3) - add dstq, mmsize*4*(%2+%3) - sub lend, mmsize*(%2+%3) - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -%define CLIPD CLIPD_MMX -VECTOR_CLIP_INT32 0, 1, 0, 0 -INIT_XMM sse2 -VECTOR_CLIP_INT32 6, 1, 0, 0, _int -%define CLIPD CLIPD_SSE2 -VECTOR_CLIP_INT32 6, 2, 0, 1 -INIT_XMM sse4 -%define CLIPD CLIPD_SSE41 -%ifdef m8 -VECTOR_CLIP_INT32 11, 1, 1, 0 -%else -VECTOR_CLIP_INT32 6, 1, 0, 0 -%endif - -; %1 = aligned/unaligned -%macro BSWAP_LOOPS 1 - mov r3, r2 - sar r2, 3 - jz .left4_%1 -.loop8_%1: - mov%1 m0, [r1 + 0] - mov%1 m1, [r1 + 16] -%if cpuflag(ssse3) - pshufb m0, m2 - pshufb m1, m2 - mova [r0 + 0], m0 - mova [r0 + 16], m1 -%else - pshuflw m0, m0, 10110001b - pshuflw m1, m1, 10110001b - pshufhw m0, m0, 10110001b - pshufhw m1, m1, 10110001b - mova m2, m0 - mova m3, m1 - psllw m0, 8 - psllw m1, 8 - psrlw m2, 8 - psrlw m3, 8 - por m2, m0 - por m3, m1 - mova [r0 + 0], m2 - mova [r0 + 16], m3 -%endif - add r0, 32 - add r1, 32 - dec r2 - jnz .loop8_%1 -.left4_%1: - mov r2, r3 - and r3, 4 - jz .left - mov%1 m0, [r1] -%if cpuflag(ssse3) - pshufb m0, m2 - mova [r0], m0 -%else - pshuflw m0, m0, 10110001b - pshufhw m0, m0, 10110001b - mova m2, m0 - psllw m0, 8 - psrlw m2, 8 - por m2, m0 - mova [r0], m2 -%endif - add r1, 16 - add r0, 16 -%endmacro - -; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); -%macro BSWAP32_BUF 0 -%if cpuflag(ssse3) -cglobal bswap32_buf, 3,4,3 - mov r3, r1 - mova m2, [pb_bswap32] -%else -cglobal bswap32_buf, 3,4,5 - mov r3, r1 -%endif - and r3, 15 - jz .start_align - BSWAP_LOOPS u - jmp .left -.start_align: - BSWAP_LOOPS a -.left: -%if cpuflag(ssse3) - mov r3, r2 - and r2, 2 - jz .left1 - movq m0, [r1] - pshufb m0, m2 - movq [r0], m0 - add r1, 8 - add r0, 8 -.left1: - and r3, 1 - jz .end - mov r2d, [r1] - bswap r2d - mov [r0], r2d -%else - and r2, 3 - jz .end -.loop2: - mov r3d, [r1] - bswap r3d - mov [r0], r3d - add r1, 4 - add r0, 4 - dec r2 - jnz .loop2 -%endif -.end: - RET -%endmacro - -INIT_XMM sse2 -BSWAP32_BUF - -INIT_XMM ssse3 -BSWAP32_BUF diff --git a/ffmpeg1/libavcodec/x86/dsputil_mmx.c b/ffmpeg1/libavcodec/x86/dsputil_mmx.c deleted file mode 100644 index fe59d22..0000000 --- a/ffmpeg1/libavcodec/x86/dsputil_mmx.c +++ /dev/null @@ -1,1636 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/h264dsp.h" -#include "libavcodec/mpegvideo.h" -#include "libavcodec/simple_idct.h" -#include "libavcodec/videodsp.h" -#include "dsputil_mmx.h" -#include "idct_xvid.h" -#include "diracdsp_mmx.h" - -//#undef NDEBUG -//#include <assert.h> - -/* pixel operations */ -DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; - -DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; - -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; - -DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; -DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; - - -#if HAVE_YASM -void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, - uint8_t *src2, int dstStride, - int src1Stride, int h); -void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - ff_put_pixels8_mmxext(block, pixels, line_size, h); - ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} - -void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, - int h); -void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, - int h); -void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext -#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM - -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "packuswb %%"#regd", %%"#regd" \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "psllw $1, %%"#regd" \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "por "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" \ - "psubb "#regd", "#regp" \n\t" - -/***********************************/ -/* MMX rounding */ - -#define DEF(x, y) x ## _ ## y ## _mmx -#define SET_RND MOVQ_WTWO -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) -#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) - -#include "dsputil_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB -#undef OP_AVG - -#endif /* HAVE_INLINE_ASM */ - - -#if HAVE_YASM - -/***********************************/ -/* MMXEXT specific */ - -//FIXME the following could be optimized too ... -static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_avg_pixels8_mmxext(block, pixels, line_size, h); - ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} - -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM -/***********************************/ -/* standard MMX */ - -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), - "r"(p) - : "memory"); - pix += line_size * 4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p) - : "memory"); -} - -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t" \ - "movq 16 + "#off"(%2), %%mm2 \n\t" \ - "movq 32 + "#off"(%2), %%mm3 \n\t" \ - "movq 48 + "#off"(%2), %%mm4 \n\t" \ - "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ - "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ - "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ - "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ - "paddb %%mm0, %%mm1 \n\t" \ - "paddb %%mm0, %%mm2 \n\t" \ - "paddb %%mm0, %%mm3 \n\t" \ - "paddb %%mm0, %%mm4 \n\t" \ - "movq %%mm1, (%0) \n\t" \ - "movq %%mm2, (%0, %3) \n\t" \ - "movq %%mm3, (%0, %3, 2) \n\t" \ - "movq %%mm4, (%0, %1) \n\t" - -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - x86_reg line_skip = line_size; - x86_reg line_skip3; - - __asm__ volatile ( - "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - : "+&r"(pixels), "=&r"(line_skip3) - : "r"(block), "r"(line_skip) - : "memory"); -} - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile ( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - : "+m"(*pix), "+m"(*(pix + line_size)) - : "r"(p) - : "memory"); - pix += line_size * 2; - p += 16; - } while (--i); -} - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -#define CLEAR_BLOCKS(name, n) \ -static void name(int16_t *blocks) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "mov %1, %%"REG_a" \n\t" \ - "1: \n\t" \ - "movq %%mm7, (%0, %%"REG_a") \n\t" \ - "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ - "add $32, %%"REG_a" \n\t" \ - "js 1b \n\t" \ - :: "r"(((uint8_t *)blocks) + 128 * n), \ - "i"(-128 * n) \ - : "%"REG_a \ - ); \ -} -CLEAR_BLOCKS(clear_blocks_mmx, 6) -CLEAR_BLOCKS(clear_block_mmx, 1) - -static void clear_block_sse(int16_t *block) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r"(block) - : "memory" - ); -} - -static void clear_blocks_sse(int16_t *blocks) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "mov %1, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - "js 1b \n" - :: "r"(((uint8_t *)blocks) + 128 * 6), - "i"(-128 * 6) - : "%"REG_a - ); -} - -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) -{ - x86_reg i = 0; - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - "js 1b \n\t" - : "+r"(i) - : "r"(src), "r"(dst), "r"((x86_reg)w - 15) - ); - for ( ; i < w; i++) - dst[i + 0] += src[i + 0]; -} - -#if HAVE_7REGS -static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top) -{ - x86_reg w2 = -w; - x86_reg x; - int l = *left & 0xff; - int tl = *left_top & 0xff; - int t; - __asm__ volatile ( - "mov %7, %3 \n" - "1: \n" - "movzbl (%3, %4), %2 \n" - "mov %2, %k3 \n" - "sub %b1, %b3 \n" - "add %b0, %b3 \n" - "mov %2, %1 \n" - "cmp %0, %2 \n" - "cmovg %0, %2 \n" - "cmovg %1, %0 \n" - "cmp %k3, %0 \n" - "cmovg %k3, %0 \n" - "mov %7, %3 \n" - "cmp %2, %0 \n" - "cmovl %2, %0 \n" - "add (%6, %4), %b0 \n" - "mov %b0, (%5, %4) \n" - "inc %4 \n" - "jl 1b \n" - : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) - : "r"(dst + w), "r"(diff + w), "rm"(top + w) - ); - *left = l; - *left_top = tl; -} -#endif -#endif /* HAVE_INLINE_ASM */ - -void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale); -void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale); - -#if HAVE_INLINE_ASM -/* Draw the edges of width 'w' of an image of size width, height - * this MMX version can only handle w == 8 || w == 16. */ -static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, - int w, int h, int sides) -{ - uint8_t *ptr, *last_line; - int i; - - last_line = buf + (height - 1) * wrap; - /* left and right */ - ptr = buf; - if (w == 8) { - __asm__ volatile ( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - "jb 1b \n\t" - : "+r"(ptr) - : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) - ); - } else if(w==16){ - __asm__ volatile ( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq %%mm0, -16(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "movq %%mm1, 8(%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - "jb 1b \n\t" - : "+r"(ptr) - : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) - ); - } else { - av_assert1(w == 4); - __asm__ volatile ( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "movd %%mm0, -4(%0) \n\t" - "movd -4(%0, %2), %%mm1 \n\t" - "punpcklbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, (%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - "jb 1b \n\t" - : "+r"(ptr) - : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) - ); - } - - /* top and bottom (and hopefully also the corners) */ - if (sides & EDGE_TOP) { - for (i = 0; i < h; i += 4) { - ptr = buf - (i + 1) * wrap - w; - __asm__ volatile ( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - "jb 1b \n\t" - : "+r"(ptr) - : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap), - "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w) - ); - } - } - - if (sides & EDGE_BOTTOM) { - for (i = 0; i < h; i += 4) { - ptr = last_line + (i + 1) * wrap - w; - __asm__ volatile ( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - "jb 1b \n\t" - : "+r"(ptr) - : "r"((x86_reg)last_line - (x86_reg)ptr - w), - "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3), - "r"(ptr + width + 2 * w) - ); - } - } -} -#endif /* HAVE_INLINE_ASM */ - - -#if HAVE_YASM -#define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \ -static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ - 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \ - stride, stride); \ -} \ - \ -static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ - 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ - 8, stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ - stride, stride, 16);\ -} \ - \ -static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ - stride, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \ - stride, stride); \ -} \ - \ -static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ - stride, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} - -QPEL_OP(put_, ff_pw_16, _, mmxext) -QPEL_OP(avg_, ff_pw_16, _, mmxext) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext) -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM -void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels8_xy2_mmx(dst, src, stride, 8); -} -void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels16_xy2_mmx(dst, src, stride, 16); -} -void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels8_xy2_mmx(dst, src, stride, 8); -} -void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels16_xy2_mmx(dst, src, stride, 16); -} - -typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, - ptrdiff_t linesize, int block_w, int block_h, - int src_x, int src_y, int w, int h); - -static av_always_inline void gmc(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height, - emulated_edge_mc_func *emu_edge_fn) -{ - const int w = 8; - const int ix = ox >> (16 + shift); - const int iy = oy >> (16 + shift); - const int oxs = ox >> 4; - const int oys = oy >> 4; - const int dxxs = dxx >> 4; - const int dxys = dxy >> 4; - const int dyxs = dyx >> 4; - const int dyys = dyy >> 4; - const uint16_t r4[4] = { r, r, r, r }; - const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; - const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; - const uint64_t shift2 = 2 * shift; -#define MAX_STRIDE 4096U -#define MAX_H 8U - uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE]; - int x, y; - - const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); - const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); - const int dxh = dxy * (h - 1); - const int dyw = dyx * (w - 1); - int need_emu = (unsigned)ix >= width - w || - (unsigned)iy >= height - h; - - if ( // non-constant fullpel offset (3% of blocks) - ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | - (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) - // uses more than 16 bits of subpel mv (only at huge resolution) - || (dxx | dxy | dyx | dyy) & 15 - || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) { - // FIXME could still use mmx for some of the rows - ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, - shift, r, width, height); - return; - } - - src += ix + iy * stride; - if (need_emu) { - emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height); - src = edge_buf; - } - - __asm__ volatile ( - "movd %0, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - :: "r"(1<<shift) - ); - - for (x = 0; x < w; x += 4) { - uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0), - oxs - dxys + dxxs * (x + 1), - oxs - dxys + dxxs * (x + 2), - oxs - dxys + dxxs * (x + 3) }; - uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0), - oys - dyys + dyxs * (x + 1), - oys - dyys + dyxs * (x + 2), - oys - dyys + dyxs * (x + 3) }; - - for (y = 0; y < h; y++) { - __asm__ volatile ( - "movq %0, %%mm4 \n\t" - "movq %1, %%mm5 \n\t" - "paddw %2, %%mm4 \n\t" - "paddw %3, %%mm5 \n\t" - "movq %%mm4, %0 \n\t" - "movq %%mm5, %1 \n\t" - "psrlw $12, %%mm4 \n\t" - "psrlw $12, %%mm5 \n\t" - : "+m"(*dx4), "+m"(*dy4) - : "m"(*dxy4), "m"(*dyy4) - ); - - __asm__ volatile ( - "movq %%mm6, %%mm2 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubw %%mm4, %%mm2 \n\t" - "psubw %%mm5, %%mm1 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm4, %%mm3 \n\t" - "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy) - "pmullw %%mm5, %%mm3 \n\t" // dx * dy - "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy - "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy) - - "movd %4, %%mm5 \n\t" - "movd %3, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy - "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy - - "movd %2, %%mm5 \n\t" - "movd %1, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy) - "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy) - "paddw %5, %%mm1 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm2, %%mm0 \n\t" - - "psrlw %6, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, %0 \n\t" - - : "=m"(dst[x + y * stride]) - : "m"(src[0]), "m"(src[1]), - "m"(src[stride]), "m"(src[stride + 1]), - "m"(*r4), "m"(shift2) - ); - src += stride; - } - src += 4 - h * stride; - } -} - -#if CONFIG_VIDEODSP -#if HAVE_YASM -#if ARCH_X86_32 -static void gmc_mmx(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) -{ - gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, - width, height, &ff_emulated_edge_mc_8); -} -#endif -static void gmc_sse(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) -{ - gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, - width, height, &ff_emulated_edge_mc_8); -} -#else -static void gmc_mmx(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) -{ - gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, - width, height, &ff_emulated_edge_mc_8); -} -#endif -#endif - -#endif /* HAVE_INLINE_ASM */ - -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -#if HAVE_INLINE_ASM - -/* CAVS-specific */ -void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels8_mmx(dst, src, stride, 8); -} - -void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels8_mmx(dst, src, stride, 8); -} - -void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels16_mmx(dst, src, stride, 16); -} - -void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels16_mmx(dst, src, stride, 16); -} - -/* VC-1-specific */ -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd) -{ - put_pixels8_mmx(dst, src, stride, 8); -} - -#if CONFIG_DIRAC_DECODER -#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ -void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ -{\ - if (h&3)\ - ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\ - else\ - OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ -}\ -void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ -{\ - if (h&3)\ - ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\ - else\ - OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ -}\ -void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ -{\ - if (h&3) {\ - ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\ - } else {\ - OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ - OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ - }\ -} - -#if HAVE_MMX_INLINE -DIRAC_PIXOP(put, put, mmx) -DIRAC_PIXOP(avg, avg, mmx) -#endif - -#if HAVE_YASM -DIRAC_PIXOP(avg, ff_avg, mmxext) - -void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) -{ - if (h&3) - ff_put_dirac_pixels16_c(dst, src, stride, h); - else - ff_put_pixels16_sse2(dst, src[0], stride, h); -} -void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) -{ - if (h&3) - ff_avg_dirac_pixels16_c(dst, src, stride, h); - else - ff_avg_pixels16_sse2(dst, src[0], stride, h); -} -void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) -{ - if (h&3) { - ff_put_dirac_pixels32_c(dst, src, stride, h); - } else { - ff_put_pixels16_sse2(dst , src[0] , stride, h); - ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h); - } -} -void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) -{ - if (h&3) { - ff_avg_dirac_pixels32_c(dst, src, stride, h); - } else { - ff_avg_pixels16_sse2(dst , src[0] , stride, h); - ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h); - } -} -#endif -#endif - -static void vector_clipf_sse(float *dst, const float *src, - float min, float max, int len) -{ - x86_reg i = (len - 16) * 4; - __asm__ volatile ( - "movss %3, %%xmm4 \n\t" - "movss %4, %%xmm5 \n\t" - "shufps $0, %%xmm4, %%xmm4 \n\t" - "shufps $0, %%xmm5, %%xmm5 \n\t" - "1: \n\t" - "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel - "movaps 16(%2, %0), %%xmm1 \n\t" - "movaps 32(%2, %0), %%xmm2 \n\t" - "movaps 48(%2, %0), %%xmm3 \n\t" - "maxps %%xmm4, %%xmm0 \n\t" - "maxps %%xmm4, %%xmm1 \n\t" - "maxps %%xmm4, %%xmm2 \n\t" - "maxps %%xmm4, %%xmm3 \n\t" - "minps %%xmm5, %%xmm0 \n\t" - "minps %%xmm5, %%xmm1 \n\t" - "minps %%xmm5, %%xmm2 \n\t" - "minps %%xmm5, %%xmm3 \n\t" - "movaps %%xmm0, (%1, %0) \n\t" - "movaps %%xmm1, 16(%1, %0) \n\t" - "movaps %%xmm2, 32(%1, %0) \n\t" - "movaps %%xmm3, 48(%1, %0) \n\t" - "sub $64, %0 \n\t" - "jge 1b \n\t" - : "+&r"(i) - : "r"(dst), "r"(src), "m"(min), "m"(max) - : "memory" - ); -} - -#endif /* HAVE_INLINE_ASM */ - -int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, - int order); -int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, - int order); -int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); - -void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); - -void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); -void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); - -void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top); -int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, - int w, int left); -int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, - int w, int left); - -void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); - -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ - do { \ - c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ - } while (0) - -static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - -#if HAVE_INLINE_ASM - c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = ff_add_pixels_clamped_mmx; - - if (!high_bit_depth) { - c->clear_block = clear_block_mmx; - c->clear_blocks = clear_blocks_mmx; - c->draw_edges = draw_edges_mmx; - } - -#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM) - c->gmc = gmc_mmx; -#endif - - c->add_bytes = add_bytes_mmx; -#endif /* HAVE_INLINE_ASM */ - -#if HAVE_YASM - if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { - c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx; - c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx; - } - - c->vector_clip_int32 = ff_vector_clip_int32_mmx; -#endif - -} - -static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - -#if HAVE_YASM - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); - - SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); -#endif /* HAVE_YASM */ - -#if HAVE_MMXEXT_EXTERNAL - /* slower than cmov version on AMD */ - if (!(mm_flags & AV_CPU_FLAG_3DNOW)) - c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; - - c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; - - if (avctx->flags & CODEC_FLAG_BITEXACT) { - c->apply_window_int16 = ff_apply_window_int16_mmxext; - } else { - c->apply_window_int16 = ff_apply_window_int16_round_mmxext; - } -#endif /* HAVE_MMXEXT_EXTERNAL */ -} - -static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - -#if HAVE_INLINE_ASM - if (!high_bit_depth) { - if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) { - /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ - c->clear_block = clear_block_sse; - c->clear_blocks = clear_blocks_sse; - } - } - - c->vector_clipf = vector_clipf_sse; -#endif /* HAVE_INLINE_ASM */ - -#if HAVE_YASM -#if HAVE_INLINE_ASM && CONFIG_VIDEODSP - c->gmc = gmc_sse; -#endif -#endif /* HAVE_YASM */ -} - -static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int bit_depth = avctx->bits_per_raw_sample; - const int high_bit_depth = bit_depth > 8; - -#if HAVE_SSE2_INLINE - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { - c->idct_put = ff_idct_xvid_sse2_put; - c->idct_add = ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type = FF_SSE2_IDCT_PERM; - } -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_SSE2_EXTERNAL - c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; - if (mm_flags & AV_CPU_FLAG_ATOM) { - c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; - } else { - c->vector_clip_int32 = ff_vector_clip_int32_sse2; - } - if (avctx->flags & CODEC_FLAG_BITEXACT) { - c->apply_window_int16 = ff_apply_window_int16_sse2; - } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { - c->apply_window_int16 = ff_apply_window_int16_round_sse2; - } - c->bswap_buf = ff_bswap32_buf_sse2; -#endif /* HAVE_SSE2_EXTERNAL */ -} - -static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ -#if HAVE_SSSE3_EXTERNAL - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; - if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; - - if (mm_flags & AV_CPU_FLAG_ATOM) - c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; - else - c->apply_window_int16 = ff_apply_window_int16_ssse3; - if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; - c->bswap_buf = ff_bswap32_buf_ssse3; -#endif /* HAVE_SSSE3_EXTERNAL */ -} - -static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ -#if HAVE_SSE4_EXTERNAL - c->vector_clip_int32 = ff_vector_clip_int32_sse4; -#endif /* HAVE_SSE4_EXTERNAL */ -} - -av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) -{ - int mm_flags = av_get_cpu_flags(); - -#if HAVE_7REGS && HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_CMOV) - c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; -#endif - - if (mm_flags & AV_CPU_FLAG_MMX) { -#if HAVE_INLINE_ASM - const int idct_algo = avctx->idct_algo; - - if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { - if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { - c->idct_put = ff_simple_idct_put_mmx; - c->idct_add = ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; - } else if (idct_algo == FF_IDCT_XVIDMMX) { - if (mm_flags & AV_CPU_FLAG_SSE2) { - c->idct_put = ff_idct_xvid_sse2_put; - c->idct_add = ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type = FF_SSE2_IDCT_PERM; - } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->idct_put = ff_idct_xvid_mmxext_put; - c->idct_add = ff_idct_xvid_mmxext_add; - c->idct = ff_idct_xvid_mmxext; - } else { - c->idct_put = ff_idct_xvid_mmx_put; - c->idct_add = ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; - } - } - } -#endif /* HAVE_INLINE_ASM */ - - dsputil_init_mmx(c, avctx, mm_flags); - } - - if (mm_flags & AV_CPU_FLAG_MMXEXT) - dsputil_init_mmxext(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE) - dsputil_init_sse(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE2) - dsputil_init_sse2(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSSE3) - dsputil_init_ssse3(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE4) - dsputil_init_sse4(c, avctx, mm_flags); - - if (CONFIG_ENCODERS) - ff_dsputilenc_init_mmx(c, avctx); -} diff --git a/ffmpeg1/libavcodec/x86/dsputil_mmx.h b/ffmpeg1/libavcodec/x86/dsputil_mmx.h deleted file mode 100644 index 28b0078..0000000 --- a/ffmpeg1/libavcodec/x86/dsputil_mmx.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DSPUTIL_MMX_H -#define AVCODEC_X86_DSPUTIL_MMX_H - -#include <stddef.h> -#include <stdint.h> - -#include "libavcodec/dsputil.h" -#include "libavutil/x86/asm.h" - -extern const uint64_t ff_bone; -extern const uint64_t ff_wtwo; - -extern const xmm_reg ff_pw_3; -extern const xmm_reg ff_pw_4; -extern const xmm_reg ff_pw_5; -extern const xmm_reg ff_pw_8; -extern const uint64_t ff_pw_15; -extern const xmm_reg ff_pw_16; -extern const xmm_reg ff_pw_18; -extern const uint64_t ff_pw_20; -extern const xmm_reg ff_pw_32; -extern const uint64_t ff_pw_42; -extern const uint64_t ff_pw_53; -extern const xmm_reg ff_pw_64; -extern const uint64_t ff_pw_96; -extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; - -extern const xmm_reg ff_pb_1; -extern const xmm_reg ff_pb_3; -extern const uint64_t ff_pb_3F; -extern const xmm_reg ff_pb_F8; -extern const uint64_t ff_pb_FC; - -extern const double ff_pd_1[2]; -extern const double ff_pd_2[2]; - -#define SBUTTERFLY(a,b,t,n,m)\ - "mov" #m " " #a ", " #t " \n\t" /* abcd */\ - "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ - "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ - -#define TRANSPOSE4(a,b,c,d,t)\ - SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ - SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ - SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ - SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ - -#define MOVQ_WONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) - -void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); - -void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); - -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd); - -void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); - -void ff_mmx_idct(int16_t *block); -void ff_mmxext_idct(int16_t *block); - - -void ff_deinterlace_line_mmx(uint8_t *dst, - const uint8_t *lum_m4, const uint8_t *lum_m3, - const uint8_t *lum_m2, const uint8_t *lum_m1, - const uint8_t *lum, - int size); - -void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, - const uint8_t *lum_m3, - const uint8_t *lum_m2, - const uint8_t *lum_m1, - const uint8_t *lum, int size); - -#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ diff --git a/ffmpeg1/libavcodec/x86/dsputil_qns_template.c b/ffmpeg1/libavcodec/x86/dsputil_qns_template.c deleted file mode 100644 index 77a41b9..0000000 --- a/ffmpeg1/libavcodec/x86/dsputil_qns_template.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3 - * Copyright (c) 2004 Michael Niedermayer - * - * MMX optimization by Michael Niedermayer <michaelni@gmx.at> - * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) - -static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) -{ - x86_reg i=0; - - assert(FFABS(scale) < MAX_ABS); - scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; - - SET_RND(mm6); - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %4, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - "psraw $6, %%mm1 \n\t" - "pmullw (%3, %0), %%mm0 \n\t" - "pmullw 8(%3, %0), %%mm1 \n\t" - "pmaddwd %%mm0, %%mm0 \n\t" - "pmaddwd %%mm1, %%mm1 \n\t" - "paddd %%mm1, %%mm0 \n\t" - "psrld $4, %%mm0 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" //FIXME optimize & bench - " jb 1b \n\t" - PHADDD(%%mm7, %%mm6) - "psrld $2, %%mm7 \n\t" - "movd %%mm7, %0 \n\t" - - : "+r" (i) - : "r"(basis), "r"(rem), "r"(weight), "g"(scale) - ); - return i; -} - -static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) -{ - x86_reg i=0; - - if(FFABS(scale) < MAX_ABS){ - scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; - SET_RND(mm6); - __asm__ volatile( - "movd %3, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "movq %%mm0, (%2, %0) \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - - : "+r" (i) - : "r"(basis), "r"(rem), "g"(scale) - ); - }else{ - for(i=0; i<8*8; i++){ - rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); - } - } -} diff --git a/ffmpeg1/libavcodec/x86/dsputil_rnd_template.c b/ffmpeg1/libavcodec/x86/dsputil_rnd_template.c deleted file mode 100644 index 1a89b77..0000000 --- a/ffmpeg1/libavcodec/x86/dsputil_rnd_template.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * DSP utils mmx functions are compiled twice for rnd/no_rnd - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -// put_pixels -static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -// this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -//FIXME optimize -static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_xy2)(block , pixels , line_size, h); - DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_xy2)(block , pixels , line_size, h); - DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); -} diff --git a/ffmpeg1/libavcodec/x86/dsputilenc.asm b/ffmpeg1/libavcodec/x86/dsputilenc.asm deleted file mode 100644 index 1839bee..0000000 --- a/ffmpeg1/libavcodec/x86/dsputilenc.asm +++ /dev/null @@ -1,487 +0,0 @@ -;***************************************************************************** -;* MMX optimized DSP utils -;***************************************************************************** -;* Copyright (c) 2000, 2001 Fabrice Bellard -;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;***************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -%macro DIFF_PIXELS_1 4 - movh %1, %3 - movh %2, %4 - punpcklbw %2, %1 - punpcklbw %1, %1 - psubw %1, %2 -%endmacro - -; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 -; %6=temporary storage location -; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) -%macro DIFF_PIXELS_8 6 - DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] - add %1, %5 - add %2, %5 - DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] - DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] -%ifdef m8 - DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] -%else - mova [%6], m0 - DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] - mova m0, [%6] -%endif - sub %1, %5 - sub %2, %5 -%endmacro - -%macro HADAMARD8 0 - SUMSUB_BADC w, 0, 1, 2, 3 - SUMSUB_BADC w, 4, 5, 6, 7 - SUMSUB_BADC w, 0, 2, 1, 3 - SUMSUB_BADC w, 4, 6, 5, 7 - SUMSUB_BADC w, 0, 4, 1, 5 - SUMSUB_BADC w, 2, 6, 3, 7 -%endmacro - -%macro ABS1_SUM 3 - ABS1 %1, %2 - paddusw %3, %1 -%endmacro - -%macro ABS2_SUM 6 - ABS2 %1, %2, %3, %4 - paddusw %5, %1 - paddusw %6, %2 -%endmacro - -%macro ABS_SUM_8x8_64 1 - ABS2 m0, m1, m8, m9 - ABS2_SUM m2, m3, m8, m9, m0, m1 - ABS2_SUM m4, m5, m8, m9, m0, m1 - ABS2_SUM m6, m7, m8, m9, m0, m1 - paddusw m0, m1 -%endmacro - -%macro ABS_SUM_8x8_32 1 - mova [%1], m7 - ABS1 m0, m7 - ABS1 m1, m7 - ABS1_SUM m2, m7, m0 - ABS1_SUM m3, m7, m1 - ABS1_SUM m4, m7, m0 - ABS1_SUM m5, m7, m1 - ABS1_SUM m6, m7, m0 - mova m2, [%1] - ABS1_SUM m2, m7, m1 - paddusw m0, m1 -%endmacro - -; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to -; about 100k on extreme inputs. But that's very unlikely to occur in natural video, -; and it's even more unlikely to not have any alternative mvs/modes with lower cost. -%macro HSUM 3 -%if cpuflag(sse2) - movhlps %2, %1 - paddusw %1, %2 - pshuflw %2, %1, 0xE - paddusw %1, %2 - pshuflw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%elif cpuflag(mmxext) - pshufw %2, %1, 0xE - paddusw %1, %2 - pshufw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%elif cpuflag(mmx) - mova %2, %1 - psrlq %1, 32 - paddusw %1, %2 - mova %2, %1 - psrlq %1, 16 - paddusw %1, %2 - movd %3, %1 -%endif -%endmacro - -%macro STORE4 5 - mova [%1+mmsize*0], %2 - mova [%1+mmsize*1], %3 - mova [%1+mmsize*2], %4 - mova [%1+mmsize*3], %5 -%endmacro - -%macro LOAD4 5 - mova %2, [%1+mmsize*0] - mova %3, [%1+mmsize*1] - mova %4, [%1+mmsize*2] - mova %5, [%1+mmsize*3] -%endmacro - -%macro hadamard8_16_wrapper 2 -cglobal hadamard8_diff, 4, 4, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - call hadamard8x8_diff %+ SUFFIX -%ifndef m8 - ADD rsp, pad -%endif - RET - -cglobal hadamard8_diff16, 5, 6, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - - call hadamard8x8_diff %+ SUFFIX - mov r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - - cmp r4d, 16 - jne .done - - lea r1, [r1+r3*8-8] - lea r2, [r2+r3*8-8] - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - -.done: - mov eax, r5d -%ifndef m8 - ADD rsp, pad -%endif - RET -%endmacro - -%macro HADAMARD8_DIFF 0-1 -%if cpuflag(sse2) -hadamard8x8_diff %+ SUFFIX: - lea r0, [r3*3] - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize - HADAMARD8 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] -%endif - HADAMARD8 - ABS_SUM_8x8 rsp+gprsize - HSUM m0, m1, eax - and eax, 0xFFFF - ret - -hadamard8_16_wrapper %1, 3 -%elif cpuflag(mmx) -ALIGN 16 -; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, -; int stride, int h) -; r0 = void *s = unused, int h = unused (always 8) -; note how r1, r2 and r3 are not clobbered in this function, so 16x16 -; can simply call this 2x2x (and that's why we access rsp+gprsize -; everywhere, which is rsp of calling func -hadamard8x8_diff %+ SUFFIX: - lea r0, [r3*3] - - ; first 4x8 pixels - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 - - ; second 4x8 pixels - DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - - LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize+0x60 - mova [rsp+gprsize+0x60], m0 - - LOAD4 rsp+gprsize , m0, m1, m2, m3 - LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize - paddusw m0, [rsp+gprsize+0x60] - - HSUM m0, m1, eax - and rax, 0xFFFF - ret - -hadamard8_16_wrapper 0, 14 -%endif -%endmacro - -INIT_MMX mmx -HADAMARD8_DIFF - -INIT_MMX mmxext -HADAMARD8_DIFF - -INIT_XMM sse2 -%if ARCH_X86_64 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -%else -%define ABS_SUM_8x8 ABS_SUM_8x8_32 -%endif -HADAMARD8_DIFF 10 - -INIT_XMM ssse3 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -HADAMARD8_DIFF 9 - -INIT_XMM sse2 -; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) -cglobal sse16, 5, 5, 8 - shr r4d, 1 - pxor m0, m0 ; mm0 = 0 - pxor m7, m7 ; mm7 holds the sum - -.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] - - ; todo: mm1-mm2, mm3-mm4 - ; algo: subtract mm1 from mm2 with saturation and vice versa - ; OR the result to get the absolute difference - mova m5, m1 - mova m6, m3 - psubusb m1, m2 - psubusb m3, m4 - psubusb m2, m5 - psubusb m4, m6 - - por m2, m1 - por m4, m3 - - ; now convert to 16-bit vectors so we can square them - mova m1, m2 - mova m3, m4 - - punpckhbw m2, m0 - punpckhbw m4, m0 - punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) - punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) - - pmaddwd m2, m2 - pmaddwd m4, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 - - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - - paddd m1, m2 - paddd m3, m4 - paddd m7, m1 - paddd m7, m3 - - dec r4 - jnz .next2lines - - mova m1, m7 - psrldq m7, 8 ; shift hi qword to lo - paddd m7, m1 - mova m1, m7 - psrldq m7, 4 ; shift hi dword to lo - paddd m7, m1 - movd eax, m7 ; return value - RET - -INIT_MMX mmx -; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) -cglobal get_pixels, 3,4 - movsxdifnidn r2, r2d - add r0, 128 - mov r3, -128 - pxor m7, m7 -.loop: - mova m0, [r1] - mova m2, [r1+r2] - mova m1, m0 - mova m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - mova [r0+r3+ 0], m0 - mova [r0+r3+ 8], m1 - mova [r0+r3+16], m2 - mova [r0+r3+24], m3 - lea r1, [r1+r2*2] - add r3, 32 - js .loop - REP_RET - -INIT_XMM sse2 -cglobal get_pixels, 3, 4 - movsxdifnidn r2, r2d - lea r3, [r2*3] - pxor m4, m4 - movh m0, [r1] - movh m1, [r1+r2] - movh m2, [r1+r2*2] - movh m3, [r1+r3] - lea r1, [r1+r2*4] - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 - mova [r0], m0 - mova [r0+0x10], m1 - mova [r0+0x20], m2 - mova [r0+0x30], m3 - movh m0, [r1] - movh m1, [r1+r2*1] - movh m2, [r1+r2*2] - movh m3, [r1+r3] - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 - mova [r0+0x40], m0 - mova [r0+0x50], m1 - mova [r0+0x60], m2 - mova [r0+0x70], m3 - RET - -INIT_MMX mmx -; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride) -cglobal diff_pixels, 4,5 - movsxdifnidn r3, r3d - pxor m7, m7 - add r0, 128 - mov r4, -128 -.loop: - mova m0, [r1] - mova m2, [r2] - mova m1, m0 - mova m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 - mova [r0+r4+0], m0 - mova [r0+r4+8], m1 - add r1, r3 - add r2, r3 - add r4, 16 - jne .loop - REP_RET - -INIT_MMX mmx -; pix_sum16_mmx(uint8_t * pix, int line_size) -cglobal pix_sum16, 2, 3 - movsxdifnidn r1, r1d - mov r2, r1 - neg r2 - shl r2, 4 - sub r0, r2 - pxor m7, m7 - pxor m6, m6 -.loop: - mova m0, [r0+r2+0] - mova m1, [r0+r2+0] - mova m2, [r0+r2+8] - mova m3, [r0+r2+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - paddw m1, m0 - paddw m3, m2 - paddw m3, m1 - paddw m6, m3 - add r2, r1 - js .loop - mova m5, m6 - psrlq m6, 32 - paddw m6, m5 - mova m5, m6 - psrlq m6, 16 - paddw m6, m5 - movd eax, m6 - and eax, 0xffff - RET - -INIT_MMX mmx -; pix_norm1_mmx(uint8_t *pix, int line_size) -cglobal pix_norm1, 2, 4 - movsxdifnidn r1, r1d - mov r2, 16 - pxor m0, m0 - pxor m7, m7 -.loop: - mova m2, [r0+0] - mova m3, [r0+8] - mova m1, m2 - punpckhbw m1, m0 - punpcklbw m2, m0 - mova m4, m3 - punpckhbw m3, m0 - punpcklbw m4, m0 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - pmaddwd m4, m4 - paddd m2, m1 - paddd m4, m3 - paddd m7, m2 - add r0, r1 - paddd m7, m4 - dec r2 - jne .loop - mova m1, m7 - psrlq m7, 32 - paddd m1, m7 - movd eax, m1 - RET - diff --git a/ffmpeg1/libavcodec/x86/dsputilenc_mmx.c b/ffmpeg1/libavcodec/x86/dsputilenc_mmx.c deleted file mode 100644 index a3f268e..0000000 --- a/ffmpeg1/libavcodec/x86/dsputilenc_mmx.c +++ /dev/null @@ -1,1060 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/dct.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "libavcodec/mathops.h" -#include "dsputil_mmx.h" - -void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); -void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); -void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); -int ff_pix_sum16_mmx(uint8_t * pix, int line_size); -int ff_pix_norm1_mmx(uint8_t *pix, int line_size); - -#if HAVE_INLINE_ASM - -static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %4,%%ecx\n" - "shr $1,%%ecx\n" - "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ - "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ - "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ - "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ - "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1,%%mm5\n" - "movq %%mm3,%%mm6\n" - "psubusb %%mm2,%%mm1\n" - "psubusb %%mm4,%%mm3\n" - "psubusb %%mm5,%%mm2\n" - "psubusb %%mm6,%%mm4\n" - - "por %%mm1,%%mm2\n" - "por %%mm3,%%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2,%%mm1\n" - "movq %%mm4,%%mm3\n" - - "punpckhbw %%mm0,%%mm2\n" - "punpckhbw %%mm0,%%mm4\n" - "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ - "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ - - "pmaddwd %%mm2,%%mm2\n" - "pmaddwd %%mm4,%%mm4\n" - "pmaddwd %%mm1,%%mm1\n" - "pmaddwd %%mm3,%%mm3\n" - - "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ - "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ - - "paddd %%mm2,%%mm1\n" - "paddd %%mm4,%%mm3\n" - "paddd %%mm1,%%mm7\n" - "paddd %%mm3,%%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} - -static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ - "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ - "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ - "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ - "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1,%%mm5\n" - "movq %%mm3,%%mm6\n" - "psubusb %%mm2,%%mm1\n" - "psubusb %%mm4,%%mm3\n" - "psubusb %%mm5,%%mm2\n" - "psubusb %%mm6,%%mm4\n" - - "por %%mm1,%%mm2\n" - "por %%mm3,%%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2,%%mm1\n" - "movq %%mm4,%%mm3\n" - - "punpckhbw %%mm0,%%mm2\n" - "punpckhbw %%mm0,%%mm4\n" - "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ - "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ - - "pmaddwd %%mm2,%%mm2\n" - "pmaddwd %%mm4,%%mm4\n" - "pmaddwd %%mm1,%%mm1\n" - "pmaddwd %%mm3,%%mm3\n" - - "add %3,%0\n" - "add %3,%1\n" - - "paddd %%mm2,%%mm1\n" - "paddd %%mm4,%%mm3\n" - "paddd %%mm1,%%mm7\n" - "paddd %%mm3,%%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} - -static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm7,%%mm7\n" - "pxor %%mm6,%%mm6\n" - - "movq (%0),%%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "1:\n" - - "movq (%0),%%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7,%%mm0\n" - "punpckhwd %%mm7,%%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) - : "r" ((x86_reg)line_size) , "g" (h-2) - : "%ecx"); - return tmp; -} - -static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { - int tmp; - uint8_t * pix= pix1; - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm7,%%mm7\n" - "pxor %%mm6,%%mm6\n" - - "movq (%0),%%mm0\n" - "movq 1(%0),%%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq 1(%0),%%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "1:\n" - - "movq (%0),%%mm0\n" - "movq 1(%0),%%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq 1(%0),%%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7,%%mm0\n" - "punpckhwd %%mm7,%%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) - : "r" ((x86_reg)line_size) , "g" (h-2) - : "%ecx"); - return tmp + hf_noise8_mmx(pix+8, line_size, h); -} - -static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - MpegEncContext *c = p; - int score1, score2; - - if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); - else score1 = sse16_mmx(c, pix1, pix2, line_size, h); - score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); - - if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; - else return score1 + FFABS(score2)*8; -} - -static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - MpegEncContext *c = p; - int score1= sse8_mmx(c, pix1, pix2, line_size, h); - int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); - - if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; - else return score1 + FFABS(score2)*8; -} - -static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { - int tmp; - - av_assert2( (((int)pix) & 7) == 0); - av_assert2((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n"\ - "movq 8(%0), %%mm3\n"\ - "add %2,%0\n"\ - "movq %%mm2, " #out0 "\n"\ - "movq %%mm3, " #out1 "\n"\ - "psubusb " #in0 ", %%mm2\n"\ - "psubusb " #in1 ", %%mm3\n"\ - "psubusb " #out0 ", " #in0 "\n"\ - "psubusb " #out1 ", " #in1 "\n"\ - "por %%mm2, " #in0 "\n"\ - "por %%mm3, " #in1 "\n"\ - "movq " #in0 ", %%mm2\n"\ - "movq " #in1 ", %%mm3\n"\ - "punpcklbw %%mm7, " #in0 "\n"\ - "punpcklbw %%mm7, " #in1 "\n"\ - "punpckhbw %%mm7, %%mm2\n"\ - "punpckhbw %%mm7, %%mm3\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw %%mm3, %%mm2\n"\ - "paddw %%mm2, " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pxor %%mm7,%%mm7\n" - "movq (%0),%%mm0\n" - "movq 8(%0),%%mm1\n" - "add %2,%0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6,%%mm0\n" - "movq %%mm0,%%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp & 0xFFFF; -} -#undef SUM - -static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy, - int line_size, int h) -{ - int tmp; - - av_assert2( (((int)pix) & 7) == 0); - av_assert2((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n"\ - "movq 8(%0), " #out1 "\n"\ - "add %2,%0\n"\ - "psadbw " #out0 ", " #in0 "\n"\ - "psadbw " #out1 ", " #in1 "\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pxor %%mm7,%%mm7\n" - "movq (%0),%%mm0\n" - "movq 8(%0),%%mm1\n" - "add %2,%0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6,%1\n" - : "+r" (pix), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} -#undef SUM - -static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - - av_assert2( (((int)pix1) & 7) == 0); - av_assert2( (((int)pix2) & 7) == 0); - av_assert2((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0),%%mm2\n"\ - "movq (%1)," #out0 "\n"\ - "movq 8(%0),%%mm3\n"\ - "movq 8(%1)," #out1 "\n"\ - "add %3,%0\n"\ - "add %3,%1\n"\ - "psubb " #out0 ", %%mm2\n"\ - "psubb " #out1 ", %%mm3\n"\ - "pxor %%mm7, %%mm2\n"\ - "pxor %%mm7, %%mm3\n"\ - "movq %%mm2, " #out0 "\n"\ - "movq %%mm3, " #out1 "\n"\ - "psubusb " #in0 ", %%mm2\n"\ - "psubusb " #in1 ", %%mm3\n"\ - "psubusb " #out0 ", " #in0 "\n"\ - "psubusb " #out1 ", " #in1 "\n"\ - "por %%mm2, " #in0 "\n"\ - "por %%mm3, " #in1 "\n"\ - "movq " #in0 ", %%mm2\n"\ - "movq " #in1 ", %%mm3\n"\ - "punpcklbw %%mm7, " #in0 "\n"\ - "punpcklbw %%mm7, " #in1 "\n"\ - "punpckhbw %%mm7, %%mm2\n"\ - "punpckhbw %%mm7, %%mm3\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw %%mm3, %%mm2\n"\ - "paddw %%mm2, " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pcmpeqw %%mm7,%%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0),%%mm0\n" - "movq (%1),%%mm2\n" - "movq 8(%0),%%mm1\n" - "movq 8(%1),%%mm3\n" - "add %3,%0\n" - "add %3,%1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6,%%mm0\n" - "movq %%mm0,%%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6,%%mm0\n" - "movd %%mm0,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp & 0x7FFF; -} -#undef SUM - -static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - av_assert2( (((int)pix1) & 7) == 0); - av_assert2( (((int)pix2) & 7) == 0); - av_assert2((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0)," #out0 "\n"\ - "movq (%1),%%mm2\n"\ - "movq 8(%0)," #out1 "\n"\ - "movq 8(%1),%%mm3\n"\ - "add %3,%0\n"\ - "add %3,%1\n"\ - "psubb %%mm2, " #out0 "\n"\ - "psubb %%mm3, " #out1 "\n"\ - "pxor %%mm7, " #out0 "\n"\ - "pxor %%mm7, " #out1 "\n"\ - "psadbw " #out0 ", " #in0 "\n"\ - "psadbw " #out1 ", " #in1 "\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pcmpeqw %%mm7,%%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0),%%mm0\n" - "movq (%1),%%mm2\n" - "movq 8(%0),%%mm1\n" - "movq 8(%1),%%mm3\n" - "add %3,%0\n" - "add %3,%1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} -#undef SUM - -static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){ - x86_reg i=0; - if(w>=16) - __asm__ volatile( - "1: \n\t" - "movq (%2, %0), %%mm0 \n\t" - "movq (%1, %0), %%mm1 \n\t" - "psubb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%3, %0) \n\t" - "movq 8(%2, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - "psubb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%3, %0) \n\t" - "add $16, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (i) - : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) - ); - for(; i<w; i++) - dst[i+0] = src1[i+0]-src2[i+0]; -} - -static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1, - const uint8_t *src2, int w, - int *left, int *left_top) -{ - x86_reg i=0; - uint8_t l, lt; - - __asm__ volatile( - "movq (%1, %0), %%mm0 \n\t" // LT - "psllq $8, %%mm0 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm1 \n\t" // T - "movq -1(%2, %0), %%mm2 \n\t" // L - "movq (%2, %0), %%mm3 \n\t" // X - "movq %%mm2, %%mm4 \n\t" // L - "psubb %%mm0, %%mm2 \n\t" - "paddb %%mm1, %%mm2 \n\t" // L + T - LT - "movq %%mm4, %%mm5 \n\t" // L - "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) - "pminub %%mm5, %%mm1 \n\t" // min(T, L) - "pminub %%mm2, %%mm4 \n\t" - "pmaxub %%mm1, %%mm4 \n\t" - "psubb %%mm4, %%mm3 \n\t" // dst - pred - "movq %%mm3, (%3, %0) \n\t" - "add $8, %0 \n\t" - "movq -1(%1, %0), %%mm0 \n\t" // LT - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (i) - : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) - ); - - l= *left; - lt= *left_top; - - dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); - - *left_top= src1[w-1]; - *left = src2[w-1]; -} - -#define MMABS_MMX(a,z)\ - "pxor " #z ", " #z " \n\t"\ - "pcmpgtw " #a ", " #z " \n\t"\ - "pxor " #z ", " #a " \n\t"\ - "psubw " #z ", " #a " \n\t" - -#define MMABS_MMXEXT(a, z) \ - "pxor " #z ", " #z " \n\t"\ - "psubw " #a ", " #z " \n\t"\ - "pmaxsw " #z ", " #a " \n\t" - -#define MMABS_SSSE3(a,z)\ - "pabsw " #a ", " #a " \n\t" - -#define MMABS_SUM(a,z, sum)\ - MMABS(a,z)\ - "paddusw " #a ", " #sum " \n\t" - -/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to - * about 100k on extreme inputs. But that's very unlikely to occur in natural video, - * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ -#define HSUM_MMX(a, t, dst)\ - "movq "#a", "#t" \n\t"\ - "psrlq $32, "#a" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movq "#a", "#t" \n\t"\ - "psrlq $16, "#a" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define HSUM_MMXEXT(a, t, dst) \ - "pshufw $0x0E, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshufw $0x01, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define HSUM_SSE2(a, t, dst)\ - "movhlps "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshuflw $0x0E, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshuflw $0x01, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define DCT_SAD4(m,mm,o)\ - "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ - "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ - "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ - "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ - MMABS_SUM(mm##2, mm##6, mm##0)\ - MMABS_SUM(mm##3, mm##7, mm##1)\ - MMABS_SUM(mm##4, mm##6, mm##0)\ - MMABS_SUM(mm##5, mm##7, mm##1)\ - -#define DCT_SAD_MMX\ - "pxor %%mm0, %%mm0 \n\t"\ - "pxor %%mm1, %%mm1 \n\t"\ - DCT_SAD4(q, %%mm, 0)\ - DCT_SAD4(q, %%mm, 8)\ - DCT_SAD4(q, %%mm, 64)\ - DCT_SAD4(q, %%mm, 72)\ - "paddusw %%mm1, %%mm0 \n\t"\ - HSUM(%%mm0, %%mm1, %0) - -#define DCT_SAD_SSE2\ - "pxor %%xmm0, %%xmm0 \n\t"\ - "pxor %%xmm1, %%xmm1 \n\t"\ - DCT_SAD4(dqa, %%xmm, 0)\ - DCT_SAD4(dqa, %%xmm, 64)\ - "paddusw %%xmm1, %%xmm0 \n\t"\ - HSUM(%%xmm0, %%xmm1, %0) - -#define DCT_SAD_FUNC(cpu) \ -static int sum_abs_dctelem_##cpu(int16_t *block){\ - int sum;\ - __asm__ volatile(\ - DCT_SAD\ - :"=r"(sum)\ - :"r"(block)\ - );\ - return sum&0xFFFF;\ -} - -#define DCT_SAD DCT_SAD_MMX -#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) -#define MMABS(a,z) MMABS_MMX(a,z) -DCT_SAD_FUNC(mmx) -#undef MMABS -#undef HSUM - -#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst) -#define MMABS(a,z) MMABS_MMXEXT(a,z) -DCT_SAD_FUNC(mmxext) -#undef HSUM -#undef DCT_SAD - -#define DCT_SAD DCT_SAD_SSE2 -#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) -DCT_SAD_FUNC(sse2) -#undef MMABS - -#if HAVE_SSSE3_INLINE -#define MMABS(a,z) MMABS_SSSE3(a,z) -DCT_SAD_FUNC(ssse3) -#undef MMABS -#endif -#undef HSUM -#undef DCT_SAD - -static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ - int sum; - x86_reg i=size; - __asm__ volatile( - "pxor %%mm4, %%mm4 \n" - "1: \n" - "sub $8, %0 \n" - "movq (%2,%0), %%mm2 \n" - "movq (%3,%0,2), %%mm0 \n" - "movq 8(%3,%0,2), %%mm1 \n" - "punpckhbw %%mm2, %%mm3 \n" - "punpcklbw %%mm2, %%mm2 \n" - "psraw $8, %%mm3 \n" - "psraw $8, %%mm2 \n" - "psubw %%mm3, %%mm1 \n" - "psubw %%mm2, %%mm0 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm0, %%mm0 \n" - "paddd %%mm1, %%mm4 \n" - "paddd %%mm0, %%mm4 \n" - "jg 1b \n" - "movq %%mm4, %%mm3 \n" - "psrlq $32, %%mm3 \n" - "paddd %%mm3, %%mm4 \n" - "movd %%mm4, %1 \n" - :"+r"(i), "=r"(sum) - :"r"(pix1), "r"(pix2) - ); - return sum; -} - -#define PHADDD(a, t)\ - "movq "#a", "#t" \n\t"\ - "psrlq $32, "#a" \n\t"\ - "paddd "#t", "#a" \n\t" -/* - pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] - pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] - pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] - */ -#define PMULHRW(x, y, s, o)\ - "pmulhw " #s ", "#x " \n\t"\ - "pmulhw " #s ", "#y " \n\t"\ - "paddw " #o ", "#x " \n\t"\ - "paddw " #o ", "#y " \n\t"\ - "psraw $1, "#x " \n\t"\ - "psraw $1, "#y " \n\t" -#define DEF(x) x ## _mmx -#define SET_RND MOVQ_WONE -#define SCALE_OFFSET 1 - -#include "dsputil_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW - -#define DEF(x) x ## _3dnow -#define SET_RND(x) -#define SCALE_OFFSET 0 -#define PMULHRW(x, y, s, o)\ - "pmulhrw " #s ", "#x " \n\t"\ - "pmulhrw " #s ", "#y " \n\t" - -#include "dsputil_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW - -#if HAVE_SSSE3_INLINE -#undef PHADDD -#define DEF(x) x ## _ssse3 -#define SET_RND(x) -#define SCALE_OFFSET -1 -#define PHADDD(a, t)\ - "pshufw $0x0E, "#a", "#t" \n\t"\ - "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ -#define PMULHRW(x, y, s, o)\ - "pmulhrsw " #s ", "#x " \n\t"\ - "pmulhrsw " #s ", "#y " \n\t" - -#include "dsputil_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW -#undef PHADDD -#endif /* HAVE_SSSE3_INLINE */ - -#endif /* HAVE_INLINE_ASM */ - -int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); - -#define hadamard_func(cpu) \ -int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); \ -int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); - -hadamard_func(mmx) -hadamard_func(mmxext) -hadamard_func(sse2) -hadamard_func(ssse3) - -av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx) -{ - int mm_flags = av_get_cpu_flags(); - int bit_depth = avctx->bits_per_raw_sample; - -#if HAVE_YASM - if (EXTERNAL_MMX(mm_flags)) { - if (bit_depth <= 8) - c->get_pixels = ff_get_pixels_mmx; - c->diff_pixels = ff_diff_pixels_mmx; - c->pix_sum = ff_pix_sum16_mmx; - - c->pix_norm1 = ff_pix_norm1_mmx; - } - if (EXTERNAL_SSE2(mm_flags)) - if (bit_depth <= 8) - c->get_pixels = ff_get_pixels_sse2; -#endif /* HAVE_YASM */ - -#if HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_MMX) { - const int dct_algo = avctx->dct_algo; - if (avctx->bits_per_raw_sample <= 8 && - (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) { - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->fdct = ff_fdct_sse2; - } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->fdct = ff_fdct_mmxext; - }else{ - c->fdct = ff_fdct_mmx; - } - } - - - c->diff_bytes= diff_bytes_mmx; - c->sum_abs_dctelem= sum_abs_dctelem_mmx; - - c->sse[0] = sse16_mmx; - c->sse[1] = sse8_mmx; - c->vsad[4]= vsad_intra16_mmx; - - c->nsse[0] = nsse16_mmx; - c->nsse[1] = nsse8_mmx; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->vsad[0] = vsad16_mmx; - } - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_mmx; - } - c->add_8x8basis= add_8x8basis_mmx; - - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; - - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->sum_abs_dctelem = sum_abs_dctelem_mmxext; - c->vsad[4] = vsad_intra16_mmxext; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->vsad[0] = vsad16_mmxext; - } - - c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext; - } - - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->sum_abs_dctelem= sum_abs_dctelem_sse2; - } - -#if HAVE_SSSE3_INLINE - if(mm_flags & AV_CPU_FLAG_SSSE3){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_ssse3; - } - c->add_8x8basis= add_8x8basis_ssse3; - c->sum_abs_dctelem= sum_abs_dctelem_ssse3; - } -#endif - - if(mm_flags & AV_CPU_FLAG_3DNOW){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_3dnow; - } - c->add_8x8basis= add_8x8basis_3dnow; - } - } -#endif /* HAVE_INLINE_ASM */ - - if (EXTERNAL_MMX(mm_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; - - if (EXTERNAL_MMXEXT(mm_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; - } - - if (EXTERNAL_SSE2(mm_flags)) { - c->sse[0] = ff_sse16_sse2; - -#if HAVE_ALIGNED_STACK - c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; - c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; -#endif - } - - if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; - c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; - } - } - - ff_dsputil_init_pix_mmx(c, avctx); -} diff --git a/ffmpeg1/libavcodec/x86/dwt_yasm.asm b/ffmpeg1/libavcodec/x86/dwt_yasm.asm deleted file mode 100644 index 5253abc..0000000 --- a/ffmpeg1/libavcodec/x86/dwt_yasm.asm +++ /dev/null @@ -1,306 +0,0 @@ -;****************************************************************************** -;* MMX optimized discrete wavelet trasnform -;* Copyright (c) 2010 David Conrad -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 -pw_2: times 8 dw 2 -pw_8: times 8 dw 8 -pw_16: times 8 dw 16 -pw_1991: times 4 dw 9,-1 - -section .text - -; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 -%macro COMPOSE_53iL0 4 - paddw %2, %3 - paddw %2, %4 - psraw %2, 2 - psubw %1, %2 -%endm - -; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 -; if %4 is supplied, %1 is loaded unaligned from there -; m2: clobbered m3: pw_8 m4: pw_1991 -%macro COMPOSE_DD97iH0 3-4 - paddw m0, %3 - paddw m1, %2 - psubw m0, m3 - mova m2, m1 - punpcklwd m1, m0 - punpckhwd m2, m0 - pmaddwd m1, m4 - pmaddwd m2, m4 -%if %0 > 3 - movu %1, %4 -%endif - psrad m1, 4 - psrad m2, 4 - packssdw m1, m2 - paddw m1, %1 -%endm - -%macro COMPOSE_VERTICAL 1 -; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, -; int width) -cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width - mova m2, [pw_2] -%if ARCH_X86_64 - mov widthd, widthd -%endif -.loop: - sub widthq, mmsize/2 - mova m1, [b0q+2*widthq] - mova m0, [b1q+2*widthq] - COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 - mova [b1q+2*widthq], m0 - jg .loop - REP_RET - -; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, -; int width) -cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width - mova m1, [pw_1] -%if ARCH_X86_64 - mov widthd, widthd -%endif -.loop: - sub widthq, mmsize/2 - mova m0, [b0q+2*widthq] - paddw m0, [b2q+2*widthq] - paddw m0, m1 - psraw m0, 1 - paddw m0, [b1q+2*widthq] - mova [b1q+2*widthq], m0 - jg .loop - REP_RET - -; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, -; IDWTELEM *b3, IDWTELEM *b4, int width) -cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width - mova m3, [pw_8] - mova m4, [pw_1991] -%if ARCH_X86_64 - mov widthd, widthd -%endif -.loop: - sub widthq, mmsize/2 - mova m0, [b0q+2*widthq] - mova m1, [b1q+2*widthq] - COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] - mova [b2q+2*widthq], m1 - jg .loop - REP_RET - -; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, -; IDWTELEM *b3, IDWTELEM *b4, int width) -cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width - mova m3, [pw_16] - mova m4, [pw_1991] -%if ARCH_X86_64 - mov widthd, widthd -%endif -.loop: - sub widthq, mmsize/2 - mova m0, [b0q+2*widthq] - mova m1, [b1q+2*widthq] - mova m5, [b2q+2*widthq] - paddw m0, [b4q+2*widthq] - paddw m1, [b3q+2*widthq] - psubw m0, m3 - mova m2, m1 - punpcklwd m1, m0 - punpckhwd m2, m0 - pmaddwd m1, m4 - pmaddwd m2, m4 - psrad m1, 5 - psrad m2, 5 - packssdw m1, m2 - psubw m5, m1 - mova [b2q+2*widthq], m5 - jg .loop - REP_RET - -; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) -cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width - mova m3, [pw_1] -%if ARCH_X86_64 - mov widthd, widthd -%endif -.loop: - sub widthq, mmsize/2 - mova m1, [b1q+2*widthq] - mova m0, [b0q+2*widthq] - mova m2, m1 - paddw m1, m3 - psraw m1, 1 - psubw m0, m1 - mova [b0q+2*widthq], m0 - paddw m2, m0 - mova [b1q+2*widthq], m2 - jg .loop - REP_RET -%endmacro - -; extend the left and right edges of the tmp array by %1 and %2 respectively -%macro EDGE_EXTENSION 3 - mov %3, [tmpq] -%assign %%i 1 -%rep %1 - mov [tmpq-2*%%i], %3 - %assign %%i %%i+1 -%endrep - mov %3, [tmpq+2*w2q-2] -%assign %%i 0 -%rep %2 - mov [tmpq+2*w2q+2*%%i], %3 - %assign %%i %%i+1 -%endrep -%endmacro - - -%macro HAAR_HORIZONTAL 2 -; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) -cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 - mov w2d, wd - xor xq, xq - shr w2d, 1 - lea b_w2q, [bq+wq] - mova m3, [pw_1] -.lowpass_loop: - movu m1, [b_w2q + 2*xq] - mova m0, [bq + 2*xq] - paddw m1, m3 - psraw m1, 1 - psubw m0, m1 - mova [tmpq + 2*xq], m0 - add xq, mmsize/2 - cmp xq, w2q - jl .lowpass_loop - - xor xq, xq - and w2q, ~(mmsize/2 - 1) - cmp w2q, mmsize/2 - jl .end - -.highpass_loop: - movu m1, [b_w2q + 2*xq] - mova m0, [tmpq + 2*xq] - paddw m1, m0 - - ; shift and interleave -%if %2 == 1 - paddw m0, m3 - paddw m1, m3 - psraw m0, 1 - psraw m1, 1 -%endif - mova m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - mova [bq+4*xq], m0 - mova [bq+4*xq+mmsize], m2 - - add xq, mmsize/2 - cmp xq, w2q - jl .highpass_loop -.end: - REP_RET -%endmacro - - -INIT_XMM -; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) -cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 - mov w2d, wd - xor xd, xd - shr w2d, 1 - lea b_w2q, [bq+wq] - movu m4, [bq+wq] - mova m7, [pw_2] - pslldq m4, 14 -.lowpass_loop: - movu m1, [b_w2q + 2*xq] - mova m0, [bq + 2*xq] - mova m2, m1 - palignr m1, m4, 14 - mova m4, m2 - COMPOSE_53iL0 m0, m1, m2, m7 - mova [tmpq + 2*xq], m0 - add xd, mmsize/2 - cmp xd, w2d - jl .lowpass_loop - - EDGE_EXTENSION 1, 2, xw - ; leave the last up to 7 (sse) or 3 (mmx) values for C - xor xd, xd - and w2d, ~(mmsize/2 - 1) - cmp w2d, mmsize/2 - jl .end - - mova m7, [tmpq-mmsize] - mova m0, [tmpq] - mova m5, [pw_1] - mova m3, [pw_8] - mova m4, [pw_1991] -.highpass_loop: - mova m6, m0 - palignr m0, m7, 14 - mova m7, [tmpq + 2*xq + 16] - mova m1, m7 - mova m2, m7 - palignr m1, m6, 2 - palignr m2, m6, 4 - COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] - mova m0, m7 - mova m7, m6 - - ; shift and interleave - paddw m6, m5 - paddw m1, m5 - psraw m6, 1 - psraw m1, 1 - mova m2, m6 - punpcklwd m6, m1 - punpckhwd m2, m1 - mova [bq+4*xq], m6 - mova [bq+4*xq+mmsize], m2 - - add xd, mmsize/2 - cmp xd, w2d - jl .highpass_loop -.end: - REP_RET - - -%if ARCH_X86_64 == 0 -INIT_MMX -COMPOSE_VERTICAL mmx -HAAR_HORIZONTAL mmx, 0 -HAAR_HORIZONTAL mmx, 1 -%endif - -;;INIT_XMM -INIT_XMM -COMPOSE_VERTICAL sse2 -HAAR_HORIZONTAL sse2, 0 -HAAR_HORIZONTAL sse2, 1 diff --git a/ffmpeg1/libavcodec/x86/fdct.c b/ffmpeg1/libavcodec/x86/fdct.c deleted file mode 100644 index d35245d..0000000 --- a/ffmpeg1/libavcodec/x86/fdct.c +++ /dev/null @@ -1,586 +0,0 @@ -/* - * MMX optimized forward DCT - * The gcc porting is Copyright (c) 2001 Fabrice Bellard. - * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. - * - * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT - * - * Intel Application Note AP-922 - fast, precise implementation of DCT - * http://developer.intel.com/vtune/cbts/appnotes.htm - * - * Also of inspiration: - * a page about fdct at http://www.geocities.com/ssavekar/dct.htm - * Skal's fdct at http://skal.planet-d.net/coding/dct.html - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dct.h" - -#if HAVE_INLINE_ASM - -////////////////////////////////////////////////////////////////////// -// -// constants for the forward DCT -// ----------------------------- -// -// Be sure to check that your compiler is aligning all constants to QWORD -// (8-byte) memory boundaries! Otherwise the unaligned memory access will -// severely stall MMX execution. -// -////////////////////////////////////////////////////////////////////// - -#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy -#define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) - -#define X8(x) x,x,x,x,x,x,x,x - -//concatenated table, for forward DCT transformation -DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { - X8(13036), // tg * (2<<16) + 0.5 - X8(27146), // tg * (2<<16) + 0.5 - X8(-21746) // tg * (2<<16) + 0.5 -}; - -DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { - X8(23170) //cos * (2<<15) + 0.5 -}; - -DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; - -DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; - -static const struct -{ - DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; -} fdct_r_row_sse2 = -{{ - RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW -}}; -//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; - -DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, -}; - -static const struct -{ - DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; -} tab_frw_01234567_sse2 = -{{ -//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table -#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ - C4, C4, C5, C7, C2, C6, C3, -C7, \ - -C4, C4, C7, C3, C6, -C2, C7, -C5, \ - C4, -C4, C5, -C1, C2, -C6, C3, -C1, -// c1..c7 * cos(pi/4) * 2^15 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 -}}; - -#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long - -#define FDCT_COL(cpu, mm, mov)\ -static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ -{\ - __asm__ volatile (\ - #mov" 16(%0), %%"#mm"0 \n\t" \ - #mov" 96(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"0, %%"#mm"2 \n\t" \ - #mov" 32(%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" 80(%0), %%"#mm"4 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ - #mov" (%0), %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ - "paddsw 112(%0), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ - #mov" %%"#mm"0, %%"#mm"6 \n\t" \ - "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ - #mov" 16(%1), %%"#mm"1 \n\t" \ - "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ - #mov" 48(%0), %%"#mm"7 \n\t" \ - "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ - "paddsw 64(%0), %%"#mm"7 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ - #mov" %%"#mm"5, %%"#mm"4 \n\t" \ - "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ - "por (%2), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ - "pmulhw 16(%1), %%"#mm"5 \n\t" \ - #mov" %%"#mm"4, %%"#mm"7 \n\t" \ - "psubsw 80(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" %%"#mm"1, 32(%3) \n\t" \ - "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" 48(%0), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ - "psubsw 64(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"2, %%"#mm"6 \n\t" \ - #mov" %%"#mm"4, 64(%3) \n\t" \ - "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ - "pmulhw (%4), %%"#mm"2 \n\t" \ - "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ - "pmulhw (%4), %%"#mm"6 \n\t" \ - "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ - "por (%2), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ - "por (%2), %%"#mm"2 \n\t" \ - #mov" %%"#mm"1, %%"#mm"4 \n\t" \ - #mov" (%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ - "psubsw 112(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" (%1), %%"#mm"0 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ - #mov" 32(%1), %%"#mm"6 \n\t" \ - "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" %%"#mm"7, (%3) \n\t" \ - "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ - #mov" %%"#mm"5, 96(%3) \n\t" \ - #mov" %%"#mm"3, %%"#mm"7 \n\t" \ - #mov" 32(%1), %%"#mm"5 \n\t" \ - "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ - "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ - "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "pmulhw (%1), %%"#mm"3 \n\t" \ - "por (%2), %%"#mm"0 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" %%"#mm"0, 16(%3) \n\t" \ - "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ - #mov" %%"#mm"7, 48(%3) \n\t" \ - "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ - #mov" %%"#mm"5, 80(%3) \n\t" \ - #mov" %%"#mm"3, 112(%3) \n\t" \ - : \ - : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ - "r" (out + offset), "r" (ocos_4_16)); \ -} - -FDCT_COL(mmx, mm, movq) -FDCT_COL(sse2, xmm, movdqa) - -static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) -{ - __asm__ volatile( -#define FDCT_ROW_SSE2_H1(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" \ - "movdqa " #t "(%1), %%xmm4 \n\t" \ - "movdqa " #t "+16(%1), %%xmm5 \n\t" - -#define FDCT_ROW_SSE2_H2(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" - -#define FDCT_ROW_SSE2(i) \ - "movq %%xmm2, %%xmm1 \n\t" \ - "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "psubsw %%xmm0, %%xmm2 \n\t" \ - "punpckldq %%xmm2, %%xmm1 \n\t" \ - "pshufd $78, %%xmm1, %%xmm2 \n\t" \ - "pmaddwd %%xmm2, %%xmm3 \n\t" \ - "pmaddwd %%xmm1, %%xmm7 \n\t" \ - "pmaddwd %%xmm5, %%xmm2 \n\t" \ - "pmaddwd %%xmm4, %%xmm1 \n\t" \ - "paddd %%xmm7, %%xmm3 \n\t" \ - "paddd %%xmm2, %%xmm1 \n\t" \ - "paddd %%xmm6, %%xmm3 \n\t" \ - "paddd %%xmm6, %%xmm1 \n\t" \ - "psrad %3, %%xmm3 \n\t" \ - "psrad %3, %%xmm1 \n\t" \ - "packssdw %%xmm3, %%xmm1 \n\t" \ - "movdqa %%xmm1, " #i "(%4) \n\t" - - "movdqa (%2), %%xmm6 \n\t" - FDCT_ROW_SSE2_H1(0,0) - FDCT_ROW_SSE2(0) - FDCT_ROW_SSE2_H2(64,0) - FDCT_ROW_SSE2(64) - - FDCT_ROW_SSE2_H1(16,64) - FDCT_ROW_SSE2(16) - FDCT_ROW_SSE2_H2(112,64) - FDCT_ROW_SSE2(112) - - FDCT_ROW_SSE2_H1(32,128) - FDCT_ROW_SSE2(32) - FDCT_ROW_SSE2_H2(96,128) - FDCT_ROW_SSE2(96) - - FDCT_ROW_SSE2_H1(48,192) - FDCT_ROW_SSE2(48) - FDCT_ROW_SSE2_H2(80,192) - FDCT_ROW_SSE2(80) - : - : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), - "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); -} - -static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out, - const int16_t *table) -{ - __asm__ volatile ( - "pshufw $0x1B, 8(%0), %%mm5 \n\t" - "movq (%0), %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "paddsw %%mm5, %%mm0 \n\t" - "psubsw %%mm5, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm1, %%mm0 \n\t" - "punpckhdq %%mm1, %%mm2 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, (%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) -{ - //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) - __asm__ volatile( - "movd 12(%0), %%mm1 \n\t" - "punpcklwd 8(%0), %%mm1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "psrlq $0x20, %%mm1 \n\t" - "movq 0(%0), %%mm0 \n\t" - "punpcklwd %%mm2, %%mm1 \n\t" - "movq %%mm0, %%mm5 \n\t" - "paddsw %%mm1, %%mm0 \n\t" - "psubsw %%mm1, %%mm5 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm5, %%mm0 \n\t" - "punpckhdq %%mm5, %%mm2 \n\t" - "movq 0(%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, 0(%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -void ff_fdct_mmx(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; - int16_t * block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmx(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_mmxext(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; - int16_t *block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmxext(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_sse2(int16_t *block) -{ - DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; - int16_t * const block1= (int16_t*)align_tmp; - - fdct_col_sse2(block, block1, 0); - fdct_row_sse2(block1, block); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg1/libavcodec/x86/fft.asm b/ffmpeg1/libavcodec/x86/fft.asm deleted file mode 100644 index 5071741..0000000 --- a/ffmpeg1/libavcodec/x86/fft.asm +++ /dev/null @@ -1,1093 +0,0 @@ -;****************************************************************************** -;* FFT transform with SSE/3DNow optimizations -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2011 Vitor Sessak -;* -;* This algorithm (though not any of the implementation details) is -;* based on libdjbfft by D. J. Bernstein. -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -; These functions are not individually interchangeable with the C versions. -; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results -; in blocks as conventient to the vector size. -; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) - -%include "libavutil/x86/x86util.asm" - -%if ARCH_X86_64 -%define pointer resq -%else -%define pointer resd -%endif - -SECTION_RODATA - -struc FFTContext - .nbits: resd 1 - .reverse: resd 1 - .revtab: pointer 1 - .tmpbuf: pointer 1 - .mdctsize: resd 1 - .mdctbits: resd 1 - .tcos: pointer 1 - .tsin: pointer 1 - .fftperm: pointer 1 - .fftcalc: pointer 1 - .imdctcalc:pointer 1 - .imdcthalf:pointer 1 -endstruc - -%define M_SQRT1_2 0.70710678118654752440 -%define M_COS_PI_1_8 0.923879532511287 -%define M_COS_PI_3_8 0.38268343236509 - -align 32 -ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 -ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 - -ps_root2: times 8 dd M_SQRT1_2 -ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 - -perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 -perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 -ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 -ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 -ps_m1m1m1m1: times 4 dd 1<<31 -ps_m1p1: dd 1<<31, 0 - -%assign i 16 -%rep 13 -cextern cos_ %+ i -%assign i i<<1 -%endrep - -%if ARCH_X86_64 - %define pointer dq -%else - %define pointer dd -%endif - -%macro IF0 1+ -%endmacro -%macro IF1 1+ - %1 -%endmacro - -SECTION_TEXT - -%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 - mova %1, %3 - mova %2, %1 - pfadd %1, %4 - pfsub %2, %4 -%endmacro - -%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 - mova %5, %3 - pfsub %3, %4 - pfadd %5, %4 ; {t6,t5} - pxor %3, [ps_m1p1] ; {t8,t7} - mova %6, %1 - movd [r0+12], %3 - punpckhdq %3, [r0+8] - pfadd %1, %5 ; {r0,i0} - pfsub %6, %5 ; {r2,i2} - mova %4, %2 - pfadd %2, %3 ; {r1,i1} - pfsub %4, %3 ; {r3,i3} - SWAP %3, %6 -%endmacro - -; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} -; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} -; %3, %4, %5 tmp -; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} -; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} -%macro T8_AVX 5 - vsubps %5, %1, %2 ; v = %1 - %2 - vaddps %3, %1, %2 ; w = %1 + %2 - vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 - vpermilps %2, %2, [perm1] - vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} - vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} - vsubps %4, %5, %1 ; s = r - q - vaddps %1, %5, %1 ; u = r + q - vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} - vshufps %5, %4, %1, 0xbb - vshufps %3, %4, %1, 0xee - vperm2f128 %3, %3, %5, 0x13 - vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} - vshufps %2, %1, %4, 0xdd - vshufps %1, %1, %4, 0x88 - vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} - vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} - vsubps %5, %1, %3 - vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} - vsubps %2, %4, %1 ; %2 = v - w - vaddps %1, %4, %1 ; %1 = v + w -%endmacro - -; In SSE mode do one fft4 transforms -; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} -; -; In AVX mode do two fft4 transforms -; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} -%macro T4_SSE 3 - subps %3, %1, %2 ; {t3,t4,-t8,t7} - addps %1, %1, %2 ; {t1,t2,t6,t5} - xorps %3, %3, [ps_p1p1m1p1] - shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} - shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} - subps %3, %1, %2 ; {r2,i2,r3,i3} - addps %1, %1, %2 ; {r0,i0,r1,i1} - shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} - shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} -%endmacro - -; In SSE mode do one FFT8 -; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} -; -; In AVX mode do two FFT8 -; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} -; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} -; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} -; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} -%macro T8_SSE 6 - addps %6, %3, %4 ; {t1,t2,t3,t4} - subps %3, %3, %4 ; {r5,i5,r7,i7} - shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} - mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} - mulps %4, %4, [ps_root2] - addps %3, %3, %4 ; {t8,t7,ta,t9} - shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} - shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} - subps %3, %6, %4 ; {t6,t5,tc,tb} - addps %6, %6, %4 ; {t1,t2,t9,ta} - shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} - shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} - subps %3, %1, %6 ; {r4,r5,r6,r7} - addps %1, %1, %6 ; {r0,r1,r2,r3} - subps %4, %2, %5 ; {i4,i5,i6,i7} - addps %2, %2, %5 ; {i0,i1,i2,i3} -%endmacro - -; scheduled for cpu-bound sizes -%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim -IF%1 mova m4, Z(4) -IF%1 mova m5, Z(5) - mova m0, %2 ; wre - mova m1, %3 ; wim - mulps m2, m4, m0 ; r2*wre -IF%1 mova m6, Z2(6) - mulps m3, m5, m1 ; i2*wim -IF%1 mova m7, Z2(7) - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m1, m1, m6 ; r3*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 - mova Z2(6), m4 - mova Z(2), m3 - mova m2, Z(3) - addps m3, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 - mova Z2(7), m2 - mova Z(3), m1 - subps m4, m7, m3 ; i2 - addps m3, m3, m7 ; i0 - mova Z(5), m4 - mova Z(1), m3 -%endmacro - -; scheduled to avoid store->load aliasing -%macro PASS_BIG 1 ; (!interleave) - mova m4, Z(4) ; r2 - mova m5, Z(5) ; i2 - mova m0, [wq] ; wre - mova m1, [wq+o1q] ; wim - mulps m2, m4, m0 ; r2*wre - mova m6, Z2(6) ; r3 - mulps m3, m5, m1 ; i2*wim - mova m7, Z2(7) ; i3 - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - mulps m1, m1, m6 ; r3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 -IF%1 mova Z2(6), m4 -IF%1 mova Z(2), m3 - mova m2, Z(3) - addps m5, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 -IF%1 mova Z2(7), m2 -IF%1 mova Z(3), m1 - subps m6, m7, m5 ; i2 - addps m5, m5, m7 ; i0 -IF%1 mova Z(5), m6 -IF%1 mova Z(1), m5 -%if %1==0 - INTERL m1, m3, m7, Z, 2 - INTERL m2, m4, m0, Z2, 6 - - mova m1, Z(0) - mova m2, Z(4) - - INTERL m5, m1, m3, Z, 0 - INTERL m6, m2, m7, Z, 4 -%endif -%endmacro - -%macro PUNPCK 3 - mova %3, %1 - punpckldq %1, %2 - punpckhdq %3, %2 -%endmacro - -%define Z(x) [r0+mmsize*x] -%define Z2(x) [r0+mmsize*x] -%define ZH(x) [r0+mmsize*x+mmsize/2] - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -align 16 -fft8_avx: - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m2, m3, m4 - mova Z(0), m0 - mova Z(1), m1 - ret - - -align 16 -fft16_avx: - mova m2, Z(2) - mova m3, Z(3) - T4_SSE m2, m3, m7 - - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m4, m5, m7 - - mova m4, [ps_cos16_1] - mova m5, [ps_cos16_2] - vmulps m6, m2, m4 - vmulps m7, m3, m5 - vaddps m7, m7, m6 - vmulps m2, m2, m5 - vmulps m3, m3, m4 - vsubps m3, m3, m2 - vblendps m2, m7, m3, 0xf0 - vperm2f128 m3, m7, m3, 0x21 - vaddps m4, m2, m3 - vsubps m2, m3, m2 - vperm2f128 m2, m2, m2, 0x01 - vsubps m3, m1, m2 - vaddps m1, m1, m2 - vsubps m5, m0, m4 - vaddps m0, m0, m4 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - vextractf128 Z(2), m5, 0 - vextractf128 ZH(2), m3, 0 - vextractf128 Z(3), m5, 1 - vextractf128 ZH(3), m3, 1 - ret - -align 16 -fft32_avx: - call fft16_avx - - mova m0, Z(4) - mova m1, Z(5) - - T4_SSE m0, m1, m4 - - mova m2, Z(6) - mova m3, Z(7) - - T8_SSE m0, m1, m2, m3, m4, m6 - ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} - ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} - - vperm2f128 m4, m0, m2, 0x20 - vperm2f128 m5, m1, m3, 0x20 - vperm2f128 m6, m0, m2, 0x31 - vperm2f128 m7, m1, m3, 0x31 - - PASS_SMALL 0, [cos_32], [cos_32+32] - - ret - -fft32_interleave_avx: - call fft32_avx - mov r2d, 32 -.deint_loop: - mova m2, Z(0) - mova m3, Z(1) - vunpcklps m0, m2, m3 - vunpckhps m1, m2, m3 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - add r0, mmsize*2 - sub r2d, mmsize/4 - jg .deint_loop - ret - -%endif - -INIT_XMM sse - -align 16 -fft4_avx: -fft4_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova Z(0), m0 - mova Z(1), m1 - ret - -align 16 -fft8_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - ret - -align 16 -fft16_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova m4, Z(4) - mova m5, Z(5) - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - T4_SSE m4, m5, m6 - mova m6, Z2(6) - mova m7, Z2(7) - T4_SSE m6, m7, m0 - PASS_SMALL 0, [cos_16], [cos_16+16] - ret - - -%macro FFT48_3DNOW 0 -align 16 -fft4 %+ SUFFIX: - T2_3DNOW m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DNOW m0, m1, m2, m3, m4, m5 - PUNPCK m0, m1, m4 - PUNPCK m2, m3, m5 - mova Z(0), m0 - mova Z(1), m4 - mova Z(2), m2 - mova Z(3), m5 - ret - -align 16 -fft8 %+ SUFFIX: - T2_3DNOW m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DNOW m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(2), m2 - T2_3DNOW m4, m5, Z(4), Z(5) - T2_3DNOW m6, m7, Z2(6), Z2(7) - PSWAPD m0, m5 - PSWAPD m2, m7 - pxor m0, [ps_m1p1] - pxor m2, [ps_m1p1] - pfsub m5, m0 - pfadd m7, m2 - pfmul m5, [ps_root2] - pfmul m7, [ps_root2] - T4_3DNOW m1, m3, m5, m7, m0, m2 - mova Z(5), m5 - mova Z2(7), m7 - mova m0, Z(0) - mova m2, Z(2) - T4_3DNOW m0, m2, m4, m6, m5, m7 - PUNPCK m0, m1, m5 - PUNPCK m2, m3, m7 - mova Z(0), m0 - mova Z(1), m5 - mova Z(2), m2 - mova Z(3), m7 - PUNPCK m4, Z(5), m5 - PUNPCK m6, Z2(7), m7 - mova Z(4), m4 - mova Z(5), m5 - mova Z2(6), m6 - mova Z2(7), m7 - ret -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnowext -FFT48_3DNOW - -INIT_MMX 3dnow -FFT48_3DNOW -%endif - -%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] -%define Z2(x) [zcq + o3q + mmsize*(x&1)] -%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] -%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] - -%macro DECL_PASS 2+ ; name, payload -align 16 -%1: -DEFINE_ARGS zc, w, n, o1, o3 - lea o3q, [nq*3] - lea o1q, [nq*8] - shl o3q, 4 -.loop: - %2 - add zcq, mmsize*2 - add wq, mmsize - sub nd, mmsize/8 - jg .loop - rep ret -%endmacro - -%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs - lea r2, [dispatch_tab%1] - mov r2, [r2 + (%2q-2)*gprsize] -%ifdef PIC - lea r3, [$$] - add r2, r3 -%endif - call r2 -%endmacro ; FFT_DISPATCH - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -%macro INTERL_AVX 5 - vunpckhps %3, %2, %1 - vunpcklps %2, %2, %1 - vextractf128 %4(%5), %2, 0 - vextractf128 %4 %+ H(%5), %3, 0 - vextractf128 %4(%5 + 1), %2, 1 - vextractf128 %4 %+ H(%5 + 1), %3, 1 -%endmacro - -%define INTERL INTERL_AVX - -DECL_PASS pass_avx, PASS_BIG 1 -DECL_PASS pass_interleave_avx, PASS_BIG 0 - -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - REP_RET - -%endif - -INIT_XMM sse - -%macro INTERL_SSE 5 - mova %3, %2 - unpcklps %2, %1 - unpckhps %3, %1 - mova %4(%5), %2 - mova %4(%5+1), %3 -%endmacro - -%define INTERL INTERL_SSE - -DECL_PASS pass_sse, PASS_BIG 1 -DECL_PASS pass_interleave_sse, PASS_BIG 0 - -%macro FFT_CALC_FUNC 0 -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - PUSH r1 - PUSH r3 - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - POP rcx - POP r4 - cmp rcx, 3+(mmsize/16) - jg .end - mov r2, -1 - add rcx, 3 - shl r2, cl - sub r4, r2 -.loop: -%if mmsize == 8 - PSWAPD m0, [r4 + r2 + 4] - mova [r4 + r2 + 4], m0 -%else - movaps xmm0, [r4 + r2] - movaps xmm1, xmm0 - unpcklps xmm0, [r4 + r2 + 16] - unpckhps xmm1, [r4 + r2 + 16] - movaps [r4 + r2], xmm0 - movaps [r4 + r2 + 16], xmm1 -%endif - add r2, mmsize*2 - jl .loop -.end: -%if cpuflag(3dnow) - femms - RET -%else - REP_RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnow -FFT_CALC_FUNC -INIT_MMX 3dnowext -FFT_CALC_FUNC -%endif -INIT_XMM sse -FFT_CALC_FUNC - -cglobal fft_permute, 2,7,1 - mov r4, [r0 + FFTContext.revtab] - mov r5, [r0 + FFTContext.tmpbuf] - mov ecx, [r0 + FFTContext.nbits] - mov r2, 1 - shl r2, cl - xor r0, r0 -%if ARCH_X86_32 - mov r1, r1m -%endif -.loop: - movaps xmm0, [r1 + 8*r0] - movzx r6, word [r4 + 2*r0] - movzx r3, word [r4 + 2*r0 + 2] - movlps [r5 + 8*r6], xmm0 - movhps [r5 + 8*r3], xmm0 - add r0, 2 - cmp r0, r2 - jl .loop - shl r2, 3 - add r1, r2 - add r5, r2 - neg r2 -; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B -.loopcopy: - movaps xmm0, [r5 + r2] - movaps xmm1, [r5 + r2 + 16] - movaps [r1 + r2], xmm0 - movaps [r1 + r2 + 16], xmm1 - add r2, 32 - jl .loopcopy - REP_RET - -%macro IMDCT_CALC_FUNC 0 -cglobal imdct_calc, 3,5,3 - mov r3d, [r0 + FFTContext.mdctsize] - mov r4, [r0 + FFTContext.imdcthalf] - add r1, r3 - PUSH r3 - PUSH r1 -%if ARCH_X86_32 - push r2 - push r1 - push r0 -%else - sub rsp, 8 -%endif - call r4 -%if ARCH_X86_32 - add esp, 12 -%else - add rsp, 8 -%endif - POP r1 - POP r3 - lea r0, [r1 + 2*r3] - mov r2, r3 - sub r3, mmsize - neg r2 - mova m2, [ps_m1m1m1m1] -.loop: -%if mmsize == 8 - PSWAPD m0, [r1 + r3] - PSWAPD m1, [r0 + r2] - pxor m0, m2 -%else - mova m0, [r1 + r3] - mova m1, [r0 + r2] - shufps m0, m0, 0x1b - shufps m1, m1, 0x1b - xorps m0, m2 -%endif - mova [r0 + r3], m1 - mova [r1 + r2], m0 - sub r3, mmsize - add r2, mmsize - jl .loop -%if cpuflag(3dnow) - femms - RET -%else - REP_RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnow -IMDCT_CALC_FUNC -INIT_MMX 3dnowext -IMDCT_CALC_FUNC -%endif - -INIT_XMM sse -IMDCT_CALC_FUNC - -%if ARCH_X86_32 -INIT_MMX 3dnow -%define mulps pfmul -%define addps pfadd -%define subps pfsub -%define unpcklps punpckldq -%define unpckhps punpckhdq -DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] -DECL_PASS pass_interleave_3dnow, PASS_BIG 0 -%define pass_3dnowext pass_3dnow -%define pass_interleave_3dnowext pass_interleave_3dnow -%endif - -%ifdef PIC -%define SECTION_REL - $$ -%else -%define SECTION_REL -%endif - -%macro DECL_FFT 1-2 ; nbits, suffix -%ifidn %0, 1 -%xdefine fullsuffix SUFFIX -%else -%xdefine fullsuffix %2 %+ SUFFIX -%endif -%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL -%if %1>=5 -%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL -%endif -%if %1>=6 -%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL -%endif - -%assign n 1<<%1 -%rep 17-%1 -%assign n2 n/2 -%assign n4 n/4 -%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL - -align 16 -fft %+ n %+ fullsuffix: - call fft %+ n2 %+ SUFFIX - add r0, n*4 - (n&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - add r0, n*2 - (n2&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - sub r0, n*6 + (n2&(-2<<%1)) - lea r1, [cos_ %+ n] - mov r2d, n4/2 - jmp pass %+ fullsuffix - -%assign n n*2 -%endrep -%undef n - -align 8 -dispatch_tab %+ fullsuffix: pointer list_of_fft -%endmacro ; DECL_FFT - -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -DECL_FFT 6 -DECL_FFT 6, _interleave -%endif -INIT_XMM sse -DECL_FFT 5 -DECL_FFT 5, _interleave -%if ARCH_X86_32 -INIT_MMX 3dnow -DECL_FFT 4 -DECL_FFT 4, _interleave -INIT_MMX 3dnowext -DECL_FFT 4 -DECL_FFT 4, _interleave -%endif - -INIT_XMM sse -%undef mulps -%undef addps -%undef subps -%undef unpcklps -%undef unpckhps - -%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 -%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 - PSWAPD m0, [%3+%2*4] - movq m2, [%3+%1*4-8] - movq m3, m0 - punpckldq m0, m2 - punpckhdq m2, m3 - movd m1, [%4+%1*2-4] ; tcos[j] - movd m3, [%4+%2*2] ; tcos[n4-j-1] - punpckldq m1, [%5+%1*2-4] ; tsin[j] - punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] - - mova m4, m0 - PSWAPD m5, m1 - pfmul m0, m1 - pfmul m4, m5 - mova m6, m2 - PSWAPD m5, m3 - pfmul m2, m3 - pfmul m6, m5 -%if cpuflag(3dnowext) - pfpnacc m0, m4 - pfpnacc m2, m6 -%else - SBUTTERFLY dq, 0, 4, 1 - SBUTTERFLY dq, 2, 6, 3 - pxor m4, m7 - pxor m6, m7 - pfadd m0, m4 - pfadd m2, m6 -%endif -%else - movaps xmm0, [%3+%2*4] - movaps xmm1, [%3+%1*4-0x10] - movaps xmm2, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm1, xmm2, 0x77 - movlps xmm4, [%4+%2*2] - movlps xmm5, [%5+%2*2+0x0] - movhps xmm4, [%4+%1*2-0x8] - movhps xmm5, [%5+%1*2-0x8] - movaps xmm2, xmm0 - movaps xmm3, xmm1 - mulps xmm0, xmm5 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - mulps xmm3, xmm5 - subps xmm1, xmm0 - addps xmm2, xmm3 - movaps xmm0, xmm1 - unpcklps xmm1, xmm2 - unpckhps xmm0, xmm2 -%endif -%endmacro - -%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 - mulps m6, %3, [%5+%1] - mulps m7, %2, [%5+%1] - mulps %2, %2, [%6+%1] - mulps %3, %3, [%6+%1] - subps %2, %2, m6 - addps %3, %3, m7 -%endmacro - -%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - vmovaps ymm1, [%3+%1*2] - vmovaps ymm0, [%3+%1*2+0x20] - vmovaps ymm3, [%3+%2*2] - vmovaps ymm2, [%3+%2*2+0x20] - - CMUL %1, ymm0, ymm1, %3, %4, %5 - CMUL %2, ymm2, ymm3, %3, %4, %5 - vshufps ymm1, ymm1, ymm1, 0x1b - vshufps ymm3, ymm3, ymm3, 0x1b - vperm2f128 ymm1, ymm1, ymm1, 0x01 - vperm2f128 ymm3, ymm3, ymm3, 0x01 - vunpcklps ymm6, ymm2, ymm1 - vunpckhps ymm4, ymm2, ymm1 - vunpcklps ymm7, ymm0, ymm3 - vunpckhps ymm5, ymm0, ymm3 - - vextractf128 [%3+%1*2], ymm7, 0 - vextractf128 [%3+%1*2+0x10], ymm5, 0 - vextractf128 [%3+%1*2+0x20], ymm7, 1 - vextractf128 [%3+%1*2+0x30], ymm5, 1 - - vextractf128 [%3+%2*2], ymm6, 0 - vextractf128 [%3+%2*2+0x10], ymm4, 0 - vextractf128 [%3+%2*2+0x20], ymm6, 1 - vextractf128 [%3+%2*2+0x30], ymm4, 1 - sub %2, 0x20 - add %1, 0x20 - jl .post -%endmacro - -%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - movaps xmm1, [%3+%1*2] - movaps xmm0, [%3+%1*2+0x10] - CMUL %1, xmm0, xmm1, %3, %4, %5 - movaps xmm5, [%3+%2*2] - movaps xmm4, [%3+%2*2+0x10] - CMUL %2, xmm4, xmm5, %3, %4, %5 - shufps xmm1, xmm1, 0x1b - shufps xmm5, xmm5, 0x1b - movaps xmm6, xmm4 - unpckhps xmm4, xmm1 - unpcklps xmm6, xmm1 - movaps xmm2, xmm0 - unpcklps xmm0, xmm5 - unpckhps xmm2, xmm5 - movaps [%3+%2*2], xmm6 - movaps [%3+%2*2+0x10], xmm4 - movaps [%3+%1*2], xmm0 - movaps [%3+%1*2+0x10], xmm2 - sub %2, 0x10 - add %1, 0x10 - jl .post -%endmacro - -%macro CMUL_3DNOW 6 - mova m6, [%1+%2*2] - mova %3, [%1+%2*2+8] - mova %4, m6 - mova m7, %3 - pfmul m6, [%5+%2] - pfmul %3, [%6+%2] - pfmul %4, [%6+%2] - pfmul m7, [%5+%2] - pfsub %3, m6 - pfadd %4, m7 -%endmacro - -%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - CMUL_3DNOW %3, %1, m0, m1, %4, %5 - CMUL_3DNOW %3, %2, m2, m3, %4, %5 - movd [%3+%1*2+ 0], m0 - movd [%3+%2*2+12], m1 - movd [%3+%2*2+ 0], m2 - movd [%3+%1*2+12], m3 - psrlq m0, 32 - psrlq m1, 32 - psrlq m2, 32 - psrlq m3, 32 - movd [%3+%1*2+ 8], m0 - movd [%3+%2*2+ 4], m1 - movd [%3+%2*2+ 8], m2 - movd [%3+%1*2+ 4], m3 - sub %2, 8 - add %1, 8 - jl .post -%endmacro - -%macro DECL_IMDCT 1 -cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input -%if ARCH_X86_64 -%define rrevtab r7 -%define rtcos r8 -%define rtsin r9 -%else -%define rrevtab r6 -%define rtsin r6 -%define rtcos r5 -%endif - mov r3d, [r0+FFTContext.mdctsize] - add r2, r3 - shr r3, 1 - mov rtcos, [r0+FFTContext.tcos] - mov rtsin, [r0+FFTContext.tsin] - add rtcos, r3 - add rtsin, r3 -%if ARCH_X86_64 == 0 - push rtcos - push rtsin -%endif - shr r3, 1 - mov rrevtab, [r0+FFTContext.revtab] - add rrevtab, r3 -%if ARCH_X86_64 == 0 - push rrevtab -%endif - -%if mmsize == 8 - sub r3, 2 -%else - sub r3, 4 -%endif -%if ARCH_X86_64 || mmsize == 8 - xor r4, r4 - sub r4, r3 -%endif -%if notcpuflag(3dnowext) && mmsize == 8 - movd m7, [ps_m1m1m1m1] -%endif -.pre: -%if ARCH_X86_64 == 0 -;unspill -%if mmsize != 8 - xor r4, r4 - sub r4, r3 -%endif - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - - PREROTATER r4, r3, r2, rtcos, rtsin -%if mmsize == 8 - mov r6, [esp] ; rrevtab = ptr+n8 - movzx r5, word [rrevtab+r4-2] ; rrevtab[j] - movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] - mova [r1+r5*8], m0 - mova [r1+r6*8], m2 - add r4, 2 - sub r3, 2 -%else -%if ARCH_X86_64 - movzx r5, word [rrevtab+r4-4] - movzx r6, word [rrevtab+r4-2] - movzx r10, word [rrevtab+r3] - movzx r11, word [rrevtab+r3+2] - movlps [r1+r5 *8], xmm0 - movhps [r1+r6 *8], xmm0 - movlps [r1+r10*8], xmm1 - movhps [r1+r11*8], xmm1 - add r4, 4 -%else - mov r6, [esp] - movzx r5, word [r6+r4-4] - movzx r4, word [r6+r4-2] - movlps [r1+r5*8], xmm0 - movhps [r1+r4*8], xmm0 - movzx r5, word [r6+r3] - movzx r4, word [r6+r3+2] - movlps [r1+r5*8], xmm1 - movhps [r1+r4*8], xmm1 -%endif - sub r3, 4 -%endif - jns .pre - - mov r5, r0 - mov r6, r1 - mov r0, r1 - mov r1d, [r5+FFTContext.nbits] - - FFT_DISPATCH SUFFIX, r1 - - mov r0d, [r5+FFTContext.mdctsize] - add r6, r0 - shr r0, 1 -%if ARCH_X86_64 == 0 -%define rtcos r2 -%define rtsin r3 - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - neg r0 - mov r1, -mmsize - sub r1, r0 - %1 r0, r1, r6, rtcos, rtsin -%if ARCH_X86_64 == 0 - add esp, 12 -%endif -%if mmsize == 8 - femms -%endif - RET -%endmacro - -DECL_IMDCT POSROTATESHUF - -%if ARCH_X86_32 -INIT_MMX 3dnow -DECL_IMDCT POSROTATESHUF_3DNOW - -INIT_MMX 3dnowext -DECL_IMDCT POSROTATESHUF_3DNOW -%endif - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -DECL_IMDCT POSROTATESHUF_AVX -%endif diff --git a/ffmpeg1/libavcodec/x86/fft.h b/ffmpeg1/libavcodec/x86/fft.h deleted file mode 100644 index 3f8b21d..0000000 --- a/ffmpeg1/libavcodec/x86/fft.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_FFT_H -#define AVCODEC_X86_FFT_H - -#include "libavcodec/fft.h" - -void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); -void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); -void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); - -void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); -void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); -void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); - -#endif /* AVCODEC_X86_FFT_H */ diff --git a/ffmpeg1/libavcodec/x86/fft_init.c b/ffmpeg1/libavcodec/x86/fft_init.c deleted file mode 100644 index bfa7947..0000000 --- a/ffmpeg1/libavcodec/x86/fft_init.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/dct.h" -#include "fft.h" - -av_cold void ff_fft_init_x86(FFTContext *s) -{ - int has_vectors = av_get_cpu_flags(); -#if ARCH_X86_32 - if (EXTERNAL_AMD3DNOW(has_vectors)) { - /* 3DNow! for K6-2/3 */ - s->imdct_calc = ff_imdct_calc_3dnow; - s->imdct_half = ff_imdct_half_3dnow; - s->fft_calc = ff_fft_calc_3dnow; - } - if (EXTERNAL_AMD3DNOWEXT(has_vectors)) { - /* 3DNowEx for K7 */ - s->imdct_calc = ff_imdct_calc_3dnowext; - s->imdct_half = ff_imdct_half_3dnowext; - s->fft_calc = ff_fft_calc_3dnowext; - } -#endif - if (EXTERNAL_SSE(has_vectors)) { - /* SSE for P3/P4/K8 */ - s->imdct_calc = ff_imdct_calc_sse; - s->imdct_half = ff_imdct_half_sse; - s->fft_permute = ff_fft_permute_sse; - s->fft_calc = ff_fft_calc_sse; - s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; - } - if (EXTERNAL_AVX(has_vectors) && s->nbits >= 5) { - /* AVX for SB */ - s->imdct_half = ff_imdct_half_avx; - s->fft_calc = ff_fft_calc_avx; - s->fft_permutation = FF_FFT_PERM_AVX; - } -} - -#if CONFIG_DCT -av_cold void ff_dct_init_x86(DCTContext *s) -{ - int has_vectors = av_get_cpu_flags(); - if (EXTERNAL_SSE(has_vectors)) - s->dct32 = ff_dct32_float_sse; - if (EXTERNAL_SSE2(has_vectors)) - s->dct32 = ff_dct32_float_sse2; - if (EXTERNAL_AVX(has_vectors)) - s->dct32 = ff_dct32_float_avx; -} -#endif diff --git a/ffmpeg1/libavcodec/x86/fmtconvert.asm b/ffmpeg1/libavcodec/x86/fmtconvert.asm deleted file mode 100644 index 1bd13fc..0000000 --- a/ffmpeg1/libavcodec/x86/fmtconvert.asm +++ /dev/null @@ -1,429 +0,0 @@ -;****************************************************************************** -;* x86 optimized Format Conversion Utils -;* Copyright (c) 2008 Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_TEXT - -%macro CVTPS2PI 2 -%if cpuflag(sse) - cvtps2pi %1, %2 -%elif cpuflag(3dnow) - pf2id %1, %2 -%endif -%endmacro - -;--------------------------------------------------------------------------------- -; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); -;--------------------------------------------------------------------------------- -%macro INT32_TO_FLOAT_FMUL_SCALAR 1 -%if UNIX64 -cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len -%else -cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len -%endif -%if WIN64 - SWAP 0, 2 -%elif ARCH_X86_32 - movss m0, mulm -%endif - SPLATD m0 - shl lenq, 2 - add srcq, lenq - add dstq, lenq - neg lenq -.loop: -%if cpuflag(sse2) - cvtdq2ps m1, [srcq+lenq ] - cvtdq2ps m2, [srcq+lenq+16] -%else - cvtpi2ps m1, [srcq+lenq ] - cvtpi2ps m3, [srcq+lenq+ 8] - cvtpi2ps m2, [srcq+lenq+16] - cvtpi2ps m4, [srcq+lenq+24] - movlhps m1, m3 - movlhps m2, m4 -%endif - mulps m1, m0 - mulps m2, m0 - mova [dstq+lenq ], m1 - mova [dstq+lenq+16], m2 - add lenq, 32 - jl .loop - REP_RET -%endmacro - -INIT_XMM sse -INT32_TO_FLOAT_FMUL_SCALAR 5 -INIT_XMM sse2 -INT32_TO_FLOAT_FMUL_SCALAR 3 - - -;------------------------------------------------------------------------------ -; void ff_float_to_int16(int16_t *dst, const float *src, long len); -;------------------------------------------------------------------------------ -%macro FLOAT_TO_INT16 1 -cglobal float_to_int16, 3, 3, %1, dst, src, len - add lenq, lenq - lea srcq, [srcq+2*lenq] - add dstq, lenq - neg lenq -.loop: -%if cpuflag(sse2) - cvtps2dq m0, [srcq+2*lenq ] - cvtps2dq m1, [srcq+2*lenq+16] - packssdw m0, m1 - mova [dstq+lenq], m0 -%else - CVTPS2PI m0, [srcq+2*lenq ] - CVTPS2PI m1, [srcq+2*lenq+ 8] - CVTPS2PI m2, [srcq+2*lenq+16] - CVTPS2PI m3, [srcq+2*lenq+24] - packssdw m0, m1 - packssdw m2, m3 - mova [dstq+lenq ], m0 - mova [dstq+lenq+8], m2 -%endif - add lenq, 16 - js .loop -%if mmsize == 8 - emms -%endif - REP_RET -%endmacro - -INIT_XMM sse2 -FLOAT_TO_INT16 2 -INIT_MMX sse -FLOAT_TO_INT16 0 -INIT_MMX 3dnow -FLOAT_TO_INT16 0 - -;------------------------------------------------------------------------------ -; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); -;------------------------------------------------------------------------------ -%macro FLOAT_TO_INT16_STEP 1 -cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2 - add lenq, lenq - lea srcq, [srcq+2*lenq] - lea step3q, [stepq*3] - neg lenq -.loop: -%if cpuflag(sse2) - cvtps2dq m0, [srcq+2*lenq ] - cvtps2dq m1, [srcq+2*lenq+16] - packssdw m0, m1 - movd v1d, m0 - psrldq m0, 4 - movd v2d, m0 - psrldq m0, 4 - mov [dstq], v1w - mov [dstq+stepq*4], v2w - shr v1d, 16 - shr v2d, 16 - mov [dstq+stepq*2], v1w - mov [dstq+step3q*2], v2w - lea dstq, [dstq+stepq*8] - movd v1d, m0 - psrldq m0, 4 - movd v2d, m0 - mov [dstq], v1w - mov [dstq+stepq*4], v2w - shr v1d, 16 - shr v2d, 16 - mov [dstq+stepq*2], v1w - mov [dstq+step3q*2], v2w - lea dstq, [dstq+stepq*8] -%else - CVTPS2PI m0, [srcq+2*lenq ] - CVTPS2PI m1, [srcq+2*lenq+ 8] - CVTPS2PI m2, [srcq+2*lenq+16] - CVTPS2PI m3, [srcq+2*lenq+24] - packssdw m0, m1 - packssdw m2, m3 - movd v1d, m0 - psrlq m0, 32 - movd v2d, m0 - mov [dstq], v1w - mov [dstq+stepq*4], v2w - shr v1d, 16 - shr v2d, 16 - mov [dstq+stepq*2], v1w - mov [dstq+step3q*2], v2w - lea dstq, [dstq+stepq*8] - movd v1d, m2 - psrlq m2, 32 - movd v2d, m2 - mov [dstq], v1w - mov [dstq+stepq*4], v2w - shr v1d, 16 - shr v2d, 16 - mov [dstq+stepq*2], v1w - mov [dstq+step3q*2], v2w - lea dstq, [dstq+stepq*8] -%endif - add lenq, 16 - js .loop -%if mmsize == 8 - emms -%endif - REP_RET -%endmacro - -INIT_XMM sse2 -FLOAT_TO_INT16_STEP 2 -INIT_MMX sse -FLOAT_TO_INT16_STEP 0 -INIT_MMX 3dnow -FLOAT_TO_INT16_STEP 0 - -;------------------------------------------------------------------------------- -; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); -;------------------------------------------------------------------------------- -%macro FLOAT_TO_INT16_INTERLEAVE2 0 -cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len - lea lenq, [4*r2q] - mov src1q, [src0q+gprsize] - mov src0q, [src0q] - add dstq, lenq - add src0q, lenq - add src1q, lenq - neg lenq -.loop: -%if cpuflag(sse2) - cvtps2dq m0, [src0q+lenq] - cvtps2dq m1, [src1q+lenq] - packssdw m0, m1 - movhlps m1, m0 - punpcklwd m0, m1 - mova [dstq+lenq], m0 -%else - CVTPS2PI m0, [src0q+lenq ] - CVTPS2PI m1, [src0q+lenq+8] - CVTPS2PI m2, [src1q+lenq ] - CVTPS2PI m3, [src1q+lenq+8] - packssdw m0, m1 - packssdw m2, m3 - mova m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - mova [dstq+lenq ], m0 - mova [dstq+lenq+8], m1 -%endif - add lenq, 16 - js .loop -%if mmsize == 8 - emms -%endif - REP_RET -%endmacro - -INIT_MMX 3dnow -FLOAT_TO_INT16_INTERLEAVE2 -INIT_MMX sse -FLOAT_TO_INT16_INTERLEAVE2 -INIT_XMM sse2 -FLOAT_TO_INT16_INTERLEAVE2 - -%macro FLOAT_TO_INT16_INTERLEAVE6 0 -; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) -cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len -%if ARCH_X86_64 - mov lend, r2d -%else - %define lend dword r2m -%endif - mov src1q, [srcq+1*gprsize] - mov src2q, [srcq+2*gprsize] - mov src3q, [srcq+3*gprsize] - mov src4q, [srcq+4*gprsize] - mov src5q, [srcq+5*gprsize] - mov srcq, [srcq] - sub src1q, srcq - sub src2q, srcq - sub src3q, srcq - sub src4q, srcq - sub src5q, srcq -.loop: - CVTPS2PI mm0, [srcq] - CVTPS2PI mm1, [srcq+src1q] - CVTPS2PI mm2, [srcq+src2q] - CVTPS2PI mm3, [srcq+src3q] - CVTPS2PI mm4, [srcq+src4q] - CVTPS2PI mm5, [srcq+src5q] - packssdw mm0, mm3 - packssdw mm1, mm4 - packssdw mm2, mm5 - PSWAPD mm3, mm0 - punpcklwd mm0, mm1 - punpckhwd mm1, mm2 - punpcklwd mm2, mm3 - PSWAPD mm3, mm0 - punpckldq mm0, mm2 - punpckhdq mm2, mm1 - punpckldq mm1, mm3 - movq [dstq ], mm0 - movq [dstq+16], mm2 - movq [dstq+ 8], mm1 - add srcq, 8 - add dstq, 24 - sub lend, 2 - jg .loop - emms - RET -%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 - -INIT_MMX sse -FLOAT_TO_INT16_INTERLEAVE6 -INIT_MMX 3dnow -FLOAT_TO_INT16_INTERLEAVE6 -INIT_MMX 3dnowext -FLOAT_TO_INT16_INTERLEAVE6 - -;----------------------------------------------------------------------------- -; void ff_float_interleave6(float *dst, const float **src, unsigned int len); -;----------------------------------------------------------------------------- - -%macro FLOAT_INTERLEAVE6 1 -cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len -%if ARCH_X86_64 - mov lend, r2d -%else - %define lend dword r2m -%endif - mov src1q, [srcq+1*gprsize] - mov src2q, [srcq+2*gprsize] - mov src3q, [srcq+3*gprsize] - mov src4q, [srcq+4*gprsize] - mov src5q, [srcq+5*gprsize] - mov srcq, [srcq] - sub src1q, srcq - sub src2q, srcq - sub src3q, srcq - sub src4q, srcq - sub src5q, srcq -.loop: -%if cpuflag(sse) - movaps m0, [srcq] - movaps m1, [srcq+src1q] - movaps m2, [srcq+src2q] - movaps m3, [srcq+src3q] - movaps m4, [srcq+src4q] - movaps m5, [srcq+src5q] - - SBUTTERFLYPS 0, 1, 6 - SBUTTERFLYPS 2, 3, 6 - SBUTTERFLYPS 4, 5, 6 - - movaps m6, m4 - shufps m4, m0, 0xe4 - movlhps m0, m2 - movhlps m6, m2 - movaps [dstq ], m0 - movaps [dstq+16], m4 - movaps [dstq+32], m6 - - movaps m6, m5 - shufps m5, m1, 0xe4 - movlhps m1, m3 - movhlps m6, m3 - movaps [dstq+48], m1 - movaps [dstq+64], m5 - movaps [dstq+80], m6 -%else ; mmx - movq m0, [srcq] - movq m1, [srcq+src1q] - movq m2, [srcq+src2q] - movq m3, [srcq+src3q] - movq m4, [srcq+src4q] - movq m5, [srcq+src5q] - - SBUTTERFLY dq, 0, 1, 6 - SBUTTERFLY dq, 2, 3, 6 - SBUTTERFLY dq, 4, 5, 6 - movq [dstq ], m0 - movq [dstq+ 8], m2 - movq [dstq+16], m4 - movq [dstq+24], m1 - movq [dstq+32], m3 - movq [dstq+40], m5 -%endif - add srcq, mmsize - add dstq, mmsize*6 - sub lend, mmsize/4 - jg .loop -%if mmsize == 8 - emms -%endif - REP_RET -%endmacro - -INIT_MMX mmx -FLOAT_INTERLEAVE6 0 -INIT_XMM sse -FLOAT_INTERLEAVE6 7 - -;----------------------------------------------------------------------------- -; void ff_float_interleave2(float *dst, const float **src, unsigned int len); -;----------------------------------------------------------------------------- - -%macro FLOAT_INTERLEAVE2 1 -cglobal float_interleave2, 3, 4, %1, dst, src, len, src1 - mov src1q, [srcq+gprsize] - mov srcq, [srcq ] - sub src1q, srcq -.loop: - mova m0, [srcq ] - mova m1, [srcq+src1q ] - mova m3, [srcq +mmsize] - mova m4, [srcq+src1q+mmsize] - - mova m2, m0 - PUNPCKLDQ m0, m1 - PUNPCKHDQ m2, m1 - - mova m1, m3 - PUNPCKLDQ m3, m4 - PUNPCKHDQ m1, m4 - - mova [dstq ], m0 - mova [dstq+1*mmsize], m2 - mova [dstq+2*mmsize], m3 - mova [dstq+3*mmsize], m1 - - add srcq, mmsize*2 - add dstq, mmsize*4 - sub lend, mmsize/2 - jg .loop -%if mmsize == 8 - emms -%endif - REP_RET -%endmacro - -INIT_MMX mmx -%define PUNPCKLDQ punpckldq -%define PUNPCKHDQ punpckhdq -FLOAT_INTERLEAVE2 0 -INIT_XMM sse -%define PUNPCKLDQ unpcklps -%define PUNPCKHDQ unpckhps -FLOAT_INTERLEAVE2 5 diff --git a/ffmpeg1/libavcodec/x86/fmtconvert_init.c b/ffmpeg1/libavcodec/x86/fmtconvert_init.c deleted file mode 100644 index 4a4c017..0000000 --- a/ffmpeg1/libavcodec/x86/fmtconvert_init.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Format Conversion Utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/fmtconvert.h" - -#if HAVE_YASM - -void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); -void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); - -void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); -void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); -void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); - -void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); -void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); -void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); - -void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); -void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); -void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); - -void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); - -#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse - -#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ -/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ -static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - int c;\ - for(c=0; c<channels; c++){\ - ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\ - }\ -}\ -\ -static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ - if(channels==1)\ - ff_float_to_int16_##cpu(dst, src[0], len);\ - else if(channels==2){\ - ff_float_to_int16_interleave2_##cpu(dst, src, len);\ - }else if(channels==6){\ - ff_float_to_int16_interleave6_##cpu(dst, src, len);\ - }else\ - float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ -} - -FLOAT_TO_INT16_INTERLEAVE(3dnow) -FLOAT_TO_INT16_INTERLEAVE(sse) -FLOAT_TO_INT16_INTERLEAVE(sse2) - -static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src, - long len, int channels) -{ - if(channels==6) - ff_float_to_int16_interleave6_3dnowext(dst, src, len); - else - float_to_int16_interleave_3dnow(dst, src, len, channels); -} - -void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); -void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); - -void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); -void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); - -static void float_interleave_mmx(float *dst, const float **src, - unsigned int len, int channels) -{ - if (channels == 2) { - ff_float_interleave2_mmx(dst, src, len); - } else if (channels == 6) - ff_float_interleave6_mmx(dst, src, len); - else - ff_float_interleave_c(dst, src, len, channels); -} - -static void float_interleave_sse(float *dst, const float **src, - unsigned int len, int channels) -{ - if (channels == 2) { - ff_float_interleave2_sse(dst, src, len); - } else if (channels == 6) - ff_float_interleave6_sse(dst, src, len); - else - ff_float_interleave_c(dst, src, len, channels); -} -#endif /* HAVE_YASM */ - -av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMX(mm_flags)) { - c->float_interleave = float_interleave_mmx; - - if (EXTERNAL_AMD3DNOW(mm_flags)) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = ff_float_to_int16_3dnow; - c->float_to_int16_interleave = float_to_int16_interleave_3dnow; - } - } - if (EXTERNAL_AMD3DNOWEXT(mm_flags)) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; - } - } - if (EXTERNAL_SSE(mm_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; - c->float_to_int16 = ff_float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; - c->float_interleave = float_interleave_sse; - } - if (EXTERNAL_SSE2(mm_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = ff_float_to_int16_sse2; - c->float_to_int16_interleave = float_to_int16_interleave_sse2; - } - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/fpelbase.asm b/ffmpeg1/libavcodec/x86/fpelbase.asm deleted file mode 100644 index a327206..0000000 --- a/ffmpeg1/libavcodec/x86/fpelbase.asm +++ /dev/null @@ -1,106 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -INIT_MMX mmxext -; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) -%macro PIXELS48 2 -%if %2 == 4 -%define OP movh -%else -%define OP mova -%endif -cglobal %1_pixels%2, 4,5 - movsxdifnidn r2, r2d - lea r4, [r2*3] -.loop: - OP m0, [r1] - OP m1, [r1+r2] - OP m2, [r1+r2*2] - OP m3, [r1+r4] - lea r1, [r1+r2*4] -%ifidn %1, avg - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] -%endif - OP [r0], m0 - OP [r0+r2], m1 - OP [r0+r2*2], m2 - OP [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jne .loop - RET -%endmacro - -PIXELS48 put, 4 -PIXELS48 avg, 4 -PIXELS48 put, 8 -PIXELS48 avg, 8 - - -INIT_XMM sse2 -; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal put_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET - -; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal avg_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET diff --git a/ffmpeg1/libavcodec/x86/h263_loopfilter.asm b/ffmpeg1/libavcodec/x86/h263_loopfilter.asm deleted file mode 100644 index a21baf1..0000000 --- a/ffmpeg1/libavcodec/x86/h263_loopfilter.asm +++ /dev/null @@ -1,189 +0,0 @@ -;****************************************************************************** -;* MMX-optimized H.263 loop filter -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -cextern pb_FC -cextern h263_loop_filter_strength - -SECTION_TEXT - -%macro H263_LOOP_FILTER 5 - pxor m7, m7 - mova m0, [%1] - mova m1, [%1] - mova m2, [%4] - mova m3, [%4] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 - mova m2, [%2] - mova m3, [%2] - mova m4, [%3] - mova m5, [%3] - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - psubw m4, m2 - psubw m5, m3 - psllw m4, 2 - psllw m5, 2 - paddw m4, m0 - paddw m5, m1 - pxor m6, m6 - pcmpgtw m6, m4 - pcmpgtw m7, m5 - pxor m4, m6 - pxor m5, m7 - psubw m4, m6 - psubw m5, m7 - psrlw m4, 3 - psrlw m5, 3 - packuswb m4, m5 - packsswb m6, m7 - pxor m7, m7 - movd m2, %5 - punpcklbw m2, m2 - punpcklbw m2, m2 - punpcklbw m2, m2 - psubusb m2, m4 - mova m3, m2 - psubusb m3, m4 - psubb m2, m3 - mova m3, [%2] - mova m4, [%3] - pxor m3, m6 - pxor m4, m6 - paddusb m3, m2 - psubusb m4, m2 - pxor m3, m6 - pxor m4, m6 - paddusb m2, m2 - packsswb m0, m1 - pcmpgtb m7, m0 - pxor m0, m7 - psubb m0, m7 - mova m1, m0 - psubusb m0, m2 - psubb m1, m0 - pand m1, [pb_FC] - psrlw m1, 2 - pxor m1, m7 - psubb m1, m7 - mova m5, [%1] - mova m6, [%4] - psubb m5, m1 - paddb m6, m1 -%endmacro - -INIT_MMX mmx -; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) -cglobal h263_v_loop_filter, 3,5 - movsxdifnidn r1, r1d - movsxdifnidn r2, r2d - - lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 - - mov r3, r0 - sub r3, r1 - mov r4, r3 - sub r4, r1 - H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d - - mova [r3], m3 - mova [r0], m4 - mova [r4], m5 - mova [r0+r1], m6 - RET - -%macro TRANSPOSE4X4 2 - movd m0, [%1] - movd m1, [%1+r1] - movd m2, [%1+r1*2] - movd m3, [%1+r3] - punpcklbw m0, m1 - punpcklbw m2, m3 - mova m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd [%2+ 0], m0 - punpckhdq m0, m0 - movd [%2+ 8], m0 - movd [%2+16], m1 - punpckhdq m1, m1 - movd [%2+24], m1 -%endmacro - - -; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) -INIT_MMX mmx -cglobal h263_h_loop_filter, 3,5,0,32 - movsxdifnidn r1, r1d - movsxdifnidn r2, r2d - - lea r4, [h263_loop_filter_strength] - movzx r3d, BYTE [r4+r2] - movsx r2, r3b - shl r2, 1 - - sub r0, 2 - lea r3, [r1*3] - - TRANSPOSE4X4 r0, rsp - lea r4, [r0+r1*4] - TRANSPOSE4X4 r4, rsp+4 - - H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d - - mova m1, m5 - mova m0, m4 - punpcklbw m5, m3 - punpcklbw m4, m6 - punpckhbw m1, m3 - punpckhbw m0, m6 - mova m3, m5 - mova m6, m1 - punpcklwd m5, m4 - punpcklwd m1, m0 - punpckhwd m3, m4 - punpckhwd m6, m0 - movd [r0], m5 - punpckhdq m5, m5 - movd [r0+r1*1], m5 - movd [r0+r1*2], m3 - punpckhdq m3, m3 - movd [r0+r3], m3 - movd [r4], m1 - punpckhdq m1, m1 - movd [r4+r1*1], m1 - movd [r4+r1*2], m6 - punpckhdq m6, m6 - movd [r4+r3], m6 - RET diff --git a/ffmpeg1/libavcodec/x86/h264_chromamc.asm b/ffmpeg1/libavcodec/x86/h264_chromamc.asm deleted file mode 100644 index 32681aa..0000000 --- a/ffmpeg1/libavcodec/x86/h264_chromamc.asm +++ /dev/null @@ -1,678 +0,0 @@ -;****************************************************************************** -;* MMX/SSSE3-optimized functions for H264 chroma MC -;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, -;* 2005-2008 Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -rnd_rv40_2d_tbl: times 4 dw 0 - times 4 dw 16 - times 4 dw 32 - times 4 dw 16 - times 4 dw 32 - times 4 dw 28 - times 4 dw 32 - times 4 dw 28 - times 4 dw 0 - times 4 dw 32 - times 4 dw 16 - times 4 dw 32 - times 4 dw 32 - times 4 dw 28 - times 4 dw 32 - times 4 dw 28 -rnd_rv40_1d_tbl: times 4 dw 0 - times 4 dw 2 - times 4 dw 4 - times 4 dw 2 - times 4 dw 4 - times 4 dw 3 - times 4 dw 4 - times 4 dw 3 - times 4 dw 0 - times 4 dw 4 - times 4 dw 2 - times 4 dw 4 - times 4 dw 4 - times 4 dw 3 - times 4 dw 4 - times 4 dw 3 - -cextern pw_3 -cextern pw_4 -cextern pw_8 -pw_28: times 8 dw 28 -cextern pw_32 -cextern pw_64 - -SECTION .text - -%macro mv0_pixels_mc8 0 - lea r4, [r2*2 ] -.next4rows: - movq mm0, [r1 ] - movq mm1, [r1+r2] - add r1, r4 - CHROMAMC_AVG mm0, [r0 ] - CHROMAMC_AVG mm1, [r0+r2] - movq [r0 ], mm0 - movq [r0+r2], mm1 - add r0, r4 - movq mm0, [r1 ] - movq mm1, [r1+r2] - add r1, r4 - CHROMAMC_AVG mm0, [r0 ] - CHROMAMC_AVG mm1, [r0+r2] - movq [r0 ], mm0 - movq [r0+r2], mm1 - add r0, r4 - sub r3d, 4 - jne .next4rows -%endmacro - -%macro chroma_mc8_mmx_func 2-3 -%ifidn %2, rv40 -%ifdef PIC -%define rnd_1d_rv40 r8 -%define rnd_2d_rv40 r8 -%define extra_regs 2 -%else ; no-PIC -%define rnd_1d_rv40 rnd_rv40_1d_tbl -%define rnd_2d_rv40 rnd_rv40_2d_tbl -%define extra_regs 1 -%endif ; PIC -%else -%define extra_regs 0 -%endif ; rv40 -; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, -; int stride, int h, int mx, int my) -cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 -%if ARCH_X86_64 - movsxd r2, r2d -%endif - mov r6d, r5d - or r6d, r4d - jne .at_least_one_non_zero - ; mx == 0 AND my == 0 - no filter needed - mv0_pixels_mc8 - REP_RET - -.at_least_one_non_zero: -%ifidn %2, rv40 -%if ARCH_X86_64 - mov r7, r5 - and r7, 6 ; &~1 for mx/my=[0,7] - lea r7, [r7*4+r4] - sar r7d, 1 -%define rnd_bias r7 -%define dest_reg r0 -%else ; x86-32 - mov r0, r5 - and r0, 6 ; &~1 for mx/my=[0,7] - lea r0, [r0*4+r4] - sar r0d, 1 -%define rnd_bias r0 -%define dest_reg r5 -%endif -%else ; vc1, h264 -%define rnd_bias 0 -%define dest_reg r0 -%endif - - test r5d, r5d - mov r6, 1 - je .my_is_zero - test r4d, r4d - mov r6, r2 ; dxy = x ? 1 : stride - jne .both_non_zero -.my_is_zero: - ; mx == 0 XOR my == 0 - 1 dimensional filter only - or r4d, r5d ; x + y - -%ifidn %2, rv40 -%ifdef PIC - lea r8, [rnd_rv40_1d_tbl] -%endif -%if ARCH_X86_64 == 0 - mov r5, r0m -%endif -%endif - - movd m5, r4d - movq m4, [pw_8] - movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 - punpcklwd m5, m5 - punpckldq m5, m5 ; mm5 = B = x - pxor m7, m7 - psubw m4, m5 ; mm4 = A = 8-x - -.next1drow: - movq m0, [r1 ] ; mm0 = src[0..7] - movq m2, [r1+r6] ; mm1 = src[1..8] - - movq m1, m0 - movq m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] - pmullw m1, m4 - pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] - pmullw m3, m5 - - paddw m0, m6 - paddw m1, m6 - paddw m0, m2 - paddw m1, m3 - psrlw m0, 3 - psrlw m1, 3 - packuswb m0, m1 - CHROMAMC_AVG m0, [dest_reg] - movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 - - add dest_reg, r2 - add r1, r2 - dec r3d - jne .next1drow - REP_RET - -.both_non_zero: ; general case, bilinear - movd m4, r4d ; x - movd m6, r5d ; y -%ifidn %2, rv40 -%ifdef PIC - lea r8, [rnd_rv40_2d_tbl] -%endif -%if ARCH_X86_64 == 0 - mov r5, r0m -%endif -%endif - mov r6, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, 16 ; AA and DD - - punpcklwd m4, m4 - punpcklwd m6, m6 - punpckldq m4, m4 ; mm4 = x words - punpckldq m6, m6 ; mm6 = y words - movq m5, m4 - pmullw m4, m6 ; mm4 = x * y - psllw m5, 3 - psllw m6, 3 - movq m7, m5 - paddw m7, m6 - movq [rsp+8], m4 ; DD = x * y - psubw m5, m4 ; mm5 = B = 8x - xy - psubw m6, m4 ; mm6 = C = 8y - xy - paddw m4, [pw_64] - psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 - pxor m7, m7 - movq [rsp ], m4 - - movq m0, [r1 ] ; mm0 = src[0..7] - movq m1, [r1+1] ; mm1 = src[1..8] -.next2drow: - add r1, r2 - - movq m2, m0 - movq m3, m1 - punpckhbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - pmullw m0, [rsp] - pmullw m2, [rsp] - pmullw m1, m5 - pmullw m3, m5 - paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] - paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] - - movq m0, [r1] - movq m1, m0 - punpcklbw m0, m7 - punpckhbw m1, m7 - pmullw m0, m6 - pmullw m1, m6 - paddw m2, m0 - paddw m3, m1 ; [mm2,mm3] += C * src[0..7] - - movq m1, [r1+1] - movq m0, m1 - movq m4, m1 - punpcklbw m0, m7 - punpckhbw m4, m7 - pmullw m0, [rsp+8] - pmullw m4, [rsp+8] - paddw m2, m0 - paddw m3, m4 ; [mm2,mm3] += D * src[1..8] - movq m0, [r1] - - paddw m2, [rnd_2d_%2+rnd_bias*8] - paddw m3, [rnd_2d_%2+rnd_bias*8] - psrlw m2, 6 - psrlw m3, 6 - packuswb m2, m3 - CHROMAMC_AVG m2, [dest_reg] - movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 - - add dest_reg, r2 - dec r3d - jne .next2drow - mov rsp, r6 ; restore stack pointer - RET -%endmacro - -%macro chroma_mc4_mmx_func 2 -%define extra_regs 0 -%ifidn %2, rv40 -%ifdef PIC -%define extra_regs 1 -%endif ; PIC -%endif ; rv40 -cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 -%if ARCH_X86_64 - movsxd r2, r2d -%endif - pxor m7, m7 - movd m2, r4d ; x - movd m3, r5d ; y - movq m4, [pw_8] - movq m5, [pw_8] - punpcklwd m2, m2 - punpcklwd m3, m3 - punpcklwd m2, m2 - punpcklwd m3, m3 - psubw m4, m2 - psubw m5, m3 - -%ifidn %2, rv40 -%ifdef PIC - lea r6, [rnd_rv40_2d_tbl] -%define rnd_2d_rv40 r6 -%else -%define rnd_2d_rv40 rnd_rv40_2d_tbl -%endif - and r5, 6 ; &~1 for mx/my=[0,7] - lea r5, [r5*4+r4] - sar r5d, 1 -%define rnd_bias r5 -%else ; vc1, h264 -%define rnd_bias 0 -%endif - - movd m0, [r1 ] - movd m6, [r1+1] - add r1, r2 - punpcklbw m0, m7 - punpcklbw m6, m7 - pmullw m0, m4 - pmullw m6, m2 - paddw m6, m0 - -.next2rows: - movd m0, [r1 ] - movd m1, [r1+1] - add r1, r2 - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 - pmullw m1, m2 - paddw m1, m0 - movq m0, m1 - - pmullw m6, m5 - pmullw m1, m3 - paddw m6, [rnd_2d_%2+rnd_bias*8] - paddw m1, m6 - psrlw m1, 6 - packuswb m1, m1 - CHROMAMC_AVG4 m1, m6, [r0] - movd [r0], m1 - add r0, r2 - - movd m6, [r1 ] - movd m1, [r1+1] - add r1, r2 - punpcklbw m6, m7 - punpcklbw m1, m7 - pmullw m6, m4 - pmullw m1, m2 - paddw m1, m6 - movq m6, m1 - pmullw m0, m5 - pmullw m1, m3 - paddw m0, [rnd_2d_%2+rnd_bias*8] - paddw m1, m0 - psrlw m1, 6 - packuswb m1, m1 - CHROMAMC_AVG4 m1, m0, [r0] - movd [r0], m1 - add r0, r2 - sub r3d, 2 - jnz .next2rows - REP_RET -%endmacro - -%macro chroma_mc2_mmx_func 2 -cglobal %1_%2_chroma_mc2, 6, 7, 0 -%if ARCH_X86_64 - movsxd r2, r2d -%endif - - mov r6d, r4d - shl r4d, 16 - sub r4d, r6d - add r4d, 8 - imul r5d, r4d ; x*y<<16 | y*(8-x) - shl r4d, 3 - sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) - - movd m5, r4d - movd m6, r5d - punpckldq m5, m5 ; mm5 = {A,B,A,B} - punpckldq m6, m6 ; mm6 = {C,D,C,D} - pxor m7, m7 - movd m2, [r1] - punpcklbw m2, m7 - pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] - -.nextrow: - add r1, r2 - movq m1, m2 - pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] - movd m0, [r1] - punpcklbw m0, m7 - pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] - movq m2, m0 - pmaddwd m0, m6 - paddw m1, [rnd_2d_%2] - paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] - psrlw m1, 6 - packssdw m1, m7 - packuswb m1, m7 - CHROMAMC_AVG4 m1, m3, [r0] - movd r5d, m1 - mov [r0], r5w - add r0, r2 - sub r3d, 1 - jnz .nextrow - REP_RET -%endmacro - -%define rnd_1d_h264 pw_4 -%define rnd_2d_h264 pw_32 -%define rnd_1d_vc1 pw_3 -%define rnd_2d_vc1 pw_28 - -%macro NOTHING 2-3 -%endmacro -%macro DIRECT_AVG 2 - PAVGB %1, %2 -%endmacro -%macro COPY_AVG 3 - movd %2, %3 - PAVGB %1, %2 -%endmacro - -INIT_MMX mmx -%define CHROMAMC_AVG NOTHING -%define CHROMAMC_AVG4 NOTHING -chroma_mc8_mmx_func put, h264, _rnd -chroma_mc8_mmx_func put, vc1, _nornd -chroma_mc8_mmx_func put, rv40 -chroma_mc4_mmx_func put, h264 -chroma_mc4_mmx_func put, rv40 - -INIT_MMX mmxext -chroma_mc2_mmx_func put, h264 - -%define CHROMAMC_AVG DIRECT_AVG -%define CHROMAMC_AVG4 COPY_AVG -chroma_mc8_mmx_func avg, h264, _rnd -chroma_mc8_mmx_func avg, vc1, _nornd -chroma_mc8_mmx_func avg, rv40 -chroma_mc4_mmx_func avg, h264 -chroma_mc4_mmx_func avg, rv40 -chroma_mc2_mmx_func avg, h264 - -INIT_MMX 3dnow -chroma_mc8_mmx_func avg, h264, _rnd -chroma_mc8_mmx_func avg, vc1, _nornd -chroma_mc8_mmx_func avg, rv40 -chroma_mc4_mmx_func avg, h264 -chroma_mc4_mmx_func avg, rv40 - -%macro chroma_mc8_ssse3_func 2-3 -cglobal %1_%2_chroma_mc8%3, 6, 7, 8 -%if ARCH_X86_64 - movsxd r2, r2d -%endif - mov r6d, r5d - or r6d, r4d - jne .at_least_one_non_zero - ; mx == 0 AND my == 0 - no filter needed - mv0_pixels_mc8 - REP_RET - -.at_least_one_non_zero: - test r5d, r5d - je .my_is_zero - test r4d, r4d - je .mx_is_zero - - ; general case, bilinear - mov r6d, r4d - shl r4d, 8 - sub r4, r6 - mov r6, 8 - add r4, 8 ; x*288+8 = x<<8 | (8-x) - sub r6d, r5d - imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) - imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) - - movd m7, r6d - movd m6, r4d - movdqa m5, [rnd_2d_%2] - movq m0, [r1 ] - movq m1, [r1+1] - pshuflw m7, m7, 0 - pshuflw m6, m6, 0 - punpcklbw m0, m1 - movlhps m7, m7 - movlhps m6, m6 - -.next2rows: - movq m1, [r1+r2*1 ] - movq m2, [r1+r2*1+1] - movq m3, [r1+r2*2 ] - movq m4, [r1+r2*2+1] - lea r1, [r1+r2*2] - punpcklbw m1, m2 - movdqa m2, m1 - punpcklbw m3, m4 - movdqa m4, m3 - pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - psrlw m1, 6 - movdqa m0, m4 - psrlw m3, 6 -%ifidn %1, avg - movq m2, [r0 ] - movhps m2, [r0+r2] -%endif - packuswb m1, m3 - CHROMAMC_AVG m1, m2 - movq [r0 ], m1 - movhps [r0+r2], m1 - sub r3d, 2 - lea r0, [r0+r2*2] - jg .next2rows - REP_RET - -.my_is_zero: - mov r5d, r4d - shl r4d, 8 - add r4, 8 - sub r4, r5 ; 255*x+8 = x<<8 | (8-x) - movd m7, r4d - movdqa m6, [rnd_1d_%2] - pshuflw m7, m7, 0 - movlhps m7, m7 - -.next2xrows: - movq m0, [r1 ] - movq m1, [r1 +1] - movq m2, [r1+r2 ] - movq m3, [r1+r2+1] - punpcklbw m0, m1 - punpcklbw m2, m3 - pmaddubsw m0, m7 - pmaddubsw m2, m7 -%ifidn %1, avg - movq m4, [r0 ] - movhps m4, [r0+r2] -%endif - paddw m0, m6 - paddw m2, m6 - psrlw m0, 3 - psrlw m2, 3 - packuswb m0, m2 - CHROMAMC_AVG m0, m4 - movq [r0 ], m0 - movhps [r0+r2], m0 - sub r3d, 2 - lea r0, [r0+r2*2] - lea r1, [r1+r2*2] - jg .next2xrows - REP_RET - -.mx_is_zero: - mov r4d, r5d - shl r5d, 8 - add r5, 8 - sub r5, r4 ; 255*y+8 = y<<8 | (8-y) - movd m7, r5d - movdqa m6, [rnd_1d_%2] - pshuflw m7, m7, 0 - movlhps m7, m7 - -.next2yrows: - movq m0, [r1 ] - movq m1, [r1+r2 ] - movdqa m2, m1 - movq m3, [r1+r2*2] - lea r1, [r1+r2*2] - punpcklbw m0, m1 - punpcklbw m2, m3 - pmaddubsw m0, m7 - pmaddubsw m2, m7 -%ifidn %1, avg - movq m4, [r0 ] - movhps m4, [r0+r2] -%endif - paddw m0, m6 - paddw m2, m6 - psrlw m0, 3 - psrlw m2, 3 - packuswb m0, m2 - CHROMAMC_AVG m0, m4 - movq [r0 ], m0 - movhps [r0+r2], m0 - sub r3d, 2 - lea r0, [r0+r2*2] - jg .next2yrows - REP_RET -%endmacro - -%macro chroma_mc4_ssse3_func 2 -cglobal %1_%2_chroma_mc4, 6, 7, 0 -%if ARCH_X86_64 - movsxd r2, r2d -%endif - mov r6, r4 - shl r4d, 8 - sub r4d, r6d - mov r6, 8 - add r4d, 8 ; x*288+8 - sub r6d, r5d - imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) - imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) - - movd m7, r6d - movd m6, r4d - movq m5, [pw_32] - movd m0, [r1 ] - pshufw m7, m7, 0 - punpcklbw m0, [r1+1] - pshufw m6, m6, 0 - -.next2rows: - movd m1, [r1+r2*1 ] - movd m3, [r1+r2*2 ] - punpcklbw m1, [r1+r2*1+1] - punpcklbw m3, [r1+r2*2+1] - lea r1, [r1+r2*2] - movq m2, m1 - movq m4, m3 - pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - psrlw m1, 6 - movq m0, m4 - psrlw m3, 6 - packuswb m1, m1 - packuswb m3, m3 - CHROMAMC_AVG m1, [r0 ] - CHROMAMC_AVG m3, [r0+r2] - movd [r0 ], m1 - movd [r0+r2], m3 - sub r3d, 2 - lea r0, [r0+r2*2] - jg .next2rows - REP_RET -%endmacro - -%define CHROMAMC_AVG NOTHING -INIT_XMM ssse3 -chroma_mc8_ssse3_func put, h264, _rnd -chroma_mc8_ssse3_func put, vc1, _nornd -INIT_MMX ssse3 -chroma_mc4_ssse3_func put, h264 - -%define CHROMAMC_AVG DIRECT_AVG -INIT_XMM ssse3 -chroma_mc8_ssse3_func avg, h264, _rnd -chroma_mc8_ssse3_func avg, vc1, _nornd -INIT_MMX ssse3 -chroma_mc4_ssse3_func avg, h264 diff --git a/ffmpeg1/libavcodec/x86/h264_chromamc_10bit.asm b/ffmpeg1/libavcodec/x86/h264_chromamc_10bit.asm deleted file mode 100644 index b850551..0000000 --- a/ffmpeg1/libavcodec/x86/h264_chromamc_10bit.asm +++ /dev/null @@ -1,271 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -cextern pw_4 -cextern pw_8 -cextern pw_32 -cextern pw_64 - -SECTION .text - - -%macro MV0_PIXELS_MC8 0 - lea r4, [r2*3 ] - lea r5, [r2*4 ] -.next4rows: - movu m0, [r1 ] - movu m1, [r1+r2 ] - CHROMAMC_AVG m0, [r0 ] - CHROMAMC_AVG m1, [r0+r2 ] - mova [r0 ], m0 - mova [r0+r2 ], m1 - movu m0, [r1+r2*2] - movu m1, [r1+r4 ] - CHROMAMC_AVG m0, [r0+r2*2] - CHROMAMC_AVG m1, [r0+r4 ] - mova [r0+r2*2], m0 - mova [r0+r4 ], m1 - add r1, r5 - add r0, r5 - sub r3d, 4 - jne .next4rows -%endmacro - -;----------------------------------------------------------------------------- -; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) -;----------------------------------------------------------------------------- -%macro CHROMA_MC8 1 -; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, -; int stride, int h, int mx, int my) -cglobal %1_h264_chroma_mc8_10, 6,7,8 - movsxdifnidn r2, r2d - mov r6d, r5d - or r6d, r4d - jne .at_least_one_non_zero - ; mx == 0 AND my == 0 - no filter needed - MV0_PIXELS_MC8 - REP_RET - -.at_least_one_non_zero: - mov r6d, 2 - test r5d, r5d - je .x_interpolation - mov r6, r2 ; dxy = x ? 1 : stride - test r4d, r4d - jne .xy_interpolation -.x_interpolation: - ; mx == 0 XOR my == 0 - 1 dimensional filter only - or r4d, r5d ; x + y - movd m5, r4d - mova m4, [pw_8] - mova m6, [pw_4] ; mm6 = rnd >> 3 - SPLATW m5, m5 ; mm5 = B = x - psubw m4, m5 ; mm4 = A = 8-x - -.next1drow: - movu m0, [r1 ] ; mm0 = src[0..7] - movu m2, [r1+r6] ; mm2 = src[1..8] - - pmullw m0, m4 ; mm0 = A * src[0..7] - pmullw m2, m5 ; mm2 = B * src[1..8] - - paddw m0, m6 - paddw m0, m2 - psrlw m0, 3 - CHROMAMC_AVG m0, [r0] - mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 - - add r0, r2 - add r1, r2 - dec r3d - jne .next1drow - REP_RET - -.xy_interpolation: ; general case, bilinear - movd m4, r4m ; x - movd m6, r5m ; y - - SPLATW m4, m4 ; mm4 = x words - SPLATW m6, m6 ; mm6 = y words - psllw m5, m4, 3 ; mm5 = 8x - pmullw m4, m6 ; mm4 = x * y - psllw m6, 3 ; mm6 = 8y - paddw m1, m5, m6 ; mm7 = 8x+8y - mova m7, m4 ; DD = x * y - psubw m5, m4 ; mm5 = B = 8x - xy - psubw m6, m4 ; mm6 = C = 8y - xy - paddw m4, [pw_64] - psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 - - movu m0, [r1 ] ; mm0 = src[0..7] - movu m1, [r1+2] ; mm1 = src[1..8] -.next2drow: - add r1, r2 - - pmullw m2, m0, m4 - pmullw m1, m5 - paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] - - movu m0, [r1] - movu m1, [r1+2] - pmullw m3, m0, m6 - paddw m2, m3 ; mm2 += C * src[0..7+strde] - pmullw m3, m1, m7 - paddw m2, m3 ; mm2 += D * src[1..8+strde] - - paddw m2, [pw_32] - psrlw m2, 6 - CHROMAMC_AVG m2, [r0] - mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 - - add r0, r2 - dec r3d - jne .next2drow - REP_RET -%endmacro - -;----------------------------------------------------------------------------- -; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) -;----------------------------------------------------------------------------- -;TODO: xmm mc4 -%macro MC4_OP 2 - movq %1, [r1 ] - movq m1, [r1+2] - add r1, r2 - pmullw %1, m4 - pmullw m1, m2 - paddw m1, %1 - mova %1, m1 - - pmullw %2, m5 - pmullw m1, m3 - paddw %2, [pw_32] - paddw m1, %2 - psrlw m1, 6 - CHROMAMC_AVG m1, %2, [r0] - movq [r0], m1 - add r0, r2 -%endmacro - -%macro CHROMA_MC4 1 -cglobal %1_h264_chroma_mc4_10, 6,6,7 - movsxdifnidn r2, r2d - movd m2, r4m ; x - movd m3, r5m ; y - mova m4, [pw_8] - mova m5, m4 - SPLATW m2, m2 - SPLATW m3, m3 - psubw m4, m2 - psubw m5, m3 - - movq m0, [r1 ] - movq m6, [r1+2] - add r1, r2 - pmullw m0, m4 - pmullw m6, m2 - paddw m6, m0 - -.next2rows: - MC4_OP m0, m6 - MC4_OP m6, m0 - sub r3d, 2 - jnz .next2rows - REP_RET -%endmacro - -;----------------------------------------------------------------------------- -; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) -;----------------------------------------------------------------------------- -%macro CHROMA_MC2 1 -cglobal %1_h264_chroma_mc2_10, 6,7 - movsxdifnidn r2, r2d - mov r6d, r4d - shl r4d, 16 - sub r4d, r6d - add r4d, 8 - imul r5d, r4d ; x*y<<16 | y*(8-x) - shl r4d, 3 - sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) - - movd m5, r4d - movd m6, r5d - punpckldq m5, m5 ; mm5 = {A,B,A,B} - punpckldq m6, m6 ; mm6 = {C,D,C,D} - pxor m7, m7 - pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] - -.nextrow: - add r1, r2 - movq m1, m2 - pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] - pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] - movq m2, m0 - pmaddwd m0, m6 - paddw m1, [pw_32] - paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] - psrlw m1, 6 - packssdw m1, m7 - CHROMAMC_AVG m1, m3, [r0] - movd [r0], m1 - add r0, r2 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -%macro NOTHING 2-3 -%endmacro -%macro AVG 2-3 -%if %0==3 - movq %2, %3 -%endif - pavgw %1, %2 -%endmacro - -%define CHROMAMC_AVG NOTHING -INIT_XMM sse2 -CHROMA_MC8 put -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -CHROMA_MC8 put -%endif -INIT_MMX mmxext -CHROMA_MC4 put -CHROMA_MC2 put - -%define CHROMAMC_AVG AVG -INIT_XMM sse2 -CHROMA_MC8 avg -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -CHROMA_MC8 avg -%endif -INIT_MMX mmxext -CHROMA_MC4 avg -CHROMA_MC2 avg diff --git a/ffmpeg1/libavcodec/x86/h264_deblock.asm b/ffmpeg1/libavcodec/x86/h264_deblock.asm deleted file mode 100644 index d58e16c..0000000 --- a/ffmpeg1/libavcodec/x86/h264_deblock.asm +++ /dev/null @@ -1,1083 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized H.264 deblocking code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Oskar Arvidsson <oskar@irock.se> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -pb_A1: times 16 db 0xA1 -pb_3_1: times 4 db 3, 1 - -SECTION .text - -cextern pb_0 -cextern pb_1 -cextern pb_3 - -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - -%define PASS8ROWS(base, base3, stride, stride3, offset) \ - PASS8ROWS(base+offset, base3+offset, stride, stride3) - -; in: 8 rows of 4 bytes in %4..%11 -; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 11 - movh m0, %4 - movh m2, %5 - movh m1, %6 - movh m3, %7 - punpckl%1 m0, m2 - punpckl%1 m1, m3 - mova m2, m0 - punpckl%2 m0, m1 - punpckh%2 m2, m1 - - movh m4, %8 - movh m6, %9 - movh m5, %10 - movh m7, %11 - punpckl%1 m4, m6 - punpckl%1 m5, m7 - mova m6, m4 - punpckl%2 m4, m5 - punpckh%2 m6, m5 - - punpckh%3 m1, m0, m4 - punpckh%3 m3, m2, m6 - punpckl%3 m0, m4 - punpckl%3 m2, m6 -%endmacro - -; in: 4 rows of 8 bytes in m0..m3 -; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4B_STORE 8 - punpckhdq m4, m0, m0 - punpckhdq m5, m1, m1 - punpckhdq m6, m2, m2 - - punpcklbw m0, m1 - punpcklbw m2, m3 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 - movh %1, m1 - punpckhdq m1, m1 - movh %2, m1 - movh %3, m0 - punpckhdq m0, m0 - movh %4, m0 - - punpckhdq m3, m3 - punpcklbw m4, m5 - punpcklbw m6, m3 - punpcklwd m5, m4, m6 - punpckhwd m4, m6 - movh %5, m5 - punpckhdq m5, m5 - movh %6, m5 - movh %7, m4 - punpckhdq m4, m4 - movh %8, m4 -%endmacro - -%macro TRANSPOSE4x8B_LOAD 8 - TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 -%endmacro - -%macro SBUTTERFLY3 4 - punpckh%1 %4, %2, %3 - punpckl%1 %2, %3 -%endmacro - -; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 -; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] -%macro TRANSPOSE6x8_MEM 9 - RESET_MM_PERMUTATION - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, 0, 1, 7 - SBUTTERFLY bw, 2, 3, 7 - SBUTTERFLY bw, 4, 5, 7 - movq [%9+0x10], m3 - SBUTTERFLY3 bw, m6, %8, m7 - SBUTTERFLY wd, 0, 2, 3 - SBUTTERFLY wd, 4, 6, 3 - punpckhdq m0, m4 - movq [%9+0x00], m0 - SBUTTERFLY3 wd, m1, [%9+0x10], m3 - SBUTTERFLY wd, 5, 7, 0 - SBUTTERFLY dq, 1, 5, 0 - SBUTTERFLY dq, 2, 6, 0 - punpckldq m3, m7 - movq [%9+0x10], m2 - movq [%9+0x20], m6 - movq [%9+0x30], m1 - movq [%9+0x40], m5 - movq [%9+0x50], m3 - RESET_MM_PERMUTATION -%endmacro - -; in: 8 rows of 8 in %1..%8 -; out: 8 rows of 8 in %9..%16 -%macro TRANSPOSE8x8_MEM 16 - RESET_MM_PERMUTATION - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, 0, 1, 7 - SBUTTERFLY bw, 2, 3, 7 - SBUTTERFLY bw, 4, 5, 7 - SBUTTERFLY3 bw, m6, %8, m7 - movq %9, m5 - SBUTTERFLY wd, 0, 2, 5 - SBUTTERFLY wd, 4, 6, 5 - SBUTTERFLY wd, 1, 3, 5 - movq %11, m6 - movq m6, %9 - SBUTTERFLY wd, 6, 7, 5 - SBUTTERFLY dq, 0, 4, 5 - SBUTTERFLY dq, 1, 6, 5 - movq %9, m0 - movq %10, m4 - movq %13, m1 - movq %14, m6 - SBUTTERFLY3 dq, m2, %11, m0 - SBUTTERFLY dq, 3, 7, 4 - movq %11, m2 - movq %12, m0 - movq %15, m3 - movq %16, m7 - RESET_MM_PERMUTATION -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT 5 -%if avx_enabled == 0 - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 -%else - psubusb %5, %2, %1 - psubusb %4, %1, %2 -%endif - por %4, %5 - psubusb %4, %3 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT2 5 -%if ARCH_X86_64 - psubusb %5, %2, %1 - psubusb %4, %1, %2 -%else - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 -%endif - psubusb %5, %3 - psubusb %4, %3 - pcmpeqb %4, %5 -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 -; out: m5=beta-1, m7=mask, %3=alpha-1 -; clobbers: m4,m6 -%macro LOAD_MASK 2-3 - movd m4, %1 - movd m5, %2 - SPLATW m4, m4 - SPLATW m5, m5 - packuswb m4, m4 ; 16x alpha-1 - packuswb m5, m5 ; 16x beta-1 -%if %0>2 - mova %3, m4 -%endif - DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 - DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 - por m7, m4 - DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 - por m7, m4 - pxor m6, m6 - pcmpeqb m7, m6 -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) -; out: m1=p0' m2=q0' -; clobbers: m0,3-6 -%macro DEBLOCK_P0_Q0 0 - pcmpeqb m4, m4 - pxor m5, m1, m2 ; p0^q0 - pxor m3, m4 - pand m5, [pb_1] ; (p0^q0)&1 - pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pxor m4, m1 - pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 - pavgb m4, m2 ; (q0 - p0 + 256)>>1 - pavgb m3, m5 - mova m6, [pb_A1] - paddusb m3, m4 ; d+128+33 - psubusb m6, m3 - psubusb m3, [pb_A1] - pminub m6, m7 - pminub m3, m7 - psubusb m1, m6 - psubusb m2, m3 - paddusb m1, m3 - paddusb m2, m6 -%endmacro - -; in: m1=p0 m2=q0 -; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp -; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -; clobbers: q2, tmp, tc0 -%macro LUMA_Q1 6 - pavgb %6, m1, m2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) - pxor %6, %3 - pand %6, [pb_1] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - psubusb %6, %1, %5 - paddusb %5, %1 - pmaxub %2, %6 - pminub %2, %5 - mova %4, %2 -%endmacro - -%if ARCH_X86_64 -;----------------------------------------------------------------------------- -; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -%macro DEBLOCK_LUMA 0 -cglobal deblock_v_luma_8, 5,5,10 - movd m8, [r4] ; tc0 - lea r4, [r1*3] - dec r2d ; alpha-1 - neg r4 - dec r3d ; beta-1 - add r4, r0 ; pix-3*stride - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2d, r3d - - punpcklbw m8, m8 - punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - pcmpeqb m9, m9 - pcmpeqb m9, m8 - pandn m9, m7 - pand m8, m9 - - movdqa m3, [r4] ; p2 - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m9 - psubb m7, m8, m6 - pand m6, m8 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - movdqa m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - pand m6, m9 - pand m8, m6 - psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX cpuname -cglobal deblock_h_luma_8, 5,9 - movsxd r7, r1d - lea r8, [r7+r7*2] - lea r6, [r0-4] - lea r5, [r0-4+r8] -%if WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 -%else - sub rsp, 0x68 - %define pix_tmp rsp -%endif - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp - lea r6, [r6+r7*8] - lea r5, [r5+r7*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 - - ; vertical filter - ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them - lea r0, [pix_tmp+0x30] - mov r1d, 0x10 -%if WIN64 - mov [rsp+0x20], r4 -%endif - call deblock_v_luma_8 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add r6, 2 - add r5, 2 - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - - shl r7, 3 - sub r6, r7 - sub r5, r7 - shr r7, 3 - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - -%if WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 -%endif - RET -%endmacro - -INIT_XMM sse2 -DEBLOCK_LUMA -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA -%endif - -%else - -%macro DEBLOCK_LUMA 2 -;----------------------------------------------------------------------------- -; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_8, 5,5,8,2*%2 - lea r4, [r1*3] - dec r2 ; alpha-1 - neg r4 - dec r3 ; beta-1 - add r4, r0 ; pix-3*stride - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2, r3 - - mov r3, r4mp - pcmpeqb m3, m3 - movd m4, [r3] ; tc0 - punpcklbw m4, m4 - punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - mova [esp+%2], m4 ; tc - pcmpgtb m4, m3 - mova m3, [r4] ; p2 - pand m4, m7 - mova [esp], m4 ; mask - - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m4 - pand m4, [esp+%2] ; tc - psubb m7, m4, m6 - pand m6, m4 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - mova m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - pand m6, [esp] ; mask - mova m5, [esp+%2] ; tc - psubb m7, m6 - pand m5, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX cpuname -cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 - mov r0, r0mp - mov r3, r1m - lea r4, [r3*3] - sub r0, 4 - lea r1, [r0+r4] -%define pix_tmp esp+12*HAVE_ALIGNED_STACK - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 - - ; vertical filter - lea r0, [pix_tmp+0x30] - PUSH dword r4m - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH dword r0 - call deblock_%1_luma_8 -%ifidn %1, v8 - add dword [esp ], 8 ; pix_tmp+0x38 - add dword [esp+16], 2 ; tc0+2 - call deblock_%1_luma_8 -%endif - ADD esp, 20 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov r0, r0mp - sub r0, 2 - - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - lea r1, [r0+r4] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) - - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) - - RET -%endmacro ; DEBLOCK_LUMA - -INIT_MMX mmxext -DEBLOCK_LUMA v8, 8 -INIT_XMM sse2 -DEBLOCK_LUMA v, 16 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA v, 16 -%endif - -%endif ; ARCH - - - -%macro LUMA_INTRA_P012 4 ; p0..p3 in memory -%if ARCH_X86_64 - pavgb t0, p2, p1 - pavgb t1, p0, q0 -%else - mova t0, p2 - mova t1, p0 - pavgb t0, p1 - pavgb t1, q0 -%endif - pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 - mova t5, t1 -%if ARCH_X86_64 - paddb t2, p2, p1 - paddb t3, p0, q0 -%else - mova t2, p2 - mova t3, p0 - paddb t2, p1 - paddb t3, q0 -%endif - paddb t2, t3 - mova t3, t2 - mova t4, t2 - psrlw t2, 1 - pavgb t2, mpb_0 - pxor t2, t0 - pand t2, mpb_1 - psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; - -%if ARCH_X86_64 - pavgb t1, p2, q1 - psubb t2, p2, q1 -%else - mova t1, p2 - mova t2, p2 - pavgb t1, q1 - psubb t2, q1 -%endif - paddb t3, t3 - psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_1 - psubb t1, t2 - pavgb t1, p1 - pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 - psrlw t3, 2 - pavgb t3, mpb_0 - pxor t3, t1 - pand t3, mpb_1 - psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - - pxor t3, p0, q1 - pavgb t2, p0, q1 - pand t3, mpb_1 - psubb t2, t3 - pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 - - pxor t1, t2 - pxor t2, p0 - pand t1, mask1p - pand t2, mask0 - pxor t1, t2 - pxor t1, p0 - mova %1, t1 ; store p0 - - mova t1, %4 ; p3 - paddb t2, t1, p2 - pavgb t1, p2 - pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 - paddb t2, t2 - paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 - psrlw t2, 2 - pavgb t2, mpb_0 - pxor t2, t1 - pand t2, mpb_1 - psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 - - pxor t0, p1 - pxor t1, p2 - pand t0, mask1p - pand t1, mask1p - pxor t0, p1 - pxor t1, p2 - mova %2, t0 ; store p1 - mova %3, t1 ; store p2 -%endmacro - -%macro LUMA_INTRA_SWAP_PQ 0 - %define q1 m0 - %define q0 m1 - %define p0 m2 - %define p1 m3 - %define p2 q2 - %define mask1p mask1q -%endmacro - -%macro DEBLOCK_LUMA_INTRA 1 - %define p1 m0 - %define p0 m1 - %define q0 m2 - %define q1 m3 - %define t0 m4 - %define t1 m5 - %define t2 m6 - %define t3 m7 -%if ARCH_X86_64 - %define p2 m8 - %define q2 m9 - %define t4 m10 - %define t5 m11 - %define mask0 m12 - %define mask1p m13 -%if WIN64 - %define mask1q [rsp] -%else - %define mask1q [rsp-24] -%endif - %define mpb_0 m14 - %define mpb_1 m15 -%else - %define spill(x) [esp+16*x] - %define p2 [r4+r1] - %define q2 [r0+2*r1] - %define t4 spill(0) - %define t5 spill(1) - %define mask0 spill(2) - %define mask1p spill(3) - %define mask1q spill(4) - %define mpb_0 [pb_0] - %define mpb_1 [pb_1] -%endif - -;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -%if WIN64 -cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 -%else -cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 -%endif - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - dec r2d ; alpha-1 - jl .end - neg r4 - dec r3d ; beta-1 - jl .end - add r4, r0 ; pix-4*stride - mova p1, [r4+2*r1] - mova p0, [r4+r5] - mova q0, [r0] - mova q1, [r0+r1] -%if ARCH_X86_64 - pxor mpb_0, mpb_0 - mova mpb_1, [pb_1] - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_0 - pavgb t5, mpb_1 ; alpha/4+1 - movdqa p2, [r4+r1] - movdqa q2, [r0+2*r1] - DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 - DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 - DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 - pand t0, mask0 - pand t4, t0 - pand t2, t0 - mova mask1q, t4 - mova mask1p, t2 -%else - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - mova m4, t5 - mova mask0, m7 - pavgb m4, [pb_0] - pavgb m4, [pb_1] ; alpha/4+1 - DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 - pand m6, mask0 - DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 - pand m4, m6 - mova mask1p, m4 - DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 - pand m4, m6 - mova mask1q, m4 -%endif - LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] - LUMA_INTRA_SWAP_PQ - LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] -.end: - RET - -INIT_MMX cpuname -%if ARCH_X86_64 -;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8, 4,9 - movsxd r7, r1d - lea r8, [r7*3] - lea r6, [r0-4] - lea r5, [r0-4+r8] - sub rsp, 0x88 - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r7*8] - lea r5, [r5+r7*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - mov r1, 0x10 - call deblock_v_luma_intra_8 - - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r8] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - shl r7, 3 - sub r6, r7 - sub r5, r7 - shr r7, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - add rsp, 0x88 - RET -%else -cglobal deblock_h_luma_intra_8, 2,4,8,0x80 - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH r0 - call deblock_%1_luma_intra_8 -%ifidn %1, v8 - add dword [rsp], 8 ; pix_tmp+8 - call deblock_%1_luma_intra_8 -%endif - ADD esp, 16 - - mov r1, r1m - mov r0, r0mp - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - RET -%endif ; ARCH_X86_64 -%endmacro ; DEBLOCK_LUMA_INTRA - -INIT_XMM sse2 -DEBLOCK_LUMA_INTRA v -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA_INTRA v -%endif -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_LUMA_INTRA v8 -%endif - -INIT_MMX mmxext - -%macro CHROMA_V_START 0 - dec r2d ; alpha-1 - dec r3d ; beta-1 - mov t5, r0 - sub t5, r1 - sub t5, r1 -%endmacro - -%macro CHROMA_H_START 0 - dec r2d - dec r3d - sub r0, 2 - lea t6, [r1*3] - mov t5, r0 - add r0, t6 -%endmacro - -%define t5 r5 -%define t6 r6 - -;----------------------------------------------------------------------------- -; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_8, 5,6 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_8, 5,7 -%if UNIX64 - %define buf0 [rsp-24] - %define buf1 [rsp-16] -%elif WIN64 - sub rsp, 16 - %define buf0 [rsp] - %define buf1 [rsp+8] -%else - %define buf0 r0m - %define buf1 r2m -%endif - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) -%if WIN64 - add rsp, 16 -%endif - RET - -ALIGN 16 -ff_chroma_inter_body_mmxext: - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - ret - - - -; in: %1=p0 %2=p1 %3=q1 -; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 -%macro CHROMA_INTRA_P0 3 - movq m4, %1 - pxor m4, %3 - pand m4, [pb_1] ; m4 = (p0^q1)&1 - pavgb %1, %3 - psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) -%endmacro - -%define t5 r4 -%define t6 r5 - -;----------------------------------------------------------------------------- -; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_8, 4,5 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra_8, 4,6 - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - RET - -ALIGN 16 -ff_chroma_intra_body_mmxext: - LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 - CHROMA_INTRA_P0 m1, m0, m3 - CHROMA_INTRA_P0 m2, m3, m0 - psubb m1, m5 - psubb m2, m6 - pand m1, m7 - pand m2, m7 - paddb m1, m5 - paddb m2, m6 - ret - -;----------------------------------------------------------------------------- -; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], -; int8_t ref[2][40], int16_t mv[2][40][2], -; int bidir, int edges, int step, -; int mask_mv0, int mask_mv1, int field); -; -; bidir is 0 or 1 -; edges is 1 or 4 -; step is 1 or 2 -; mask_mv0 is 0 or 3 -; mask_mv1 is 0 or 1 -; field is 0 or 1 -;----------------------------------------------------------------------------- -%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, - ; dir, d_idx, mask_dir, bidir -%define edgesd %1 -%define stepd %2 -%define mask_mvd %3 -%define dir %4 -%define d_idx %5 -%define mask_dir %6 -%define bidir %7 - xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) -%%.b_idx_loop: -%if mask_dir == 0 - pxor m0, m0 -%endif - test b_idxd, dword mask_mvd - jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) -%if bidir == 1 - movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } - punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } - pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } - pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } - pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } - psubb m0, m2 ; { ref0[b] != ref0[bn], - ; ref0[b] != ref1[bn] } - psubb m1, m3 ; { ref1[b] != ref1[bn], - ; ref1[b] != ref0[bn] } - - por m0, m1 - mova m1, [mvq+b_idxq*4+(d_idx+12)*4] - mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] - mova m3, m1 - mova m4, m2 - psubw m1, [mvq+b_idxq*4+12*4] - psubw m2, [mvq+b_idxq*4+12*4+mmsize] - psubw m3, [mvq+b_idxq*4+52*4] - psubw m4, [mvq+b_idxq*4+52*4+mmsize] - packsswb m1, m2 - packsswb m3, m4 - paddb m1, m6 - paddb m3, m6 - psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit - psubusb m3, m5 - packsswb m1, m3 - - por m0, m1 - mova m1, [mvq+b_idxq*4+(d_idx+52)*4] - mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] - mova m3, m1 - mova m4, m2 - psubw m1, [mvq+b_idxq*4+12*4] - psubw m2, [mvq+b_idxq*4+12*4+mmsize] - psubw m3, [mvq+b_idxq*4+52*4] - psubw m4, [mvq+b_idxq*4+52*4+mmsize] - packsswb m1, m2 - packsswb m3, m4 - paddb m1, m6 - paddb m3, m6 - psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit - psubusb m3, m5 - packsswb m1, m3 - - pshufw m1, m1, 0x4E - por m0, m1 - pshufw m1, m0, 0x4E - pminub m0, m1 -%else ; bidir == 0 - movd m0, [refq+b_idxq+12] - psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] - - mova m1, [mvq+b_idxq*4+12*4] - mova m2, [mvq+b_idxq*4+12*4+mmsize] - psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] - psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] - packsswb m1, m2 - paddb m1, m6 - psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit - packsswb m1, m1 - por m0, m1 -%endif ; bidir == 1/0 - -%%.skip_loop_iter: - movd m1, [nnzq+b_idxq+12] - por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] - - pminub m1, m7 - pminub m0, m7 - psllw m1, 1 - pxor m2, m2 - pmaxub m1, m0 - punpcklbw m1, m2 - movq [bsq+b_idxq+32*dir], m1 - - add b_idxd, dword stepd - cmp b_idxd, dword edgesd - jl %%.b_idx_loop -%endmacro - -INIT_MMX mmxext -cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ - step, mask_mv0, mask_mv1, field -%define b_idxq bidirq -%define b_idxd bidird - cmp dword fieldm, 0 - mova m7, [pb_1] - mova m5, [pb_3] - je .nofield - mova m5, [pb_3_1] -.nofield: - mova m6, m5 - paddb m5, m5 - - shl dword stepd, 3 - shl dword edgesd, 3 -%if ARCH_X86_32 -%define mask_mv0d mask_mv0m -%define mask_mv1d mask_mv1m -%endif - shl dword mask_mv1d, 3 - shl dword mask_mv0d, 3 - - cmp dword bidird, 0 - jne .bidir - loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 - loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 - - mova m0, [bsq+mmsize*0] - mova m1, [bsq+mmsize*1] - mova m2, [bsq+mmsize*2] - mova m3, [bsq+mmsize*3] - TRANSPOSE4x4W 0, 1, 2, 3, 4 - mova [bsq+mmsize*0], m0 - mova [bsq+mmsize*1], m1 - mova [bsq+mmsize*2], m2 - mova [bsq+mmsize*3], m3 - RET - -.bidir: - loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 - loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 - - mova m0, [bsq+mmsize*0] - mova m1, [bsq+mmsize*1] - mova m2, [bsq+mmsize*2] - mova m3, [bsq+mmsize*3] - TRANSPOSE4x4W 0, 1, 2, 3, 4 - mova [bsq+mmsize*0], m0 - mova [bsq+mmsize*1], m1 - mova [bsq+mmsize*2], m2 - mova [bsq+mmsize*3], m3 - RET diff --git a/ffmpeg1/libavcodec/x86/h264_deblock_10bit.asm b/ffmpeg1/libavcodec/x86/h264_deblock_10bit.asm deleted file mode 100644 index d63ca02..0000000 --- a/ffmpeg1/libavcodec/x86/h264_deblock_10bit.asm +++ /dev/null @@ -1,923 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Oskar Arvidsson <oskar@irock.se> -;* Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -pw_pixel_max: times 8 dw ((1 << 10)-1) - -SECTION .text - -cextern pw_2 -cextern pw_3 -cextern pw_4 - -; out: %4 = |%1-%2|-%3 -; clobbers: %5 -%macro ABS_SUB 5 - psubusw %5, %2, %1 - psubusw %4, %1, %2 - por %4, %5 - psubw %4, %3 -%endmacro - -; out: %4 = |%1-%2|<%3 -%macro DIFF_LT 5 - psubusw %4, %2, %1 - psubusw %5, %1, %2 - por %5, %4 ; |%1-%2| - pxor %4, %4 - psubw %5, %3 ; |%1-%2|-%3 - pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 -%endmacro - -%macro LOAD_AB 4 - movd %1, %3 - movd %2, %4 - SPLATW %1, %1 - SPLATW %2, %2 -%endmacro - -; in: %2=tc reg -; out: %1=splatted tc -%macro LOAD_TC 2 - movd %1, [%2] - punpcklbw %1, %1 -%if mmsize == 8 - pshufw %1, %1, 0 -%else - pshuflw %1, %1, 01010000b - pshufd %1, %1, 01010000b -%endif - psraw %1, 6 -%endmacro - -; in: %1=p1, %2=p0, %3=q0, %4=q1 -; %5=alpha, %6=beta, %7-%9=tmp -; out: %7=mask -%macro LOAD_MASK 9 - ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha - ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta - pand %8, %9 - ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta - pxor %7, %7 - pand %8, %9 - pcmpgtw %7, %8 -%endmacro - -; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp -; out: %1=p0', m2=q0' -%macro DEBLOCK_P0_Q0 7 - psubw %3, %4 - pxor %7, %7 - paddw %3, [pw_4] - psubw %7, %5 - psubw %6, %2, %1 - psllw %6, 2 - paddw %3, %6 - psraw %3, 3 - mova %6, [pw_pixel_max] - CLIPW %3, %7, %5 - pxor %7, %7 - paddw %1, %3 - psubw %2, %3 - CLIPW %1, %7, %6 - CLIPW %2, %7, %6 -%endmacro - -; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp -%macro LUMA_Q1 6 - pavgw %6, %3, %4 ; (p0+q0+1)>>1 - paddw %1, %6 - pxor %6, %6 - psraw %1, 1 - psubw %6, %5 - psubw %1, %2 - CLIPW %1, %6, %5 - paddw %1, %2 -%endmacro - -%macro LUMA_DEBLOCK_ONE 3 - DIFF_LT m5, %1, bm, m4, m6 - pxor m6, m6 - mova %3, m4 - pcmpgtw m6, tcm - pand m4, tcm - pandn m6, m7 - pand m4, m6 - LUMA_Q1 m5, %2, m1, m2, m4, m6 -%endmacro - -%macro LUMA_H_STORE 2 -%if mmsize == 8 - movq [r0-4], m0 - movq [r0+r1-4], m1 - movq [r0+r1*2-4], m2 - movq [r0+%2-4], m3 -%else - movq [r0-4], m0 - movhps [r0+r1-4], m0 - movq [r0+r1*2-4], m1 - movhps [%1-4], m1 - movq [%1+r1-4], m2 - movhps [%1+r1*2-4], m2 - movq [%1+%2-4], m3 - movhps [%1+r1*4-4], m3 -%endif -%endmacro - -%macro DEBLOCK_LUMA 0 -;----------------------------------------------------------------------------- -; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) - %assign pad 5*mmsize+12-(stack_offset&15) - %define tcm [rsp] - %define ms1 [rsp+mmsize] - %define ms2 [rsp+mmsize*2] - %define am [rsp+mmsize*3] - %define bm [rsp+mmsize*4] - SUB rsp, pad - shl r2d, 2 - shl r3d, 2 - LOAD_AB m4, m5, r2d, r3d - mov r3, 32/mmsize - mov r2, r0 - sub r0, r1 - mova am, m4 - sub r0, r1 - mova bm, m5 - sub r0, r1 -.loop: - mova m0, [r0+r1] - mova m1, [r0+r1*2] - mova m2, [r2] - mova m3, [r2+r1] - - LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 - LOAD_TC m6, r4 - mova tcm, m6 - - mova m5, [r0] - LUMA_DEBLOCK_ONE m1, m0, ms1 - mova [r0+r1], m5 - - mova m5, [r2+r1*2] - LUMA_DEBLOCK_ONE m2, m3, ms2 - mova [r2+r1], m5 - - pxor m5, m5 - mova m6, tcm - pcmpgtw m5, tcm - psubw m6, ms1 - pandn m5, m7 - psubw m6, ms2 - pand m5, m6 - DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 - mova [r0+r1*2], m1 - mova [r2], m2 - - add r0, mmsize - add r2, mmsize - add r4, mmsize/8 - dec r3 - jg .loop - ADD rsp, pad - RET - -cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) - %assign pad 7*mmsize+12-(stack_offset&15) - %define tcm [rsp] - %define ms1 [rsp+mmsize] - %define ms2 [rsp+mmsize*2] - %define p1m [rsp+mmsize*3] - %define p2m [rsp+mmsize*4] - %define am [rsp+mmsize*5] - %define bm [rsp+mmsize*6] - SUB rsp, pad - shl r2d, 2 - shl r3d, 2 - LOAD_AB m4, m5, r2d, r3d - mov r3, r1 - mova am, m4 - add r3, r1 - mov r5, 32/mmsize - mova bm, m5 - add r3, r1 -%if mmsize == 16 - mov r2, r0 - add r2, r3 -%endif -.loop: -%if mmsize == 8 - movq m2, [r0-8] ; y q2 q1 q0 - movq m7, [r0+0] - movq m5, [r0+r1-8] - movq m3, [r0+r1+0] - movq m0, [r0+r1*2-8] - movq m6, [r0+r1*2+0] - movq m1, [r0+r3-8] - TRANSPOSE4x4W 2, 5, 0, 1, 4 - SWAP 2, 7 - movq m7, [r0+r3] - TRANSPOSE4x4W 2, 3, 6, 7, 4 -%else - movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x - movu m0, [r0+r1-8] - movu m2, [r0+r1*2-8] - movu m3, [r2-8] - TRANSPOSE4x4W 5, 0, 2, 3, 6 - mova tcm, m3 - - movu m4, [r2+r1-8] - movu m1, [r2+r1*2-8] - movu m3, [r2+r3-8] - movu m7, [r2+r1*4-8] - TRANSPOSE4x4W 4, 1, 3, 7, 6 - - mova m6, tcm - punpcklqdq m6, m7 - punpckhqdq m5, m4 - SBUTTERFLY qdq, 0, 1, 7 - SBUTTERFLY qdq, 2, 3, 7 -%endif - - mova p2m, m6 - LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 - LOAD_TC m6, r4 - mova tcm, m6 - - LUMA_DEBLOCK_ONE m1, m0, ms1 - mova p1m, m5 - - mova m5, p2m - LUMA_DEBLOCK_ONE m2, m3, ms2 - mova p2m, m5 - - pxor m5, m5 - mova m6, tcm - pcmpgtw m5, tcm - psubw m6, ms1 - pandn m5, m7 - psubw m6, ms2 - pand m5, m6 - DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 - mova m0, p1m - mova m3, p2m - TRANSPOSE4x4W 0, 1, 2, 3, 4 - LUMA_H_STORE r2, r3 - - add r4, mmsize/8 - lea r0, [r0+r1*(mmsize/2)] - lea r2, [r2+r1*(mmsize/2)] - dec r5 - jg .loop - ADD rsp, pad - RET -%endmacro - -%if ARCH_X86_64 -; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 -; m12=alpha, m13=beta -; out: m0=p1', m3=q1', m1=p0', m2=q0' -; clobbers: m4, m5, m6, m7, m10, m11, m14 -%macro DEBLOCK_LUMA_INTER_SSE2 0 - LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 - LOAD_TC m6, r4 - DIFF_LT m8, m1, m13, m10, m4 - DIFF_LT m9, m2, m13, m11, m4 - pand m6, m7 - - mova m14, m6 - pxor m4, m4 - pcmpgtw m6, m4 - pand m6, m14 - - mova m5, m10 - pand m5, m6 - LUMA_Q1 m8, m0, m1, m2, m5, m4 - - mova m5, m11 - pand m5, m6 - LUMA_Q1 m9, m3, m1, m2, m5, m4 - - pxor m4, m4 - psubw m6, m10 - pcmpgtw m4, m14 - pandn m4, m7 - psubw m6, m11 - pand m4, m6 - DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 - - SWAP 0, 8 - SWAP 3, 9 -%endmacro - -%macro DEBLOCK_LUMA_64 0 -cglobal deblock_v_luma_10, 5,5,15 - %define p2 m8 - %define p1 m0 - %define p0 m1 - %define q0 m2 - %define q1 m3 - %define q2 m9 - %define mask0 m7 - %define mask1 m10 - %define mask2 m11 - shl r2d, 2 - shl r3d, 2 - LOAD_AB m12, m13, r2d, r3d - mov r2, r0 - sub r0, r1 - sub r0, r1 - sub r0, r1 - mov r3, 2 -.loop: - mova p2, [r0] - mova p1, [r0+r1] - mova p0, [r0+r1*2] - mova q0, [r2] - mova q1, [r2+r1] - mova q2, [r2+r1*2] - DEBLOCK_LUMA_INTER_SSE2 - mova [r0+r1], p1 - mova [r0+r1*2], p0 - mova [r2], q0 - mova [r2+r1], q1 - add r0, mmsize - add r2, mmsize - add r4, 2 - dec r3 - jg .loop - REP_RET - -cglobal deblock_h_luma_10, 5,7,15 - shl r2d, 2 - shl r3d, 2 - LOAD_AB m12, m13, r2d, r3d - mov r2, r1 - add r2, r1 - add r2, r1 - mov r5, r0 - add r5, r2 - mov r6, 2 -.loop: - movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x - movu m0, [r0+r1-8] - movu m2, [r0+r1*2-8] - movu m9, [r5-8] - movu m5, [r5+r1-8] - movu m1, [r5+r1*2-8] - movu m3, [r5+r2-8] - movu m7, [r5+r1*4-8] - - TRANSPOSE4x4W 8, 0, 2, 9, 10 - TRANSPOSE4x4W 5, 1, 3, 7, 10 - - punpckhqdq m8, m5 - SBUTTERFLY qdq, 0, 1, 10 - SBUTTERFLY qdq, 2, 3, 10 - punpcklqdq m9, m7 - - DEBLOCK_LUMA_INTER_SSE2 - - TRANSPOSE4x4W 0, 1, 2, 3, 4 - LUMA_H_STORE r5, r2 - add r4, 2 - lea r0, [r0+r1*8] - lea r5, [r5+r1*8] - dec r6 - jg .loop - REP_RET -%endmacro - -INIT_XMM sse2 -DEBLOCK_LUMA_64 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA_64 -%endif -%endif - -%macro SWAPMOVA 2 -%ifid %1 - SWAP %1, %2 -%else - mova %1, %2 -%endif -%endmacro - -; in: t0-t2: tmp registers -; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 -; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' -%macro LUMA_INTRA_P012 12 ; p0..p3 in memory -%if ARCH_X86_64 - paddw t0, %3, %2 - mova t2, %4 - paddw t2, %3 -%else - mova t0, %3 - mova t2, %4 - paddw t0, %2 - paddw t2, %3 -%endif - paddw t0, %1 - paddw t2, t2 - paddw t0, %5 - paddw t2, %9 - paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) - paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) - - psrlw t2, 3 - psrlw t1, t0, 2 - psubw t2, %3 - psubw t1, %2 - pand t2, %8 - pand t1, %8 - paddw t2, %3 - paddw t1, %2 - SWAPMOVA %11, t1 - - psubw t1, t0, %3 - paddw t0, t0 - psubw t1, %5 - psubw t0, %3 - paddw t1, %6 - paddw t1, %2 - paddw t0, %6 - psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 - psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 - - pxor t0, t1 - pxor t1, %1 - pand t0, %8 - pand t1, %7 - pxor t0, t1 - pxor t0, %1 - SWAPMOVA %10, t0 - SWAPMOVA %12, t2 -%endmacro - -%macro LUMA_INTRA_INIT 1 - %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) - %define t0 m4 - %define t1 m5 - %define t2 m6 - %define t3 m7 - %assign i 4 -%rep %1 - CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] - %assign i i+1 -%endrep - SUB rsp, pad -%endmacro - -; in: %1-%3=tmp, %4=p2, %5=q2 -%macro LUMA_INTRA_INTER 5 - LOAD_AB t0, t1, r2d, r3d - mova %1, t0 - LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 -%if ARCH_X86_64 - mova %2, t0 ; mask0 - psrlw t3, %1, 2 -%else - mova t3, %1 - mova %2, t0 ; mask0 - psrlw t3, 2 -%endif - paddw t3, [pw_2] ; alpha/4+2 - DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 - pand t2, %2 - mova t3, %5 ; q2 - mova %1, t2 ; mask1 - DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta - pand t2, %1 - mova t3, %4 ; p2 - mova %3, t2 ; mask1q - DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta - pand t2, %1 - mova %1, t2 ; mask1p -%endmacro - -%macro LUMA_H_INTRA_LOAD 0 -%if mmsize == 8 - movu t0, [r0-8] - movu t1, [r0+r1-8] - movu m0, [r0+r1*2-8] - movu m1, [r0+r4-8] - TRANSPOSE4x4W 4, 5, 0, 1, 2 - mova t4, t0 ; p3 - mova t5, t1 ; p2 - - movu m2, [r0] - movu m3, [r0+r1] - movu t0, [r0+r1*2] - movu t1, [r0+r4] - TRANSPOSE4x4W 2, 3, 4, 5, 6 - mova t6, t0 ; q2 - mova t7, t1 ; q3 -%else - movu t0, [r0-8] - movu t1, [r0+r1-8] - movu m0, [r0+r1*2-8] - movu m1, [r0+r5-8] - movu m2, [r4-8] - movu m3, [r4+r1-8] - movu t2, [r4+r1*2-8] - movu t3, [r4+r5-8] - TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 - mova t4, t0 ; p3 - mova t5, t1 ; p2 - mova t6, t2 ; q2 - mova t7, t3 ; q3 -%endif -%endmacro - -; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp -%macro LUMA_H_INTRA_STORE 9 -%if mmsize == 8 - TRANSPOSE4x4W %1, %2, %3, %4, %9 - movq [r0-8], m%1 - movq [r0+r1-8], m%2 - movq [r0+r1*2-8], m%3 - movq [r0+r4-8], m%4 - movq m%1, %8 - TRANSPOSE4x4W %5, %6, %7, %1, %9 - movq [r0], m%5 - movq [r0+r1], m%6 - movq [r0+r1*2], m%7 - movq [r0+r4], m%1 -%else - TRANSPOSE2x4x4W %1, %2, %3, %4, %9 - movq [r0-8], m%1 - movq [r0+r1-8], m%2 - movq [r0+r1*2-8], m%3 - movq [r0+r5-8], m%4 - movhps [r4-8], m%1 - movhps [r4+r1-8], m%2 - movhps [r4+r1*2-8], m%3 - movhps [r4+r5-8], m%4 -%ifnum %8 - SWAP %1, %8 -%else - mova m%1, %8 -%endif - TRANSPOSE2x4x4W %5, %6, %7, %1, %9 - movq [r0], m%5 - movq [r0+r1], m%6 - movq [r0+r1*2], m%7 - movq [r0+r5], m%1 - movhps [r4], m%5 - movhps [r4+r1], m%6 - movhps [r4+r1*2], m%7 - movhps [r4+r5], m%1 -%endif -%endmacro - -%if ARCH_X86_64 -;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -%macro DEBLOCK_LUMA_INTRA_64 0 -cglobal deblock_v_luma_intra_10, 4,7,16 - %define t0 m1 - %define t1 m2 - %define t2 m4 - %define p2 m8 - %define p1 m9 - %define p0 m10 - %define q0 m11 - %define q1 m12 - %define q2 m13 - %define aa m5 - %define bb m14 - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - neg r4 - add r4, r0 ; pix-4*stride - mov r6, 2 - mova m0, [pw_2] - shl r2d, 2 - shl r3d, 2 - LOAD_AB aa, bb, r2d, r3d -.loop: - mova p2, [r4+r1] - mova p1, [r4+2*r1] - mova p0, [r4+r5] - mova q0, [r0] - mova q1, [r0+r1] - mova q2, [r0+2*r1] - - LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 - mova t2, aa - psrlw t2, 2 - paddw t2, m0 ; alpha/4+2 - DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 - DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta - DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta - pand m6, m3 - pand m7, m6 - pand m6, t1 - LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] - LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] - add r0, mmsize - add r4, mmsize - dec r6 - jg .loop - REP_RET - -;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_10, 4,7,16 - %define t0 m15 - %define t1 m14 - %define t2 m2 - %define q3 m5 - %define q2 m8 - %define q1 m9 - %define q0 m10 - %define p0 m11 - %define p1 m12 - %define p2 m13 - %define p3 m4 - %define spill [rsp] - %assign pad 24-(stack_offset&15) - SUB rsp, pad - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - add r4, r0 ; pix+4*stride - mov r6, 2 - mova m0, [pw_2] - shl r2d, 2 - shl r3d, 2 -.loop: - movu q3, [r0-8] - movu q2, [r0+r1-8] - movu q1, [r0+r1*2-8] - movu q0, [r0+r5-8] - movu p0, [r4-8] - movu p1, [r4+r1-8] - movu p2, [r4+r1*2-8] - movu p3, [r4+r5-8] - TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 - - LOAD_AB m1, m2, r2d, r3d - LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 - psrlw m1, 2 - paddw m1, m0 ; alpha/4+2 - DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 - DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta - DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta - pand m6, m3 - pand m7, m6 - pand m6, t1 - - mova spill, q3 - LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 - LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 - mova m7, spill - - LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 - - lea r0, [r0+r1*8] - lea r4, [r4+r1*8] - dec r6 - jg .loop - ADD rsp, pad - RET -%endmacro - -INIT_XMM sse2 -DEBLOCK_LUMA_INTRA_64 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA_INTRA_64 -%endif - -%endif - -%macro DEBLOCK_LUMA_INTRA 0 -;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) - LUMA_INTRA_INIT 3 - lea r4, [r1*4] - lea r5, [r1*3] - neg r4 - add r4, r0 - mov r6, 32/mmsize - shl r2d, 2 - shl r3d, 2 -.loop: - mova m0, [r4+r1*2] ; p1 - mova m1, [r4+r5] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] - LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] - mova t3, [r0+r1*2] ; q2 - LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] - add r0, mmsize - add r4, mmsize - dec r6 - jg .loop - ADD rsp, pad - RET - -;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) - LUMA_INTRA_INIT 8 -%if mmsize == 8 - lea r4, [r1*3] - mov r5, 32/mmsize -%else - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - add r4, r0 ; pix+4*stride - mov r6, 32/mmsize -%endif - shl r2d, 2 - shl r3d, 2 -.loop: - LUMA_H_INTRA_LOAD - LUMA_INTRA_INTER t8, t9, t10, t5, t6 - - LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 - mova t3, t6 ; q2 - LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 - - mova m2, t4 - mova m0, t11 - mova m1, t5 - mova m3, t8 - mova m6, t6 - - LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 - - lea r0, [r0+r1*(mmsize/2)] -%if mmsize == 8 - dec r5 -%else - lea r4, [r4+r1*(mmsize/2)] - dec r6 -%endif - jg .loop - ADD rsp, pad - RET -%endmacro - -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_LUMA -DEBLOCK_LUMA_INTRA -INIT_XMM sse2 -DEBLOCK_LUMA -DEBLOCK_LUMA_INTRA -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_LUMA -DEBLOCK_LUMA_INTRA -%endif -%endif - -; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp -; out: %1=p0', %2=q0' -%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 - mova %6, [pw_2] - paddw %6, %3 - paddw %6, %4 - paddw %7, %6, %2 - paddw %6, %1 - paddw %6, %3 - paddw %7, %4 - psraw %6, 2 - psraw %7, 2 - psubw %6, %1 - psubw %7, %2 - pand %6, %5 - pand %7, %5 - paddw %1, %6 - paddw %2, %7 -%endmacro - -%macro CHROMA_V_LOAD 1 - mova m0, [r0] ; p1 - mova m1, [r0+r1] ; p0 - mova m2, [%1] ; q0 - mova m3, [%1+r1] ; q1 -%endmacro - -%macro CHROMA_V_STORE 0 - mova [r0+1*r1], m1 - mova [r0+2*r1], m2 -%endmacro - -%macro CHROMA_V_LOAD_TC 2 - movd %1, [%2] - punpcklbw %1, %1 - punpcklwd %1, %1 - psraw %1, 6 -%endmacro - -%macro DEBLOCK_CHROMA 0 -;----------------------------------------------------------------------------- -; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) - mov r5, r0 - sub r0, r1 - sub r0, r1 - shl r2d, 2 - shl r3d, 2 -%if mmsize < 16 - mov r6, 16/mmsize -.loop: -%endif - CHROMA_V_LOAD r5 - LOAD_AB m4, m5, r2d, r3d - LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 - pxor m4, m4 - CHROMA_V_LOAD_TC m6, r4 - psubw m6, [pw_3] - pmaxsw m6, m4 - pand m7, m6 - DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 - CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r5, mmsize - add r4, mmsize/4 - dec r6 - jg .loop - REP_RET -%else - RET -%endif - -;----------------------------------------------------------------------------- -; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) - mov r4, r0 - sub r0, r1 - sub r0, r1 - shl r2d, 2 - shl r3d, 2 -%if mmsize < 16 - mov r5, 16/mmsize -.loop: -%endif - CHROMA_V_LOAD r4 - LOAD_AB m4, m5, r2d, r3d - LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 - CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 - CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r4, mmsize - dec r5 - jg .loop - REP_RET -%else - RET -%endif -%endmacro - -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_CHROMA -%endif -INIT_XMM sse2 -DEBLOCK_CHROMA -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEBLOCK_CHROMA -%endif diff --git a/ffmpeg1/libavcodec/x86/h264_i386.h b/ffmpeg1/libavcodec/x86/h264_i386.h deleted file mode 100644 index 0dc0a7c..0000000 --- a/ffmpeg1/libavcodec/x86/h264_i386.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 codec. - * non-MMX i386-specific optimizations for H.264 - * @author Michael Niedermayer <michaelni@gmx.at> - */ - -#ifndef AVCODEC_X86_H264_I386_H -#define AVCODEC_X86_H264_I386_H - -#include <stddef.h> - -#include "libavcodec/cabac.h" -#include "cabac.h" - -#if HAVE_INLINE_ASM - -//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet -//as that would make optimization work hard) -#if HAVE_7REGS -#define decode_significance decode_significance_x86 -static int decode_significance_x86(CABACContext *c, int max_coeff, - uint8_t *significant_coeff_ctx_base, - int *index, x86_reg last_off){ - void *end= significant_coeff_ctx_base + max_coeff - 1; - int minusstart= -(intptr_t)significant_coeff_ctx_base; - int minusindex= 4-(intptr_t)index; - int bit; - x86_reg coeff_count; - -#ifdef BROKEN_RELOCATIONS - void *tables; - - __asm__ volatile( - "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" - : "=&r"(tables) - ); -#endif - - __asm__ volatile( - "3: \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c11(%6)", "%c12(%6)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%13") - - "test $1, %4 \n\t" - " jz 4f \n\t" - "add %10, %1 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c11(%6)", "%c12(%6)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%13") - - "sub %10, %1 \n\t" - "mov %2, %0 \n\t" - "movl %7, %%ecx \n\t" - "add %1, %%"REG_c" \n\t" - "movl %%ecx, (%0) \n\t" - - "test $1, %4 \n\t" - " jnz 5f \n\t" - - "add"OPSIZE" $4, %2 \n\t" - - "4: \n\t" - "add $1, %1 \n\t" - "cmp %8, %1 \n\t" - " jb 3b \n\t" - "mov %2, %0 \n\t" - "movl %7, %%ecx \n\t" - "add %1, %%"REG_c" \n\t" - "movl %%ecx, (%0) \n\t" - "5: \n\t" - "add %9, %k0 \n\t" - "shr $2, %k0 \n\t" - : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index), - "+&r"(c->low), "=&r"(bit), "+&r"(c->range) - : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)) - TABLES_ARG - : "%"REG_c, "memory" - ); - return coeff_count; -} - -#define decode_significance_8x8 decode_significance_8x8_x86 -static int decode_significance_8x8_x86(CABACContext *c, - uint8_t *significant_coeff_ctx_base, - int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){ - int minusindex= 4-(intptr_t)index; - int bit; - x86_reg coeff_count; - x86_reg last=0; - x86_reg state; - -#ifdef BROKEN_RELOCATIONS - void *tables; - - __asm__ volatile( - "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" - : "=&r"(tables) - ); -#endif - - __asm__ volatile( - "mov %1, %6 \n\t" - "3: \n\t" - - "mov %10, %0 \n\t" - "movzbl (%0, %6), %k6 \n\t" - "add %9, %6 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c12(%7)", "%c13(%7)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%15") - - "mov %1, %k6 \n\t" - "test $1, %4 \n\t" - " jz 4f \n\t" - -#ifdef BROKEN_RELOCATIONS - "movzbl %c14(%15, %q6), %k6\n\t" -#else - "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t" -#endif - "add %11, %6 \n\t" - - BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", - "%5", "%q5", "%k0", "%b0", - "%c12(%7)", "%c13(%7)", - AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), - AV_STRINGIFY(H264_LPS_RANGE_OFFSET), - AV_STRINGIFY(H264_MLPS_STATE_OFFSET), - "%15") - - "mov %2, %0 \n\t" - "mov %1, %k6 \n\t" - "movl %k6, (%0) \n\t" - - "test $1, %4 \n\t" - " jnz 5f \n\t" - - "add"OPSIZE" $4, %2 \n\t" - - "4: \n\t" - "addl $1, %k6 \n\t" - "mov %k6, %1 \n\t" - "cmpl $63, %k6 \n\t" - " jb 3b \n\t" - "mov %2, %0 \n\t" - "movl %k6, (%0) \n\t" - "5: \n\t" - "addl %8, %k0 \n\t" - "shr $2, %k0 \n\t" - : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low), - "=&r"(bit), "+&r"(c->range), "=&r"(state) - : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), - "m"(sig_off), "m"(last_coeff_ctx_base), - "i"(offsetof(CABACContext, bytestream)), - "i"(offsetof(CABACContext, bytestream_end)), - "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG - : "%"REG_c, "memory" - ); - return coeff_count; -} -#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */ - -#endif /* HAVE_INLINE_ASM */ -#endif /* AVCODEC_X86_H264_I386_H */ diff --git a/ffmpeg1/libavcodec/x86/h264_idct.asm b/ffmpeg1/libavcodec/x86/h264_idct.asm deleted file mode 100644 index 7bb1653..0000000 --- a/ffmpeg1/libavcodec/x86/h264_idct.asm +++ /dev/null @@ -1,1073 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2-optimized H.264 iDCT -;***************************************************************************** -;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2003-2008 x264 project -;* -;* Authors: Laurent Aimar <fenrir@via.ecp.fr> -;* Loren Merritt <lorenm@u.washington.edu> -;* Holger Lubitz <hal@duncan.ol.sub.de> -;* Min Chen <chenm001.163.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;***************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split -scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 - db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 - db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 - db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 - db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 - db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 - db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 - db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 - db 4+11*8, 5+11*8, 4+12*8, 5+12*8 - db 6+11*8, 7+11*8, 6+12*8, 7+12*8 - db 4+13*8, 5+13*8, 4+14*8, 5+14*8 - db 6+13*8, 7+13*8, 6+14*8, 7+14*8 -%ifdef PIC -%define npicregs 1 -%define scan8 picregq -%else -%define npicregs 0 -%define scan8 scan8_mem -%endif - -cextern pw_32 -cextern pw_1 - -SECTION .text - -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT4_ADD 3 - ; Load dct coeffs - movq m0, [%2] - movq m1, [%2+8] - movq m2, [%2+16] - movq m3, [%2+24] - - IDCT4_1D w, 0, 1, 2, 3, 4, 5 - mova m6, [pw_32] - TRANSPOSE4x4W 0, 1, 2, 3, 4 - paddw m0, m6 - IDCT4_1D w, 0, 1, 2, 3, 4, 5 - pxor m7, m7 - movq [%2+ 0], m7 - movq [%2+ 8], m7 - movq [%2+16], m7 - movq [%2+24], m7 - - STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 -%endmacro - -INIT_MMX mmx -; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_add_8, 3, 3, 0 - IDCT4_ADD r0, r1, r2 - RET - -%macro IDCT8_1D 2 - mova m0, m1 - psraw m1, 1 - mova m4, m5 - psraw m4, 1 - paddw m4, m5 - paddw m1, m0 - paddw m4, m7 - paddw m1, m5 - psubw m4, m0 - paddw m1, m3 - - psubw m0, m3 - psubw m5, m3 - psraw m3, 1 - paddw m0, m7 - psubw m5, m7 - psraw m7, 1 - psubw m0, m3 - psubw m5, m7 - - mova m7, m1 - psraw m1, 2 - mova m3, m4 - psraw m3, 2 - paddw m3, m0 - psraw m0, 2 - paddw m1, m5 - psraw m5, 2 - psubw m0, m4 - psubw m7, m5 - - mova m5, m6 - psraw m6, 1 - mova m4, m2 - psraw m4, 1 - paddw m6, m2 - psubw m4, m5 - - mova m2, %1 - mova m5, %2 - SUMSUB_BA w, 5, 2 - SUMSUB_BA w, 6, 5 - SUMSUB_BA w, 4, 2 - SUMSUB_BA w, 7, 6 - SUMSUB_BA w, 0, 4 - SUMSUB_BA w, 3, 2 - SUMSUB_BA w, 1, 5 - SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 -%endmacro - -%macro IDCT8_1D_FULL 1 - mova m7, [%1+112] - mova m6, [%1+ 96] - mova m5, [%1+ 80] - mova m3, [%1+ 48] - mova m2, [%1+ 32] - mova m1, [%1+ 16] - IDCT8_1D [%1], [%1+ 64] -%endmacro - -; %1=int16_t *block, %2=int16_t *dstblock -%macro IDCT8_ADD_MMX_START 2 - IDCT8_1D_FULL %1 - mova [%1], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - mova m7, [%1] - mova [%2 ], m0 - mova [%2+16], m1 - mova [%2+32], m2 - mova [%2+48], m3 - TRANSPOSE4x4W 4, 5, 6, 7, 3 - mova [%2+ 8], m4 - mova [%2+24], m5 - mova [%2+40], m6 - mova [%2+56], m7 -%endmacro - -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_MMX_END 3-4 - IDCT8_1D_FULL %2 - mova [%2 ], m5 - mova [%2+16], m6 - mova [%2+32], m7 - - pxor m7, m7 -%if %0 == 4 - movq [%4+ 0], m7 - movq [%4+ 8], m7 - movq [%4+ 16], m7 - movq [%4+ 24], m7 - movq [%4+ 32], m7 - movq [%4+ 40], m7 - movq [%4+ 48], m7 - movq [%4+ 56], m7 - movq [%4+ 64], m7 - movq [%4+ 72], m7 - movq [%4+ 80], m7 - movq [%4+ 88], m7 - movq [%4+ 96], m7 - movq [%4+104], m7 - movq [%4+112], m7 - movq [%4+120], m7 -%endif - STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 - mova m0, [%2 ] - mova m1, [%2+16] - mova m2, [%2+32] - lea %1, [%1+%3*2] - STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 -%endmacro - -INIT_MMX mmx -; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_add_8, 3, 4, 0 - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - add word [r1], 32 - IDCT8_ADD_MMX_START r1 , rsp - IDCT8_ADD_MMX_START r1+8, rsp+64 - lea r3, [r0+4] - IDCT8_ADD_MMX_END r0 , rsp, r2, r1 - IDCT8_ADD_MMX_END r3 , rsp+8, r2 - - ADD rsp, pad - RET - -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_SSE 4 - IDCT8_1D_FULL %2 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] -%endif - paddw m0, [pw_32] - -%if ARCH_X86_64 == 0 - mova [%2 ], m0 - mova [%2+16], m4 - IDCT8_1D [%2], [%2+ 16] - mova [%2 ], m6 - mova [%2+16], m7 -%else - SWAP 0, 8 - SWAP 4, 9 - IDCT8_1D m8, m9 - SWAP 6, 8 - SWAP 7, 9 -%endif - - pxor m7, m7 - lea %4, [%3*3] - STORE_DIFF m0, m6, m7, [%1 ] - STORE_DIFF m1, m6, m7, [%1+%3 ] - STORE_DIFF m2, m6, m7, [%1+%3*2] - STORE_DIFF m3, m6, m7, [%1+%4 ] -%if ARCH_X86_64 == 0 - mova m0, [%2 ] - mova m1, [%2+16] -%else - SWAP 0, 8 - SWAP 1, 9 -%endif - mova [%2+ 0], m7 - mova [%2+ 16], m7 - mova [%2+ 32], m7 - mova [%2+ 48], m7 - mova [%2+ 64], m7 - mova [%2+ 80], m7 - mova [%2+ 96], m7 - mova [%2+112], m7 - lea %1, [%1+%3*4] - STORE_DIFF m4, m6, m7, [%1 ] - STORE_DIFF m5, m6, m7, [%1+%3 ] - STORE_DIFF m0, m6, m7, [%1+%3*2] - STORE_DIFF m1, m6, m7, [%1+%4 ] -%endmacro - -INIT_XMM sse2 -; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_add_8, 3, 4, 10 - IDCT8_ADD_SSE r0, r1, r2, r3 - RET - -%macro DC_ADD_MMXEXT_INIT 2 - add %1, 32 - sar %1, 6 - movd m0, %1d - lea %1, [%2*3] - pshufw m0, m0, 0 - pxor m1, m1 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 -%endmacro - -%macro DC_ADD_MMXEXT_OP 4 - %1 m2, [%2 ] - %1 m3, [%2+%3 ] - %1 m4, [%2+%3*2] - %1 m5, [%2+%4 ] - paddusb m2, m0 - paddusb m3, m0 - paddusb m4, m0 - paddusb m5, m0 - psubusb m2, m1 - psubusb m3, m1 - psubusb m4, m1 - psubusb m5, m1 - %1 [%2 ], m2 - %1 [%2+%3 ], m3 - %1 [%2+%3*2], m4 - %1 [%2+%4 ], m5 -%endmacro - -INIT_MMX mmxext -; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -%if ARCH_X86_64 -cglobal h264_idct_dc_add_8, 3, 4, 0 - movsx r3, word [r1] - mov word [r1], 0 - DC_ADD_MMXEXT_INIT r3, r2 - DC_ADD_MMXEXT_OP movh, r0, r2, r3 - RET - -; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_dc_add_8, 3, 4, 0 - movsx r3, word [r1] - mov word [r1], 0 - DC_ADD_MMXEXT_INIT r3, r2 - DC_ADD_MMXEXT_OP mova, r0, r2, r3 - lea r0, [r0+r2*4] - DC_ADD_MMXEXT_OP mova, r0, r2, r3 - RET -%else -cglobal h264_idct_dc_add_8, 2, 3, 0 - movsx r2, word [r1] - mov word [r1], 0 - mov r1, r2m - DC_ADD_MMXEXT_INIT r2, r1 - DC_ADD_MMXEXT_OP movh, r0, r1, r2 - RET - -; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_dc_add_8, 2, 3, 0 - movsx r2, word [r1] - mov word [r1], 0 - mov r1, r2m - DC_ADD_MMXEXT_INIT r2, r1 - DC_ADD_MMXEXT_OP mova, r0, r1, r2 - lea r0, [r0+r1*4] - DC_ADD_MMXEXT_OP mova, r0, r1, r2 - RET -%endif - -INIT_MMX mmx -; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - ADD rsp, pad - RET - -INIT_MMX mmxext -; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmx -; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6w, word [r2] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmxext -; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - add dst2q, r0 - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 - lea dst2q, [dst2q+r3*4] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET - -INIT_XMM sse2 -; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc -INIT_MMX cpuname - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - add dst2q, r0 - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 - lea dst2q, [dst2q+r3*4] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - REP_RET -.no_dc: -INIT_XMM cpuname - mov dst2d, dword [r1+r5*4] - add dst2q, r0 - IDCT8_ADD_SSE dst2q, r2, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmx -h264_idct_add8_mmx_plane: -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6w, word [r2] - test r6, r6 - jz .skipblock -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - IDCT4_ADD r0, r2, r3 -.skipblock: - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret - -; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - mov r5, 16 - add r2, 512 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -%if ARCH_X86_64 - mov dst2q, r0 -%endif - call h264_idct_add8_mmx_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmx_plane - RET - -h264_idct_add8_mmxext_plane: -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - IDCT4_ADD r0, r2, r3 - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - DC_ADD_MMXEXT_OP movh, r0, r3, r6 -.skipblock: - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret - -INIT_MMX mmxext -; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - mov r5, 16 - add r2, 512 -%if ARCH_X86_64 - mov dst2q, r0 -%endif -%ifdef PIC - lea picregq, [scan8_mem] -%endif - call h264_idct_add8_mmxext_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmxext_plane - RET - -; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered -h264_idct_dc_add8_mmxext: - movd m0, [r2 ] ; 0 0 X D - mov word [r2+ 0], 0 - punpcklwd m0, [r2+32] ; x X d D - mov word [r2+32], 0 - paddsw m0, [pw_32] - psraw m0, 6 - punpcklwd m0, m0 ; d d D D - pxor m1, m1 ; 0 0 0 0 - psubw m1, m0 ; -d-d-D-D - packuswb m0, m1 ; -d-d-D-D d d D D - pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D - punpcklwd m0, m0 ; d d d d D D D D - lea r6, [r3*3] - DC_ADD_MMXEXT_OP movq, r0, r3, r6 - ret - -ALIGN 16 -INIT_XMM sse2 -; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride -h264_add8x4_idct_sse2: - movq m0, [r2+ 0] - movq m1, [r2+ 8] - movq m2, [r2+16] - movq m3, [r2+24] - movhps m0, [r2+32] - movhps m1, [r2+40] - movhps m2, [r2+48] - movhps m3, [r2+56] - IDCT4_1D w,0,1,2,3,4,5 - TRANSPOSE2x4x4W 0,1,2,3,4 - paddw m0, [pw_32] - IDCT4_1D w,0,1,2,3,4,5 - pxor m7, m7 - mova [r2+ 0], m7 - mova [r2+16], m7 - mova [r2+32], m7 - mova [r2+48], m7 - STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 - lea r0, [r0+r3*2] - STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 - ret - -%macro add16_sse2_cycle 2 - movzx r0, word [r4+%2] - test r0, r0 - jz .cycle%1end - mov r0d, dword [r1+%1*8] -%if ARCH_X86_64 - add r0, r5 -%else - add r0, r0m -%endif - call h264_add8x4_idct_sse2 -.cycle%1end: -%if %1 < 7 - add r2, 64 -%endif -%endmacro - -; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 -%if ARCH_X86_64 - mov r5, r0 -%endif - ; unrolling of the loop leads to an average performance gain of - ; 20-25% - add16_sse2_cycle 0, 0xc - add16_sse2_cycle 1, 0x14 - add16_sse2_cycle 2, 0xe - add16_sse2_cycle 3, 0x16 - add16_sse2_cycle 4, 0x1c - add16_sse2_cycle 5, 0x24 - add16_sse2_cycle 6, 0x1e - add16_sse2_cycle 7, 0x26 - RET - -%macro add16intra_sse2_cycle 2 - movzx r0, word [r4+%2] - test r0, r0 - jz .try%1dc - mov r0d, dword [r1+%1*8] -%if ARCH_X86_64 - add r0, r7 -%else - add r0, r0m -%endif - call h264_add8x4_idct_sse2 - jmp .cycle%1end -.try%1dc: - movsx r0, word [r2 ] - or r0w, word [r2+32] - jz .cycle%1end - mov r0d, dword [r1+%1*8] -%if ARCH_X86_64 - add r0, r7 -%else - add r0, r0m -%endif - call h264_idct_dc_add8_mmxext -.cycle%1end: -%if %1 < 7 - add r2, 64 -%endif -%endmacro - -; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 -%if ARCH_X86_64 - mov r7, r0 -%endif - add16intra_sse2_cycle 0, 0xc - add16intra_sse2_cycle 1, 0x14 - add16intra_sse2_cycle 2, 0xe - add16intra_sse2_cycle 3, 0x16 - add16intra_sse2_cycle 4, 0x1c - add16intra_sse2_cycle 5, 0x24 - add16intra_sse2_cycle 6, 0x1e - add16intra_sse2_cycle 7, 0x26 - RET - -%macro add8_sse2_cycle 2 - movzx r0, word [r4+%2] - test r0, r0 - jz .try%1dc -%if ARCH_X86_64 - mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r7] -%else - mov r0, r0m - mov r0, [r0] - add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] -%endif - call h264_add8x4_idct_sse2 - jmp .cycle%1end -.try%1dc: - movsx r0, word [r2 ] - or r0w, word [r2+32] - jz .cycle%1end -%if ARCH_X86_64 - mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] - add r0, [r7] -%else - mov r0, r0m - mov r0, [r0] - add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] -%endif - call h264_idct_dc_add8_mmxext -.cycle%1end: -%if %1 == 1 - add r2, 384+64 -%elif %1 < 3 - add r2, 64 -%endif -%endmacro - -; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) -cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 - add r2, 512 -%if ARCH_X86_64 - mov r7, r0 -%endif - add8_sse2_cycle 0, 0x34 - add8_sse2_cycle 1, 0x3c -%if ARCH_X86_64 - add r7, gprsize -%else - add r0mp, gprsize -%endif - add8_sse2_cycle 2, 0x5c - add8_sse2_cycle 3, 0x64 - RET - -;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) - -%macro WALSH4_1D 5 - SUMSUB_BADC w, %4, %3, %2, %1, %5 - SUMSUB_BADC w, %4, %2, %3, %1, %5 - SWAP %1, %4, %3 -%endmacro - -%macro DEQUANT_MMX 3 - mova m7, [pw_1] - mova m4, %1 - punpcklwd %1, m7 - punpckhwd m4, m7 - mova m5, %2 - punpcklwd %2, m7 - punpckhwd m5, m7 - movd m7, t3d - punpckldq m7, m7 - pmaddwd %1, m7 - pmaddwd %2, m7 - pmaddwd m4, m7 - pmaddwd m5, m7 - psrad %1, %3 - psrad %2, %3 - psrad m4, %3 - psrad m5, %3 - packssdw %1, m4 - packssdw %2, m5 -%endmacro - -%macro STORE_WORDS 5-9 -%if cpuflag(sse) - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 - psrldq %1, 4 - mov [t2+%2*32], t0w - mov [t2+%4*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%3*32], t0w - mov [t2+%5*32], t1w - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 - mov [t2+%6*32], t0w - mov [t2+%8*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%7*32], t0w - mov [t2+%9*32], t1w -%else - movd t0d, %1 - psrlq %1, 32 - movd t1d, %1 - mov [t2+%2*32], t0w - mov [t2+%4*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%3*32], t0w - mov [t2+%5*32], t1w -%endif -%endmacro - -%macro DEQUANT_STORE 1 -%if cpuflag(sse2) - movd xmm4, t3d - movq xmm5, [pw_1] - pshufd xmm4, xmm4, 0 - movq2dq xmm0, m0 - movq2dq xmm1, m1 - movq2dq xmm2, m2 - movq2dq xmm3, m3 - punpcklwd xmm0, xmm5 - punpcklwd xmm1, xmm5 - punpcklwd xmm2, xmm5 - punpcklwd xmm3, xmm5 - pmaddwd xmm0, xmm4 - pmaddwd xmm1, xmm4 - pmaddwd xmm2, xmm4 - pmaddwd xmm3, xmm4 - psrad xmm0, %1 - psrad xmm1, %1 - psrad xmm2, %1 - psrad xmm3, %1 - packssdw xmm0, xmm1 - packssdw xmm2, xmm3 - STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 - STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 -%else - DEQUANT_MMX m0, m1, %1 - STORE_WORDS m0, 0, 1, 4, 5 - STORE_WORDS m1, 2, 3, 6, 7 - - DEQUANT_MMX m2, m3, %1 - STORE_WORDS m2, 8, 9, 12, 13 - STORE_WORDS m3, 10, 11, 14, 15 -%endif -%endmacro - -%macro IDCT_DC_DEQUANT 1 -cglobal h264_luma_dc_dequant_idct, 3, 4, %1 - ; manually spill XMM registers for Win64 because - ; the code here is initialized with INIT_MMX - WIN64_SPILL_XMM %1 - movq m3, [r1+24] - movq m2, [r1+16] - movq m1, [r1+ 8] - movq m0, [r1+ 0] - WALSH4_1D 0,1,2,3,4 - TRANSPOSE4x4W 0,1,2,3,4 - WALSH4_1D 0,1,2,3,4 - -; shift, tmp, output, qmul -%if WIN64 - DECLARE_REG_TMP 0,3,1,2 - ; we can't avoid this, because r0 is the shift register (ecx) on win64 - xchg r0, t2 -%elif ARCH_X86_64 - DECLARE_REG_TMP 3,1,0,2 -%else - DECLARE_REG_TMP 1,3,0,2 -%endif - - cmp t3d, 32767 - jg .big_qmul - add t3d, 128 << 16 - DEQUANT_STORE 8 - RET -.big_qmul: - bsr t0d, t3d - add t3d, 128 << 16 - mov t1d, 7 - cmp t0d, t1d - cmovg t0d, t1d - inc t1d - shr t3d, t0b - sub t1d, t0d -%if cpuflag(sse2) - movd xmm6, t1d - DEQUANT_STORE xmm6 -%else - movd m6, t1d - DEQUANT_STORE m6 -%endif - RET -%endmacro - -INIT_MMX mmx -IDCT_DC_DEQUANT 0 -INIT_MMX sse2 -IDCT_DC_DEQUANT 7 diff --git a/ffmpeg1/libavcodec/x86/h264_idct_10bit.asm b/ffmpeg1/libavcodec/x86/h264_idct_10bit.asm deleted file mode 100644 index 88fdb84..0000000 --- a/ffmpeg1/libavcodec/x86/h264_idct_10bit.asm +++ /dev/null @@ -1,589 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -pw_pixel_max: times 8 dw ((1 << 10)-1) -pd_32: times 4 dd 32 - -SECTION .text - -;----------------------------------------------------------------------------- -; void h264_idct_add(pixel *dst, dctcoef *block, int stride) -;----------------------------------------------------------------------------- -%macro STORE_DIFFx2 6 - psrad %1, 6 - psrad %2, 6 - packssdw %1, %2 - movq %3, [%5] - movhps %3, [%5+%6] - paddsw %1, %3 - CLIPW %1, %4, [pw_pixel_max] - movq [%5], %1 - movhps [%5+%6], %1 -%endmacro - -%macro STORE_DIFF16 5 - psrad %1, 6 - psrad %2, 6 - packssdw %1, %2 - paddsw %1, [%5] - CLIPW %1, %3, %4 - mova [%5], %1 -%endmacro - -;dst, in, stride -%macro IDCT4_ADD_10 3 - mova m0, [%2+ 0] - mova m1, [%2+16] - mova m2, [%2+32] - mova m3, [%2+48] - IDCT4_1D d,0,1,2,3,4,5 - TRANSPOSE4x4D 0,1,2,3,4 - paddd m0, [pd_32] - IDCT4_1D d,0,1,2,3,4,5 - pxor m5, m5 - mova [%2+ 0], m5 - mova [%2+16], m5 - mova [%2+32], m5 - mova [%2+48], m5 - STORE_DIFFx2 m0, m1, m4, m5, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m2, m3, m4, m5, %1, %3 -%endmacro - -%macro IDCT_ADD_10 0 -cglobal h264_idct_add_10, 3,3 - IDCT4_ADD_10 r0, r1, r2 - RET -%endmacro - -INIT_XMM sse2 -IDCT_ADD_10 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT_ADD_10 -%endif - -;----------------------------------------------------------------------------- -; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) -;----------------------------------------------------------------------------- -;;;;;;; NO FATE SAMPLES TRIGGER THIS -%macro ADD4x4IDCT 0 -add4x4_idct %+ SUFFIX: - add r5, r0 - mova m0, [r2+ 0] - mova m1, [r2+16] - mova m2, [r2+32] - mova m3, [r2+48] - IDCT4_1D d,0,1,2,3,4,5 - TRANSPOSE4x4D 0,1,2,3,4 - paddd m0, [pd_32] - IDCT4_1D d,0,1,2,3,4,5 - pxor m5, m5 - mova [r2+ 0], m5 - mova [r2+16], m5 - mova [r2+32], m5 - mova [r2+48], m5 - STORE_DIFFx2 m0, m1, m4, m5, r5, r3 - lea r5, [r5+r3*2] - STORE_DIFFx2 m2, m3, m4, m5, r5, r3 - ret -%endmacro - -INIT_XMM sse2 -ALIGN 16 -ADD4x4IDCT -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -ALIGN 16 -ADD4x4IDCT -%endif - -%macro ADD16_OP 2 - cmp byte [r4+%2], 0 - jz .skipblock%1 - mov r5d, [r1+%1*4] - call add4x4_idct %+ SUFFIX -.skipblock%1: -%if %1<15 - add r2, 64 -%endif -%endmacro - -%macro IDCT_ADD16_10 0 -cglobal h264_idct_add16_10, 5,6 - ADD16_OP 0, 4+1*8 - ADD16_OP 1, 5+1*8 - ADD16_OP 2, 4+2*8 - ADD16_OP 3, 5+2*8 - ADD16_OP 4, 6+1*8 - ADD16_OP 5, 7+1*8 - ADD16_OP 6, 6+2*8 - ADD16_OP 7, 7+2*8 - ADD16_OP 8, 4+3*8 - ADD16_OP 9, 5+3*8 - ADD16_OP 10, 4+4*8 - ADD16_OP 11, 5+4*8 - ADD16_OP 12, 6+3*8 - ADD16_OP 13, 7+3*8 - ADD16_OP 14, 6+4*8 - ADD16_OP 15, 7+4*8 - REP_RET -%endmacro - -INIT_XMM sse2 -IDCT_ADD16_10 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT_ADD16_10 -%endif - -;----------------------------------------------------------------------------- -; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) -;----------------------------------------------------------------------------- -%macro IDCT_DC_ADD_OP_10 3 - pxor m5, m5 -%if avx_enabled - paddw m1, m0, [%1+0 ] - paddw m2, m0, [%1+%2 ] - paddw m3, m0, [%1+%2*2] - paddw m4, m0, [%1+%3 ] -%else - mova m1, [%1+0 ] - mova m2, [%1+%2 ] - mova m3, [%1+%2*2] - mova m4, [%1+%3 ] - paddw m1, m0 - paddw m2, m0 - paddw m3, m0 - paddw m4, m0 -%endif - CLIPW m1, m5, m6 - CLIPW m2, m5, m6 - CLIPW m3, m5, m6 - CLIPW m4, m5, m6 - mova [%1+0 ], m1 - mova [%1+%2 ], m2 - mova [%1+%2*2], m3 - mova [%1+%3 ], m4 -%endmacro - -INIT_MMX mmxext -cglobal h264_idct_dc_add_10,3,3 - movd m0, [r1] - mov dword [r1], 0 - paddd m0, [pd_32] - psrad m0, 6 - lea r1, [r2*3] - pshufw m0, m0, 0 - mova m6, [pw_pixel_max] - IDCT_DC_ADD_OP_10 r0, r2, r1 - RET - -;----------------------------------------------------------------------------- -; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) -;----------------------------------------------------------------------------- -%macro IDCT8_DC_ADD 0 -cglobal h264_idct8_dc_add_10,3,4,7 - movd m0, [r1] - mov dword[r1], 0 - paddd m0, [pd_32] - psrad m0, 6 - lea r1, [r2*3] - SPLATW m0, m0, 0 - mova m6, [pw_pixel_max] - IDCT_DC_ADD_OP_10 r0, r2, r1 - lea r0, [r0+r2*4] - IDCT_DC_ADD_OP_10 r0, r2, r1 - RET -%endmacro - -INIT_XMM sse2 -IDCT8_DC_ADD -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT8_DC_ADD -%endif - -;----------------------------------------------------------------------------- -; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) -;----------------------------------------------------------------------------- -%macro AC 1 -.ac%1: - mov r5d, [r1+(%1+0)*4] - call add4x4_idct %+ SUFFIX - mov r5d, [r1+(%1+1)*4] - add r2, 64 - call add4x4_idct %+ SUFFIX - add r2, 64 - jmp .skipadd%1 -%endmacro - -%assign last_block 16 -%macro ADD16_OP_INTRA 2 - cmp word [r4+%2], 0 - jnz .ac%1 - mov r5d, [r2+ 0] - or r5d, [r2+64] - jz .skipblock%1 - mov r5d, [r1+(%1+0)*4] - call idct_dc_add %+ SUFFIX -.skipblock%1: -%if %1<last_block-2 - add r2, 128 -%endif -.skipadd%1: -%endmacro - -%macro IDCT_ADD16INTRA_10 0 -idct_dc_add %+ SUFFIX: - add r5, r0 - movq m0, [r2+ 0] - movhps m0, [r2+64] - mov dword [r2+ 0], 0 - mov dword [r2+64], 0 - paddd m0, [pd_32] - psrad m0, 6 - pshufhw m0, m0, 0 - pshuflw m0, m0, 0 - lea r6, [r3*3] - mova m6, [pw_pixel_max] - IDCT_DC_ADD_OP_10 r5, r3, r6 - ret - -cglobal h264_idct_add16intra_10,5,7,8 - ADD16_OP_INTRA 0, 4+1*8 - ADD16_OP_INTRA 2, 4+2*8 - ADD16_OP_INTRA 4, 6+1*8 - ADD16_OP_INTRA 6, 6+2*8 - ADD16_OP_INTRA 8, 4+3*8 - ADD16_OP_INTRA 10, 4+4*8 - ADD16_OP_INTRA 12, 6+3*8 - ADD16_OP_INTRA 14, 6+4*8 - REP_RET - AC 8 - AC 10 - AC 12 - AC 14 - AC 0 - AC 2 - AC 4 - AC 6 -%endmacro - -INIT_XMM sse2 -IDCT_ADD16INTRA_10 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT_ADD16INTRA_10 -%endif - -%assign last_block 36 -;----------------------------------------------------------------------------- -; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) -;----------------------------------------------------------------------------- -%macro IDCT_ADD8 0 -cglobal h264_idct_add8_10,5,8,7 -%if ARCH_X86_64 - mov r7, r0 -%endif - add r2, 1024 - mov r0, [r0] - ADD16_OP_INTRA 16, 4+ 6*8 - ADD16_OP_INTRA 18, 4+ 7*8 - add r2, 1024-128*2 -%if ARCH_X86_64 - mov r0, [r7+gprsize] -%else - mov r0, r0m - mov r0, [r0+gprsize] -%endif - ADD16_OP_INTRA 32, 4+11*8 - ADD16_OP_INTRA 34, 4+12*8 - REP_RET - AC 16 - AC 18 - AC 32 - AC 34 - -%endmacro ; IDCT_ADD8 - -INIT_XMM sse2 -IDCT_ADD8 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT_ADD8 -%endif - -;----------------------------------------------------------------------------- -; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) -;----------------------------------------------------------------------------- -%macro IDCT8_1D 2 - SWAP 0, 1 - psrad m4, m5, 1 - psrad m1, m0, 1 - paddd m4, m5 - paddd m1, m0 - paddd m4, m7 - paddd m1, m5 - psubd m4, m0 - paddd m1, m3 - - psubd m0, m3 - psubd m5, m3 - paddd m0, m7 - psubd m5, m7 - psrad m3, 1 - psrad m7, 1 - psubd m0, m3 - psubd m5, m7 - - SWAP 1, 7 - psrad m1, m7, 2 - psrad m3, m4, 2 - paddd m3, m0 - psrad m0, 2 - paddd m1, m5 - psrad m5, 2 - psubd m0, m4 - psubd m7, m5 - - SWAP 5, 6 - psrad m4, m2, 1 - psrad m6, m5, 1 - psubd m4, m5 - paddd m6, m2 - - mova m2, %1 - mova m5, %2 - SUMSUB_BA d, 5, 2 - SUMSUB_BA d, 6, 5 - SUMSUB_BA d, 4, 2 - SUMSUB_BA d, 7, 6 - SUMSUB_BA d, 0, 4 - SUMSUB_BA d, 3, 2 - SUMSUB_BA d, 1, 5 - SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 -%endmacro - -%macro IDCT8_1D_FULL 1 - mova m7, [%1+112*2] - mova m6, [%1+ 96*2] - mova m5, [%1+ 80*2] - mova m3, [%1+ 48*2] - mova m2, [%1+ 32*2] - mova m1, [%1+ 16*2] - IDCT8_1D [%1], [%1+ 64*2] -%endmacro - -; %1=int16_t *block, %2=int16_t *dstblock -%macro IDCT8_ADD_SSE_START 2 - IDCT8_1D_FULL %1 -%if ARCH_X86_64 - TRANSPOSE4x4D 0,1,2,3,8 - mova [%2 ], m0 - TRANSPOSE4x4D 4,5,6,7,8 - mova [%2+8*2], m4 -%else - mova [%1], m7 - TRANSPOSE4x4D 0,1,2,3,7 - mova m7, [%1] - mova [%2 ], m0 - mova [%2+16*2], m1 - mova [%2+32*2], m2 - mova [%2+48*2], m3 - TRANSPOSE4x4D 4,5,6,7,3 - mova [%2+ 8*2], m4 - mova [%2+24*2], m5 - mova [%2+40*2], m6 - mova [%2+56*2], m7 -%endif -%endmacro - -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_SSE_END 3 - IDCT8_1D_FULL %2 - mova [%2 ], m6 - mova [%2+16*2], m7 - - pxor m7, m7 - STORE_DIFFx2 m0, m1, m6, m7, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m2, m3, m6, m7, %1, %3 - mova m0, [%2 ] - mova m1, [%2+16*2] - lea %1, [%1+%3*2] - STORE_DIFFx2 m4, m5, m6, m7, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m0, m1, m6, m7, %1, %3 -%endmacro - -%macro IDCT8_ADD 0 -cglobal h264_idct8_add_10, 3,4,16 -%if UNIX64 == 0 - %assign pad 16-gprsize-(stack_offset&15) - sub rsp, pad - call h264_idct8_add1_10 %+ SUFFIX - add rsp, pad - RET -%endif - -ALIGN 16 -; TODO: does not need to use stack -h264_idct8_add1_10 %+ SUFFIX: -%assign pad 256+16-gprsize - sub rsp, pad - add dword [r1], 32 - -%if ARCH_X86_64 - IDCT8_ADD_SSE_START r1, rsp - SWAP 1, 9 - SWAP 2, 10 - SWAP 3, 11 - SWAP 5, 13 - SWAP 6, 14 - SWAP 7, 15 - IDCT8_ADD_SSE_START r1+16, rsp+128 - PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 - IDCT8_1D [rsp], [rsp+128] - SWAP 0, 8 - SWAP 1, 9 - SWAP 2, 10 - SWAP 3, 11 - SWAP 4, 12 - SWAP 5, 13 - SWAP 6, 14 - SWAP 7, 15 - IDCT8_1D [rsp+16], [rsp+144] - psrad m8, 6 - psrad m0, 6 - packssdw m8, m0 - paddsw m8, [r0] - pxor m0, m0 - mova [r1+ 0], m0 - mova [r1+ 16], m0 - mova [r1+ 32], m0 - mova [r1+ 48], m0 - mova [r1+ 64], m0 - mova [r1+ 80], m0 - mova [r1+ 96], m0 - mova [r1+112], m0 - mova [r1+128], m0 - mova [r1+144], m0 - mova [r1+160], m0 - mova [r1+176], m0 - mova [r1+192], m0 - mova [r1+208], m0 - mova [r1+224], m0 - mova [r1+240], m0 - CLIPW m8, m0, [pw_pixel_max] - mova [r0], m8 - mova m8, [pw_pixel_max] - STORE_DIFF16 m9, m1, m0, m8, r0+r2 - lea r0, [r0+r2*2] - STORE_DIFF16 m10, m2, m0, m8, r0 - STORE_DIFF16 m11, m3, m0, m8, r0+r2 - lea r0, [r0+r2*2] - STORE_DIFF16 m12, m4, m0, m8, r0 - STORE_DIFF16 m13, m5, m0, m8, r0+r2 - lea r0, [r0+r2*2] - STORE_DIFF16 m14, m6, m0, m8, r0 - STORE_DIFF16 m15, m7, m0, m8, r0+r2 -%else - IDCT8_ADD_SSE_START r1, rsp - IDCT8_ADD_SSE_START r1+16, rsp+128 - lea r3, [r0+8] - IDCT8_ADD_SSE_END r0, rsp, r2 - IDCT8_ADD_SSE_END r3, rsp+16, r2 - mova [r1+ 0], m7 - mova [r1+ 16], m7 - mova [r1+ 32], m7 - mova [r1+ 48], m7 - mova [r1+ 64], m7 - mova [r1+ 80], m7 - mova [r1+ 96], m7 - mova [r1+112], m7 - mova [r1+128], m7 - mova [r1+144], m7 - mova [r1+160], m7 - mova [r1+176], m7 - mova [r1+192], m7 - mova [r1+208], m7 - mova [r1+224], m7 - mova [r1+240], m7 -%endif ; ARCH_X86_64 - - add rsp, pad - ret -%endmacro - -INIT_XMM sse2 -IDCT8_ADD -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT8_ADD -%endif - -;----------------------------------------------------------------------------- -; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) -;----------------------------------------------------------------------------- -;;;;;;; NO FATE SAMPLES TRIGGER THIS -%macro IDCT8_ADD4_OP 2 - cmp byte [r4+%2], 0 - jz .skipblock%1 - mov r0d, [r6+%1*4] - add r0, r5 - call h264_idct8_add1_10 %+ SUFFIX -.skipblock%1: -%if %1<12 - add r1, 256 -%endif -%endmacro - -%macro IDCT8_ADD4 0 -cglobal h264_idct8_add4_10, 0,7,16 - %assign pad 16-gprsize-(stack_offset&15) - SUB rsp, pad - mov r5, r0mp - mov r6, r1mp - mov r1, r2mp - mov r2d, r3m - movifnidn r4, r4mp - IDCT8_ADD4_OP 0, 4+1*8 - IDCT8_ADD4_OP 4, 6+1*8 - IDCT8_ADD4_OP 8, 4+3*8 - IDCT8_ADD4_OP 12, 6+3*8 - ADD rsp, pad - RET -%endmacro ; IDCT8_ADD4 - -INIT_XMM sse2 -IDCT8_ADD4 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -IDCT8_ADD4 -%endif diff --git a/ffmpeg1/libavcodec/x86/h264_intrapred.asm b/ffmpeg1/libavcodec/x86/h264_intrapred.asm deleted file mode 100644 index 5c0dff4..0000000 --- a/ffmpeg1/libavcodec/x86/h264_intrapred.asm +++ /dev/null @@ -1,2702 +0,0 @@ -;****************************************************************************** -;* H.264 intra prediction asm optimizations -;* Copyright (c) 2010 Jason Garrett-Glaser -;* Copyright (c) 2010 Holger Lubitz -;* Copyright (c) 2010 Loren Merritt -;* Copyright (c) 2010 Ronald S. Bultje -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -tm_shuf: times 8 db 0x03, 0x80 -pw_ff00: times 8 dw 0xff00 -plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 - db 1, 2, 3, 4, 5, 6, 7, 8 -plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 - db 1, 2, 3, 4, 0, 0, 0, 0 -pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 -pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 -pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 -pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 - -SECTION .text - -cextern pb_1 -cextern pb_3 -cextern pw_4 -cextern pw_5 -cextern pw_8 -cextern pw_16 -cextern pw_17 -cextern pw_32 - -;----------------------------------------------------------------------------- -; void pred16x16_vertical_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmx -cglobal pred16x16_vertical_8, 2,3 - sub r0, r1 - mov r2, 8 - movq mm0, [r0+0] - movq mm1, [r0+8] -.loop: - movq [r0+r1*1+0], mm0 - movq [r0+r1*1+8], mm1 - movq [r0+r1*2+0], mm0 - movq [r0+r1*2+8], mm1 - lea r0, [r0+r1*2] - dec r2 - jg .loop - REP_RET - -INIT_XMM sse -cglobal pred16x16_vertical_8, 2,3 - sub r0, r1 - mov r2, 4 - movaps xmm0, [r0] -.loop: - movaps [r0+r1*1], xmm0 - movaps [r0+r1*2], xmm0 - lea r0, [r0+r1*2] - movaps [r0+r1*1], xmm0 - movaps [r0+r1*2], xmm0 - lea r0, [r0+r1*2] - dec r2 - jg .loop - REP_RET - -;----------------------------------------------------------------------------- -; void pred16x16_horizontal_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro PRED16x16_H 0 -cglobal pred16x16_horizontal_8, 2,3 - mov r2, 8 -%if cpuflag(ssse3) - mova m2, [pb_3] -%endif -.loop: - movd m0, [r0+r1*0-4] - movd m1, [r0+r1*1-4] - -%if cpuflag(ssse3) - pshufb m0, m2 - pshufb m1, m2 -%else - punpcklbw m0, m0 - punpcklbw m1, m1 - SPLATW m0, m0, 3 - SPLATW m1, m1, 3 - mova [r0+r1*0+8], m0 - mova [r0+r1*1+8], m1 -%endif - - mova [r0+r1*0], m0 - mova [r0+r1*1], m1 - lea r0, [r0+r1*2] - dec r2 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED16x16_H -INIT_MMX mmxext -PRED16x16_H -INIT_XMM ssse3 -PRED16x16_H - -;----------------------------------------------------------------------------- -; void pred16x16_dc_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC 0 -cglobal pred16x16_dc_8, 2,7 - mov r4, r0 - sub r0, r1 - pxor mm0, mm0 - pxor mm1, mm1 - psadbw mm0, [r0+0] - psadbw mm1, [r0+8] - dec r0 - movzx r5d, byte [r0+r1*1] - paddw mm0, mm1 - movd r6d, mm0 - lea r0, [r0+r1*2] -%rep 7 - movzx r2d, byte [r0+r1*0] - movzx r3d, byte [r0+r1*1] - add r5d, r2d - add r6d, r3d - lea r0, [r0+r1*2] -%endrep - movzx r2d, byte [r0+r1*0] - add r5d, r6d - lea r2d, [r2+r5+16] - shr r2d, 5 -%if cpuflag(ssse3) - pxor m1, m1 -%endif - SPLATB_REG m0, r2, m1 - -%if mmsize==8 - mov r3d, 8 -.loop: - mova [r4+r1*0+0], m0 - mova [r4+r1*0+8], m0 - mova [r4+r1*1+0], m0 - mova [r4+r1*1+8], m0 -%else - mov r3d, 4 -.loop: - mova [r4+r1*0], m0 - mova [r4+r1*1], m0 - lea r4, [r4+r1*2] - mova [r4+r1*0], m0 - mova [r4+r1*1], m0 -%endif - lea r4, [r4+r1*2] - dec r3d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_DC -INIT_XMM sse2 -PRED16x16_DC -INIT_XMM ssse3 -PRED16x16_DC - -;----------------------------------------------------------------------------- -; void pred16x16_tm_vp8_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro PRED16x16_TM 0 -cglobal pred16x16_tm_vp8_8, 2,5 - sub r0, r1 - pxor mm7, mm7 - movq mm0, [r0+0] - movq mm2, [r0+8] - movq mm1, mm0 - movq mm3, mm2 - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - punpcklbw mm2, mm7 - punpckhbw mm3, mm7 - movzx r3d, byte [r0-1] - mov r4d, 16 -.loop: - movzx r2d, byte [r0+r1-1] - sub r2d, r3d - movd mm4, r2d - SPLATW mm4, mm4, 0 - movq mm5, mm4 - movq mm6, mm4 - movq mm7, mm4 - paddw mm4, mm0 - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 - packuswb mm4, mm5 - packuswb mm6, mm7 - movq [r0+r1+0], mm4 - movq [r0+r1+8], mm6 - add r0, r1 - dec r4d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED16x16_TM -INIT_MMX mmxext -PRED16x16_TM - -INIT_XMM sse2 -cglobal pred16x16_tm_vp8_8, 2,6,6 - sub r0, r1 - pxor xmm2, xmm2 - movdqa xmm0, [r0] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - movzx r4d, byte [r0-1] - mov r5d, 8 -.loop: - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - sub r2d, r4d - sub r3d, r4d - movd xmm2, r2d - movd xmm4, r3d - pshuflw xmm2, xmm2, 0 - pshuflw xmm4, xmm4, 0 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm4, xmm4 - movdqa xmm3, xmm2 - movdqa xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm3, xmm1 - paddw xmm4, xmm0 - paddw xmm5, xmm1 - packuswb xmm2, xmm3 - packuswb xmm4, xmm5 - movdqa [r0+r1*1], xmm2 - movdqa [r0+r1*2], xmm4 - lea r0, [r0+r1*2] - dec r5d - jg .loop - REP_RET - -;----------------------------------------------------------------------------- -; void pred16x16_plane_*_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro H264_PRED16x16_PLANE 1 -cglobal pred16x16_plane_%1_8, 2,9,7 - mov r2, r1 ; +stride - neg r1 ; -stride - - movh m0, [r0+r1 -1] -%if mmsize == 8 - pxor m4, m4 - movh m1, [r0+r1 +3 ] - movh m2, [r0+r1 +8 ] - movh m3, [r0+r1 +12] - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 - pmullw m0, [pw_m8tom1 ] - pmullw m1, [pw_m8tom1+8] - pmullw m2, [pw_1to8 ] - pmullw m3, [pw_1to8 +8] - paddw m0, m2 - paddw m1, m3 -%else ; mmsize == 16 -%if cpuflag(ssse3) - movhps m0, [r0+r1 +8] - pmaddubsw m0, [plane_shuf] ; H coefficients -%else ; sse2 - pxor m2, m2 - movh m1, [r0+r1 +8] - punpcklbw m0, m2 - punpcklbw m1, m2 - pmullw m0, [pw_m8tom1] - pmullw m1, [pw_1to8] - paddw m0, m1 -%endif - movhlps m1, m0 -%endif - paddw m0, m1 -%if cpuflag(mmxext) - PSHUFLW m1, m0, 0xE -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 32 -%endif - paddw m0, m1 -%if cpuflag(mmxext) - PSHUFLW m1, m0, 0x1 -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 16 -%endif - paddw m0, m1 ; sum of H coefficients - - lea r4, [r0+r2*8-1] - lea r3, [r0+r2*4-1] - add r4, r2 - -%if ARCH_X86_64 -%define e_reg r8 -%else -%define e_reg r0 -%endif - - movzx e_reg, byte [r3+r2*2 ] - movzx r5, byte [r4+r1 ] - sub r5, e_reg - - movzx e_reg, byte [r3+r2 ] - movzx r6, byte [r4 ] - sub r6, e_reg - lea r5, [r5+r6*2] - - movzx e_reg, byte [r3+r1 ] - movzx r6, byte [r4+r2*2 ] - sub r6, e_reg - lea r5, [r5+r6*4] - - movzx e_reg, byte [r3 ] -%if ARCH_X86_64 - movzx r7, byte [r4+r2 ] - sub r7, e_reg -%else - movzx r6, byte [r4+r2 ] - sub r6, e_reg - lea r5, [r5+r6*4] - sub r5, r6 -%endif - - lea e_reg, [r3+r1*4] - lea r3, [r4+r2*4] - - movzx r4, byte [e_reg+r2 ] - movzx r6, byte [r3 ] - sub r6, r4 -%if ARCH_X86_64 - lea r6, [r7+r6*2] - lea r5, [r5+r6*2] - add r5, r6 -%else - lea r5, [r5+r6*4] - lea r5, [r5+r6*2] -%endif - - movzx r4, byte [e_reg ] -%if ARCH_X86_64 - movzx r7, byte [r3 +r2 ] - sub r7, r4 - sub r5, r7 -%else - movzx r6, byte [r3 +r2 ] - sub r6, r4 - lea r5, [r5+r6*8] - sub r5, r6 -%endif - - movzx r4, byte [e_reg+r1 ] - movzx r6, byte [r3 +r2*2] - sub r6, r4 -%if ARCH_X86_64 - add r6, r7 -%endif - lea r5, [r5+r6*8] - - movzx r4, byte [e_reg+r2*2] - movzx r6, byte [r3 +r1 ] - sub r6, r4 - lea r5, [r5+r6*4] - add r5, r6 ; sum of V coefficients - -%if ARCH_X86_64 == 0 - mov r0, r0m -%endif - -%ifidn %1, h264 - lea r5, [r5*5+32] - sar r5, 6 -%elifidn %1, rv40 - lea r5, [r5*5] - sar r5, 6 -%elifidn %1, svq3 - test r5, r5 - lea r6, [r5+3] - cmovs r5, r6 - sar r5, 2 ; V/4 - lea r5, [r5*5] ; 5*(V/4) - test r5, r5 - lea r6, [r5+15] - cmovs r5, r6 - sar r5, 4 ; (5*(V/4))/16 -%endif - - movzx r4, byte [r0+r1 +15] - movzx r3, byte [r3+r2*2 ] - lea r3, [r3+r4+1] - shl r3, 4 - - movd r1d, m0 - movsx r1d, r1w -%ifnidn %1, svq3 -%ifidn %1, h264 - lea r1d, [r1d*5+32] -%else ; rv40 - lea r1d, [r1d*5] -%endif - sar r1d, 6 -%else ; svq3 - test r1d, r1d - lea r4d, [r1d+3] - cmovs r1d, r4d - sar r1d, 2 ; H/4 - lea r1d, [r1d*5] ; 5*(H/4) - test r1d, r1d - lea r4d, [r1d+15] - cmovs r1d, r4d - sar r1d, 4 ; (5*(H/4))/16 -%endif - movd m0, r1d - - add r1d, r5d - add r3d, r1d - shl r1d, 3 - sub r3d, r1d ; a - - movd m1, r5d - movd m3, r3d - SPLATW m0, m0, 0 ; H - SPLATW m1, m1, 0 ; V - SPLATW m3, m3, 0 ; a -%ifidn %1, svq3 - SWAP 0, 1 -%endif - mova m2, m0 -%if mmsize == 8 - mova m5, m0 -%endif - pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) -%if mmsize == 16 - psllw m2, 3 -%else - psllw m5, 3 - psllw m2, 2 - mova m6, m5 - paddw m6, m2 -%endif - paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H - paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H -%if mmsize == 8 - paddw m5, m0 ; a + {8,9,10,11}*H - paddw m6, m0 ; a + {12,13,14,15}*H -%endif - - mov r4, 8 -.loop: - mova m3, m0 ; b[0..7] - mova m4, m2 ; b[8..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0], m3 -%if mmsize == 8 - mova m3, m5 ; b[8..11] - mova m4, m6 ; b[12..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0+8], m3 -%endif - paddw m0, m1 - paddw m2, m1 -%if mmsize == 8 - paddw m5, m1 - paddw m6, m1 -%endif - - mova m3, m0 ; b[0..7] - mova m4, m2 ; b[8..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0+r2], m3 -%if mmsize == 8 - mova m3, m5 ; b[8..11] - mova m4, m6 ; b[12..15] - psraw m3, 5 - psraw m4, 5 - packuswb m3, m4 - mova [r0+r2+8], m3 -%endif - paddw m0, m1 - paddw m2, m1 -%if mmsize == 8 - paddw m5, m1 - paddw m6, m1 -%endif - - lea r0, [r0+r2*2] - dec r4 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 -INIT_MMX mmxext -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 -INIT_XMM sse2 -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 -INIT_XMM ssse3 -H264_PRED16x16_PLANE h264 -H264_PRED16x16_PLANE rv40 -H264_PRED16x16_PLANE svq3 - -;----------------------------------------------------------------------------- -; void pred8x8_plane_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro H264_PRED8x8_PLANE 0 -cglobal pred8x8_plane_8, 2,9,7 - mov r2, r1 ; +stride - neg r1 ; -stride - - movd m0, [r0+r1 -1] -%if mmsize == 8 - pxor m2, m2 - movh m1, [r0+r1 +4 ] - punpcklbw m0, m2 - punpcklbw m1, m2 - pmullw m0, [pw_m4to4] - pmullw m1, [pw_m4to4+8] -%else ; mmsize == 16 -%if cpuflag(ssse3) - movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary - pmaddubsw m0, [plane8_shuf] ; H coefficients -%else ; sse2 - pxor m2, m2 - movd m1, [r0+r1 +4] - punpckldq m0, m1 - punpcklbw m0, m2 - pmullw m0, [pw_m4to4] -%endif - movhlps m1, m0 -%endif - paddw m0, m1 - -%if notcpuflag(ssse3) -%if cpuflag(mmxext) - PSHUFLW m1, m0, 0xE -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 32 -%endif - paddw m0, m1 -%endif ; !ssse3 - -%if cpuflag(mmxext) - PSHUFLW m1, m0, 0x1 -%elif cpuflag(mmx) - mova m1, m0 - psrlq m1, 16 -%endif - paddw m0, m1 ; sum of H coefficients - - lea r4, [r0+r2*4-1] - lea r3, [r0 -1] - add r4, r2 - -%if ARCH_X86_64 -%define e_reg r8 -%else -%define e_reg r0 -%endif - - movzx e_reg, byte [r3+r2*2 ] - movzx r5, byte [r4+r1 ] - sub r5, e_reg - - movzx e_reg, byte [r3 ] -%if ARCH_X86_64 - movzx r7, byte [r4+r2 ] - sub r7, e_reg - sub r5, r7 -%else - movzx r6, byte [r4+r2 ] - sub r6, e_reg - lea r5, [r5+r6*4] - sub r5, r6 -%endif - - movzx e_reg, byte [r3+r1 ] - movzx r6, byte [r4+r2*2 ] - sub r6, e_reg -%if ARCH_X86_64 - add r6, r7 -%endif - lea r5, [r5+r6*4] - - movzx e_reg, byte [r3+r2 ] - movzx r6, byte [r4 ] - sub r6, e_reg - lea r6, [r5+r6*2] - - lea r5, [r6*9+16] - lea r5, [r5+r6*8] - sar r5, 5 - -%if ARCH_X86_64 == 0 - mov r0, r0m -%endif - - movzx r3, byte [r4+r2*2 ] - movzx r4, byte [r0+r1 +7] - lea r3, [r3+r4+1] - shl r3, 4 - movd r1d, m0 - movsx r1d, r1w - imul r1d, 17 - add r1d, 16 - sar r1d, 5 - movd m0, r1d - add r1d, r5d - sub r3d, r1d - add r1d, r1d - sub r3d, r1d ; a - - movd m1, r5d - movd m3, r3d - SPLATW m0, m0, 0 ; H - SPLATW m1, m1, 0 ; V - SPLATW m3, m3, 0 ; a -%if mmsize == 8 - mova m2, m0 -%endif - pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) - paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H -%if mmsize == 8 - psllw m2, 2 - paddw m2, m0 ; a + {4,5,6,7}*H -%endif - - mov r4, 4 -ALIGN 16 -.loop: -%if mmsize == 16 - mova m3, m0 ; b[0..7] - paddw m0, m1 - psraw m3, 5 - mova m4, m0 ; V+b[0..7] - paddw m0, m1 - psraw m4, 5 - packuswb m3, m4 - movh [r0], m3 - movhps [r0+r2], m3 -%else ; mmsize == 8 - mova m3, m0 ; b[0..3] - mova m4, m2 ; b[4..7] - paddw m0, m1 - paddw m2, m1 - psraw m3, 5 - psraw m4, 5 - mova m5, m0 ; V+b[0..3] - mova m6, m2 ; V+b[4..7] - paddw m0, m1 - paddw m2, m1 - psraw m5, 5 - psraw m6, 5 - packuswb m3, m4 - packuswb m5, m6 - mova [r0], m3 - mova [r0+r2], m5 -%endif - - lea r0, [r0+r2*2] - dec r4 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -H264_PRED8x8_PLANE -INIT_MMX mmxext -H264_PRED8x8_PLANE -INIT_XMM sse2 -H264_PRED8x8_PLANE -INIT_XMM ssse3 -H264_PRED8x8_PLANE - -;----------------------------------------------------------------------------- -; void pred8x8_vertical_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmx -cglobal pred8x8_vertical_8, 2,2 - sub r0, r1 - movq mm0, [r0] -%rep 3 - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 - lea r0, [r0+r1*2] -%endrep - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 - RET - -;----------------------------------------------------------------------------- -; void pred8x8_horizontal_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8_H 0 -cglobal pred8x8_horizontal_8, 2,3 - mov r2, 4 -%if cpuflag(ssse3) - mova m2, [pb_3] -%endif -.loop: - SPLATB_LOAD m0, r0+r1*0-1, m2 - SPLATB_LOAD m1, r0+r1*1-1, m2 - mova [r0+r1*0], m0 - mova [r0+r1*1], m1 - lea r0, [r0+r1*2] - dec r2 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED8x8_H -INIT_MMX mmxext -PRED8x8_H -INIT_MMX ssse3 -PRED8x8_H - -;----------------------------------------------------------------------------- -; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride) -;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred8x8_top_dc_8, 2,5 - sub r0, r1 - movq mm0, [r0] - pxor mm1, mm1 - pxor mm2, mm2 - lea r2, [r0+r1*2] - punpckhbw mm1, mm0 - punpcklbw mm0, mm2 - psadbw mm1, mm2 ; s1 - lea r3, [r2+r1*2] - psadbw mm0, mm2 ; s0 - psrlw mm1, 1 - psrlw mm0, 1 - pavgw mm1, mm2 - lea r4, [r3+r1*2] - pavgw mm0, mm2 - pshufw mm1, mm1, 0 - pshufw mm0, mm0, 0 ; dc0 (w) - packuswb mm0, mm1 ; dc0,dc1 (b) - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 - lea r0, [r3+r1*2] - movq [r2+r1*1], mm0 - movq [r2+r1*2], mm0 - movq [r3+r1*1], mm0 - movq [r3+r1*2], mm0 - movq [r0+r1*1], mm0 - movq [r0+r1*2], mm0 - RET - -;----------------------------------------------------------------------------- -; void pred8x8_dc_8_mmxext(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8_dc_8, 2,5 - sub r0, r1 - pxor m7, m7 - movd m0, [r0+0] - movd m1, [r0+4] - psadbw m0, m7 ; s0 - mov r4, r0 - psadbw m1, m7 ; s1 - - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - lea r0, [r0+r1*2] - add r2d, r3d - movzx r3d, byte [r0+r1*1-1] - add r2d, r3d - movzx r3d, byte [r0+r1*2-1] - add r2d, r3d - lea r0, [r0+r1*2] - movd m2, r2d ; s2 - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - lea r0, [r0+r1*2] - add r2d, r3d - movzx r3d, byte [r0+r1*1-1] - add r2d, r3d - movzx r3d, byte [r0+r1*2-1] - add r2d, r3d - movd m3, r2d ; s3 - - punpcklwd m0, m1 - mov r0, r4 - punpcklwd m2, m3 - punpckldq m0, m2 ; s0, s1, s2, s3 - pshufw m3, m0, 11110110b ; s2, s1, s3, s3 - lea r2, [r0+r1*2] - pshufw m0, m0, 01110100b ; s0, s1, s3, s1 - paddw m0, m3 - lea r3, [r2+r1*2] - psrlw m0, 2 - pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 - lea r4, [r3+r1*2] - packuswb m0, m0 - punpcklbw m0, m0 - movq m1, m0 - punpcklbw m0, m0 - punpckhbw m1, m1 - movq [r0+r1*1], m0 - movq [r0+r1*2], m0 - movq [r2+r1*1], m0 - movq [r2+r1*2], m0 - movq [r3+r1*1], m1 - movq [r3+r1*2], m1 - movq [r4+r1*1], m1 - movq [r4+r1*2], m1 - RET - -;----------------------------------------------------------------------------- -; void pred8x8_dc_rv40_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8_dc_rv40_8, 2,7 - mov r4, r0 - sub r0, r1 - pxor mm0, mm0 - psadbw mm0, [r0] - dec r0 - movzx r5d, byte [r0+r1*1] - movd r6d, mm0 - lea r0, [r0+r1*2] -%rep 3 - movzx r2d, byte [r0+r1*0] - movzx r3d, byte [r0+r1*1] - add r5d, r2d - add r6d, r3d - lea r0, [r0+r1*2] -%endrep - movzx r2d, byte [r0+r1*0] - add r5d, r6d - lea r2d, [r2+r5+8] - shr r2d, 4 - movd mm0, r2d - punpcklbw mm0, mm0 - pshufw mm0, mm0, 0 - mov r3d, 4 -.loop: - movq [r4+r1*0], mm0 - movq [r4+r1*1], mm0 - lea r4, [r4+r1*2] - dec r3d - jg .loop - REP_RET - -;----------------------------------------------------------------------------- -; void pred8x8_tm_vp8_8(uint8_t *src, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8_TM 0 -cglobal pred8x8_tm_vp8_8, 2,6 - sub r0, r1 - pxor mm7, mm7 - movq mm0, [r0] - movq mm1, mm0 - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - movzx r4d, byte [r0-1] - mov r5d, 4 -.loop: - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - sub r2d, r4d - sub r3d, r4d - movd mm2, r2d - movd mm4, r3d - SPLATW mm2, mm2, 0 - SPLATW mm4, mm4, 0 - movq mm3, mm2 - movq mm5, mm4 - paddw mm2, mm0 - paddw mm3, mm1 - paddw mm4, mm0 - paddw mm5, mm1 - packuswb mm2, mm3 - packuswb mm4, mm5 - movq [r0+r1*1], mm2 - movq [r0+r1*2], mm4 - lea r0, [r0+r1*2] - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED8x8_TM -INIT_MMX mmxext -PRED8x8_TM - -INIT_XMM sse2 -cglobal pred8x8_tm_vp8_8, 2,6,4 - sub r0, r1 - pxor xmm1, xmm1 - movq xmm0, [r0] - punpcklbw xmm0, xmm1 - movzx r4d, byte [r0-1] - mov r5d, 4 -.loop: - movzx r2d, byte [r0+r1*1-1] - movzx r3d, byte [r0+r1*2-1] - sub r2d, r4d - sub r3d, r4d - movd xmm2, r2d - movd xmm3, r3d - pshuflw xmm2, xmm2, 0 - pshuflw xmm3, xmm3, 0 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - paddw xmm2, xmm0 - paddw xmm3, xmm0 - packuswb xmm2, xmm3 - movq [r0+r1*1], xmm2 - movhps [r0+r1*2], xmm2 - lea r0, [r0+r1*2] - dec r5d - jg .loop - REP_RET - -INIT_XMM ssse3 -cglobal pred8x8_tm_vp8_8, 2,3,6 - sub r0, r1 - movdqa xmm4, [tm_shuf] - pxor xmm1, xmm1 - movq xmm0, [r0] - punpcklbw xmm0, xmm1 - movd xmm5, [r0-4] - pshufb xmm5, xmm4 - mov r2d, 4 -.loop: - movd xmm2, [r0+r1*1-4] - movd xmm3, [r0+r1*2-4] - pshufb xmm2, xmm4 - pshufb xmm3, xmm4 - psubw xmm2, xmm5 - psubw xmm3, xmm5 - paddw xmm2, xmm0 - paddw xmm3, xmm0 - packuswb xmm2, xmm3 - movq [r0+r1*1], xmm2 - movhps [r0+r1*2], xmm2 - lea r0, [r0+r1*2] - dec r2d - jg .loop - REP_RET - -; dest, left, right, src, tmp -; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED4x4_LOWPASS 5 - mova %5, %2 - pavgb %2, %3 - pxor %3, %5 - mova %1, %4 - pand %3, [pb_1] - psubusb %2, %3 - pavgb %1, %2 -%endmacro - -;----------------------------------------------------------------------------- -; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_TOP_DC 0 -cglobal pred8x8l_top_dc_8, 4,4 - sub r0, r3 - pxor mm7, mm7 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 -.body: - PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 - psadbw mm7, mm0 - paddw mm7, [pw_4] - psrlw mm7, 3 - pshufw mm7, mm7, 0 - packuswb mm7, mm7 -%rep 3 - movq [r0+r3*1], mm7 - movq [r0+r3*2], mm7 - lea r0, [r0+r3*2] -%endrep - movq [r0+r3*1], mm7 - movq [r0+r3*2], mm7 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_TOP_DC -INIT_MMX ssse3 -PRED8x8L_TOP_DC - -;----------------------------------------------------------------------------- -;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8L_DC 0 -cglobal pred8x8l_dc_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .body -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .body -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.body: - lea r1, [r0+r3*2] - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 - pxor mm0, mm0 - pxor mm1, mm1 - lea r2, [r1+r3*2] - psadbw mm0, mm7 - psadbw mm1, mm6 - paddw mm0, [pw_8] - paddw mm0, mm1 - lea r4, [r2+r3*2] - psrlw mm0, 4 - pshufw mm0, mm0, 0 - packuswb mm0, mm0 - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 - movq [r1+r3*1], mm0 - movq [r1+r3*2], mm0 - movq [r2+r3*1], mm0 - movq [r2+r3*2], mm0 - movq [r4+r3*1], mm0 - movq [r4+r3*2], mm0 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_DC -INIT_MMX ssse3 -PRED8x8L_DC - -;----------------------------------------------------------------------------- -; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8L_HORIZONTAL 0 -cglobal pred8x8l_horizontal_8, 4,4 - sub r0, r3 - lea r2, [r0+r3*2] - movq mm0, [r0+r3*1-8] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhbw mm0, [r1+r3*0-8] - movq mm1, [r2+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r2, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r1+r3*0-8] - mov r0, r2 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm3, mm7 - lea r1, [r0+r3*2] - movq mm7, mm3 - punpckhbw mm3, mm3 - punpcklbw mm7, mm7 - pshufw mm0, mm3, 0xff - pshufw mm1, mm3, 0xaa - lea r2, [r1+r3*2] - pshufw mm2, mm3, 0x55 - pshufw mm3, mm3, 0x00 - pshufw mm4, mm7, 0xff - pshufw mm5, mm7, 0xaa - pshufw mm6, mm7, 0x55 - pshufw mm7, mm7, 0x00 - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm1 - movq [r1+r3*1], mm2 - movq [r1+r3*2], mm3 - movq [r2+r3*1], mm4 - movq [r2+r3*2], mm5 - lea r0, [r2+r3*2] - movq [r0+r3*1], mm6 - movq [r0+r3*2], mm7 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_HORIZONTAL -INIT_MMX ssse3 -PRED8x8L_HORIZONTAL - -;----------------------------------------------------------------------------- -; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8L_VERTICAL 0 -cglobal pred8x8l_vertical_8, 4,4 - sub r0, r3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 -.body: - PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 -%rep 3 - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 - lea r0, [r0+r3*2] -%endrep - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_VERTICAL -INIT_MMX ssse3 -PRED8x8L_VERTICAL - -;----------------------------------------------------------------------------- -;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8l_down_left_8, 4,5 - sub r0, r3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.fix_tr_2: - punpckhbw mm3, mm3 - pshufw mm1, mm3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm7, mm4 - test r2, r2 - jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 - psrlq mm5, 56 - PALIGNR mm2, mm3, 7, mm3 - PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 -.do_topright: - lea r1, [r0+r3*2] - movq mm6, mm1 - psrlq mm1, 56 - movq mm4, mm1 - lea r2, [r1+r3*2] - movq mm2, mm6 - PALIGNR mm2, mm7, 1, mm0 - movq mm3, mm6 - PALIGNR mm3, mm7, 7, mm0 - PALIGNR mm4, mm6, 1, mm0 - movq mm5, mm7 - movq mm1, mm7 - movq mm7, mm6 - lea r4, [r2+r3*2] - psllq mm1, 8 - PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 - PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 - movq [r4+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r4+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r2+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r2+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r1+r3*2], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r1+r3*1], mm1 - movq mm2, mm0 - psllq mm1, 8 - psrlq mm2, 56 - psllq mm0, 8 - por mm1, mm2 - movq [r0+r3*2], mm1 - psllq mm1, 8 - psrlq mm0, 56 - por mm1, mm0 - movq [r0+r3*1], mm1 - RET - -%macro PRED8x8L_DOWN_LEFT 0 -cglobal pred8x8l_down_left_8, 4,4 - sub r0, r3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 ; top_right - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.fix_tr_2: - punpckhbw mm3, mm3 - pshufw mm1, mm3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq2dq xmm3, mm4 - test r2, r2 ; top_right - jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 - psrlq mm5, 56 - PALIGNR mm2, mm3, 7, mm3 - PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 -.do_topright: - movq2dq xmm4, mm1 - psrlq mm1, 56 - movq2dq xmm5, mm1 - lea r1, [r0+r3*2] - pslldq xmm4, 8 - por xmm3, xmm4 - movdqa xmm2, xmm3 - psrldq xmm2, 1 - pslldq xmm5, 15 - por xmm2, xmm5 - lea r2, [r1+r3*2] - movdqa xmm1, xmm3 - pslldq xmm1, 1 -INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 - psrldq xmm0, 1 - movq [r0+r3*1], xmm0 - psrldq xmm0, 1 - movq [r0+r3*2], xmm0 - psrldq xmm0, 1 - lea r0, [r2+r3*2] - movq [r1+r3*1], xmm0 - psrldq xmm0, 1 - movq [r1+r3*2], xmm0 - psrldq xmm0, 1 - movq [r2+r3*1], xmm0 - psrldq xmm0, 1 - movq [r2+r3*2], xmm0 - psrldq xmm0, 1 - movq [r0+r3*1], xmm0 - psrldq xmm0, 1 - movq [r0+r3*2], xmm0 - RET -%endmacro - -INIT_MMX sse2 -PRED8x8L_DOWN_LEFT -INIT_MMX ssse3 -PRED8x8L_DOWN_LEFT - -;----------------------------------------------------------------------------- -;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8l_down_right_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 ; top_left - jz .fix_lt_1 -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - movq mm6, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm5, mm4 - jmp .body -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 ; top_right - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.body: - lea r1, [r0+r3*2] - movq mm1, mm7 - movq mm7, mm5 - movq mm5, mm6 - movq mm2, mm7 - lea r2, [r1+r3*2] - PALIGNR mm2, mm6, 1, mm0 - movq mm3, mm7 - PALIGNR mm3, mm6, 7, mm0 - movq mm4, mm7 - lea r4, [r2+r3*2] - psrlq mm4, 8 - PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 - PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 - movq [r4+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r4+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r2+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r2+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r1+r3*2], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r1+r3*1], mm0 - movq mm2, mm1 - psrlq mm0, 8 - psllq mm2, 56 - psrlq mm1, 8 - por mm0, mm2 - movq [r0+r3*2], mm0 - psrlq mm0, 8 - psllq mm1, 56 - por mm0, mm1 - movq [r0+r3*1], mm0 - RET - -%macro PRED8x8L_DOWN_RIGHT 0 -cglobal pred8x8l_down_right_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jz .fix_lt_1 - jmp .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - movq2dq xmm3, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq2dq xmm1, mm7 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq2dq xmm4, mm4 - lea r1, [r0+r3*2] - movdqa xmm0, xmm3 - pslldq xmm4, 8 - por xmm3, xmm4 - lea r2, [r1+r3*2] - pslldq xmm4, 1 - por xmm1, xmm4 - psrldq xmm0, 7 - pslldq xmm0, 15 - psrldq xmm0, 7 - por xmm1, xmm0 - lea r0, [r2+r3*2] - movdqa xmm2, xmm3 - psrldq xmm2, 1 -INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 - movdqa xmm1, xmm0 - psrldq xmm1, 1 - movq [r0+r3*2], xmm0 - movq [r0+r3*1], xmm1 - psrldq xmm0, 2 - psrldq xmm1, 2 - movq [r2+r3*2], xmm0 - movq [r2+r3*1], xmm1 - psrldq xmm0, 2 - psrldq xmm1, 2 - movq [r1+r3*2], xmm0 - movq [r1+r3*1], xmm1 - psrldq xmm0, 2 - psrldq xmm1, 2 - movq [r4+r3*2], xmm0 - movq [r4+r3*1], xmm1 - RET -%endmacro - -INIT_MMX sse2 -PRED8x8L_DOWN_RIGHT -INIT_MMX ssse3 -PRED8x8L_DOWN_RIGHT - -;----------------------------------------------------------------------------- -; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8l_vertical_right_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jz .fix_lt_1 - jmp .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm7, mm2 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 - lea r1, [r0+r3*2] - movq mm2, mm6 - movq mm3, mm6 - PALIGNR mm3, mm7, 7, mm0 - PALIGNR mm6, mm7, 6, mm1 - movq mm4, mm3 - pavgb mm3, mm2 - lea r2, [r1+r3*2] - PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 - movq [r0+r3*1], mm3 - movq [r0+r3*2], mm0 - movq mm5, mm0 - movq mm6, mm3 - movq mm1, mm7 - movq mm2, mm1 - psllq mm2, 8 - movq mm3, mm1 - psllq mm3, 16 - lea r4, [r2+r3*2] - PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 - PALIGNR mm6, mm0, 7, mm2 - movq [r1+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r1+r3*2], mm5 - psllq mm0, 8 - PALIGNR mm6, mm0, 7, mm2 - movq [r2+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r2+r3*2], mm5 - psllq mm0, 8 - PALIGNR mm6, mm0, 7, mm2 - movq [r4+r3*1], mm6 - psllq mm0, 8 - PALIGNR mm5, mm0, 7, mm1 - movq [r4+r3*2], mm5 - RET - -%macro PRED8x8L_VERTICAL_RIGHT 0 -cglobal pred8x8l_vertical_right_8, 4,5,7 - ; manually spill XMM registers for Win64 because - ; the code here is initialized with INIT_MMX - WIN64_SPILL_XMM 7 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq2dq xmm0, mm2 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 - lea r1, [r0+r3*2] - movq2dq xmm4, mm6 - pslldq xmm4, 8 - por xmm0, xmm4 - movdqa xmm6, [pw_ff00] - movdqa xmm1, xmm0 - lea r2, [r1+r3*2] - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - pslldq xmm0, 1 - pslldq xmm1, 2 - pavgb xmm2, xmm0 -INIT_XMM cpuname - PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 - pandn xmm6, xmm4 - movdqa xmm5, xmm4 - psrlw xmm4, 8 - packuswb xmm6, xmm4 - movhlps xmm4, xmm6 - movhps [r0+r3*2], xmm5 - movhps [r0+r3*1], xmm2 - psrldq xmm5, 4 - movss xmm5, xmm6 - psrldq xmm2, 4 - movss xmm2, xmm4 - lea r0, [r2+r3*2] - psrldq xmm5, 1 - psrldq xmm2, 1 - movq [r0+r3*2], xmm5 - movq [r0+r3*1], xmm2 - psrldq xmm5, 1 - psrldq xmm2, 1 - movq [r2+r3*2], xmm5 - movq [r2+r3*1], xmm2 - psrldq xmm5, 1 - psrldq xmm2, 1 - movq [r1+r3*2], xmm5 - movq [r1+r3*1], xmm2 - RET -%endmacro - -INIT_MMX sse2 -PRED8x8L_VERTICAL_RIGHT -INIT_MMX ssse3 -PRED8x8L_VERTICAL_RIGHT - -;----------------------------------------------------------------------------- -;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8L_VERTICAL_LEFT 0 -cglobal pred8x8l_vertical_left_8, 4,4 - sub r0, r3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.fix_tr_2: - punpckhbw mm3, mm3 - pshufw mm1, mm3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq2dq xmm4, mm4 - test r2, r2 - jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 - psrlq mm5, 56 - PALIGNR mm2, mm3, 7, mm3 - PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 -.do_topright: - movq2dq xmm3, mm1 - lea r1, [r0+r3*2] - pslldq xmm3, 8 - por xmm4, xmm3 - movdqa xmm2, xmm4 - movdqa xmm1, xmm4 - movdqa xmm3, xmm4 - psrldq xmm2, 1 - pslldq xmm1, 1 - pavgb xmm3, xmm2 - lea r2, [r1+r3*2] -INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 - psrldq xmm0, 1 - movq [r0+r3*1], xmm3 - movq [r0+r3*2], xmm0 - lea r0, [r2+r3*2] - psrldq xmm3, 1 - psrldq xmm0, 1 - movq [r1+r3*1], xmm3 - movq [r1+r3*2], xmm0 - psrldq xmm3, 1 - psrldq xmm0, 1 - movq [r2+r3*1], xmm3 - movq [r2+r3*2], xmm0 - psrldq xmm3, 1 - psrldq xmm0, 1 - movq [r0+r3*1], xmm3 - movq [r0+r3*2], xmm0 - RET -%endmacro - -INIT_MMX sse2 -PRED8x8L_VERTICAL_LEFT -INIT_MMX ssse3 -PRED8x8L_VERTICAL_LEFT - -;----------------------------------------------------------------------------- -; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED8x8L_HORIZONTAL_UP 0 -cglobal pred8x8l_horizontal_up_8, 4,4 - sub r0, r3 - lea r2, [r0+r3*2] - movq mm0, [r0+r3*1-8] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhbw mm0, [r1+r3*0-8] - movq mm1, [r2+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r2, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r1+r3*0-8] - mov r0, r2 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - lea r1, [r0+r3*2] - pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 - psllq mm7, 56 ; l7 .. .. .. .. .. .. .. - movq mm2, mm0 - psllw mm0, 8 - psrlw mm2, 8 - por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 - movq mm3, mm2 - movq mm4, mm2 - movq mm5, mm2 - psrlq mm2, 8 - psrlq mm3, 16 - lea r2, [r1+r3*2] - por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 - punpckhbw mm7, mm7 - por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 - pavgb mm4, mm2 - PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 - movq mm5, mm4 - punpcklbw mm4, mm1 ; p4 p3 p2 p1 - punpckhbw mm5, mm1 ; p8 p7 p6 p5 - movq mm6, mm5 - movq mm7, mm5 - movq mm0, mm5 - PALIGNR mm5, mm4, 2, mm1 - pshufw mm1, mm6, 11111001b - PALIGNR mm6, mm4, 4, mm2 - pshufw mm2, mm7, 11111110b - PALIGNR mm7, mm4, 6, mm3 - pshufw mm3, mm0, 11111111b - movq [r0+r3*1], mm4 - movq [r0+r3*2], mm5 - lea r0, [r2+r3*2] - movq [r1+r3*1], mm6 - movq [r1+r3*2], mm7 - movq [r2+r3*1], mm0 - movq [r2+r3*2], mm1 - movq [r0+r3*1], mm2 - movq [r0+r3*2], mm3 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_HORIZONTAL_UP -INIT_MMX ssse3 -PRED8x8L_HORIZONTAL_UP - -;----------------------------------------------------------------------------- -;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred8x8l_horizontal_down_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - movq mm6, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq mm5, mm4 - lea r1, [r0+r3*2] - psllq mm7, 56 - movq mm2, mm5 - movq mm3, mm6 - movq mm4, mm2 - PALIGNR mm2, mm6, 7, mm5 - PALIGNR mm6, mm7, 7, mm0 - lea r2, [r1+r3*2] - PALIGNR mm4, mm3, 1, mm7 - movq mm5, mm3 - pavgb mm3, mm6 - PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 - movq mm4, mm2 - movq mm1, mm2 - lea r4, [r2+r3*2] - psrlq mm4, 16 - psrlq mm1, 8 - PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 - movq mm7, mm3 - punpcklbw mm3, mm0 - punpckhbw mm7, mm0 - movq mm1, mm7 - movq mm0, mm7 - movq mm4, mm7 - movq [r4+r3*2], mm3 - PALIGNR mm7, mm3, 2, mm5 - movq [r4+r3*1], mm7 - PALIGNR mm1, mm3, 4, mm5 - movq [r2+r3*2], mm1 - PALIGNR mm0, mm3, 6, mm3 - movq [r2+r3*1], mm0 - movq mm2, mm6 - movq mm3, mm6 - movq [r1+r3*2], mm4 - PALIGNR mm6, mm4, 2, mm5 - movq [r1+r3*1], mm6 - PALIGNR mm2, mm4, 4, mm5 - movq [r0+r3*2], mm2 - PALIGNR mm3, mm4, 6, mm4 - movq [r0+r3*1], mm3 - RET - -%macro PRED8x8L_HORIZONTAL_DOWN 0 -cglobal pred8x8l_horizontal_down_8, 4,5 - sub r0, r3 - lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] - mov r4, r0 - punpckhwd mm1, mm0 - lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] - lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 - lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] - mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 - test r1, r1 - jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 - test r2, r2 - jnz .do_top -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .do_top -.fix_tr_2: - punpckhbw mm3, mm3 - pshufw mm1, mm3, 0xFF - jmp .do_topright -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq2dq xmm0, mm2 - pslldq xmm0, 8 - movq mm4, mm0 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - movq2dq xmm2, mm1 - pslldq xmm2, 15 - psrldq xmm2, 8 - por xmm0, xmm2 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq2dq xmm1, mm4 - test r2, r2 - jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 - psrlq mm5, 56 - PALIGNR mm2, mm3, 7, mm3 - PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 -.do_topright: - movq2dq xmm5, mm1 - pslldq xmm5, 8 - por xmm1, xmm5 -INIT_XMM cpuname - lea r2, [r4+r3*2] - movdqa xmm2, xmm1 - movdqa xmm3, xmm1 - PALIGNR xmm1, xmm0, 7, xmm4 - PALIGNR xmm2, xmm0, 9, xmm5 - lea r1, [r2+r3*2] - PALIGNR xmm3, xmm0, 8, xmm0 - movdqa xmm4, xmm1 - pavgb xmm4, xmm3 - lea r0, [r1+r3*2] - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 - punpcklbw xmm4, xmm0 - movhlps xmm0, xmm4 - movq [r0+r3*2], xmm4 - movq [r2+r3*2], xmm0 - psrldq xmm4, 2 - psrldq xmm0, 2 - movq [r0+r3*1], xmm4 - movq [r2+r3*1], xmm0 - psrldq xmm4, 2 - psrldq xmm0, 2 - movq [r1+r3*2], xmm4 - movq [r4+r3*2], xmm0 - psrldq xmm4, 2 - psrldq xmm0, 2 - movq [r1+r3*1], xmm4 - movq [r4+r3*1], xmm0 - RET -%endmacro - -INIT_MMX sse2 -PRED8x8L_HORIZONTAL_DOWN -INIT_MMX ssse3 -PRED8x8L_HORIZONTAL_DOWN - -;----------------------------------------------------------------------------- -; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_dc_8, 3,5 - pxor mm7, mm7 - mov r4, r0 - sub r0, r2 - movd mm0, [r0] - psadbw mm0, mm7 - movzx r1d, byte [r0+r2*1-1] - movd r3d, mm0 - add r3d, r1d - movzx r1d, byte [r0+r2*2-1] - lea r0, [r0+r2*2] - add r3d, r1d - movzx r1d, byte [r0+r2*1-1] - add r3d, r1d - movzx r1d, byte [r0+r2*2-1] - add r3d, r1d - add r3d, 4 - shr r3d, 3 - imul r3d, 0x01010101 - mov [r4+r2*0], r3d - mov [r0+r2*0], r3d - mov [r0+r2*1], r3d - mov [r0+r2*2], r3d - RET - -;----------------------------------------------------------------------------- -; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -%macro PRED4x4_TM 0 -cglobal pred4x4_tm_vp8_8, 3,6 - sub r0, r2 - pxor mm7, mm7 - movd mm0, [r0] - punpcklbw mm0, mm7 - movzx r4d, byte [r0-1] - mov r5d, 2 -.loop: - movzx r1d, byte [r0+r2*1-1] - movzx r3d, byte [r0+r2*2-1] - sub r1d, r4d - sub r3d, r4d - movd mm2, r1d - movd mm4, r3d -%if cpuflag(mmxext) - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 -%else - punpcklwd mm2, mm2 - punpcklwd mm4, mm4 - punpckldq mm2, mm2 - punpckldq mm4, mm4 -%endif - paddw mm2, mm0 - paddw mm4, mm0 - packuswb mm2, mm2 - packuswb mm4, mm4 - movd [r0+r2*1], mm2 - movd [r0+r2*2], mm4 - lea r0, [r0+r2*2] - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -PRED4x4_TM -INIT_MMX mmxext -PRED4x4_TM - -INIT_XMM ssse3 -cglobal pred4x4_tm_vp8_8, 3,3 - sub r0, r2 - movq mm6, [tm_shuf] - pxor mm1, mm1 - movd mm0, [r0] - punpcklbw mm0, mm1 - movd mm7, [r0-4] - pshufb mm7, mm6 - lea r1, [r0+r2*2] - movd mm2, [r0+r2*1-4] - movd mm3, [r0+r2*2-4] - movd mm4, [r1+r2*1-4] - movd mm5, [r1+r2*2-4] - pshufb mm2, mm6 - pshufb mm3, mm6 - pshufb mm4, mm6 - pshufb mm5, mm6 - psubw mm2, mm7 - psubw mm3, mm7 - psubw mm4, mm7 - psubw mm5, mm7 - paddw mm2, mm0 - paddw mm3, mm0 - paddw mm4, mm0 - paddw mm5, mm0 - packuswb mm2, mm2 - packuswb mm3, mm3 - packuswb mm4, mm4 - packuswb mm5, mm5 - movd [r0+r2*1], mm2 - movd [r0+r2*2], mm3 - movd [r1+r2*1], mm4 - movd [r1+r2*2], mm5 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_vertical_vp8_8, 3,3 - sub r0, r2 - movd m1, [r0-1] - movd m0, [r0] - mova m2, m0 ;t0 t1 t2 t3 - punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 - lea r1, [r0+r2*2] - psrlq m0, 8 ;t1 t2 t3 t4 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - movd [r0+r2*1], m3 - movd [r0+r2*2], m3 - movd [r1+r2*1], m3 - movd [r1+r2*2], m3 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred4x4_down_left_8, 3,3 - sub r0, r2 - movq m1, [r0] - punpckldq m1, [r1] - movq m2, m1 - movq m3, m1 - psllq m1, 8 - pxor m2, m1 - psrlq m2, 8 - pxor m2, m3 - PRED4x4_LOWPASS m0, m1, m2, m3, m4 - lea r1, [r0+r2*2] - psrlq m0, 8 - movd [r0+r2*1], m0 - psrlq m0, 8 - movd [r0+r2*2], m0 - psrlq m0, 8 - movd [r1+r2*1], m0 - psrlq m0, 8 - movd [r1+r2*2], m0 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_vertical_left_8, 3,3 - sub r0, r2 - movq m1, [r0] - punpckldq m1, [r1] - movq m3, m1 - movq m2, m1 - psrlq m3, 8 - psrlq m2, 16 - movq m4, m3 - pavgb m4, m1 - PRED4x4_LOWPASS m0, m1, m2, m3, m5 - lea r1, [r0+r2*2] - movh [r0+r2*1], m4 - movh [r0+r2*2], m0 - psrlq m4, 8 - psrlq m0, 8 - movh [r1+r2*1], m4 - movh [r1+r2*2], m0 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_horizontal_up_8, 3,3 - sub r0, r2 - lea r1, [r0+r2*2] - movd m0, [r0+r2*1-4] - punpcklbw m0, [r0+r2*2-4] - movd m1, [r1+r2*1-4] - punpcklbw m1, [r1+r2*2-4] - punpckhwd m0, m1 - movq m1, m0 - punpckhbw m1, m1 - pshufw m1, m1, 0xFF - punpckhdq m0, m1 - movq m2, m0 - movq m3, m0 - movq m7, m0 - psrlq m2, 16 - psrlq m3, 8 - pavgb m7, m3 - PRED4x4_LOWPASS m4, m0, m2, m3, m5 - punpcklbw m7, m4 - movd [r0+r2*1], m7 - psrlq m7, 16 - movd [r0+r2*2], m7 - psrlq m7, 16 - movd [r1+r2*1], m7 - movd [r1+r2*2], m1 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_horizontal_down_8, 3,3 - sub r0, r2 - lea r1, [r0+r2*2] - movh m0, [r0-4] ; lt .. - punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. - psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. - movd m1, [r1+r2*2-4] ; l3 - punpcklbw m1, [r1+r2*1-4] ; l2 l3 - movd m2, [r0+r2*2-4] ; l1 - punpcklbw m2, [r0+r2*1-4] ; l0 l1 - punpckhwd m1, m2 ; l0 l1 l2 l3 - punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 - movq m0, m1 - movq m2, m1 - movq m5, m1 - psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 - psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 - pavgb m5, m2 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - punpcklbw m5, m3 - psrlq m3, 32 - PALIGNR m3, m5, 6, m4 - movh [r1+r2*2], m5 - psrlq m5, 16 - movh [r1+r2*1], m5 - psrlq m5, 16 - movh [r0+r2*2], m5 - movh [r0+r2*1], m3 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_vertical_right_8, 3,3 - sub r0, r2 - lea r1, [r0+r2*2] - movh m0, [r0] ; ........t3t2t1t0 - movq m5, m0 - PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt - pavgb m5, m0 - PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 - movq m1, m0 - PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 - movq m2, m0 - PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - movq m1, m3 - psrlq m3, 16 - psllq m1, 48 - movh [r0+r2*1], m5 - movh [r0+r2*2], m3 - PALIGNR m5, m1, 7, m2 - psllq m1, 8 - movh [r1+r2*1], m5 - PALIGNR m3, m1, 7, m1 - movh [r1+r2*2], m3 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) -;----------------------------------------------------------------------------- - -INIT_MMX mmxext -cglobal pred4x4_down_right_8, 3,3 - sub r0, r2 - lea r1, [r0+r2*2] - movq m1, [r1-8] - movq m2, [r0+r2*1-8] - punpckhbw m2, [r0-8] - movh m3, [r0] - punpckhwd m1, m2 - PALIGNR m3, m1, 5, m1 - movq m1, m3 - PALIGNR m3, [r1+r2*1-8], 7, m4 - movq m2, m3 - PALIGNR m3, [r1+r2*2-8], 7, m4 - PRED4x4_LOWPASS m0, m3, m1, m2, m4 - movh [r1+r2*2], m0 - psrlq m0, 8 - movh [r1+r2*1], m0 - psrlq m0, 8 - movh [r0+r2*2], m0 - psrlq m0, 8 - movh [r0+r2*1], m0 - RET diff --git a/ffmpeg1/libavcodec/x86/h264_intrapred_10bit.asm b/ffmpeg1/libavcodec/x86/h264_intrapred_10bit.asm deleted file mode 100644 index db2b25c..0000000 --- a/ffmpeg1/libavcodec/x86/h264_intrapred_10bit.asm +++ /dev/null @@ -1,1199 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -cextern pw_16 -cextern pw_8 -cextern pw_4 -cextern pw_2 -cextern pw_1 - -pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 -pw_m3: times 8 dw -3 -pw_pixel_max: times 8 dw ((1 << 10)-1) -pw_512: times 8 dw 512 -pd_17: times 4 dd 17 -pd_16: times 4 dd 16 - -SECTION .text - -; dest, left, right, src -; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 -%macro PRED4x4_LOWPASS 4 - paddw %2, %3 - psrlw %2, 1 - pavgw %1, %4, %2 -%endmacro - -;----------------------------------------------------------------------------- -; void pred4x4_down_right(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED4x4_DR 0 -cglobal pred4x4_down_right_10, 3, 3 - sub r0, r2 - lea r1, [r0+r2*2] - movhps m1, [r1-8] - movhps m2, [r0+r2*1-8] - movhps m4, [r0-8] - punpckhwd m2, m4 - movq m3, [r0] - punpckhdq m1, m2 - PALIGNR m3, m1, 10, m1 - movhps m4, [r1+r2*1-8] - PALIGNR m0, m3, m4, 14, m4 - movhps m4, [r1+r2*2-8] - PALIGNR m2, m0, m4, 14, m4 - PRED4x4_LOWPASS m0, m2, m3, m0 - movq [r1+r2*2], m0 - psrldq m0, 2 - movq [r1+r2*1], m0 - psrldq m0, 2 - movq [r0+r2*2], m0 - psrldq m0, 2 - movq [r0+r2*1], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED4x4_DR -INIT_XMM ssse3 -PRED4x4_DR -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED4x4_DR -%endif - -;----------------------------------------------------------------------------- -; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED4x4_VR 0 -cglobal pred4x4_vertical_right_10, 3, 3, 6 - sub r0, r2 - lea r1, [r0+r2*2] - movq m5, [r0] ; ........t3t2t1t0 - movhps m1, [r0-8] - PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt - pavgw m5, m0 - movhps m1, [r0+r2*1-8] - PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 - movhps m2, [r0+r2*2-8] - PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 - movhps m3, [r1+r2*1-8] - PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 - PRED4x4_LOWPASS m1, m0, m2, m1 - pslldq m0, m1, 12 - psrldq m1, 4 - movq [r0+r2*1], m5 - movq [r0+r2*2], m1 - PALIGNR m5, m0, 14, m2 - pslldq m0, 2 - movq [r1+r2*1], m5 - PALIGNR m1, m0, 14, m0 - movq [r1+r2*2], m1 - RET -%endmacro - -INIT_XMM sse2 -PRED4x4_VR -INIT_XMM ssse3 -PRED4x4_VR -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED4x4_VR -%endif - -;----------------------------------------------------------------------------- -; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED4x4_HD 0 -cglobal pred4x4_horizontal_down_10, 3, 3 - sub r0, r2 - lea r1, [r0+r2*2] - movq m0, [r0-8] ; lt .. - movhps m0, [r0] - pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. - movq m1, [r1+r2*2-8] ; l3 - movq m3, [r1+r2*1-8] - punpcklwd m1, m3 ; l2 l3 - movq m2, [r0+r2*2-8] ; l1 - movq m3, [r0+r2*1-8] - punpcklwd m2, m3 ; l0 l1 - punpckhdq m1, m2 ; l0 l1 l2 l3 - punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 - psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 - psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 - pavgw m5, m1, m3 - PRED4x4_LOWPASS m3, m1, m0, m3 - punpcklwd m5, m3 - psrldq m3, 8 - PALIGNR m3, m5, 12, m4 - movq [r1+r2*2], m5 - movhps [r0+r2*2], m5 - psrldq m5, 4 - movq [r1+r2*1], m5 - movq [r0+r2*1], m3 - RET -%endmacro - -INIT_XMM sse2 -PRED4x4_HD -INIT_XMM ssse3 -PRED4x4_HD -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED4x4_HD -%endif - -;----------------------------------------------------------------------------- -; void pred4x4_dc(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro HADDD 2 ; sum junk -%if mmsize == 16 - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%else - pshufw %2, %1, 0xE - paddd %1, %2 -%endif -%endmacro - -%macro HADDW 2 - pmaddwd %1, [pw_1] - HADDD %1, %2 -%endmacro - -INIT_MMX mmxext -cglobal pred4x4_dc_10, 3, 3 - sub r0, r2 - lea r1, [r0+r2*2] - movq m2, [r0+r2*1-8] - paddw m2, [r0+r2*2-8] - paddw m2, [r1+r2*1-8] - paddw m2, [r1+r2*2-8] - psrlq m2, 48 - movq m0, [r0] - HADDW m0, m1 - paddw m0, [pw_4] - paddw m0, m2 - psrlw m0, 3 - SPLATW m0, m0, 0 - movq [r0+r2*1], m0 - movq [r0+r2*2], m0 - movq [r1+r2*1], m0 - movq [r1+r2*2], m0 - RET - -;----------------------------------------------------------------------------- -; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED4x4_DL 0 -cglobal pred4x4_down_left_10, 3, 3 - sub r0, r2 - movq m0, [r0] - movhps m0, [r1] - psrldq m2, m0, 2 - pslldq m3, m0, 2 - pshufhw m2, m2, 10100100b - PRED4x4_LOWPASS m0, m3, m2, m0 - lea r1, [r0+r2*2] - movhps [r1+r2*2], m0 - psrldq m0, 2 - movq [r0+r2*1], m0 - psrldq m0, 2 - movq [r0+r2*2], m0 - psrldq m0, 2 - movq [r1+r2*1], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED4x4_DL -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED4x4_DL -%endif - -;----------------------------------------------------------------------------- -; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED4x4_VL 0 -cglobal pred4x4_vertical_left_10, 3, 3 - sub r0, r2 - movu m1, [r0] - movhps m1, [r1] - psrldq m0, m1, 2 - psrldq m2, m1, 4 - pavgw m4, m0, m1 - PRED4x4_LOWPASS m0, m1, m2, m0 - lea r1, [r0+r2*2] - movq [r0+r2*1], m4 - movq [r0+r2*2], m0 - psrldq m4, 2 - psrldq m0, 2 - movq [r1+r2*1], m4 - movq [r1+r2*2], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED4x4_VL -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED4x4_VL -%endif - -;----------------------------------------------------------------------------- -; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) -;----------------------------------------------------------------------------- -INIT_MMX mmxext -cglobal pred4x4_horizontal_up_10, 3, 3 - sub r0, r2 - lea r1, [r0+r2*2] - movq m0, [r0+r2*1-8] - punpckhwd m0, [r0+r2*2-8] - movq m1, [r1+r2*1-8] - punpckhwd m1, [r1+r2*2-8] - punpckhdq m0, m1 - pshufw m1, m1, 0xFF - movq [r1+r2*2], m1 - movd [r1+r2*1+4], m1 - pshufw m2, m0, 11111001b - movq m1, m2 - pavgw m2, m0 - - pshufw m5, m0, 11111110b - PRED4x4_LOWPASS m1, m0, m5, m1 - movq m6, m2 - punpcklwd m6, m1 - movq [r0+r2*1], m6 - psrlq m2, 16 - psrlq m1, 16 - punpcklwd m2, m1 - movq [r0+r2*2], m2 - psrlq m2, 32 - movd [r1+r2*1], m2 - RET - - - -;----------------------------------------------------------------------------- -; void pred8x8_vertical(pixel *src, int stride) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal pred8x8_vertical_10, 2, 2 - sub r0, r1 - mova m0, [r0] -%rep 3 - mova [r0+r1*1], m0 - mova [r0+r1*2], m0 - lea r0, [r0+r1*2] -%endrep - mova [r0+r1*1], m0 - mova [r0+r1*2], m0 - RET - -;----------------------------------------------------------------------------- -; void pred8x8_horizontal(pixel *src, int stride) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal pred8x8_horizontal_10, 2, 3 - mov r2d, 4 -.loop: - movq m0, [r0+r1*0-8] - movq m1, [r0+r1*1-8] - pshuflw m0, m0, 0xff - pshuflw m1, m1, 0xff - punpcklqdq m0, m0 - punpcklqdq m1, m1 - mova [r0+r1*0], m0 - mova [r0+r1*1], m1 - lea r0, [r0+r1*2] - dec r2d - jg .loop - REP_RET - -;----------------------------------------------------------------------------- -; void predict_8x8_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro MOV8 2-3 -; sort of a hack, but it works -%if mmsize==8 - movq [%1+0], %2 - movq [%1+8], %3 -%else - movdqa [%1], %2 -%endif -%endmacro - -%macro PRED8x8_DC 1 -cglobal pred8x8_dc_10, 2, 6 - sub r0, r1 - pxor m4, m4 - movq m0, [r0+0] - movq m1, [r0+8] -%if mmsize==16 - punpcklwd m0, m1 - movhlps m1, m0 - paddw m0, m1 -%else - pshufw m2, m0, 00001110b - pshufw m3, m1, 00001110b - paddw m0, m2 - paddw m1, m3 - punpcklwd m0, m1 -%endif - %1 m2, m0, 00001110b - paddw m0, m2 - - lea r5, [r1*3] - lea r4, [r0+r1*4] - movzx r2d, word [r0+r1*1-2] - movzx r3d, word [r0+r1*2-2] - add r2d, r3d - movzx r3d, word [r0+r5*1-2] - add r2d, r3d - movzx r3d, word [r4-2] - add r2d, r3d - movd m2, r2d ; s2 - - movzx r2d, word [r4+r1*1-2] - movzx r3d, word [r4+r1*2-2] - add r2d, r3d - movzx r3d, word [r4+r5*1-2] - add r2d, r3d - movzx r3d, word [r4+r1*4-2] - add r2d, r3d - movd m3, r2d ; s3 - - punpcklwd m2, m3 - punpckldq m0, m2 ; s0, s1, s2, s3 - %1 m3, m0, 11110110b ; s2, s1, s3, s3 - %1 m0, m0, 01110100b ; s0, s1, s3, s1 - paddw m0, m3 - psrlw m0, 2 - pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 -%if mmsize==16 - punpcklwd m0, m0 - pshufd m3, m0, 11111010b - punpckldq m0, m0 - SWAP 0,1 -%else - pshufw m1, m0, 0x00 - pshufw m2, m0, 0x55 - pshufw m3, m0, 0xaa - pshufw m4, m0, 0xff -%endif - MOV8 r0+r1*1, m1, m2 - MOV8 r0+r1*2, m1, m2 - MOV8 r0+r5*1, m1, m2 - MOV8 r0+r1*4, m1, m2 - MOV8 r4+r1*1, m3, m4 - MOV8 r4+r1*2, m3, m4 - MOV8 r4+r5*1, m3, m4 - MOV8 r4+r1*4, m3, m4 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8_DC pshufw -INIT_XMM sse2 -PRED8x8_DC pshuflw - -;----------------------------------------------------------------------------- -; void pred8x8_top_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal pred8x8_top_dc_10, 2, 4 - sub r0, r1 - mova m0, [r0] - pshuflw m1, m0, 0x4e - pshufhw m1, m1, 0x4e - paddw m0, m1 - pshuflw m1, m0, 0xb1 - pshufhw m1, m1, 0xb1 - paddw m0, m1 - lea r2, [r1*3] - lea r3, [r0+r1*4] - paddw m0, [pw_2] - psrlw m0, 2 - mova [r0+r1*1], m0 - mova [r0+r1*2], m0 - mova [r0+r2*1], m0 - mova [r0+r1*4], m0 - mova [r3+r1*1], m0 - mova [r3+r1*2], m0 - mova [r3+r2*1], m0 - mova [r3+r1*4], m0 - RET - -;----------------------------------------------------------------------------- -; void pred8x8_plane(pixel *src, int stride) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal pred8x8_plane_10, 2, 7, 7 - sub r0, r1 - lea r2, [r1*3] - lea r3, [r0+r1*4] - mova m2, [r0] - pmaddwd m2, [pw_m32101234] - HADDD m2, m1 - movd m0, [r0-4] - psrld m0, 14 - psubw m2, m0 ; H - movd m0, [r3+r1*4-4] - movd m1, [r0+12] - paddw m0, m1 - psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) - movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] - movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] - sub r4d, r5d - movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] - movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] - sub r6d, r5d - lea r4d, [r4+r6*2] - movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] - movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] - sub r5d, r6d - lea r5d, [r5*3] - add r4d, r5d - movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] - movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] - sub r6d, r5d - lea r4d, [r4+r6*4] - movd m3, r4d ; V - punpckldq m2, m3 - pmaddwd m2, [pd_17] - paddd m2, [pd_16] - psrad m2, 5 ; b, c - - mova m3, [pw_pixel_max] - pxor m1, m1 - SPLATW m0, m0, 1 - SPLATW m4, m2, 2 - SPLATW m2, m2, 0 - pmullw m2, [pw_m32101234] ; b - pmullw m5, m4, [pw_m3] ; c - paddw m5, [pw_16] - mov r2d, 8 - add r0, r1 -.loop: - paddsw m6, m2, m5 - paddsw m6, m0 - psraw m6, 5 - CLIPW m6, m1, m3 - mova [r0], m6 - paddw m5, m4 - add r0, r1 - dec r2d - jg .loop - REP_RET - - -;----------------------------------------------------------------------------- -; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_128_DC 0 -cglobal pred8x8l_128_dc_10, 4, 4 - mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) - lea r1, [r3*3] - lea r2, [r0+r3*4] - MOV8 r0+r3*0, m0, m0 - MOV8 r0+r3*1, m0, m0 - MOV8 r0+r3*2, m0, m0 - MOV8 r0+r1*1, m0, m0 - MOV8 r2+r3*0, m0, m0 - MOV8 r2+r3*1, m0, m0 - MOV8 r2+r3*2, m0, m0 - MOV8 r2+r1*1, m0, m0 - RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_128_DC -INIT_XMM sse2 -PRED8x8L_128_DC - -;----------------------------------------------------------------------------- -; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_TOP_DC 0 -cglobal pred8x8l_top_dc_10, 4, 4, 6 - sub r0, r3 - mova m0, [r0] - shr r1d, 14 - shr r2d, 13 - neg r1 - pslldq m1, m0, 2 - psrldq m2, m0, 2 - pinsrw m1, [r0+r1], 0 - pinsrw m2, [r0+r2+14], 7 - lea r1, [r3*3] - lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m0 - HADDW m0, m1 - paddw m0, [pw_4] - psrlw m0, 3 - SPLATW m0, m0, 0 - mova [r0+r3*1], m0 - mova [r0+r3*2], m0 - mova [r0+r1*1], m0 - mova [r0+r3*4], m0 - mova [r2+r3*1], m0 - mova [r2+r3*2], m0 - mova [r2+r1*1], m0 - mova [r2+r3*4], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_TOP_DC -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_TOP_DC -%endif - -;----------------------------------------------------------------------------- -;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -;TODO: see if scalar is faster -%macro PRED8x8L_DC 0 -cglobal pred8x8l_dc_10, 4, 6, 6 - sub r0, r3 - lea r4, [r0+r3*4] - lea r5, [r3*3] - mova m0, [r0+r3*2-16] - punpckhwd m0, [r0+r3*1-16] - mova m1, [r4+r3*0-16] - punpckhwd m1, [r0+r5*1-16] - punpckhdq m1, m0 - mova m2, [r4+r3*2-16] - punpckhwd m2, [r4+r3*1-16] - mova m3, [r4+r3*4-16] - punpckhwd m3, [r4+r5*1-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - mova m0, [r0] - shr r1d, 14 - shr r2d, 13 - neg r1 - pslldq m1, m0, 2 - psrldq m2, m0, 2 - pinsrw m1, [r0+r1], 0 - pinsrw m2, [r0+r2+14], 7 - not r1 - and r1, r3 - pslldq m4, m3, 2 - psrldq m5, m3, 2 - pshuflw m4, m4, 11100101b - pinsrw m5, [r0+r1-2], 7 - PRED4x4_LOWPASS m3, m4, m5, m3 - PRED4x4_LOWPASS m0, m2, m1, m0 - paddw m0, m3 - HADDW m0, m1 - paddw m0, [pw_8] - psrlw m0, 4 - SPLATW m0, m0 - mova [r0+r3*1], m0 - mova [r0+r3*2], m0 - mova [r0+r5*1], m0 - mova [r0+r3*4], m0 - mova [r4+r3*1], m0 - mova [r4+r3*2], m0 - mova [r4+r5*1], m0 - mova [r4+r3*4], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_DC -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_DC -%endif - -;----------------------------------------------------------------------------- -; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_VERTICAL 0 -cglobal pred8x8l_vertical_10, 4, 4, 6 - sub r0, r3 - mova m0, [r0] - shr r1d, 14 - shr r2d, 13 - neg r1 - pslldq m1, m0, 2 - psrldq m2, m0, 2 - pinsrw m1, [r0+r1], 0 - pinsrw m2, [r0+r2+14], 7 - lea r1, [r3*3] - lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m0 - mova [r0+r3*1], m0 - mova [r0+r3*2], m0 - mova [r0+r1*1], m0 - mova [r0+r3*4], m0 - mova [r2+r3*1], m0 - mova [r2+r3*2], m0 - mova [r2+r1*1], m0 - mova [r2+r3*4], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_VERTICAL -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_VERTICAL -%endif - -;----------------------------------------------------------------------------- -; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_HORIZONTAL 0 -cglobal pred8x8l_horizontal_10, 4, 4, 5 - mova m0, [r0-16] - shr r1d, 14 - dec r1 - and r1, r3 - sub r1, r3 - punpckhwd m0, [r0+r1-16] - mova m1, [r0+r3*2-16] - punpckhwd m1, [r0+r3*1-16] - lea r2, [r0+r3*4] - lea r1, [r3*3] - punpckhdq m1, m0 - mova m2, [r2+r3*0-16] - punpckhwd m2, [r0+r1-16] - mova m3, [r2+r3*2-16] - punpckhwd m3, [r2+r3*1-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - PALIGNR m4, m3, [r2+r1-16], 14, m0 - pslldq m0, m4, 2 - pshuflw m0, m0, 11100101b - PRED4x4_LOWPASS m4, m3, m0, m4 - punpckhwd m3, m4, m4 - punpcklwd m4, m4 - pshufd m0, m3, 0xff - pshufd m1, m3, 0xaa - pshufd m2, m3, 0x55 - pshufd m3, m3, 0x00 - mova [r0+r3*0], m0 - mova [r0+r3*1], m1 - mova [r0+r3*2], m2 - mova [r0+r1*1], m3 - pshufd m0, m4, 0xff - pshufd m1, m4, 0xaa - pshufd m2, m4, 0x55 - pshufd m3, m4, 0x00 - mova [r2+r3*0], m0 - mova [r2+r3*1], m1 - mova [r2+r3*2], m2 - mova [r2+r1*1], m3 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_HORIZONTAL -INIT_XMM ssse3 -PRED8x8L_HORIZONTAL -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_HORIZONTAL -%endif - -;----------------------------------------------------------------------------- -;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_DOWN_LEFT 0 -cglobal pred8x8l_down_left_10, 4, 4, 7 - sub r0, r3 - mova m3, [r0] - shr r1d, 14 - neg r1 - shr r2d, 13 - pslldq m1, m3, 2 - psrldq m2, m3, 2 - pinsrw m1, [r0+r1], 0 - pinsrw m2, [r0+r2+14], 7 - PRED4x4_LOWPASS m6, m2, m1, m3 - jz .fix_tr ; flags from shr r2d - mova m1, [r0+16] - psrldq m5, m1, 2 - PALIGNR m2, m1, m3, 14, m3 - pshufhw m5, m5, 10100100b - PRED4x4_LOWPASS m1, m2, m5, m1 -.do_topright: - lea r1, [r3*3] - psrldq m5, m1, 14 - lea r2, [r0+r3*4] - PALIGNR m2, m1, m6, 2, m0 - PALIGNR m3, m1, m6, 14, m0 - PALIGNR m5, m1, 2, m0 - pslldq m4, m6, 2 - PRED4x4_LOWPASS m6, m4, m2, m6 - PRED4x4_LOWPASS m1, m3, m5, m1 - mova [r2+r3*4], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r2+r1*1], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r2+r3*2], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r2+r3*1], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r0+r3*4], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r0+r1*1], m1 - PALIGNR m1, m6, 14, m2 - pslldq m6, 2 - mova [r0+r3*2], m1 - PALIGNR m1, m6, 14, m6 - mova [r0+r3*1], m1 - RET -.fix_tr: - punpckhwd m3, m3 - pshufd m1, m3, 0xFF - jmp .do_topright -%endmacro - -INIT_XMM sse2 -PRED8x8L_DOWN_LEFT -INIT_XMM ssse3 -PRED8x8L_DOWN_LEFT -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_DOWN_LEFT -%endif - -;----------------------------------------------------------------------------- -;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_DOWN_RIGHT 0 -; standard forbids this when has_topleft is false -; no need to check -cglobal pred8x8l_down_right_10, 4, 5, 8 - sub r0, r3 - lea r4, [r0+r3*4] - lea r1, [r3*3] - mova m0, [r0+r3*1-16] - punpckhwd m0, [r0+r3*0-16] - mova m1, [r0+r1*1-16] - punpckhwd m1, [r0+r3*2-16] - punpckhdq m1, m0 - mova m2, [r4+r3*1-16] - punpckhwd m2, [r4+r3*0-16] - mova m3, [r4+r1*1-16] - punpckhwd m3, [r4+r3*2-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - mova m0, [r4+r3*4-16] - mova m1, [r0] - PALIGNR m4, m3, m0, 14, m0 - PALIGNR m1, m3, 2, m2 - pslldq m0, m4, 2 - pshuflw m0, m0, 11100101b - PRED4x4_LOWPASS m6, m1, m4, m3 - PRED4x4_LOWPASS m4, m3, m0, m4 - mova m3, [r0] - shr r2d, 13 - pslldq m1, m3, 2 - psrldq m2, m3, 2 - pinsrw m1, [r0-2], 0 - pinsrw m2, [r0+r2+14], 7 - PRED4x4_LOWPASS m3, m2, m1, m3 - PALIGNR m2, m3, m6, 2, m0 - PALIGNR m5, m3, m6, 14, m0 - psrldq m7, m3, 2 - PRED4x4_LOWPASS m6, m4, m2, m6 - PRED4x4_LOWPASS m3, m5, m7, m3 - mova [r4+r3*4], m6 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r0+r3*1], m3 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r0+r3*2], m3 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r0+r1*1], m3 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r0+r3*4], m3 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r4+r3*1], m3 - PALIGNR m3, m6, 14, m2 - pslldq m6, 2 - mova [r4+r3*2], m3 - PALIGNR m3, m6, 14, m6 - mova [r4+r1*1], m3 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_DOWN_RIGHT -INIT_XMM ssse3 -PRED8x8L_DOWN_RIGHT -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_DOWN_RIGHT -%endif - -;----------------------------------------------------------------------------- -; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_VERTICAL_RIGHT 0 -; likewise with 8x8l_down_right -cglobal pred8x8l_vertical_right_10, 4, 5, 7 - sub r0, r3 - lea r4, [r0+r3*4] - lea r1, [r3*3] - mova m0, [r0+r3*1-16] - punpckhwd m0, [r0+r3*0-16] - mova m1, [r0+r1*1-16] - punpckhwd m1, [r0+r3*2-16] - punpckhdq m1, m0 - mova m2, [r4+r3*1-16] - punpckhwd m2, [r4+r3*0-16] - mova m3, [r4+r1*1-16] - punpckhwd m3, [r4+r3*2-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - mova m0, [r4+r3*4-16] - mova m1, [r0] - PALIGNR m4, m3, m0, 14, m0 - PALIGNR m1, m3, 2, m2 - PRED4x4_LOWPASS m3, m1, m4, m3 - mova m2, [r0] - shr r2d, 13 - pslldq m1, m2, 2 - psrldq m5, m2, 2 - pinsrw m1, [r0-2], 0 - pinsrw m5, [r0+r2+14], 7 - PRED4x4_LOWPASS m2, m5, m1, m2 - PALIGNR m6, m2, m3, 12, m1 - PALIGNR m5, m2, m3, 14, m0 - PRED4x4_LOWPASS m0, m6, m2, m5 - pavgw m2, m5 - mova [r0+r3*2], m0 - mova [r0+r3*1], m2 - pslldq m6, m3, 4 - pslldq m1, m3, 2 - PRED4x4_LOWPASS m1, m3, m6, m1 - PALIGNR m2, m1, 14, m4 - mova [r0+r1*1], m2 - pslldq m1, 2 - PALIGNR m0, m1, 14, m3 - mova [r0+r3*4], m0 - pslldq m1, 2 - PALIGNR m2, m1, 14, m4 - mova [r4+r3*1], m2 - pslldq m1, 2 - PALIGNR m0, m1, 14, m3 - mova [r4+r3*2], m0 - pslldq m1, 2 - PALIGNR m2, m1, 14, m4 - mova [r4+r1*1], m2 - pslldq m1, 2 - PALIGNR m0, m1, 14, m1 - mova [r4+r3*4], m0 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_VERTICAL_RIGHT -INIT_XMM ssse3 -PRED8x8L_VERTICAL_RIGHT -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_VERTICAL_RIGHT -%endif - -;----------------------------------------------------------------------------- -; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) -;----------------------------------------------------------------------------- -%macro PRED8x8L_HORIZONTAL_UP 0 -cglobal pred8x8l_horizontal_up_10, 4, 4, 6 - mova m0, [r0+r3*0-16] - punpckhwd m0, [r0+r3*1-16] - shr r1d, 14 - dec r1 - and r1, r3 - sub r1, r3 - mova m4, [r0+r1*1-16] - lea r1, [r3*3] - lea r2, [r0+r3*4] - mova m1, [r0+r3*2-16] - punpckhwd m1, [r0+r1*1-16] - punpckhdq m0, m1 - mova m2, [r2+r3*0-16] - punpckhwd m2, [r2+r3*1-16] - mova m3, [r2+r3*2-16] - punpckhwd m3, [r2+r1*1-16] - punpckhdq m2, m3 - punpckhqdq m0, m2 - PALIGNR m1, m0, m4, 14, m4 - psrldq m2, m0, 2 - pshufhw m2, m2, 10100100b - PRED4x4_LOWPASS m0, m1, m2, m0 - psrldq m1, m0, 2 - psrldq m2, m0, 4 - pshufhw m1, m1, 10100100b - pshufhw m2, m2, 01010100b - pavgw m4, m0, m1 - PRED4x4_LOWPASS m1, m2, m0, m1 - punpckhwd m5, m4, m1 - punpcklwd m4, m1 - mova [r2+r3*0], m5 - mova [r0+r3*0], m4 - pshufd m0, m5, 11111001b - pshufd m1, m5, 11111110b - pshufd m2, m5, 11111111b - mova [r2+r3*1], m0 - mova [r2+r3*2], m1 - mova [r2+r1*1], m2 - PALIGNR m2, m5, m4, 4, m0 - PALIGNR m3, m5, m4, 8, m1 - PALIGNR m5, m5, m4, 12, m4 - mova [r0+r3*1], m2 - mova [r0+r3*2], m3 - mova [r0+r1*1], m5 - RET -%endmacro - -INIT_XMM sse2 -PRED8x8L_HORIZONTAL_UP -INIT_XMM ssse3 -PRED8x8L_HORIZONTAL_UP -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -PRED8x8L_HORIZONTAL_UP -%endif - - -;----------------------------------------------------------------------------- -; void pred16x16_vertical(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro MOV16 3-5 - mova [%1+ 0], %2 - mova [%1+mmsize], %3 -%if mmsize==8 - mova [%1+ 16], %4 - mova [%1+ 24], %5 -%endif -%endmacro - -%macro PRED16x16_VERTICAL 0 -cglobal pred16x16_vertical_10, 2, 3 - sub r0, r1 - mov r2d, 8 - mova m0, [r0+ 0] - mova m1, [r0+mmsize] -%if mmsize==8 - mova m2, [r0+16] - mova m3, [r0+24] -%endif -.loop: - MOV16 r0+r1*1, m0, m1, m2, m3 - MOV16 r0+r1*2, m0, m1, m2, m3 - lea r0, [r0+r1*2] - dec r2d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_VERTICAL -INIT_XMM sse2 -PRED16x16_VERTICAL - -;----------------------------------------------------------------------------- -; void pred16x16_horizontal(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro PRED16x16_HORIZONTAL 0 -cglobal pred16x16_horizontal_10, 2, 3 - mov r2d, 8 -.vloop: - movd m0, [r0+r1*0-4] - movd m1, [r0+r1*1-4] - SPLATW m0, m0, 1 - SPLATW m1, m1, 1 - MOV16 r0+r1*0, m0, m0, m0, m0 - MOV16 r0+r1*1, m1, m1, m1, m1 - lea r0, [r0+r1*2] - dec r2d - jg .vloop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_HORIZONTAL -INIT_XMM sse2 -PRED16x16_HORIZONTAL - -;----------------------------------------------------------------------------- -; void pred16x16_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro PRED16x16_DC 0 -cglobal pred16x16_dc_10, 2, 6 - mov r5, r0 - sub r0, r1 - mova m0, [r0+0] - paddw m0, [r0+mmsize] -%if mmsize==8 - paddw m0, [r0+16] - paddw m0, [r0+24] -%endif - HADDW m0, m2 - - lea r0, [r0+r1-2] - movzx r3d, word [r0] - movzx r4d, word [r0+r1] -%rep 7 - lea r0, [r0+r1*2] - movzx r2d, word [r0] - add r3d, r2d - movzx r2d, word [r0+r1] - add r4d, r2d -%endrep - lea r3d, [r3+r4+16] - - movd m1, r3d - paddw m0, m1 - psrlw m0, 5 - SPLATW m0, m0 - mov r3d, 8 -.loop: - MOV16 r5+r1*0, m0, m0, m0, m0 - MOV16 r5+r1*1, m0, m0, m0, m0 - lea r5, [r5+r1*2] - dec r3d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_DC -INIT_XMM sse2 -PRED16x16_DC - -;----------------------------------------------------------------------------- -; void pred16x16_top_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro PRED16x16_TOP_DC 0 -cglobal pred16x16_top_dc_10, 2, 3 - sub r0, r1 - mova m0, [r0+0] - paddw m0, [r0+mmsize] -%if mmsize==8 - paddw m0, [r0+16] - paddw m0, [r0+24] -%endif - HADDW m0, m2 - - SPLATW m0, m0 - paddw m0, [pw_8] - psrlw m0, 4 - mov r2d, 8 -.loop: - MOV16 r0+r1*1, m0, m0, m0, m0 - MOV16 r0+r1*2, m0, m0, m0, m0 - lea r0, [r0+r1*2] - dec r2d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_TOP_DC -INIT_XMM sse2 -PRED16x16_TOP_DC - -;----------------------------------------------------------------------------- -; void pred16x16_left_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro PRED16x16_LEFT_DC 0 -cglobal pred16x16_left_dc_10, 2, 6 - mov r5, r0 - - sub r0, 2 - movzx r3d, word [r0] - movzx r4d, word [r0+r1] -%rep 7 - lea r0, [r0+r1*2] - movzx r2d, word [r0] - add r3d, r2d - movzx r2d, word [r0+r1] - add r4d, r2d -%endrep - lea r3d, [r3+r4+8] - shr r3d, 4 - - movd m0, r3d - SPLATW m0, m0 - mov r3d, 8 -.loop: - MOV16 r5+r1*0, m0, m0, m0, m0 - MOV16 r5+r1*1, m0, m0, m0, m0 - lea r5, [r5+r1*2] - dec r3d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_LEFT_DC -INIT_XMM sse2 -PRED16x16_LEFT_DC - -;----------------------------------------------------------------------------- -; void pred16x16_128_dc(pixel *src, int stride) -;----------------------------------------------------------------------------- -%macro PRED16x16_128_DC 0 -cglobal pred16x16_128_dc_10, 2,3 - mova m0, [pw_512] - mov r2d, 8 -.loop: - MOV16 r0+r1*0, m0, m0, m0, m0 - MOV16 r0+r1*1, m0, m0, m0, m0 - lea r0, [r0+r1*2] - dec r2d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PRED16x16_128_DC -INIT_XMM sse2 -PRED16x16_128_DC diff --git a/ffmpeg1/libavcodec/x86/h264_intrapred_init.c b/ffmpeg1/libavcodec/x86/h264_intrapred_init.c deleted file mode 100644 index f5b5e3e..0000000 --- a/ffmpeg1/libavcodec/x86/h264_intrapred_init.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright (c) 2010 Jason Garrett-Glaser - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/h264pred.h" - -#define PRED4x4(TYPE, DEPTH, OPT) \ -void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ - const uint8_t *topright, \ - ptrdiff_t stride); - -PRED4x4(dc, 10, mmxext) -PRED4x4(down_left, 10, sse2) -PRED4x4(down_left, 10, avx) -PRED4x4(down_right, 10, sse2) -PRED4x4(down_right, 10, ssse3) -PRED4x4(down_right, 10, avx) -PRED4x4(vertical_left, 10, sse2) -PRED4x4(vertical_left, 10, avx) -PRED4x4(vertical_right, 10, sse2) -PRED4x4(vertical_right, 10, ssse3) -PRED4x4(vertical_right, 10, avx) -PRED4x4(horizontal_up, 10, mmxext) -PRED4x4(horizontal_down, 10, sse2) -PRED4x4(horizontal_down, 10, ssse3) -PRED4x4(horizontal_down, 10, avx) - -#define PRED8x8(TYPE, DEPTH, OPT) \ -void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ - ptrdiff_t stride); - -PRED8x8(dc, 10, mmxext) -PRED8x8(dc, 10, sse2) -PRED8x8(top_dc, 10, sse2) -PRED8x8(plane, 10, sse2) -PRED8x8(vertical, 10, sse2) -PRED8x8(horizontal, 10, sse2) - -#define PRED8x8L(TYPE, DEPTH, OPT)\ -void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ - int has_topleft, \ - int has_topright, \ - ptrdiff_t stride); - -PRED8x8L(dc, 10, sse2) -PRED8x8L(dc, 10, avx) -PRED8x8L(128_dc, 10, mmxext) -PRED8x8L(128_dc, 10, sse2) -PRED8x8L(top_dc, 10, sse2) -PRED8x8L(top_dc, 10, avx) -PRED8x8L(vertical, 10, sse2) -PRED8x8L(vertical, 10, avx) -PRED8x8L(horizontal, 10, sse2) -PRED8x8L(horizontal, 10, ssse3) -PRED8x8L(horizontal, 10, avx) -PRED8x8L(down_left, 10, sse2) -PRED8x8L(down_left, 10, ssse3) -PRED8x8L(down_left, 10, avx) -PRED8x8L(down_right, 10, sse2) -PRED8x8L(down_right, 10, ssse3) -PRED8x8L(down_right, 10, avx) -PRED8x8L(vertical_right, 10, sse2) -PRED8x8L(vertical_right, 10, ssse3) -PRED8x8L(vertical_right, 10, avx) -PRED8x8L(horizontal_up, 10, sse2) -PRED8x8L(horizontal_up, 10, ssse3) -PRED8x8L(horizontal_up, 10, avx) - -#define PRED16x16(TYPE, DEPTH, OPT)\ -void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ - ptrdiff_t stride); - -PRED16x16(dc, 10, mmxext) -PRED16x16(dc, 10, sse2) -PRED16x16(top_dc, 10, mmxext) -PRED16x16(top_dc, 10, sse2) -PRED16x16(128_dc, 10, mmxext) -PRED16x16(128_dc, 10, sse2) -PRED16x16(left_dc, 10, mmxext) -PRED16x16(left_dc, 10, sse2) -PRED16x16(vertical, 10, mmxext) -PRED16x16(vertical, 10, sse2) -PRED16x16(horizontal, 10, mmxext) -PRED16x16(horizontal, 10, sse2) - -/* 8-bit versions */ -PRED16x16(vertical, 8, mmx) -PRED16x16(vertical, 8, sse) -PRED16x16(horizontal, 8, mmx) -PRED16x16(horizontal, 8, mmxext) -PRED16x16(horizontal, 8, ssse3) -PRED16x16(dc, 8, mmxext) -PRED16x16(dc, 8, sse2) -PRED16x16(dc, 8, ssse3) -PRED16x16(plane_h264, 8, mmx) -PRED16x16(plane_h264, 8, mmxext) -PRED16x16(plane_h264, 8, sse2) -PRED16x16(plane_h264, 8, ssse3) -PRED16x16(plane_rv40, 8, mmx) -PRED16x16(plane_rv40, 8, mmxext) -PRED16x16(plane_rv40, 8, sse2) -PRED16x16(plane_rv40, 8, ssse3) -PRED16x16(plane_svq3, 8, mmx) -PRED16x16(plane_svq3, 8, mmxext) -PRED16x16(plane_svq3, 8, sse2) -PRED16x16(plane_svq3, 8, ssse3) -PRED16x16(tm_vp8, 8, mmx) -PRED16x16(tm_vp8, 8, mmxext) -PRED16x16(tm_vp8, 8, sse2) - -PRED8x8(top_dc, 8, mmxext) -PRED8x8(dc_rv40, 8, mmxext) -PRED8x8(dc, 8, mmxext) -PRED8x8(vertical, 8, mmx) -PRED8x8(horizontal, 8, mmx) -PRED8x8(horizontal, 8, mmxext) -PRED8x8(horizontal, 8, ssse3) -PRED8x8(plane, 8, mmx) -PRED8x8(plane, 8, mmxext) -PRED8x8(plane, 8, sse2) -PRED8x8(plane, 8, ssse3) -PRED8x8(tm_vp8, 8, mmx) -PRED8x8(tm_vp8, 8, mmxext) -PRED8x8(tm_vp8, 8, sse2) -PRED8x8(tm_vp8, 8, ssse3) - -PRED8x8L(top_dc, 8, mmxext) -PRED8x8L(top_dc, 8, ssse3) -PRED8x8L(dc, 8, mmxext) -PRED8x8L(dc, 8, ssse3) -PRED8x8L(horizontal, 8, mmxext) -PRED8x8L(horizontal, 8, ssse3) -PRED8x8L(vertical, 8, mmxext) -PRED8x8L(vertical, 8, ssse3) -PRED8x8L(down_left, 8, mmxext) -PRED8x8L(down_left, 8, sse2) -PRED8x8L(down_left, 8, ssse3) -PRED8x8L(down_right, 8, mmxext) -PRED8x8L(down_right, 8, sse2) -PRED8x8L(down_right, 8, ssse3) -PRED8x8L(vertical_right, 8, mmxext) -PRED8x8L(vertical_right, 8, sse2) -PRED8x8L(vertical_right, 8, ssse3) -PRED8x8L(vertical_left, 8, sse2) -PRED8x8L(vertical_left, 8, ssse3) -PRED8x8L(horizontal_up, 8, mmxext) -PRED8x8L(horizontal_up, 8, ssse3) -PRED8x8L(horizontal_down, 8, mmxext) -PRED8x8L(horizontal_down, 8, sse2) -PRED8x8L(horizontal_down, 8, ssse3) - -PRED4x4(dc, 8, mmxext) -PRED4x4(down_left, 8, mmxext) -PRED4x4(down_right, 8, mmxext) -PRED4x4(vertical_left, 8, mmxext) -PRED4x4(vertical_right, 8, mmxext) -PRED4x4(horizontal_up, 8, mmxext) -PRED4x4(horizontal_down, 8, mmxext) -PRED4x4(tm_vp8, 8, mmx) -PRED4x4(tm_vp8, 8, mmxext) -PRED4x4(tm_vp8, 8, ssse3) -PRED4x4(vertical_vp8, 8, mmxext) - -av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, - const int bit_depth, - const int chroma_format_idc) -{ - int mm_flags = av_get_cpu_flags(); - - if (bit_depth == 8) { - if (EXTERNAL_MMX(mm_flags)) { - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx; - if (chroma_format_idc == 1) { - h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx; - } - if (codec_id == AV_CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx; - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx; - } else { - if (chroma_format_idc == 1) - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx; - if (codec_id == AV_CODEC_ID_SVQ3) { - if (mm_flags & AV_CPU_FLAG_CMOV) - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx; - } else if (codec_id == AV_CODEC_ID_RV40) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx; - } else { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx; - } - } - } - - if (EXTERNAL_MMXEXT(mm_flags)) { - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext; - if (chroma_format_idc == 1) - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext; - h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; - h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; - h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; - h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext; - h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext; - h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext; - h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext; - h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext; - h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext; - h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext; - h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext; - h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext; - h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext; - if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) { - h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext; - } - if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { - h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext; - } - if (codec_id != AV_CODEC_ID_RV40) { - h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext; - } - if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { - if (chroma_format_idc == 1) { - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext; - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext; - } - } - if (codec_id == AV_CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext; - h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext; - h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; - } else { - if (chroma_format_idc == 1) - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext; - if (codec_id == AV_CODEC_ID_SVQ3) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext; - } else if (codec_id == AV_CODEC_ID_RV40) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext; - } else { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext; - } - } - } - - if (EXTERNAL_SSE(mm_flags)) { - h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse; - } - - if (EXTERNAL_SSE2(mm_flags)) { - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; - h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; - h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; - h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2; - h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2; - h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2; - if (codec_id == AV_CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2; - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2; - } else { - if (chroma_format_idc == 1) - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2; - if (codec_id == AV_CODEC_ID_SVQ3) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2; - } else if (codec_id == AV_CODEC_ID_RV40) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2; - } else { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2; - } - } - } - - if (EXTERNAL_SSSE3(mm_flags)) { - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3; - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; - if (chroma_format_idc == 1) - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3; - h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3; - h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3; - h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3; - h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3; - h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3; - h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3; - h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3; - h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3; - h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3; - h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3; - if (codec_id == AV_CODEC_ID_VP8) { - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3; - } else { - if (chroma_format_idc == 1) - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3; - if (codec_id == AV_CODEC_ID_SVQ3) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3; - } else if (codec_id == AV_CODEC_ID_RV40) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3; - } else { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3; - } - } - } - } else if (bit_depth == 10) { - if (EXTERNAL_MMXEXT(mm_flags)) { - h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; - h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; - - if (chroma_format_idc == 1) - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; - - h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; - - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext; - h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext; - h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext; - h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext; - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; - } - if (EXTERNAL_SSE2(mm_flags)) { - h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; - h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; - h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2; - h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; - h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; - - if (chroma_format_idc == 1) { - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; - h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; - } - - h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; - h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; - h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; - h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; - h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; - h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; - h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; - h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; - h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; - - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; - h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; - h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; - h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; - } - if (EXTERNAL_SSSE3(mm_flags)) { - h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; - h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; - h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; - - h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; - h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; - h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; - h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; - h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; - } - if (EXTERNAL_AVX(mm_flags)) { - h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; - h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; - h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; - h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; - h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; - - h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; - h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; - h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; - h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; - h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; - h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; - h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; - h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; - } - } -} diff --git a/ffmpeg1/libavcodec/x86/h264_qpel.c b/ffmpeg1/libavcodec/x86/h264_qpel.c deleted file mode 100644 index 96dec82..0000000 --- a/ffmpeg1/libavcodec/x86/h264_qpel.c +++ /dev/null @@ -1,644 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * Copyright (c) 2011 Daniel Kang - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/h264qpel.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" - -#if HAVE_YASM -void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_put_pixels8_mmxext(block, pixels, line_size, h); - ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} -static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_avg_pixels8_mmxext(block, pixels, line_size, h); - ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} -void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h); -#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext -#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext -#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext -#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext - -#define DEF_QPEL(OPNAME)\ -void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ -void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ -void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\ -void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ -void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ -void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h); - -DEF_QPEL(avg) -DEF_QPEL(put) - -#define QPEL_H264(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int w=3;\ - src -= 2*srcStride+2;\ - while(w--){\ - ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\ - tmp += 4;\ - src += 4;\ - }\ - tmp -= 3*4;\ - ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ - src += 4;\ - dst += 4;\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ - int w = (size+8)>>2;\ - src -= 2*srcStride+2;\ - while(w--){\ - ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\ - tmp += 4;\ - src += 4;\ - }\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int w = size>>4;\ - do{\ - ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ - tmp += 8;\ - dst += 8;\ - }while(w--);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ - ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ -}\ -\ -static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ - - -#if ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ - -void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); -void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); - -#else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} -#endif // ARCH_X86_64 - -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -} - -static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ - int w = (size+8)>>3; - src -= 2*srcStride+2; - while(w--){ - ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size); - tmp += 8; - src += 8; - } -} - -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ -}\ -static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ -}\ - -#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext -#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext -#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext -#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext - -#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 -#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 -#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 -#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2 - -#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext -#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext - -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ - -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, - ptrdiff_t stride) -{ - ff_put_pixels16_sse2(dst, src, stride, 16); -} -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, - ptrdiff_t stride) -{ - ff_avg_pixels16_sse2(dst, src, stride, 16); -} -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext - -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ - -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ -}\ - -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ -}\ - -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ -}\ - -#define H264_MC_4816(MMX)\ -H264_MC(put_, 4, MMX, 8)\ -H264_MC(put_, 8, MMX, 8)\ -H264_MC(put_, 16,MMX, 8)\ -H264_MC(avg_, 4, MMX, 8)\ -H264_MC(avg_, 8, MMX, 8)\ -H264_MC(avg_, 16,MMX, 8)\ - -#define H264_MC_816(QPEL, XMM)\ -QPEL(put_, 8, XMM, 16)\ -QPEL(put_, 16,XMM, 16)\ -QPEL(avg_, 8, XMM, 16)\ -QPEL(avg_, 16,XMM, 16)\ - -#undef PAVGB -#define PAVGB "pavgb" -QPEL_H264(put_, PUT_OP, mmxext) -QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext) -QPEL_H264_V_XMM(put_, PUT_OP, sse2) -QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2) -QPEL_H264_HV_XMM(put_, PUT_OP, sse2) -QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2) -QPEL_H264_H_XMM(put_, PUT_OP, ssse3) -QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) -#undef PAVGB - -H264_MC_4816(mmxext) -H264_MC_816(H264_MC_V, sse2) -H264_MC_816(H264_MC_HV, sse2) -H264_MC_816(H264_MC_H, ssse3) -H264_MC_816(H264_MC_HV, ssse3) - - -//10bit -#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ -void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, uint8_t *src, ptrdiff_t stride); - -#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) - -#define LUMA_MC_816(DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) - -LUMA_MC_ALL(10, mc00, mmxext) -LUMA_MC_ALL(10, mc10, mmxext) -LUMA_MC_ALL(10, mc20, mmxext) -LUMA_MC_ALL(10, mc30, mmxext) -LUMA_MC_ALL(10, mc01, mmxext) -LUMA_MC_ALL(10, mc11, mmxext) -LUMA_MC_ALL(10, mc21, mmxext) -LUMA_MC_ALL(10, mc31, mmxext) -LUMA_MC_ALL(10, mc02, mmxext) -LUMA_MC_ALL(10, mc12, mmxext) -LUMA_MC_ALL(10, mc22, mmxext) -LUMA_MC_ALL(10, mc32, mmxext) -LUMA_MC_ALL(10, mc03, mmxext) -LUMA_MC_ALL(10, mc13, mmxext) -LUMA_MC_ALL(10, mc23, mmxext) -LUMA_MC_ALL(10, mc33, mmxext) - -LUMA_MC_816(10, mc00, sse2) -LUMA_MC_816(10, mc10, sse2) -LUMA_MC_816(10, mc10, sse2_cache64) -LUMA_MC_816(10, mc10, ssse3_cache64) -LUMA_MC_816(10, mc20, sse2) -LUMA_MC_816(10, mc20, sse2_cache64) -LUMA_MC_816(10, mc20, ssse3_cache64) -LUMA_MC_816(10, mc30, sse2) -LUMA_MC_816(10, mc30, sse2_cache64) -LUMA_MC_816(10, mc30, ssse3_cache64) -LUMA_MC_816(10, mc01, sse2) -LUMA_MC_816(10, mc11, sse2) -LUMA_MC_816(10, mc21, sse2) -LUMA_MC_816(10, mc31, sse2) -LUMA_MC_816(10, mc02, sse2) -LUMA_MC_816(10, mc12, sse2) -LUMA_MC_816(10, mc22, sse2) -LUMA_MC_816(10, mc32, sse2) -LUMA_MC_816(10, mc03, sse2) -LUMA_MC_816(10, mc13, sse2) -LUMA_MC_816(10, mc23, sse2) -LUMA_MC_816(10, mc33, sse2) - -#define QPEL16_OPMC(OP, MC, MMX)\ -void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ - src += 8*stride;\ - dst += 8*stride;\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ -} - -#define QPEL16_OP(MC, MMX)\ -QPEL16_OPMC(put, MC, MMX)\ -QPEL16_OPMC(avg, MC, MMX) - -#define QPEL16(MMX)\ -QPEL16_OP(mc00, MMX)\ -QPEL16_OP(mc01, MMX)\ -QPEL16_OP(mc02, MMX)\ -QPEL16_OP(mc03, MMX)\ -QPEL16_OP(mc10, MMX)\ -QPEL16_OP(mc11, MMX)\ -QPEL16_OP(mc12, MMX)\ -QPEL16_OP(mc13, MMX)\ -QPEL16_OP(mc20, MMX)\ -QPEL16_OP(mc21, MMX)\ -QPEL16_OP(mc22, MMX)\ -QPEL16_OP(mc23, MMX)\ -QPEL16_OP(mc30, MMX)\ -QPEL16_OP(mc31, MMX)\ -QPEL16_OP(mc32, MMX)\ -QPEL16_OP(mc33, MMX) - -#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+ -QPEL16(mmxext) -#endif - -#endif /* HAVE_YASM */ - -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ - do { \ - c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ - } while (0) - -#define H264_QPEL_FUNCS(x, y, CPU) \ - do { \ - c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ - c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ - c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ - c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ - } while (0) - -#define H264_QPEL_FUNCS_10(x, y, CPU) \ - do { \ - c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ - c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ - c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ - c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ - } while (0) - -av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) -{ -#if HAVE_YASM - int high_bit_depth = bit_depth > 8; - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMXEXT(mm_flags)) { - if (!high_bit_depth) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, ); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, ); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, ); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); - } else if (bit_depth == 10) { -#if ARCH_X86_32 - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); -#endif - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); - } - } - - if (EXTERNAL_SSE2(mm_flags)) { - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) { - // these functions are slower than mmx on AMD, but faster on Intel - H264_QPEL_FUNCS(0, 0, sse2); - } - - if (!high_bit_depth) { - H264_QPEL_FUNCS(0, 1, sse2); - H264_QPEL_FUNCS(0, 2, sse2); - H264_QPEL_FUNCS(0, 3, sse2); - H264_QPEL_FUNCS(1, 1, sse2); - H264_QPEL_FUNCS(1, 2, sse2); - H264_QPEL_FUNCS(1, 3, sse2); - H264_QPEL_FUNCS(2, 1, sse2); - H264_QPEL_FUNCS(2, 2, sse2); - H264_QPEL_FUNCS(2, 3, sse2); - H264_QPEL_FUNCS(3, 1, sse2); - H264_QPEL_FUNCS(3, 2, sse2); - H264_QPEL_FUNCS(3, 3, sse2); - } - - if (bit_depth == 10) { - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); - H264_QPEL_FUNCS_10(1, 0, sse2_cache64); - H264_QPEL_FUNCS_10(2, 0, sse2_cache64); - H264_QPEL_FUNCS_10(3, 0, sse2_cache64); - } - } - - if (EXTERNAL_SSSE3(mm_flags)) { - if (!high_bit_depth) { - H264_QPEL_FUNCS(1, 0, ssse3); - H264_QPEL_FUNCS(1, 1, ssse3); - H264_QPEL_FUNCS(1, 2, ssse3); - H264_QPEL_FUNCS(1, 3, ssse3); - H264_QPEL_FUNCS(2, 0, ssse3); - H264_QPEL_FUNCS(2, 1, ssse3); - H264_QPEL_FUNCS(2, 2, ssse3); - H264_QPEL_FUNCS(2, 3, ssse3); - H264_QPEL_FUNCS(3, 0, ssse3); - H264_QPEL_FUNCS(3, 1, ssse3); - H264_QPEL_FUNCS(3, 2, ssse3); - H264_QPEL_FUNCS(3, 3, ssse3); - } - - if (bit_depth == 10) { - H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); - H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); - H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); - } - } - - if (EXTERNAL_AVX(mm_flags)) { - /* AVX implies 64 byte cache lines without the need to avoid unaligned - * memory accesses that cross the boundary between two cache lines. - * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid - * having to treat SSE2 functions with such properties as AVX. */ - if (bit_depth == 10) { - H264_QPEL_FUNCS_10(1, 0, sse2); - H264_QPEL_FUNCS_10(2, 0, sse2); - H264_QPEL_FUNCS_10(3, 0, sse2); - } - } -#endif -} diff --git a/ffmpeg1/libavcodec/x86/h264_qpel_10bit.asm b/ffmpeg1/libavcodec/x86/h264_qpel_10bit.asm deleted file mode 100644 index e14df84..0000000 --- a/ffmpeg1/libavcodec/x86/h264_qpel_10bit.asm +++ /dev/null @@ -1,884 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code -;***************************************************************************** -;* Copyright (C) 2011 x264 project -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA 32 - -cextern pw_16 -cextern pw_1 -cextern pb_0 - -pw_pixel_max: times 8 dw ((1 << 10)-1) - -pad10: times 8 dw 10*1023 -pad20: times 8 dw 20*1023 -pad30: times 8 dw 30*1023 -depad: times 4 dd 32*20*1023 + 512 -depad2: times 8 dw 20*1023 + 16*1022 + 16 -unpad: times 8 dw 16*1022/32 ; needs to be mod 16 - -tap1: times 4 dw 1, -5 -tap2: times 4 dw 20, 20 -tap3: times 4 dw -5, 1 -pd_0f: times 4 dd 0xffff - -SECTION .text - - -%macro AVG_MOV 2 - pavgw %2, %1 - mova %1, %2 -%endmacro - -%macro ADDW 3 -%if mmsize == 8 - paddw %1, %2 -%else - movu %3, %2 - paddw %1, %3 -%endif -%endmacro - -%macro FILT_H 4 - paddw %1, %4 - psubw %1, %2 ; a-b - psraw %1, 2 ; (a-b)/4 - psubw %1, %2 ; (a-b)/4-b - paddw %1, %3 ; (a-b)/4-b+c - psraw %1, 2 ; ((a-b)/4-b+c)/4 - paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 -%endmacro - -%macro PRELOAD_V 0 - lea r3, [r2*3] - sub r1, r3 - movu m0, [r1+r2] - movu m1, [r1+r2*2] - add r1, r3 - movu m2, [r1] - movu m3, [r1+r2] - movu m4, [r1+r2*2] - add r1, r3 -%endmacro - -%macro FILT_V 8 - movu %6, [r1] - paddw %1, %6 - mova %7, %2 - paddw %7, %5 - mova %8, %3 - paddw %8, %4 - FILT_H %1, %7, %8, [pw_16] - psraw %1, 1 - CLIPW %1, [pb_0], [pw_pixel_max] -%endmacro - -%macro MC 1 -%define OP_MOV mova -INIT_MMX mmxext -%1 put, 4 -INIT_XMM sse2 -%1 put, 8 - -%define OP_MOV AVG_MOV -INIT_MMX mmxext -%1 avg, 4 -INIT_XMM sse2 -%1 avg, 8 -%endmacro - -%macro MCAxA_OP 7 -%if ARCH_X86_32 -cglobal %1_h264_qpel%4_%2_10, %5,%6,%7 - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - mov r0, r0m - mov r1, r1m - add r0, %3*2 - add r1, %3*2 - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - mov r0, r0m - mov r1, r1m - lea r0, [r0+r2*%3] - lea r1, [r1+r2*%3] - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - mov r0, r0m - mov r1, r1m - lea r0, [r0+r2*%3+%3*2] - lea r1, [r1+r2*%3+%3*2] - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - RET -%else ; ARCH_X86_64 -cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 - mov r%6, r0 -%assign p1 %6+1 - mov r %+ p1, r1 - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - lea r0, [r%6+%3*2] - lea r1, [r %+ p1+%3*2] - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - lea r0, [r%6+r2*%3] - lea r1, [r %+ p1+r2*%3] - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - lea r0, [r%6+r2*%3+%3*2] - lea r1, [r %+ p1+r2*%3+%3*2] -%if UNIX64 == 0 ; fall through to function - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - RET -%endif -%endif -%endmacro - -;cpu, put/avg, mc, 4/8, ... -%macro cglobal_mc 6 -%assign i %3*2 -%if ARCH_X86_32 || cpuflag(sse2) -MCAxA_OP %1, %2, %3, i, %4,%5,%6 -%endif - -cglobal %1_h264_qpel%3_%2_10, %4,%5,%6 -%if UNIX64 == 0 ; no prologue or epilogue for UNIX64 - call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX - RET -%endif - -stub_%1_h264_qpel%3_%2_10 %+ SUFFIX: -%endmacro - -;----------------------------------------------------------------------------- -; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro COPY4 0 - movu m0, [r1 ] - OP_MOV [r0 ], m0 - movu m0, [r1+r2 ] - OP_MOV [r0+r2 ], m0 - movu m0, [r1+r2*2] - OP_MOV [r0+r2*2], m0 - movu m0, [r1+r3 ] - OP_MOV [r0+r3 ], m0 -%endmacro - -%macro MC00 1 -INIT_MMX mmxext -cglobal_mc %1, mc00, 4, 3,4,0 - lea r3, [r2*3] - COPY4 - ret - -INIT_XMM sse2 -cglobal %1_h264_qpel8_mc00_10, 3,4 - lea r3, [r2*3] - COPY4 - lea r0, [r0+r2*4] - lea r1, [r1+r2*4] - COPY4 - RET - -cglobal %1_h264_qpel16_mc00_10, 3,4 - mov r3d, 8 -.loop: - movu m0, [r1 ] - movu m1, [r1 +16] - OP_MOV [r0 ], m0 - OP_MOV [r0 +16], m1 - movu m0, [r1+r2 ] - movu m1, [r1+r2+16] - OP_MOV [r0+r2 ], m0 - OP_MOV [r0+r2+16], m1 - lea r0, [r0+r2*2] - lea r1, [r1+r2*2] - dec r3d - jg .loop - REP_RET -%endmacro - -%define OP_MOV mova -MC00 put - -%define OP_MOV AVG_MOV -MC00 avg - -;----------------------------------------------------------------------------- -; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC_CACHE 1 -%define OP_MOV mova -INIT_MMX mmxext -%1 put, 4 -INIT_XMM sse2, cache64 -%1 put, 8 -INIT_XMM ssse3, cache64 -%1 put, 8 -INIT_XMM sse2 -%1 put, 8 - -%define OP_MOV AVG_MOV -INIT_MMX mmxext -%1 avg, 4 -INIT_XMM sse2, cache64 -%1 avg, 8 -INIT_XMM ssse3, cache64 -%1 avg, 8 -INIT_XMM sse2 -%1 avg, 8 -%endmacro - -%macro MC20 2 -cglobal_mc %1, mc20, %2, 3,4,9 - mov r3d, %2 - mova m1, [pw_pixel_max] -%if num_mmregs > 8 - mova m8, [pw_16] - %define p16 m8 -%else - %define p16 [pw_16] -%endif -.nextrow: -%if %0 == 4 - movu m2, [r1-4] - movu m3, [r1-2] - movu m4, [r1+0] - ADDW m2, [r1+6], m5 - ADDW m3, [r1+4], m5 - ADDW m4, [r1+2], m5 -%else ; movu is slow on these processors -%if mmsize==16 - movu m2, [r1-4] - movu m0, [r1+6] - mova m6, m0 - psrldq m0, 6 - - paddw m6, m2 - PALIGNR m3, m0, m2, 2, m5 - PALIGNR m7, m0, m2, 8, m5 - paddw m3, m7 - PALIGNR m4, m0, m2, 4, m5 - PALIGNR m7, m0, m2, 6, m5 - paddw m4, m7 - SWAP 2, 6 -%else - movu m2, [r1-4] - movu m6, [r1+4] - PALIGNR m3, m6, m2, 2, m5 - paddw m3, m6 - PALIGNR m4, m6, m2, 4, m5 - PALIGNR m7, m6, m2, 6, m5 - paddw m4, m7 - paddw m2, [r1+6] -%endif -%endif - - FILT_H m2, m3, m4, p16 - psraw m2, 1 - pxor m0, m0 - CLIPW m2, m0, m1 - OP_MOV [r0], m2 - add r0, r2 - add r1, r2 - dec r3d - jg .nextrow - rep ret -%endmacro - -MC_CACHE MC20 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC30 2 -cglobal_mc %1, mc30, %2, 3,5,9 - lea r4, [r1+2] - jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body -%endmacro - -MC_CACHE MC30 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC10 2 -cglobal_mc %1, mc10, %2, 3,5,9 - mov r4, r1 -.body: - mov r3d, %2 - mova m1, [pw_pixel_max] -%if num_mmregs > 8 - mova m8, [pw_16] - %define p16 m8 -%else - %define p16 [pw_16] -%endif -.nextrow: -%if %0 == 4 - movu m2, [r1-4] - movu m3, [r1-2] - movu m4, [r1+0] - ADDW m2, [r1+6], m5 - ADDW m3, [r1+4], m5 - ADDW m4, [r1+2], m5 -%else ; movu is slow on these processors -%if mmsize==16 - movu m2, [r1-4] - movu m0, [r1+6] - mova m6, m0 - psrldq m0, 6 - - paddw m6, m2 - PALIGNR m3, m0, m2, 2, m5 - PALIGNR m7, m0, m2, 8, m5 - paddw m3, m7 - PALIGNR m4, m0, m2, 4, m5 - PALIGNR m7, m0, m2, 6, m5 - paddw m4, m7 - SWAP 2, 6 -%else - movu m2, [r1-4] - movu m6, [r1+4] - PALIGNR m3, m6, m2, 2, m5 - paddw m3, m6 - PALIGNR m4, m6, m2, 4, m5 - PALIGNR m7, m6, m2, 6, m5 - paddw m4, m7 - paddw m2, [r1+6] -%endif -%endif - - FILT_H m2, m3, m4, p16 - psraw m2, 1 - pxor m0, m0 - CLIPW m2, m0, m1 - movu m3, [r4] - pavgw m2, m3 - OP_MOV [r0], m2 - add r0, r2 - add r1, r2 - add r4, r2 - dec r3d - jg .nextrow - rep ret -%endmacro - -MC_CACHE MC10 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro V_FILT 10 -v_filt%9_%10_10 - add r4, r2 -.no_addr4: - FILT_V m0, m1, m2, m3, m4, m5, m6, m7 - add r1, r2 - add r0, r2 - ret -%endmacro - -INIT_MMX mmxext -RESET_MM_PERMUTATION -%assign i 0 -%rep 4 -V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i -SWAP 0,1,2,3,4,5 -%assign i i+1 -%endrep - -INIT_XMM sse2 -RESET_MM_PERMUTATION -%assign i 0 -%rep 6 -V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i -SWAP 0,1,2,3,4,5 -%assign i i+1 -%endrep - -%macro MC02 2 -cglobal_mc %1, mc02, %2, 3,4,8 - PRELOAD_V - - sub r0, r2 -%assign j 0 -%rep %2 - %assign i (j % 6) - call v_filt%2_ %+ i %+ _10.no_addr4 - OP_MOV [r0], m0 - SWAP 0,1,2,3,4,5 - %assign j j+1 -%endrep - ret -%endmacro - -MC MC02 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC01 2 -cglobal_mc %1, mc01, %2, 3,5,8 - mov r4, r1 -.body: - PRELOAD_V - - sub r4, r2 - sub r0, r2 -%assign j 0 -%rep %2 - %assign i (j % 6) - call v_filt%2_ %+ i %+ _10 - movu m7, [r4] - pavgw m0, m7 - OP_MOV [r0], m0 - SWAP 0,1,2,3,4,5 - %assign j j+1 -%endrep - ret -%endmacro - -MC MC01 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC03 2 -cglobal_mc %1, mc03, %2, 3,5,8 - lea r4, [r1+r2] - jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body -%endmacro - -MC MC03 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro H_FILT_AVG 2-3 -h_filt%1_%2_10: -;FILT_H with fewer registers and averaged with the FILT_V result -;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration -;unfortunately I need three registers, so m5 will have to be re-read from memory - movu m5, [r4-4] - ADDW m5, [r4+6], m7 - movu m6, [r4-2] - ADDW m6, [r4+4], m7 - paddw m5, [pw_16] - psubw m5, m6 ; a-b - psraw m5, 2 ; (a-b)/4 - psubw m5, m6 ; (a-b)/4-b - movu m6, [r4+0] - ADDW m6, [r4+2], m7 - paddw m5, m6 ; (a-b)/4-b+c - psraw m5, 2 ; ((a-b)/4-b+c)/4 - paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - psraw m5, 1 - CLIPW m5, [pb_0], [pw_pixel_max] -;avg FILT_V, FILT_H - pavgw m0, m5 -%if %0!=4 - movu m5, [r1+r5] -%endif - ret -%endmacro - -INIT_MMX mmxext -RESET_MM_PERMUTATION -%assign i 0 -%rep 3 -H_FILT_AVG 4, i -SWAP 0,1,2,3,4,5 -%assign i i+1 -%endrep -H_FILT_AVG 4, i, 0 - -INIT_XMM sse2 -RESET_MM_PERMUTATION -%assign i 0 -%rep 6 -%if i==1 -H_FILT_AVG 8, i, 0 -%else -H_FILT_AVG 8, i -%endif -SWAP 0,1,2,3,4,5 -%assign i i+1 -%endrep - -%macro MC11 2 -; this REALLY needs x86_64 -cglobal_mc %1, mc11, %2, 3,6,8 - mov r4, r1 -.body: - PRELOAD_V - - sub r0, r2 - sub r4, r2 - mov r5, r2 - neg r5 -%assign j 0 -%rep %2 - %assign i (j % 6) - call v_filt%2_ %+ i %+ _10 - call h_filt%2_ %+ i %+ _10 -%if %2==8 && i==1 - movu m5, [r1+r5] -%endif - OP_MOV [r0], m0 - SWAP 0,1,2,3,4,5 - %assign j j+1 -%endrep - ret -%endmacro - -MC MC11 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC31 2 -cglobal_mc %1, mc31, %2, 3,6,8 - mov r4, r1 - add r1, 2 - jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body -%endmacro - -MC MC31 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC13 2 -cglobal_mc %1, mc13, %2, 3,7,12 - lea r4, [r1+r2] - jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body -%endmacro - -MC MC13 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC33 2 -cglobal_mc %1, mc33, %2, 3,6,8 - lea r4, [r1+r2] - add r1, 2 - jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body -%endmacro - -MC MC33 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro FILT_H2 3 - psubw %1, %2 ; a-b - psubw %2, %3 ; b-c - psllw %2, 2 - psubw %1, %2 ; a-5*b+4*c - psllw %3, 4 - paddw %1, %3 ; a-5*b+20*c -%endmacro - -%macro FILT_VNRD 8 - movu %6, [r1] - paddw %1, %6 - mova %7, %2 - paddw %7, %5 - mova %8, %3 - paddw %8, %4 - FILT_H2 %1, %7, %8 -%endmacro - -%macro HV 1 -%if mmsize==16 -%define PAD 12 -%define COUNT 2 -%else -%define PAD 4 -%define COUNT 3 -%endif -put_hv%1_10: - neg r2 ; This actually saves instructions - lea r1, [r1+r2*2-mmsize+PAD] - lea r4, [rsp+PAD+gprsize] - mov r3d, COUNT -.v_loop: - movu m0, [r1] - sub r1, r2 - movu m1, [r1] - sub r1, r2 - movu m2, [r1] - sub r1, r2 - movu m3, [r1] - sub r1, r2 - movu m4, [r1] - sub r1, r2 -%assign i 0 -%rep %1-1 - FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 - psubw m0, [pad20] - movu [r4+i*mmsize*3], m0 - sub r1, r2 - SWAP 0,1,2,3,4,5 -%assign i i+1 -%endrep - FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 - psubw m0, [pad20] - movu [r4+i*mmsize*3], m0 - add r4, mmsize - lea r1, [r1+r2*8+mmsize] -%if %1==8 - lea r1, [r1+r2*4] -%endif - dec r3d - jg .v_loop - neg r2 - ret -%endmacro - -INIT_MMX mmxext -HV 4 -INIT_XMM sse2 -HV 8 - -%macro H_LOOP 1 -%if num_mmregs > 8 - %define s1 m8 - %define s2 m9 - %define s3 m10 - %define d1 m11 -%else - %define s1 [tap1] - %define s2 [tap2] - %define s3 [tap3] - %define d1 [depad] -%endif -h%1_loop_op: - movu m1, [r1+mmsize-4] - movu m2, [r1+mmsize-2] - mova m3, [r1+mmsize+0] - movu m4, [r1+mmsize+2] - movu m5, [r1+mmsize+4] - movu m6, [r1+mmsize+6] -%if num_mmregs > 8 - pmaddwd m1, s1 - pmaddwd m2, s1 - pmaddwd m3, s2 - pmaddwd m4, s2 - pmaddwd m5, s3 - pmaddwd m6, s3 - paddd m1, d1 - paddd m2, d1 -%else - mova m0, s1 - pmaddwd m1, m0 - pmaddwd m2, m0 - mova m0, s2 - pmaddwd m3, m0 - pmaddwd m4, m0 - mova m0, s3 - pmaddwd m5, m0 - pmaddwd m6, m0 - mova m0, d1 - paddd m1, m0 - paddd m2, m0 -%endif - paddd m3, m5 - paddd m4, m6 - paddd m1, m3 - paddd m2, m4 - psrad m1, 10 - psrad m2, 10 - pslld m2, 16 - pand m1, [pd_0f] - por m1, m2 -%if num_mmregs <= 8 - pxor m0, m0 -%endif - CLIPW m1, m0, m7 - add r1, mmsize*3 - ret -%endmacro - -INIT_MMX mmxext -H_LOOP 4 -INIT_XMM sse2 -H_LOOP 8 - -%macro MC22 2 -cglobal_mc %1, mc22, %2, 3,7,12 -%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) - mov r6, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, PAD - - call put_hv%2_10 - - mov r3d, %2 - mova m7, [pw_pixel_max] -%if num_mmregs > 8 - pxor m0, m0 - mova m8, [tap1] - mova m9, [tap2] - mova m10, [tap3] - mova m11, [depad] -%endif - mov r1, rsp -.h_loop: - call h%2_loop_op - - OP_MOV [r0], m1 - add r0, r2 - dec r3d - jg .h_loop - - mov rsp, r6 ; restore stack pointer - ret -%endmacro - -MC MC22 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC12 2 -cglobal_mc %1, mc12, %2, 3,7,12 -%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) - mov r6, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, PAD - - call put_hv%2_10 - - xor r4d, r4d -.body: - mov r3d, %2 - pxor m0, m0 - mova m7, [pw_pixel_max] -%if num_mmregs > 8 - mova m8, [tap1] - mova m9, [tap2] - mova m10, [tap3] - mova m11, [depad] -%endif - mov r1, rsp -.h_loop: - call h%2_loop_op - - movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc - paddw m3, [depad2] - psrlw m3, 5 - psubw m3, [unpad] - CLIPW m3, m0, m7 - pavgw m1, m3 - - OP_MOV [r0], m1 - add r0, r2 - dec r3d - jg .h_loop - - mov rsp, r6 ; restore stack pointer - ret -%endmacro - -MC MC12 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC32 2 -cglobal_mc %1, mc32, %2, 3,7,12 -%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) - mov r6, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - sub rsp, PAD - - call put_hv%2_10 - - mov r4d, 2 ; sizeof(pixel) - jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body -%endmacro - -MC MC32 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro H_NRD 1 -put_h%1_10: - add rsp, gprsize - mov r3d, %1 - xor r4d, r4d - mova m6, [pad20] -.nextrow: - movu m2, [r5-4] - movu m3, [r5-2] - movu m4, [r5+0] - ADDW m2, [r5+6], m5 - ADDW m3, [r5+4], m5 - ADDW m4, [r5+2], m5 - - FILT_H2 m2, m3, m4 - psubw m2, m6 - mova [rsp+r4], m2 - add r4d, mmsize*3 - add r5, r2 - dec r3d - jg .nextrow - sub rsp, gprsize - ret -%endmacro - -INIT_MMX mmxext -H_NRD 4 -INIT_XMM sse2 -H_NRD 8 - -%macro MC21 2 -cglobal_mc %1, mc21, %2, 3,7,12 - mov r5, r1 -.body: -%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) - mov r6, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack - - sub rsp, PAD - call put_h%2_10 - - sub rsp, PAD - call put_hv%2_10 - - mov r4d, PAD-mmsize ; H buffer - jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body -%endmacro - -MC MC21 - -;----------------------------------------------------------------------------- -; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) -;----------------------------------------------------------------------------- -%macro MC23 2 -cglobal_mc %1, mc23, %2, 3,7,12 - lea r5, [r1+r2] - jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body -%endmacro - -MC MC23 diff --git a/ffmpeg1/libavcodec/x86/h264_qpel_8bit.asm b/ffmpeg1/libavcodec/x86/h264_qpel_8bit.asm deleted file mode 100644 index 2d287ba..0000000 --- a/ffmpeg1/libavcodec/x86/h264_qpel_8bit.asm +++ /dev/null @@ -1,862 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/SSSE3-optimized H.264 QPEL code -;***************************************************************************** -;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2012 Daniel Kang -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA 32 - -cextern pw_16 -cextern pw_5 -cextern pb_0 - -SECTION .text - - -%macro op_avgh 3 - movh %3, %2 - pavgb %1, %3 - movh %2, %1 -%endmacro - -%macro op_avg 2-3 - pavgb %1, %2 - mova %2, %1 -%endmacro - -%macro op_puth 2-3 - movh %2, %1 -%endmacro - -%macro op_put 2-3 - mova %2, %1 -%endmacro - -%macro QPEL4_H_LOWPASS_OP 1 -cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - pxor m7, m7 - mova m4, [pw_5] - mova m5, [pw_16] - mov r4d, 4 -.loop: - movh m1, [r1-1] - movh m2, [r1+0] - movh m3, [r1+1] - movh m0, [r1+2] - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m0, m7 - paddw m1, m0 - paddw m2, m3 - movh m0, [r1-2] - movh m3, [r1+3] - punpcklbw m0, m7 - punpcklbw m3, m7 - paddw m0, m3 - psllw m2, 2 - psubw m2, m1 - pmullw m2, m4 - paddw m0, m5 - paddw m0, m2 - psraw m0, 5 - packuswb m0, m0 - op_%1h m0, [r0], m6 - add r0, r2 - add r1, r3 - dec r4d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL4_H_LOWPASS_OP put -QPEL4_H_LOWPASS_OP avg - -%macro QPEL8_H_LOWPASS_OP 1 -cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - mov r4d, 8 - pxor m7, m7 - mova m6, [pw_5] -.loop: - mova m0, [r1] - mova m2, [r1+1] - mova m1, m0 - mova m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - paddw m0, m2 - paddw m1, m3 - psllw m0, 2 - psllw m1, 2 - mova m2, [r1-1] - mova m4, [r1+2] - mova m3, m2 - mova m5, m4 - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - paddw m2, m4 - paddw m5, m3 - psubw m0, m2 - psubw m1, m5 - pmullw m0, m6 - pmullw m1, m6 - movd m2, [r1-2] - movd m5, [r1+7] - punpcklbw m2, m7 - punpcklbw m5, m7 - paddw m2, m3 - paddw m4, m5 - mova m5, [pw_16] - paddw m2, m5 - paddw m4, m5 - paddw m0, m2 - paddw m1, m4 - psraw m0, 5 - psraw m1, 5 - packuswb m0, m1 - op_%1 m0, [r0], m4 - add r0, r2 - add r1, r3 - dec r4d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL8_H_LOWPASS_OP put -QPEL8_H_LOWPASS_OP avg - -%macro QPEL8_H_LOWPASS_OP_XMM 1 -cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - mov r4d, 8 - pxor m7, m7 - mova m6, [pw_5] -.loop: - movu m1, [r1-2] - mova m0, m1 - punpckhbw m1, m7 - punpcklbw m0, m7 - mova m2, m1 - mova m3, m1 - mova m4, m1 - mova m5, m1 - palignr m4, m0, 2 - palignr m3, m0, 4 - palignr m2, m0, 6 - palignr m1, m0, 8 - palignr m5, m0, 10 - paddw m0, m5 - paddw m2, m3 - paddw m1, m4 - psllw m2, 2 - psubw m2, m1 - paddw m0, [pw_16] - pmullw m2, m6 - paddw m2, m0 - psraw m2, 5 - packuswb m2, m2 - op_%1h m2, [r0], m4 - add r1, r3 - add r0, r2 - dec r4d - jne .loop - REP_RET -%endmacro - -INIT_XMM ssse3 -QPEL8_H_LOWPASS_OP_XMM put -QPEL8_H_LOWPASS_OP_XMM avg - - -%macro QPEL4_H_LOWPASS_L2_OP 1 -cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - pxor m7, m7 - mova m4, [pw_5] - mova m5, [pw_16] - mov r5d, 4 -.loop: - movh m1, [r1-1] - movh m2, [r1+0] - movh m3, [r1+1] - movh m0, [r1+2] - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m0, m7 - paddw m1, m0 - paddw m2, m3 - movh m0, [r1-2] - movh m3, [r1+3] - punpcklbw m0, m7 - punpcklbw m3, m7 - paddw m0, m3 - psllw m2, 2 - psubw m2, m1 - pmullw m2, m4 - paddw m0, m5 - paddw m0, m2 - movh m3, [r2] - psraw m0, 5 - packuswb m0, m0 - pavgb m0, m3 - op_%1h m0, [r0], m6 - add r0, r3 - add r1, r3 - add r2, r4 - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL4_H_LOWPASS_L2_OP put -QPEL4_H_LOWPASS_L2_OP avg - - -%macro QPEL8_H_LOWPASS_L2_OP 1 -cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - mov r5d, 8 - pxor m7, m7 - mova m6, [pw_5] -.loop: - mova m0, [r1] - mova m2, [r1+1] - mova m1, m0 - mova m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - paddw m0, m2 - paddw m1, m3 - psllw m0, 2 - psllw m1, 2 - mova m2, [r1-1] - mova m4, [r1+2] - mova m3, m2 - mova m5, m4 - punpcklbw m2, m7 - punpckhbw m3, m7 - punpcklbw m4, m7 - punpckhbw m5, m7 - paddw m2, m4 - paddw m5, m3 - psubw m0, m2 - psubw m1, m5 - pmullw m0, m6 - pmullw m1, m6 - movd m2, [r1-2] - movd m5, [r1+7] - punpcklbw m2, m7 - punpcklbw m5, m7 - paddw m2, m3 - paddw m4, m5 - mova m5, [pw_16] - paddw m2, m5 - paddw m4, m5 - paddw m0, m2 - paddw m1, m4 - psraw m0, 5 - psraw m1, 5 - mova m4, [r2] - packuswb m0, m1 - pavgb m0, m4 - op_%1 m0, [r0], m4 - add r0, r3 - add r1, r3 - add r2, r4 - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL8_H_LOWPASS_L2_OP put -QPEL8_H_LOWPASS_L2_OP avg - - -%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 -cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - mov r5d, 8 - pxor m7, m7 - mova m6, [pw_5] -.loop: - lddqu m1, [r1-2] - mova m0, m1 - punpckhbw m1, m7 - punpcklbw m0, m7 - mova m2, m1 - mova m3, m1 - mova m4, m1 - mova m5, m1 - palignr m4, m0, 2 - palignr m3, m0, 4 - palignr m2, m0, 6 - palignr m1, m0, 8 - palignr m5, m0, 10 - paddw m0, m5 - paddw m2, m3 - paddw m1, m4 - psllw m2, 2 - movh m3, [r2] - psubw m2, m1 - paddw m0, [pw_16] - pmullw m2, m6 - paddw m2, m0 - psraw m2, 5 - packuswb m2, m2 - pavgb m2, m3 - op_%1h m2, [r0], m4 - add r1, r3 - add r0, r3 - add r2, r4 - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_XMM ssse3 -QPEL8_H_LOWPASS_L2_OP_XMM put -QPEL8_H_LOWPASS_L2_OP_XMM avg - - -; All functions that call this are required to have function arguments of -; dst, src, dstStride, srcStride -%macro FILT_V 1 - mova m6, m2 - movh m5, [r1] - paddw m6, m3 - psllw m6, 2 - psubw m6, m1 - psubw m6, m4 - punpcklbw m5, m7 - pmullw m6, [pw_5] - paddw m0, [pw_16] - add r1, r3 - paddw m0, m5 - paddw m6, m0 - psraw m6, 5 - packuswb m6, m6 - op_%1h m6, [r0], m0 ; 1 - add r0, r2 - SWAP 0, 1, 2, 3, 4, 5 -%endmacro - -%macro QPEL4_V_LOWPASS_OP 1 -cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - sub r1, r3 - sub r1, r3 - pxor m7, m7 - movh m0, [r1] - movh m1, [r1+r3] - lea r1, [r1+2*r3] - movh m2, [r1] - movh m3, [r1+r3] - lea r1, [r1+2*r3] - movh m4, [r1] - add r1, r3 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - RET -%endmacro - -INIT_MMX mmxext -QPEL4_V_LOWPASS_OP put -QPEL4_V_LOWPASS_OP avg - - - -%macro QPEL8OR16_V_LOWPASS_OP 1 -%if cpuflag(sse2) -cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - sub r1, r3 - sub r1, r3 -%else -cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d -%endif - pxor m7, m7 - movh m0, [r1] - movh m1, [r1+r3] - lea r1, [r1+2*r3] - movh m2, [r1] - movh m3, [r1+r3] - lea r1, [r1+2*r3] - movh m4, [r1] - add r1, r3 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - cmp r4d, 16 - jne .end - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 - FILT_V %1 -.end: - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL8OR16_V_LOWPASS_OP put -QPEL8OR16_V_LOWPASS_OP avg - -INIT_XMM sse2 -QPEL8OR16_V_LOWPASS_OP put -QPEL8OR16_V_LOWPASS_OP avg - - -; All functions that use this are required to have args: -; src, tmp, srcSize -%macro FILT_HV 1 ; offset - mova m6, m2 - movh m5, [r0] - paddw m6, m3 - psllw m6, 2 - paddw m0, [pw_16] - psubw m6, m1 - psubw m6, m4 - punpcklbw m5, m7 - pmullw m6, [pw_5] - paddw m0, m5 - add r0, r2 - paddw m6, m0 - mova [r1+%1], m6 - SWAP 0, 1, 2, 3, 4, 5 -%endmacro - -%macro QPEL4_HV1_LOWPASS_OP 1 -cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride - movsxdifnidn r2, r2d - pxor m7, m7 - movh m0, [r0] - movh m1, [r0+r2] - lea r0, [r0+2*r2] - movh m2, [r0] - movh m3, [r0+r2] - lea r0, [r0+2*r2] - movh m4, [r0] - add r0, r2 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - FILT_HV 0*24 - FILT_HV 1*24 - FILT_HV 2*24 - FILT_HV 3*24 - RET - -cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride - movsxdifnidn r2, r2d - mov r3d, 4 -.loop: - mova m0, [r0] - paddw m0, [r0+10] - mova m1, [r0+2] - paddw m1, [r0+8] - mova m2, [r0+4] - paddw m2, [r0+6] - psubw m0, m1 - psraw m0, 2 - psubw m0, m1 - paddsw m0, m2 - psraw m0, 2 - paddw m0, m2 - psraw m0, 6 - packuswb m0, m0 - op_%1h m0, [r1], m7 - add r0, 24 - add r1, r2 - dec r3d - jnz .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL4_HV1_LOWPASS_OP put -QPEL4_HV1_LOWPASS_OP avg - -%macro QPEL8OR16_HV1_LOWPASS_OP 1 -cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size - movsxdifnidn r2, r2d - pxor m7, m7 - movh m0, [r0] - movh m1, [r0+r2] - lea r0, [r0+2*r2] - movh m2, [r0] - movh m3, [r0+r2] - lea r0, [r0+2*r2] - movh m4, [r0] - add r0, r2 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - FILT_HV 0*48 - FILT_HV 1*48 - FILT_HV 2*48 - FILT_HV 3*48 - FILT_HV 4*48 - FILT_HV 5*48 - FILT_HV 6*48 - FILT_HV 7*48 - cmp r3d, 16 - jne .end - FILT_HV 8*48 - FILT_HV 9*48 - FILT_HV 10*48 - FILT_HV 11*48 - FILT_HV 12*48 - FILT_HV 13*48 - FILT_HV 14*48 - FILT_HV 15*48 -.end: - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL8OR16_HV1_LOWPASS_OP put -QPEL8OR16_HV1_LOWPASS_OP avg - -INIT_XMM sse2 -QPEL8OR16_HV1_LOWPASS_OP put - - - -%macro QPEL8OR16_HV2_LOWPASS_OP 1 -; unused is to match ssse3 and mmxext args -cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h - movsxdifnidn r2, r2d -.loop: - mova m0, [r1] - mova m3, [r1+8] - mova m1, [r1+2] - mova m4, [r1+10] - paddw m0, m4 - paddw m1, m3 - paddw m3, [r1+18] - paddw m4, [r1+16] - mova m2, [r1+4] - mova m5, [r1+12] - paddw m2, [r1+6] - paddw m5, [r1+14] - psubw m0, m1 - psubw m3, m4 - psraw m0, 2 - psraw m3, 2 - psubw m0, m1 - psubw m3, m4 - paddsw m0, m2 - paddsw m3, m5 - psraw m0, 2 - psraw m3, 2 - paddw m0, m2 - paddw m3, m5 - psraw m0, 6 - psraw m3, 6 - packuswb m0, m3 - op_%1 m0, [r0], m7 - add r1, 48 - add r0, r2 - dec r4d - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -QPEL8OR16_HV2_LOWPASS_OP put -QPEL8OR16_HV2_LOWPASS_OP avg - -%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 -cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - cmp r4d, 16 - je .op16 -.loop8: - mova m1, [r1+16] - mova m0, [r1] - mova m2, m1 - mova m3, m1 - mova m4, m1 - mova m5, m1 - palignr m5, m0, 10 - palignr m4, m0, 8 - palignr m3, m0, 6 - palignr m2, m0, 4 - palignr m1, m0, 2 - paddw m0, m5 - paddw m1, m4 - paddw m2, m3 - psubw m0, m1 - psraw m0, 2 - psubw m0, m1 - paddw m0, m2 - psraw m0, 2 - paddw m0, m2 - psraw m0, 6 - packuswb m0, m0 - op_%1h m0, [r0], m7 - add r1, 48 - add r0, r2 - dec r4d - jne .loop8 - jmp .done -.op16: - mova m4, [r1+32] - mova m5, [r1+16] - mova m7, [r1] - mova m3, m4 - mova m2, m4 - mova m1, m4 - mova m0, m4 - palignr m0, m5, 10 - palignr m1, m5, 8 - palignr m2, m5, 6 - palignr m3, m5, 4 - palignr m4, m5, 2 - paddw m0, m5 - paddw m1, m4 - paddw m2, m3 - mova m6, m5 - mova m4, m5 - mova m3, m5 - palignr m4, m7, 8 - palignr m6, m7, 2 - palignr m3, m7, 10 - paddw m4, m6 - mova m6, m5 - palignr m5, m7, 6 - palignr m6, m7, 4 - paddw m3, m7 - paddw m5, m6 - psubw m0, m1 - psubw m3, m4 - psraw m0, 2 - psraw m3, 2 - psubw m0, m1 - psubw m3, m4 - paddw m0, m2 - paddw m3, m5 - psraw m0, 2 - psraw m3, 2 - paddw m0, m2 - paddw m3, m5 - psraw m0, 6 - psraw m3, 6 - packuswb m3, m0 - op_%1 m3, [r0], m7 - add r1, 48 - add r0, r2 - dec r4d - jne .op16 -.done: - REP_RET -%endmacro - -INIT_XMM ssse3 -QPEL8OR16_HV2_LOWPASS_OP_XMM put -QPEL8OR16_HV2_LOWPASS_OP_XMM avg - - -%macro PIXELS4_L2_SHIFT5 1 -cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - mova m0, [r1] - mova m1, [r1+24] - psraw m0, 5 - psraw m1, 5 - packuswb m0, m0 - packuswb m1, m1 - pavgb m0, [r2] - pavgb m1, [r2+r4] - op_%1h m0, [r0], m4 - op_%1h m1, [r0+r3], m5 - lea r2, [r2+r4*2] - lea r0, [r0+r3*2] - mova m0, [r1+48] - mova m1, [r1+72] - psraw m0, 5 - psraw m1, 5 - packuswb m0, m0 - packuswb m1, m1 - pavgb m0, [r2] - pavgb m1, [r2+r4] - op_%1h m0, [r0], m4 - op_%1h m1, [r0+r3], m5 - RET -%endmacro - -INIT_MMX mmxext -PIXELS4_L2_SHIFT5 put -PIXELS4_L2_SHIFT5 avg - - -%macro PIXELS8_L2_SHIFT5 1 -cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d -.loop: - mova m0, [r1] - mova m1, [r1+8] - mova m2, [r1+48] - mova m3, [r1+48+8] - psraw m0, 5 - psraw m1, 5 - psraw m2, 5 - psraw m3, 5 - packuswb m0, m1 - packuswb m2, m3 - pavgb m0, [r2] - pavgb m2, [r2+r4] - op_%1 m0, [r0], m4 - op_%1 m2, [r0+r3], m5 - lea r2, [r2+2*r4] - add r1, 48*2 - lea r0, [r0+2*r3] - sub r5d, 2 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS8_L2_SHIFT5 put -PIXELS8_L2_SHIFT5 avg - - -%if ARCH_X86_64 -%macro QPEL16_H_LOWPASS_L2_OP 1 -cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - mov r5d, 16 - pxor m15, m15 - mova m14, [pw_5] - mova m13, [pw_16] -.loop: - lddqu m1, [r1+6] - lddqu m7, [r1-2] - mova m0, m1 - punpckhbw m1, m15 - punpcklbw m0, m15 - punpcklbw m7, m15 - mova m2, m1 - mova m6, m0 - mova m3, m1 - mova m8, m0 - mova m4, m1 - mova m9, m0 - mova m12, m0 - mova m11, m1 - palignr m11, m0, 10 - palignr m12, m7, 10 - palignr m4, m0, 2 - palignr m9, m7, 2 - palignr m3, m0, 4 - palignr m8, m7, 4 - palignr m2, m0, 6 - palignr m6, m7, 6 - paddw m11, m0 - palignr m1, m0, 8 - palignr m0, m7, 8 - paddw m7, m12 - paddw m2, m3 - paddw m6, m8 - paddw m1, m4 - paddw m0, m9 - psllw m2, 2 - psllw m6, 2 - psubw m2, m1 - psubw m6, m0 - paddw m11, m13 - paddw m7, m13 - pmullw m2, m14 - pmullw m6, m14 - lddqu m3, [r2] - paddw m2, m11 - paddw m6, m7 - psraw m2, 5 - psraw m6, 5 - packuswb m6, m2 - pavgb m6, m3 - op_%1 m6, [r0], m11 - add r1, r3 - add r0, r3 - add r2, r4 - dec r5d - jg .loop - REP_RET -%endmacro - -INIT_XMM ssse3 -QPEL16_H_LOWPASS_L2_OP put -QPEL16_H_LOWPASS_L2_OP avg -%endif diff --git a/ffmpeg1/libavcodec/x86/h264_weight.asm b/ffmpeg1/libavcodec/x86/h264_weight.asm deleted file mode 100644 index 4759a06..0000000 --- a/ffmpeg1/libavcodec/x86/h264_weight.asm +++ /dev/null @@ -1,317 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized weighted prediction code -;***************************************************************************** -;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -;----------------------------------------------------------------------------- -; biweight pred: -; -; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int height, int log2_denom, int weightd, -; int weights, int offset); -; and -; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, -; int log2_denom, int weight, int offset); -;----------------------------------------------------------------------------- - -%macro WEIGHT_SETUP 0 - add r5, r5 - inc r5 - movd m3, r4d - movd m5, r5d - movd m6, r3d - pslld m5, m6 - psrld m5, 1 -%if mmsize == 16 - pshuflw m3, m3, 0 - pshuflw m5, m5, 0 - punpcklqdq m3, m3 - punpcklqdq m5, m5 -%else - pshufw m3, m3, 0 - pshufw m5, m5, 0 -%endif - pxor m7, m7 -%endmacro - -%macro WEIGHT_OP 2 - movh m0, [r0+%1] - movh m1, [r0+%2] - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m3 - pmullw m1, m3 - paddsw m0, m5 - paddsw m1, m5 - psraw m0, m6 - psraw m1, m6 - packuswb m0, m1 -%endmacro - -INIT_MMX mmxext -cglobal h264_weight_16, 6, 6, 0 - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0, 4 - mova [r0 ], m0 - WEIGHT_OP 8, 12 - mova [r0+8], m0 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET - -%macro WEIGHT_FUNC_MM 2 -cglobal h264_weight_%1, 6, 6, %2 - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0, mmsize/2 - mova [r0], m0 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -WEIGHT_FUNC_MM 8, 0 -INIT_XMM sse2 -WEIGHT_FUNC_MM 16, 8 - -%macro WEIGHT_FUNC_HALF_MM 2 -cglobal h264_weight_%1, 6, 6, %2 - WEIGHT_SETUP - sar r2d, 1 - lea r3, [r1*2] -.nextrow: - WEIGHT_OP 0, r1 - movh [r0], m0 -%if mmsize == 16 - movhps [r0+r1], m0 -%else - psrlq m0, 32 - movh [r0+r1], m0 -%endif - add r0, r3 - dec r2d - jnz .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -WEIGHT_FUNC_HALF_MM 4, 0 -INIT_XMM sse2 -WEIGHT_FUNC_HALF_MM 8, 8 - -%macro BIWEIGHT_SETUP 0 -%if ARCH_X86_64 -%define off_regd r7d -%else -%define off_regd r3d -%endif - mov off_regd, r7m - add off_regd, 1 - or off_regd, 1 - add r4, 1 - cmp r5, 128 - jne .normal - sar r5, 1 - sar r6, 1 - sar off_regd, 1 - sub r4, 1 -.normal -%if cpuflag(ssse3) - movd m4, r5d - movd m0, r6d -%else - movd m3, r5d - movd m4, r6d -%endif - movd m5, off_regd - movd m6, r4d - pslld m5, m6 - psrld m5, 1 -%if cpuflag(ssse3) - punpcklbw m4, m0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m4, m4 - punpcklqdq m5, m5 - -%else -%if mmsize == 16 - pshuflw m3, m3, 0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m3, m3 - punpcklqdq m4, m4 - punpcklqdq m5, m5 -%else - pshufw m3, m3, 0 - pshufw m4, m4, 0 - pshufw m5, m5, 0 -%endif - pxor m7, m7 -%endif -%endmacro - -%macro BIWEIGHT_STEPA 3 - movh m%1, [r0+%3] - movh m%2, [r1+%3] - punpcklbw m%1, m7 - punpcklbw m%2, m7 - pmullw m%1, m3 - pmullw m%2, m4 - paddsw m%1, m%2 -%endmacro - -%macro BIWEIGHT_STEPB 0 - paddsw m0, m5 - paddsw m1, m5 - psraw m0, m6 - psraw m1, m6 - packuswb m0, m1 -%endmacro - -INIT_MMX mmxext -cglobal h264_biweight_16, 7, 8, 0 - BIWEIGHT_SETUP - movifnidn r3d, r3m -.nextrow: - BIWEIGHT_STEPA 0, 1, 0 - BIWEIGHT_STEPA 1, 2, 4 - BIWEIGHT_STEPB - mova [r0], m0 - BIWEIGHT_STEPA 0, 1, 8 - BIWEIGHT_STEPA 1, 2, 12 - BIWEIGHT_STEPB - mova [r0+8], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET - -%macro BIWEIGHT_FUNC_MM 2 -cglobal h264_biweight_%1, 7, 8, %2 - BIWEIGHT_SETUP - movifnidn r3d, r3m -.nextrow: - BIWEIGHT_STEPA 0, 1, 0 - BIWEIGHT_STEPA 1, 2, mmsize/2 - BIWEIGHT_STEPB - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -BIWEIGHT_FUNC_MM 8, 0 -INIT_XMM sse2 -BIWEIGHT_FUNC_MM 16, 8 - -%macro BIWEIGHT_FUNC_HALF_MM 2 -cglobal h264_biweight_%1, 7, 8, %2 - BIWEIGHT_SETUP - movifnidn r3d, r3m - sar r3, 1 - lea r4, [r2*2] -.nextrow: - BIWEIGHT_STEPA 0, 1, 0 - BIWEIGHT_STEPA 1, 2, r2 - BIWEIGHT_STEPB - movh [r0], m0 -%if mmsize == 16 - movhps [r0+r2], m0 -%else - psrlq m0, 32 - movh [r0+r2], m0 -%endif - add r0, r4 - add r1, r4 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -BIWEIGHT_FUNC_HALF_MM 4, 0 -INIT_XMM sse2 -BIWEIGHT_FUNC_HALF_MM 8, 8 - -%macro BIWEIGHT_SSSE3_OP 0 - pmaddubsw m0, m4 - pmaddubsw m2, m4 - paddsw m0, m5 - paddsw m2, m5 - psraw m0, m6 - psraw m2, m6 - packuswb m0, m2 -%endmacro - -INIT_XMM ssse3 -cglobal h264_biweight_16, 7, 8, 8 - BIWEIGHT_SETUP - movifnidn r3d, r3m - -.nextrow: - movh m0, [r0] - movh m2, [r0+8] - movh m3, [r1+8] - punpcklbw m0, [r1] - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET - -INIT_XMM ssse3 -cglobal h264_biweight_8, 7, 8, 8 - BIWEIGHT_SETUP - movifnidn r3d, r3m - sar r3, 1 - lea r4, [r2*2] - -.nextrow: - movh m0, [r0] - movh m1, [r1] - movh m2, [r0+r2] - movh m3, [r1+r2] - punpcklbw m0, m1 - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3d - jnz .nextrow - REP_RET diff --git a/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm deleted file mode 100644 index 3b09e42..0000000 --- a/ffmpeg1/libavcodec/x86/h264_weight_10bit.asm +++ /dev/null @@ -1,282 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code -;***************************************************************************** -;* Copyright (C) 2005-2011 x264 project -;* -;* Authors: Daniel Kang <daniel.d.kang@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA 32 - -pw_pixel_max: times 8 dw ((1 << 10)-1) -sq_1: dq 1 - dq 0 - -cextern pw_1 - -SECTION .text - -;----------------------------------------------------------------------------- -; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, -; int weight, int offset); -;----------------------------------------------------------------------------- -%macro WEIGHT_PROLOGUE 0 -.prologue: - PROLOGUE 0,6,8 - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r2d, r2m - movifnidn r4d, r4m - movifnidn r5d, r5m -%endmacro - -%macro WEIGHT_SETUP 0 - mova m0, [pw_1] - movd m2, r3m - pslld m0, m2 ; 1<<log2_denom - SPLATW m0, m0 - shl r5, 19 ; *8, move to upper half of dword - lea r5, [r5+r4*2+0x10000] - movd m3, r5d ; weight<<1 | 1+(offset<<(3)) - pshufd m3, m3, 0 - mova m4, [pw_pixel_max] - paddw m2, [sq_1] ; log2_denom+1 -%if notcpuflag(sse4) - pxor m7, m7 -%endif -%endmacro - -%macro WEIGHT_OP 1-2 -%if %0==1 - mova m5, [r0+%1] - punpckhwd m6, m5, m0 - punpcklwd m5, m0 -%else - movq m5, [r0+%1] - movq m6, [r0+%2] - punpcklwd m5, m0 - punpcklwd m6, m0 -%endif - pmaddwd m5, m3 - pmaddwd m6, m3 - psrad m5, m2 - psrad m6, m2 -%if cpuflag(sse4) - packusdw m5, m6 - pminsw m5, m4 -%else - packssdw m5, m6 - CLIPW m5, m7, m4 -%endif -%endmacro - -%macro WEIGHT_FUNC_DBL 0 -cglobal h264_weight_16_10 - WEIGHT_PROLOGUE - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0 - mova [r0 ], m5 - WEIGHT_OP 16 - mova [r0+16], m5 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -WEIGHT_FUNC_DBL -INIT_XMM sse4 -WEIGHT_FUNC_DBL - - -%macro WEIGHT_FUNC_MM 0 -cglobal h264_weight_8_10 - WEIGHT_PROLOGUE - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0 - mova [r0], m5 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -WEIGHT_FUNC_MM -INIT_XMM sse4 -WEIGHT_FUNC_MM - - -%macro WEIGHT_FUNC_HALF_MM 0 -cglobal h264_weight_4_10 - WEIGHT_PROLOGUE - sar r2d, 1 - WEIGHT_SETUP - lea r3, [r1*2] -.nextrow: - WEIGHT_OP 0, r1 - movh [r0], m5 - movhps [r0+r1], m5 - add r0, r3 - dec r2d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -WEIGHT_FUNC_HALF_MM -INIT_XMM sse4 -WEIGHT_FUNC_HALF_MM - - -;----------------------------------------------------------------------------- -; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, -; int log2_denom, int weightd, int weights, int offset); -;----------------------------------------------------------------------------- -%if ARCH_X86_32 -DECLARE_REG_TMP 3 -%else -DECLARE_REG_TMP 7 -%endif - -%macro BIWEIGHT_PROLOGUE 0 -.prologue: - PROLOGUE 0,8,8 - movifnidn r0, r0mp - movifnidn r1, r1mp - movifnidn r2d, r2m - movifnidn r5d, r5m - movifnidn r6d, r6m - movifnidn t0d, r7m -%endmacro - -%macro BIWEIGHT_SETUP 0 - lea t0, [t0*4+1] ; (offset<<2)+1 - or t0, 1 - shl r6, 16 - or r5, r6 - movd m4, r5d ; weightd | weights - movd m5, t0d ; (offset+1)|1 - movd m6, r4m ; log2_denom - pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom - paddd m6, [sq_1] - pshufd m4, m4, 0 - pshufd m5, m5, 0 - mova m3, [pw_pixel_max] - movifnidn r3d, r3m -%if notcpuflag(sse4) - pxor m7, m7 -%endif -%endmacro - -%macro BIWEIGHT 1-2 -%if %0==1 - mova m0, [r0+%1] - mova m1, [r1+%1] - punpckhwd m2, m0, m1 - punpcklwd m0, m1 -%else - movq m0, [r0+%1] - movq m1, [r1+%1] - punpcklwd m0, m1 - movq m2, [r0+%2] - movq m1, [r1+%2] - punpcklwd m2, m1 -%endif - pmaddwd m0, m4 - pmaddwd m2, m4 - paddd m0, m5 - paddd m2, m5 - psrad m0, m6 - psrad m2, m6 -%if cpuflag(sse4) - packusdw m0, m2 - pminsw m0, m3 -%else - packssdw m0, m2 - CLIPW m0, m7, m3 -%endif -%endmacro - -%macro BIWEIGHT_FUNC_DBL 0 -cglobal h264_biweight_16_10 - BIWEIGHT_PROLOGUE - BIWEIGHT_SETUP -.nextrow: - BIWEIGHT 0 - mova [r0 ], m0 - BIWEIGHT 16 - mova [r0+16], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -BIWEIGHT_FUNC_DBL -INIT_XMM sse4 -BIWEIGHT_FUNC_DBL - -%macro BIWEIGHT_FUNC 0 -cglobal h264_biweight_8_10 - BIWEIGHT_PROLOGUE - BIWEIGHT_SETUP -.nextrow: - BIWEIGHT 0 - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -BIWEIGHT_FUNC -INIT_XMM sse4 -BIWEIGHT_FUNC - -%macro BIWEIGHT_FUNC_HALF 0 -cglobal h264_biweight_4_10 - BIWEIGHT_PROLOGUE - BIWEIGHT_SETUP - sar r3d, 1 - lea r4, [r2*2] -.nextrow: - BIWEIGHT 0, r2 - movh [r0 ], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3d - jnz .nextrow - REP_RET -%endmacro - -INIT_XMM sse2 -BIWEIGHT_FUNC_HALF -INIT_XMM sse4 -BIWEIGHT_FUNC_HALF diff --git a/ffmpeg1/libavcodec/x86/h264chroma_init.c b/ffmpeg1/libavcodec/x86/h264chroma_init.c deleted file mode 100644 index b5c078f..0000000 --- a/ffmpeg1/libavcodec/x86/h264chroma_init.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/h264chroma.h" - -void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ -void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, uint8_t *src, \ - int stride, int h, int x, int y); - -CHROMA_MC(put, 2, 10, mmxext) -CHROMA_MC(avg, 2, 10, mmxext) -CHROMA_MC(put, 4, 10, mmxext) -CHROMA_MC(avg, 4, 10, mmxext) -CHROMA_MC(put, 8, 10, sse2) -CHROMA_MC(avg, 8, 10, sse2) -CHROMA_MC(put, 8, 10, avx) -CHROMA_MC(avg, 8, 10, avx) - -void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth) -{ -#if HAVE_YASM - int high_bit_depth = bit_depth > 8; - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMX(mm_flags) && !high_bit_depth) { - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; - } - - if (EXTERNAL_AMD3DNOW(mm_flags) && !high_bit_depth) { - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow; - } - - if (EXTERNAL_MMXEXT(mm_flags) && !high_bit_depth) { - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; - c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext; - c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext; - } - - if (EXTERNAL_MMXEXT(mm_flags) && bit_depth > 8 && bit_depth <= 10) { - c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; - c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; - } - - if (EXTERNAL_SSE2(mm_flags) && bit_depth > 8 && bit_depth <= 10) { - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; - } - - if (EXTERNAL_SSSE3(mm_flags) && !high_bit_depth) { - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3; - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3; - } - - if (EXTERNAL_AVX(mm_flags) && bit_depth > 8 && bit_depth <= 10) { - // AVX implies !cache64. - // TODO: Port cache(32|64) detection from x264. - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx; - } -#endif -} diff --git a/ffmpeg1/libavcodec/x86/h264dsp_init.c b/ffmpeg1/libavcodec/x86/h264dsp_init.c deleted file mode 100644 index 11aae77..0000000 --- a/ffmpeg1/libavcodec/x86/h264dsp_init.c +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/h264dsp.h" -#include "dsputil_mmx.h" - -/***********************************/ -/* IDCT */ -#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int16_t *block, \ - int stride); - -IDCT_ADD_FUNC(, 8, mmx) -IDCT_ADD_FUNC(, 10, sse2) -IDCT_ADD_FUNC(_dc, 8, mmxext) -IDCT_ADD_FUNC(_dc, 10, mmxext) -IDCT_ADD_FUNC(8_dc, 8, mmxext) -IDCT_ADD_FUNC(8_dc, 10, sse2) -IDCT_ADD_FUNC(8, 8, mmx) -IDCT_ADD_FUNC(8, 8, sse2) -IDCT_ADD_FUNC(8, 10, sse2) -IDCT_ADD_FUNC(, 10, avx) -IDCT_ADD_FUNC(8_dc, 10, avx) -IDCT_ADD_FUNC(8, 10, avx) - - -#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, const int *block_offset, \ - int16_t *block, int stride, const uint8_t nnzc[6 * 8]); - -IDCT_ADD_REP_FUNC(8, 4, 8, mmx) -IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) -IDCT_ADD_REP_FUNC(8, 4, 8, sse2) -IDCT_ADD_REP_FUNC(8, 4, 10, sse2) -IDCT_ADD_REP_FUNC(8, 4, 10, avx) -IDCT_ADD_REP_FUNC(, 16, 8, mmx) -IDCT_ADD_REP_FUNC(, 16, 8, mmxext) -IDCT_ADD_REP_FUNC(, 16, 8, sse2) -IDCT_ADD_REP_FUNC(, 16, 10, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) -IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) -IDCT_ADD_REP_FUNC(, 16, 10, avx) -IDCT_ADD_REP_FUNC(, 16intra, 10, avx) - - -#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t **dst, const int *block_offset, \ - int16_t *block, int stride, const uint8_t nnzc[6 * 8]); - -IDCT_ADD_REP_FUNC2(, 8, 8, mmx) -IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) -IDCT_ADD_REP_FUNC2(, 8, 8, sse2) -IDCT_ADD_REP_FUNC2(, 8, 10, sse2) -IDCT_ADD_REP_FUNC2(, 8, 10, avx) - -void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); -void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); - -/***********************************/ -/* deblocking */ - -void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], - int8_t ref[2][40], - int16_t mv[2][40][2], - int bidir, int edges, int step, - int mask_mv0, int mask_mv1, int field); - -#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ - int stride, \ - int alpha, \ - int beta, \ - int8_t *tc0); -#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ - int stride, \ - int alpha, \ - int beta); - -#define LF_FUNCS(type, depth) \ -LF_FUNC(h, chroma, depth, mmxext) \ -LF_IFUNC(h, chroma_intra, depth, mmxext) \ -LF_FUNC(v, chroma, depth, mmxext) \ -LF_IFUNC(v, chroma_intra, depth, mmxext) \ -LF_FUNC(h, luma, depth, mmxext) \ -LF_IFUNC(h, luma_intra, depth, mmxext) \ -LF_FUNC(h, luma, depth, sse2) \ -LF_IFUNC(h, luma_intra, depth, sse2) \ -LF_FUNC(v, luma, depth, sse2) \ -LF_IFUNC(v, luma_intra, depth, sse2) \ -LF_FUNC(h, chroma, depth, sse2) \ -LF_IFUNC(h, chroma_intra, depth, sse2) \ -LF_FUNC(v, chroma, depth, sse2) \ -LF_IFUNC(v, chroma_intra, depth, sse2) \ -LF_FUNC(h, luma, depth, avx) \ -LF_IFUNC(h, luma_intra, depth, avx) \ -LF_FUNC(v, luma, depth, avx) \ -LF_IFUNC(v, luma_intra, depth, avx) \ -LF_FUNC(h, chroma, depth, avx) \ -LF_IFUNC(h, chroma_intra, depth, avx) \ -LF_FUNC(v, chroma, depth, avx) \ -LF_IFUNC(v, chroma_intra, depth, avx) - -LF_FUNCS(uint8_t, 8) -LF_FUNCS(uint16_t, 10) - -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL -LF_FUNC(v8, luma, 8, mmxext) -static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0) -{ - if ((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); - if ((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); -} -LF_IFUNC(v8, luma_intra, 8, mmxext) -static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, - int alpha, int beta) -{ - ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); -} -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - -LF_FUNC(v, luma, 10, mmxext) -LF_IFUNC(v, luma_intra, 10, mmxext) - -/***********************************/ -/* weighted prediction */ - -#define H264_WEIGHT(W, OPT) \ -void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ - int height, int log2_denom, \ - int weight, int offset); - -#define H264_BIWEIGHT(W, OPT) \ -void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ - int stride, int height, \ - int log2_denom, int weightd, \ - int weights, int offset); - -#define H264_BIWEIGHT_MMX(W) \ - H264_WEIGHT(W, mmxext) \ - H264_BIWEIGHT(W, mmxext) - -#define H264_BIWEIGHT_MMX_SSE(W) \ - H264_BIWEIGHT_MMX(W) \ - H264_WEIGHT(W, sse2) \ - H264_BIWEIGHT(W, sse2) \ - H264_BIWEIGHT(W, ssse3) - -H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE(8) -H264_BIWEIGHT_MMX(4) - -#define H264_WEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int stride, \ - int height, \ - int log2_denom, \ - int weight, \ - int offset); - -#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - uint8_t *src, \ - int stride, \ - int height, \ - int log2_denom, \ - int weightd, \ - int weights, \ - int offset); - -#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ - H264_WEIGHT_10(W, DEPTH, sse2) \ - H264_WEIGHT_10(W, DEPTH, sse4) \ - H264_BIWEIGHT_10(W, DEPTH, sse2) \ - H264_BIWEIGHT_10(W, DEPTH, sse4) - -H264_BIWEIGHT_10_SSE(16, 10) -H264_BIWEIGHT_10_SSE(8, 10) -H264_BIWEIGHT_10_SSE(4, 10) - -av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, - const int chroma_format_idc) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags)) - c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; - - if (bit_depth == 8) { - if (EXTERNAL_MMX(mm_flags)) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (mm_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - - if (EXTERNAL_MMXEXT(mm_flags)) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; - if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; - } -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; - - if (EXTERNAL_SSE2(mm_flags)) { - c->h264_idct8_add = ff_h264_idct8_add_8_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; - - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; - - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; - } - if (EXTERNAL_SSSE3(mm_flags)) { - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; - } - if (EXTERNAL_AVX(mm_flags)) { - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; - } - } - } - } else if (bit_depth == 10) { - if (EXTERNAL_MMX(mm_flags)) { - if (EXTERNAL_MMXEXT(mm_flags)) { -#if ARCH_X86_32 - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; -#endif /* ARCH_X86_32 */ - c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; - if (EXTERNAL_SSE2(mm_flags)) { - c->h264_idct_add = ff_h264_idct_add_10_sse2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; -#if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; -#endif /* HAVE_ALIGNED_STACK */ - - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; -#endif /* HAVE_ALIGNED_STACK */ - } - if (EXTERNAL_SSE4(mm_flags)) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; - } - if (EXTERNAL_AVX(mm_flags)) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_10_avx; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; - - c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; -#if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_avx; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; -#endif /* HAVE_ALIGNED_STACK */ - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; -#endif /* HAVE_ALIGNED_STACK */ - } - } - } - } -#endif -} diff --git a/ffmpeg1/libavcodec/x86/hpeldsp.asm b/ffmpeg1/libavcodec/x86/hpeldsp.asm deleted file mode 100644 index 1a572a3..0000000 --- a/ffmpeg1/libavcodec/x86/hpeldsp.asm +++ /dev/null @@ -1,461 +0,0 @@ -;****************************************************************************** -;* -;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> -;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> -;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> -;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> -;* Copyright (c) 2013 Daniel Kang -;* -;* MMX optimized hpel functions -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -cextern pb_1 - -SECTION_TEXT - -; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_PIXELS8_X2 0 -cglobal put_pixels8_x2, 4,5 - lea r4, [r2*2] -.loop: - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] - mova [r0], m0 - mova [r0+r2], m1 - add r1, r4 - add r0, r4 - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] - add r1, r4 - mova [r0], m0 - mova [r0+r2], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_PIXELS8_X2 -INIT_MMX 3dnow -PUT_PIXELS8_X2 - - -; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_PIXELS_16 0 -cglobal put_pixels16_x2, 4,5 - lea r4, [r2*2] -.loop: - mova m0, [r1] - mova m1, [r1+r2] - mova m2, [r1+8] - mova m3, [r1+r2+8] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] - PAVGB m2, [r1+9] - PAVGB m3, [r1+r2+9] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+8], m2 - mova [r0+r2+8], m3 - add r1, r4 - add r0, r4 - mova m0, [r1] - mova m1, [r1+r2] - mova m2, [r1+8] - mova m3, [r1+r2+8] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] - PAVGB m2, [r1+9] - PAVGB m3, [r1+r2+9] - add r1, r4 - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+8], m2 - mova [r0+r2+8], m3 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_PIXELS_16 -INIT_MMX 3dnow -PUT_PIXELS_16 - - -; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_NO_RND_PIXELS8_X2 0 -cglobal put_no_rnd_pixels8_x2, 4,5 - mova m6, [pb_1] - lea r4, [r2*2] -.loop: - mova m0, [r1] - mova m2, [r1+r2] - mova m1, [r1+1] - mova m3, [r1+r2+1] - add r1, r4 - psubusb m0, m6 - psubusb m2, m6 - PAVGB m0, m1 - PAVGB m2, m3 - mova [r0], m0 - mova [r0+r2], m2 - mova m0, [r1] - mova m1, [r1+1] - mova m2, [r1+r2] - mova m3, [r1+r2+1] - add r0, r4 - add r1, r4 - psubusb m0, m6 - psubusb m2, m6 - PAVGB m0, m1 - PAVGB m2, m3 - mova [r0], m0 - mova [r0+r2], m2 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS8_X2 -INIT_MMX 3dnow -PUT_NO_RND_PIXELS8_X2 - - -; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 -cglobal put_no_rnd_pixels8_x2_exact, 4,5 - lea r4, [r2*3] - pcmpeqb m6, m6 -.loop: - mova m0, [r1] - mova m2, [r1+r2] - mova m1, [r1+1] - mova m3, [r1+r2+1] - pxor m0, m6 - pxor m2, m6 - pxor m1, m6 - pxor m3, m6 - PAVGB m0, m1 - PAVGB m2, m3 - pxor m0, m6 - pxor m2, m6 - mova [r0], m0 - mova [r0+r2], m2 - mova m0, [r1+r2*2] - mova m1, [r1+r2*2+1] - mova m2, [r1+r4] - mova m3, [r1+r4+1] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m1 - PAVGB m2, m3 - pxor m0, m6 - pxor m2, m6 - mova [r0+r2*2], m0 - mova [r0+r4], m2 - lea r1, [r1+r2*4] - lea r0, [r0+r2*4] - sub r3d, 4 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS8_X2_EXACT -INIT_MMX 3dnow -PUT_NO_RND_PIXELS8_X2_EXACT - - -; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_PIXELS8_Y2 0 -cglobal put_pixels8_y2, 4,5 - lea r4, [r2*2] - mova m0, [r1] - sub r0, r2 -.loop: - mova m1, [r1+r2] - mova m2, [r1+r4] - add r1, r4 - PAVGB m0, m1 - PAVGB m1, m2 - mova [r0+r2], m0 - mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] - add r0, r4 - add r1, r4 - PAVGB m2, m1 - PAVGB m1, m0 - mova [r0+r2], m2 - mova [r0+r4], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_PIXELS8_Y2 -INIT_MMX 3dnow -PUT_PIXELS8_Y2 - - -; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_NO_RND_PIXELS8_Y2 0 -cglobal put_no_rnd_pixels8_y2, 4,5 - mova m6, [pb_1] - lea r4, [r2+r2] - mova m0, [r1] - sub r0, r2 -.loop: - mova m1, [r1+r2] - mova m2, [r1+r4] - add r1, r4 - psubusb m1, m6 - PAVGB m0, m1 - PAVGB m1, m2 - mova [r0+r2], m0 - mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] - add r0, r4 - add r1, r4 - psubusb m1, m6 - PAVGB m2, m1 - PAVGB m1, m0 - mova [r0+r2], m2 - mova [r0+r4], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS8_Y2 -INIT_MMX 3dnow -PUT_NO_RND_PIXELS8_Y2 - - -; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 -cglobal put_no_rnd_pixels8_y2_exact, 4,5 - lea r4, [r2*3] - mova m0, [r1] - pcmpeqb m6, m6 - add r1, r2 - pxor m0, m6 -.loop: - mova m1, [r1] - mova m2, [r1+r2] - pxor m1, m6 - pxor m2, m6 - PAVGB m0, m1 - PAVGB m1, m2 - pxor m0, m6 - pxor m1, m6 - mova [r0], m0 - mova [r0+r2], m1 - mova m1, [r1+r2*2] - mova m0, [r1+r4] - pxor m1, m6 - pxor m0, m6 - PAVGB m2, m1 - PAVGB m1, m0 - pxor m2, m6 - pxor m1, m6 - mova [r0+r2*2], m2 - mova [r0+r4], m1 - lea r1, [r1+r2*4] - lea r0, [r0+r2*4] - sub r3d, 4 - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS8_Y2_EXACT -INIT_MMX 3dnow -PUT_NO_RND_PIXELS8_Y2_EXACT - - -; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8 0 -cglobal avg_pixels8, 4,5 - lea r4, [r2*2] -.loop: - mova m0, [r0] - mova m1, [r0+r2] - PAVGB m0, [r1] - PAVGB m1, [r1+r2] - mova [r0], m0 - mova [r0+r2], m1 - add r1, r4 - add r0, r4 - mova m0, [r0] - mova m1, [r0+r2] - PAVGB m0, [r1] - PAVGB m1, [r1+r2] - add r1, r4 - mova [r0], m0 - mova [r0+r2], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX 3dnow -AVG_PIXELS8 - - -; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_X2 0 -cglobal avg_pixels8_x2, 4,5 - lea r4, [r2*2] -.loop: - mova m0, [r1] - mova m2, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m2, [r1+r2+1] - PAVGB m0, [r0] - PAVGB m2, [r0+r2] - add r1, r4 - mova [r0], m0 - mova [r0+r2], m2 - mova m0, [r1] - mova m2, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m2, [r1+r2+1] - add r0, r4 - add r1, r4 - PAVGB m0, [r0] - PAVGB m2, [r0+r2] - mova [r0], m0 - mova [r0+r2], m2 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -AVG_PIXELS8_X2 -INIT_MMX 3dnow -AVG_PIXELS8_X2 - - -; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_Y2 0 -cglobal avg_pixels8_y2, 4,5 - lea r4, [r2*2] - mova m0, [r1] - sub r0, r2 -.loop: - mova m1, [r1+r2] - mova m2, [r1+r4] - add r1, r4 - PAVGB m0, m1 - PAVGB m1, m2 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m0, m3 - PAVGB m1, m4 - mova [r0+r2], m0 - mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] - PAVGB m2, m1 - PAVGB m1, m0 - add r0, r4 - add r1, r4 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m2, m3 - PAVGB m1, m4 - mova [r0+r2], m2 - mova [r0+r4], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -AVG_PIXELS8_Y2 -INIT_MMX 3dnow -AVG_PIXELS8_Y2 - - -; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_XY2 0 -cglobal avg_pixels8_xy2, 4,5 - mova m6, [pb_1] - lea r4, [r2*2] - mova m0, [r1] - pavgb m0, [r1+1] -.loop: - mova m2, [r1+r4] - mova m1, [r1+r2] - psubusb m2, m6 - pavgb m1, [r1+r2+1] - pavgb m2, [r1+r4+1] - add r1, r4 - pavgb m0, m1 - pavgb m1, m2 - pavgb m0, [r0] - pavgb m1, [r0+r2] - mova [r0], m0 - mova [r0+r2], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] - pavgb m1, [r1+r2+1] - pavgb m0, [r1+r4+1] - add r0, r4 - add r1, r4 - pavgb m2, m1 - pavgb m1, m0 - pavgb m2, [r0] - pavgb m1, [r0+r2] - mova [r0], m2 - mova [r0+r2], m1 - add r0, r4 - sub r3d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -AVG_PIXELS8_XY2 -INIT_MMX 3dnow -AVG_PIXELS8_XY2 diff --git a/ffmpeg1/libavcodec/x86/hpeldsp_avg_template.c b/ffmpeg1/libavcodec/x86/hpeldsp_avg_template.c deleted file mode 100644 index b9a8f83..0000000 --- a/ffmpeg1/libavcodec/x86/hpeldsp_avg_template.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * DSP utils : average functions are compiled twice for 3dnow/mmxext - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -//FIXME the following could be optimized too ... -static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_no_rnd_pixels8_x2)(block, pixels, line_size, h); - DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_no_rnd_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8)(block, pixels, line_size, h); - DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_x2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_xy2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h); -} diff --git a/ffmpeg1/libavcodec/x86/hpeldsp_init.c b/ffmpeg1/libavcodec/x86/hpeldsp_init.c deleted file mode 100644 index 4b877b8..0000000 --- a/ffmpeg1/libavcodec/x86/hpeldsp_init.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/hpeldsp.h" -#include "dsputil_mmx.h" - -//#undef NDEBUG -//#include <assert.h> - -#if HAVE_YASM -void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM - -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "packuswb %%"#regd", %%"#regd" \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "psllw $1, %%"#regd" \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "paddb "#regb", "#regr" \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pand "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "paddb "#regb", "#regr" \n\t" \ - "paddb "#regd", "#regp" \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "por "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" \ - "psubb "#regd", "#regp" \n\t" - -/***********************************/ -/* MMX no rounding */ -#define NO_RND 1 -#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx -#define SET_RND MOVQ_WONE -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) -#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) - -#include "hpeldsp_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB -#undef NO_RND -/***********************************/ -/* MMX rounding */ - -#define DEF(x, y) x ## _ ## y ## _mmx -#define SET_RND MOVQ_WTWO -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) - -#include "hpeldsp_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB -#undef OP_AVG - -#endif /* HAVE_INLINE_ASM */ - - -#if HAVE_YASM -#define ff_put_pixels8_mmx ff_put_pixels8_mmxext - -/***********************************/ -/* 3Dnow specific */ - -#define DEF(x) x ## _3dnow - -#include "hpeldsp_avg_template.c" - -#undef DEF - -/***********************************/ -/* MMXEXT specific */ - -#define DEF(x) x ## _mmxext - -#include "hpeldsp_avg_template.c" - -#undef DEF - -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM -#define put_no_rnd_pixels16_mmx put_pixels16_mmx -#define put_no_rnd_pixels8_mmx put_pixels8_mmx -#define put_pixels16_mmxext put_pixels16_mmx -#define put_pixels8_mmxext put_pixels8_mmx -#define put_pixels4_mmxext put_pixels4_mmx -#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} -#endif /* HAVE_INLINE_ASM */ - -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - do { \ - c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ - c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ - c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ - c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ - } while (0) - -static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags) -{ -#if HAVE_INLINE_ASM - SET_HPEL_FUNCS(put, [0], 16, mmx); - SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); - SET_HPEL_FUNCS(avg, [0], 16, mmx); - SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); - SET_HPEL_FUNCS(put, [1], 8, mmx); - SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); - SET_HPEL_FUNCS(avg, [1], 8, mmx); -#endif /* HAVE_INLINE_ASM */ -} - -static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) -{ -#if HAVE_YASM - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; - - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; - - c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; - - if (!(flags & CODEC_FLAG_BITEXACT)) { - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; - } -#endif /* HAVE_YASM */ - -#if HAVE_MMXEXT_EXTERNAL - if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; - } -#endif /* HAVE_MMXEXT_EXTERNAL */ -} - -static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) -{ -#if HAVE_YASM - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; - - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; - - c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; - - if (!(flags & CODEC_FLAG_BITEXACT)){ - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; - - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; - } - - if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; - } -#endif /* HAVE_YASM */ -} - -static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags) -{ -#if HAVE_SSE2_EXTERNAL - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { - // these functions are slower than mmx on AMD, but faster on Intel - c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; - c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; - } -#endif /* HAVE_SSE2_EXTERNAL */ -} - -void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) -{ - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) - hpeldsp_init_mmx(c, flags, mm_flags); - - if (mm_flags & AV_CPU_FLAG_MMXEXT) - hpeldsp_init_mmxext(c, flags, mm_flags); - - if (mm_flags & AV_CPU_FLAG_3DNOW) - hpeldsp_init_3dnow(c, flags, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE2) - hpeldsp_init_sse2(c, flags, mm_flags); -} diff --git a/ffmpeg1/libavcodec/x86/hpeldsp_rnd_template.c b/ffmpeg1/libavcodec/x86/hpeldsp_rnd_template.c deleted file mode 100644 index 07de675..0000000 --- a/ffmpeg1/libavcodec/x86/hpeldsp_rnd_template.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * DSP utils mmx functions are compiled twice for rnd/no_rnd - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -// put_pixels -static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"),%%mm2 \n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"),%%mm0 \n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// avg_pixels -#ifndef NO_RND -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} -#endif // NO_RND - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -#ifndef NO_RND -static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} -#endif // NO_RND - -static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - "movq 8%1, %%mm0 \n\t" - "movq 9%1, %%mm1 \n\t" - "movq 8%0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} - -static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm2, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -//FIXME optimize -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_y2)(block , pixels , line_size, h); - DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_xy2)(block , pixels , line_size, h); - DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_y2)(block , pixels , line_size, h); - DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_xy2)(block , pixels , line_size, h); - DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); -} diff --git a/ffmpeg1/libavcodec/x86/idct_mmx_xvid.c b/ffmpeg1/libavcodec/x86/idct_mmx_xvid.c deleted file mode 100644 index 5e9f405..0000000 --- a/ffmpeg1/libavcodec/x86/idct_mmx_xvid.c +++ /dev/null @@ -1,558 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - MMX and XMM forward discrete cosine transform - - * - * Copyright(C) 2001 Peter Ross <pross@xvid.org> - * - * Originally provided by Intel at AP-922 - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) - * but in a limited edition. - * New macro implements a column part for precise iDCT - * The routine precision now satisfies IEEE standard 1180-1990. - * - * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> - * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> - * - * http://www.elecard.com/peter/idct.html - * http://www.linuxvideo.org/mpeg2dec/ - * - * These examples contain code fragments for first stage iDCT 8x8 - * (for rows) and first stage DCT 8x8 (for columns) - * - * conversion to gcc syntax by Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with FFmpeg; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <inttypes.h> - -#include "config.h" -#include "libavcodec/avcodec.h" -#include "libavutil/mem.h" -#include "dsputil_mmx.h" -#include "idct_xvid.h" - -#if HAVE_INLINE_ASM - -//============================================================================= -// Macros and other preprocessor constants -//============================================================================= - -#define BITS_INV_ACC 5 // 4 or 5 for IEEE -#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 -#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 -#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) -#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) -#define RND_INV_CORR (RND_INV_COL - 1) - -#define BITS_FRW_ACC 3 // 2 or 3 for accuracy -#define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) -#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) - - -//----------------------------------------------------------------------------- -// Various memory constants (trigonometric values or rounding values) -//----------------------------------------------------------------------------- - - -DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = { - 13036,13036,13036,13036, // tg * (2<<16) + 0.5 - 27146,27146,27146,27146, // tg * (2<<16) + 0.5 - -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 - 23170,23170,23170,23170}; // cos * (2<<15) + 0.5 - -DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = { - 65536,65536, - 3597,3597, - 2260,2260, - 1203,1203, - 0,0, - 120,120, - 512,512, - 512,512}; - -//----------------------------------------------------------------------------- -// -// The first stage iDCT 8x8 - inverse DCTs of rows -// -//----------------------------------------------------------------------------- -// The 8-point inverse DCT direct algorithm -//----------------------------------------------------------------------------- -// -// static const short w[32] = { -// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), -// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), -// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), -// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), -// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), -// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; -// -// #define DCT_8_INV_ROW(x, y) -// { -// int a0, a1, a2, a3, b0, b1, b2, b3; -// -// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3]; -// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7]; -// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11]; -// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; -// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; -// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; -// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; -// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; -// -// y[0] = SHIFT_ROUND ( a0 + b0 ); -// y[1] = SHIFT_ROUND ( a1 + b1 ); -// y[2] = SHIFT_ROUND ( a2 + b2 ); -// y[3] = SHIFT_ROUND ( a3 + b3 ); -// y[4] = SHIFT_ROUND ( a3 - b3 ); -// y[5] = SHIFT_ROUND ( a2 - b2 ); -// y[6] = SHIFT_ROUND ( a1 - b1 ); -// y[7] = SHIFT_ROUND ( a0 - b0 ); -// } -// -//----------------------------------------------------------------------------- -// -// In this implementation the outputs of the iDCT-1D are multiplied -// for rows 0,4 - by cos_4_16, -// for rows 1,7 - by cos_1_16, -// for rows 2,6 - by cos_2_16, -// for rows 3,5 - by cos_3_16 -// and are shifted to the left for better accuracy -// -// For the constants used, -// FIX(float_const) = (short) (float_const * (1<<15) + 0.5) -// -//----------------------------------------------------------------------------- - -//----------------------------------------------------------------------------- -// Tables for mmx processors -//----------------------------------------------------------------------------- - -// Table for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = { - 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 - 21407,8867,8867,-21407, // w07 w05 w03 w01 - 16384,-16384,16384,16384, // w14 w12 w10 w08 - -8867,21407,-21407,-8867, // w15 w13 w11 w09 - 22725,12873,19266,-22725, // w22 w20 w18 w16 - 19266,4520,-4520,-12873, // w23 w21 w19 w17 - 12873,4520,4520,19266, // w30 w28 w26 w24 - -22725,19266,-12873,-22725, // w31 w29 w27 w25 -// Table for rows 1,7 - constants are multiplied by cos_1_16 - 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00 - 29692,12299,12299,-29692, // w07 w05 w03 w01 - 22725,-22725,22725,22725, // w14 w12 w10 w08 - -12299,29692,-29692,-12299, // w15 w13 w11 w09 - 31521,17855,26722,-31521, // w22 w20 w18 w16 - 26722,6270,-6270,-17855, // w23 w21 w19 w17 - 17855,6270,6270,26722, // w30 w28 w26 w24 - -31521,26722,-17855,-31521, // w31 w29 w27 w25 -// Table for rows 2,6 - constants are multiplied by cos_2_16 - 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00 - 27969,11585,11585,-27969, // w07 w05 w03 w01 - 21407,-21407,21407,21407, // w14 w12 w10 w08 - -11585,27969,-27969,-11585, // w15 w13 w11 w09 - 29692,16819,25172,-29692, // w22 w20 w18 w16 - 25172,5906,-5906,-16819, // w23 w21 w19 w17 - 16819,5906,5906,25172, // w30 w28 w26 w24 - -29692,25172,-16819,-29692, // w31 w29 w27 w25 -// Table for rows 3,5 - constants are multiplied by cos_3_16 - 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00 - 25172,10426,10426,-25172, // w07 w05 w03 w01 - 19266,-19266,19266,19266, // w14 w12 w10 w08 - -10426,25172,-25172,-10426, // w15 w13 w11 w09 - 26722,15137,22654,-26722, // w22 w20 w18 w16 - 22654,5315,-5315,-15137, // w23 w21 w19 w17 - 15137,5315,5315,22654, // w30 w28 w26 w24 - -26722,22654,-15137,-26722, // w31 w29 w27 w25 -}; -//----------------------------------------------------------------------------- -// Tables for xmm processors -//----------------------------------------------------------------------------- - -// %3 for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = { - 16384,21407,16384,8867, // movq-> w05 w04 w01 w00 - 16384,8867,-16384,-21407, // w07 w06 w03 w02 - 16384,-8867,16384,-21407, // w13 w12 w09 w08 - -16384,21407,16384,-8867, // w15 w14 w11 w10 - 22725,19266,19266,-4520, // w21 w20 w17 w16 - 12873,4520,-22725,-12873, // w23 w22 w19 w18 - 12873,-22725,4520,-12873, // w29 w28 w25 w24 - 4520,19266,19266,-22725, // w31 w30 w27 w26 -// %3 for rows 1,7 - constants are multiplied by cos_1_16 - 22725,29692,22725,12299, // movq-> w05 w04 w01 w00 - 22725,12299,-22725,-29692, // w07 w06 w03 w02 - 22725,-12299,22725,-29692, // w13 w12 w09 w08 - -22725,29692,22725,-12299, // w15 w14 w11 w10 - 31521,26722,26722,-6270, // w21 w20 w17 w16 - 17855,6270,-31521,-17855, // w23 w22 w19 w18 - 17855,-31521,6270,-17855, // w29 w28 w25 w24 - 6270,26722,26722,-31521, // w31 w30 w27 w26 -// %3 for rows 2,6 - constants are multiplied by cos_2_16 - 21407,27969,21407,11585, // movq-> w05 w04 w01 w00 - 21407,11585,-21407,-27969, // w07 w06 w03 w02 - 21407,-11585,21407,-27969, // w13 w12 w09 w08 - -21407,27969,21407,-11585, // w15 w14 w11 w10 - 29692,25172,25172,-5906, // w21 w20 w17 w16 - 16819,5906,-29692,-16819, // w23 w22 w19 w18 - 16819,-29692,5906,-16819, // w29 w28 w25 w24 - 5906,25172,25172,-29692, // w31 w30 w27 w26 -// %3 for rows 3,5 - constants are multiplied by cos_3_16 - 19266,25172,19266,10426, // movq-> w05 w04 w01 w00 - 19266,10426,-19266,-25172, // w07 w06 w03 w02 - 19266,-10426,19266,-25172, // w13 w12 w09 w08 - -19266,25172,19266,-10426, // w15 w14 w11 w10 - 26722,22654,22654,-5315, // w21 w20 w17 w16 - 15137,5315,-26722,-15137, // w23 w22 w19 w18 - 15137,-26722,5315,-15137, // w29 w28 w25 w24 - 5315,22654,22654,-26722, // w31 w30 w27 w26 -}; -//============================================================================= -// Helper macros for the code -//============================================================================= - -//----------------------------------------------------------------------------- -// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER -//----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\ - "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ - "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ - "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ - "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\ - "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\ - "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\ - "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\ - "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\ - "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\ - "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\ - "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\ - "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\ - "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\ - "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\ - "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\ - "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\ - "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\ - "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\ - "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\ - "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ - "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\ - "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\ - "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ - "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\ - "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ - "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\ - "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ - "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ - "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\ - "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\ - "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\ - "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\ - "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\ - "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\ - "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\ - "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\ - "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\ - "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ - "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\ - "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\ - "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\ - "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\ - "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\ - "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\ - "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\ - "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\ - "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ - - -//----------------------------------------------------------------------------- -// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER -//----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\ - "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ - "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ - "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ - "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\ - "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ - "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\ - "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\ - "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ - "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\ - "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ - "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ - "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\ - "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ - "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ - "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ - "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ - "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ - "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ - "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ - "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\ - "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ - "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\ - "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ - "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\ - "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\ - "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ - "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\ - "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\ - "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\ - "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\ - "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\ - "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\ - "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\ - "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\ - "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ - "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\ - "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\ - "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\ - "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ - "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ - "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ - - -//----------------------------------------------------------------------------- -// -// The first stage DCT 8x8 - forward DCTs of columns -// -// The %2puts are multiplied -// for rows 0,4 - on cos_4_16, -// for rows 1,7 - on cos_1_16, -// for rows 2,6 - on cos_2_16, -// for rows 3,5 - on cos_3_16 -// and are shifted to the left for rise of accuracy -// -//----------------------------------------------------------------------------- -// -// The 8-point scaled forward DCT algorithm (26a8m) -// -//----------------------------------------------------------------------------- -// -// #define DCT_8_FRW_COL(x, y) -//{ -// short t0, t1, t2, t3, t4, t5, t6, t7; -// short tp03, tm03, tp12, tm12, tp65, tm65; -// short tp465, tm465, tp765, tm765; -// -// t0 = LEFT_SHIFT ( x[0] + x[7] ); -// t1 = LEFT_SHIFT ( x[1] + x[6] ); -// t2 = LEFT_SHIFT ( x[2] + x[5] ); -// t3 = LEFT_SHIFT ( x[3] + x[4] ); -// t4 = LEFT_SHIFT ( x[3] - x[4] ); -// t5 = LEFT_SHIFT ( x[2] - x[5] ); -// t6 = LEFT_SHIFT ( x[1] - x[6] ); -// t7 = LEFT_SHIFT ( x[0] - x[7] ); -// -// tp03 = t0 + t3; -// tm03 = t0 - t3; -// tp12 = t1 + t2; -// tm12 = t1 - t2; -// -// y[0] = tp03 + tp12; -// y[4] = tp03 - tp12; -// -// y[2] = tm03 + tm12 * tg_2_16; -// y[6] = tm03 * tg_2_16 - tm12; -// -// tp65 =(t6 +t5 )*cos_4_16; -// tm65 =(t6 -t5 )*cos_4_16; -// -// tp765 = t7 + tp65; -// tm765 = t7 - tp65; -// tp465 = t4 + tm65; -// tm465 = t4 - tm65; -// -// y[1] = tp765 + tp465 * tg_1_16; -// y[7] = tp765 * tg_1_16 - tp465; -// y[5] = tm765 * tg_3_16 + tm465; -// y[3] = tm765 - tm465 * tg_3_16; -//} -// -//----------------------------------------------------------------------------- - -//----------------------------------------------------------------------------- -// DCT_8_INV_COL_4 INP,OUT -//----------------------------------------------------------------------------- - -#define DCT_8_INV_COL(A1,A2)\ - "movq 2*8(%3),%%mm0\n\t"\ - "movq 16*3+" #A1 ",%%mm3\n\t"\ - "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\ - "movq 16*5+" #A1 ",%%mm5\n\t"\ - "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\ - "movq (%3),%%mm4\n\t"\ - "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\ - "movq 16*7+" #A1 ",%%mm7\n\t"\ - "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\ - "movq 16*1+" #A1 ",%%mm6\n\t"\ - "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\ - "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\ - "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\ - "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\ - "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\ - "movq 3*8(%3),%%mm3\n\t"\ - "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\ - "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\ - "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\ - "movq %%mm4,%%mm5 \n\t"/* tp17*/\ - "movq %%mm2,%%mm6 \n\t"/* tm17*/\ - "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\ - "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\ - "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\ - "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\ - "movq 1*8(%3),%%mm7\n\t"\ - "movq %%mm4,%%mm1 \n\t"/* t1*/\ - "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\ - "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\ - "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\ - "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\ - "movq 2*16+" #A1 ",%%mm5\n\t"\ - "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\ - "movq 6*16+" #A1 ",%%mm6\n\t"\ - "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\ - "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\ - "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\ - "movq 0*16+" #A1 ",%%mm2\n\t"\ - "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\ - "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\ - "movq %%mm2,%%mm3 \n\t"/* x0*/\ - "movq 4*16+" #A1 ",%%mm6\n\t"\ - "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\ - "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\ - "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\ - "movq %%mm2,%%mm5 \n\t"/* tp04*/\ - "movq %%mm3,%%mm6 \n\t"/* tm04*/\ - "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\ - "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\ - "paddsw %%mm1,%%mm1 \n\t"/* b1*/\ - "paddsw %%mm4,%%mm4 \n\t"/* b2*/\ - "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\ - "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\ - "movq %%mm3,%%mm7 \n\t"/* a1*/\ - "movq %%mm6,%%mm0 \n\t"/* a2*/\ - "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\ - "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\ - "psraw $6,%%mm3 \n\t"/* dst1*/\ - "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\ - "psraw $6,%%mm6 \n\t"/* dst2*/\ - "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\ - "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\ - "psraw $6,%%mm7 \n\t"/* dst6*/\ - "movq %%mm5,%%mm4 \n\t"/* a0*/\ - "psraw $6,%%mm0 \n\t"/* dst5*/\ - "movq %%mm3,1*16+" #A2 "\n\t"\ - "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\ - "movq %%mm6,2*16+" #A2 "\n\t"\ - "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\ - "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\ - "psraw $6,%%mm5 \n\t"/* dst0*/\ - "movq %%mm2,%%mm6 \n\t"/* a3*/\ - "psraw $6,%%mm4 \n\t"/* dst7*/\ - "movq %%mm0,5*16+" #A2 "\n\t"\ - "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\ - "movq %%mm7,6*16+" #A2 "\n\t"\ - "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\ - "movq %%mm5,0*16+" #A2 "\n\t"\ - "psraw $6,%%mm2 \n\t"/* dst3*/\ - "movq %%mm4,7*16+" #A2 "\n\t"\ - "psraw $6,%%mm6 \n\t"/* dst4*/\ - "movq %%mm2,3*16+" #A2 "\n\t"\ - "movq %%mm6,4*16+" #A2 "\n\t" - -//============================================================================= -// Code -//============================================================================= - -//----------------------------------------------------------------------------- -// void idct_mmx(uint16_t block[64]); -//----------------------------------------------------------------------------- - - -void ff_idct_xvid_mmx(short *block){ -__asm__ volatile( - //# Process each row - DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) - DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) - DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) - DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) - DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) - DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) - DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) - DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - - //# Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16)); -} - -//----------------------------------------------------------------------------- -// void idct_xmm(uint16_t block[64]); -//----------------------------------------------------------------------------- - - -void ff_idct_xvid_mmxext(short *block) -{ -__asm__ volatile( - //# Process each row - DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) - DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) - DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) - DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) - DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) - DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) - DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) - DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - - //# Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)); -} - -void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmx(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmx(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmxext(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmxext(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg1/libavcodec/x86/idct_sse2_xvid.c b/ffmpeg1/libavcodec/x86/idct_sse2_xvid.c deleted file mode 100644 index b51466c..0000000 --- a/ffmpeg1/libavcodec/x86/idct_sse2_xvid.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - SSE2 inverse discrete cosine transform - - * - * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> - * - * Conversion to gcc syntax with modifications - * by Alexander Strange <astrange@ithinksw.com> - * - * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. - * - * This file is part of FFmpeg. - * - * Vertical pass is an implementation of the scheme: - * Loeffler C., Ligtenberg A., and Moschytz C.S.: - * Practical Fast 1D DCT Algorithm with Eleven Multiplications, - * Proc. ICASSP 1989, 988-991. - * - * Horizontal pass is a double 4x4 vector/matrix multiplication, - * (see also Intel's Application Note 922: - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * Copyright (C) 1999 Intel Corporation) - * - * More details at http://skal.planet-d.net/coding/dct.html - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with FFmpeg; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "idct_xvid.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -/** - * @file - * @brief SSE2 idct compatible with xvidmmx - */ - -#define X8(x) x,x,x,x,x,x,x,x - -#define ROW_SHIFT 11 -#define COL_SHIFT 6 - -DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16) -DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 -DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1 -DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2) -DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)}; - -DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { - 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, - 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, - 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, - 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b -}; - -DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { - 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, - 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, - 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, - 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df -}; - -DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { - 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, - 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, - 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, - 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 -}; - -DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { - 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, - 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, - 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, - 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e -}; - -DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { - 65536, 65536, 65536, 65536, - 3597, 3597, 3597, 3597, - 2260, 2260, 2260, 2260, - 1203, 1203, 1203, 1203, - 120, 120, 120, 120, - 512, 512, 512, 512 -}; - -// Temporary storage before the column pass -#define ROW1 "%%xmm6" -#define ROW3 "%%xmm4" -#define ROW5 "%%xmm5" -#define ROW7 "%%xmm7" - -#define CLEAR_ODD(r) "pxor "r","r" \n\t" -#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" - -#if ARCH_X86_64 - -# define ROW0 "%%xmm8" -# define REG0 ROW0 -# define ROW2 "%%xmm9" -# define REG2 ROW2 -# define ROW4 "%%xmm10" -# define REG4 ROW4 -# define ROW6 "%%xmm11" -# define REG6 ROW6 -# define CLEAR_EVEN(r) CLEAR_ODD(r) -# define PUT_EVEN(dst) PUT_ODD(dst) -# define XMMS "%%xmm12" -# define MOV_32_ONLY "#" -# define SREG2 REG2 -# define TAN3 "%%xmm13" -# define TAN1 "%%xmm14" - -#else - -# define ROW0 "(%0)" -# define REG0 "%%xmm4" -# define ROW2 "2*16(%0)" -# define REG2 "%%xmm4" -# define ROW4 "4*16(%0)" -# define REG4 "%%xmm6" -# define ROW6 "6*16(%0)" -# define REG6 "%%xmm6" -# define CLEAR_EVEN(r) -# define PUT_EVEN(dst) \ - "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ - "movdqa %%xmm2, "dst" \n\t" -# define XMMS "%%xmm2" -# define MOV_32_ONLY "movdqa " -# define SREG2 "%%xmm7" -# define TAN3 "%%xmm0" -# define TAN1 "%%xmm2" - -#endif - -#define ROUND(x) "paddd "MANGLE(x) - -#define JZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jz "to" \n\t" - -#define JNZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jnz "to" \n\t" - -#define TEST_ONE_ROW(src, reg, clear) \ - clear \ - "movq "src", %%mm1 \n\t" \ - "por 8+"src", %%mm1 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "pmovmskb %%mm1, "reg" \n\t" - -#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ - clear1 \ - clear2 \ - "movq "row1", %%mm1 \n\t" \ - "por 8+"row1", %%mm1 \n\t" \ - "movq "row2", %%mm2 \n\t" \ - "por 8+"row2", %%mm2 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "pmovmskb %%mm1, "reg1" \n\t" \ - "pmovmskb %%mm2, "reg2" \n\t" - -///IDCT pass on rows. -#define iMTX_MULT(src, table, rounder, put) \ - "movdqa "src", %%xmm3 \n\t" \ - "movdqa %%xmm3, %%xmm0 \n\t" \ - "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ - "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ - "pmaddwd "table", %%xmm0 \n\t" \ - "pmaddwd 16+"table", %%xmm1 \n\t" \ - "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ - "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ - "pmaddwd 32+"table", %%xmm2 \n\t" \ - "pmaddwd 48+"table", %%xmm3 \n\t" \ - "paddd %%xmm1, %%xmm0 \n\t" \ - "paddd %%xmm3, %%xmm2 \n\t" \ - rounder", %%xmm0 \n\t" \ - "movdqa %%xmm2, %%xmm3 \n\t" \ - "paddd %%xmm0, %%xmm2 \n\t" \ - "psubd %%xmm3, %%xmm0 \n\t" \ - "psrad $11, %%xmm2 \n\t" \ - "psrad $11, %%xmm0 \n\t" \ - "packssdw %%xmm0, %%xmm2 \n\t" \ - put \ - "1: \n\t" - -#define iLLM_HEAD \ - "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ - "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ - -///IDCT pass on columns. -#define iLLM_PASS(dct) \ - "movdqa "TAN3", %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "pmulhw %%xmm5, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm5, %%xmm1 \n\t" \ - "psubsw %%xmm5, "TAN3" \n\t" \ - "paddsw %%xmm4, %%xmm1 \n\t" \ - "pmulhw %%xmm7, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "paddsw %%xmm6, %%xmm3 \n\t" \ - "psubsw %%xmm7, "TAN1" \n\t" \ - "movdqa %%xmm3, %%xmm7 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm7, %%xmm1 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ - MOV_32_ONLY ROW2", "REG2" \n\t" \ - MOV_32_ONLY ROW6", "REG6" \n\t" \ - "movdqa %%xmm7, %%xmm5 \n\t" \ - "pmulhw "REG6", %%xmm7 \n\t" \ - "pmulhw "REG2", %%xmm5 \n\t" \ - "paddsw "REG2", %%xmm7 \n\t" \ - "psubsw "REG6", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - MOV_32_ONLY ROW4", "REG4" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw "REG4", "REG0" \n\t" \ - "paddsw "XMMS", "REG4" \n\t" \ - "movdqa "REG4", "XMMS" \n\t" \ - "psubsw %%xmm7, "REG4" \n\t" \ - "paddsw "XMMS", %%xmm7 \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa %%xmm7, %%xmm0 \n\t" \ - "movdqa "REG4", %%xmm4 \n\t" \ - "psubsw %%xmm1, %%xmm7 \n\t" \ - "psubsw "TAN1", "REG4" \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, %%xmm7 \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, "REG4" \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa "REG4", 4*16("dct") \n\t" \ - "movdqa %%xmm7, 7*16("dct") \n\t" - -///IDCT pass on columns, assuming rows 4-7 are zero. -#define iLLM_PASS_SPARSE(dct) \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "movdqa %%xmm6, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "movdqa %%xmm4, %%xmm1 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "paddsw %%xmm6, %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ - MOV_32_ONLY ROW2", "SREG2" \n\t" \ - "pmulhw "SREG2", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - "movdqa "REG0", %%xmm6 \n\t" \ - "psubsw "SREG2", %%xmm6 \n\t" \ - "paddsw "REG0", "SREG2" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa "SREG2", %%xmm0 \n\t" \ - "movdqa %%xmm6, %%xmm4 \n\t" \ - "psubsw %%xmm1, "SREG2" \n\t" \ - "psubsw "TAN1", %%xmm6 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, "SREG2" \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, %%xmm6 \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa %%xmm6, 4*16("dct") \n\t" \ - "movdqa "SREG2", 7*16("dct") \n\t" - -inline void ff_idct_xvid_sse2(short *block) -{ - __asm__ volatile( - "movq "MANGLE(m127)", %%mm0 \n\t" - iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) - iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) - iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) - - TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) - JZ("%%eax", "1f") - iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) - - TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) - TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) - iLLM_HEAD - ".p2align 4 \n\t" - JNZ("%%ecx", "2f") - JNZ("%%eax", "3f") - JNZ("%%edx", "4f") - JNZ("%%esi", "5f") - iLLM_PASS_SPARSE("%0") - "jmp 6f \n\t" - "2: \n\t" - iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) - "3: \n\t" - iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) - JZ("%%edx", "1f") - "4: \n\t" - iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) - JZ("%%esi", "1f") - "5: \n\t" - iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) -#if ARCH_X86_32 - iLLM_HEAD -#endif - iLLM_PASS("%0") - "6: \n\t" - : "+r"(block) - : - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , - "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,) -#if ARCH_X86_64 - XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14",) -#endif - "%eax", "%ecx", "%edx", "%esi", "memory" - ); -} - -void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) -{ - ff_idct_xvid_sse2(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) -{ - ff_idct_xvid_sse2(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg1/libavcodec/x86/idct_xvid.h b/ffmpeg1/libavcodec/x86/idct_xvid.h deleted file mode 100644 index 7a2847b..0000000 --- a/ffmpeg1/libavcodec/x86/idct_xvid.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * header for Xvid IDCT functions - */ - -#ifndef AVCODEC_X86_IDCT_XVID_H -#define AVCODEC_X86_IDCT_XVID_H - -#include <stdint.h> - -void ff_idct_xvid_mmx(short *block); -void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block); -void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block); - -void ff_idct_xvid_mmxext(short *block); -void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block); -void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block); - -void ff_idct_xvid_sse2(short *block); -void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block); -void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block); - -#endif /* AVCODEC_X86_IDCT_XVID_H */ diff --git a/ffmpeg1/libavcodec/x86/imdct36.asm b/ffmpeg1/libavcodec/x86/imdct36.asm deleted file mode 100644 index d311fbe..0000000 --- a/ffmpeg1/libavcodec/x86/imdct36.asm +++ /dev/null @@ -1,724 +0,0 @@ -;****************************************************************************** -;* 36 point SSE-optimized IMDCT transform -;* Copyright (c) 2011 Vitor Sessak -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -align 16 -ps_mask: dd 0, ~0, ~0, ~0 -ps_mask2: dd 0, ~0, 0, ~0 -ps_mask3: dd 0, 0, 0, ~0 -ps_mask4: dd 0, ~0, 0, 0 - -ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 -ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 -ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 -ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 -ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 -ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 -ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 - -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 -ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 - -ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 - dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 - dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 - dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 - dd 1.0, 0.70710678118654752439, 0.0, 0.0 - -ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 - dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 - dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 - dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 - dd 1.0, 0.70710678118654752439, 0.0, 0.0 - -costabs: times 4 dd 0.98480773 - times 4 dd 0.93969262 - times 4 dd 0.86602539 - times 4 dd -0.76604444 - times 4 dd -0.64278764 - times 4 dd 0.50000000 - times 4 dd -0.50000000 - times 4 dd -0.34202015 - times 4 dd -0.17364818 - times 4 dd 0.50190992 - times 4 dd 0.51763808 - times 4 dd 0.55168896 - times 4 dd 0.61038726 - times 4 dd 0.70710677 - times 4 dd 0.87172341 - times 4 dd 1.18310082 - times 4 dd 1.93185163 - times 4 dd 5.73685646 - -%define SBLIMIT 32 -SECTION_TEXT - -%macro PSHUFD 3 -%if cpuflag(sse2) && notcpuflag(avx) - pshufd %1, %2, %3 -%else - shufps %1, %2, %2, %3 -%endif -%endmacro - -; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} -; output %1={x3,x4,y1,y2} -%macro BUILDINVHIGHLOW 3 -%if cpuflag(avx) - shufps %1, %2, %3, 0x4e -%else - movlhps %1, %3 - movhlps %1, %2 -%endif -%endmacro - -; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} -; output %1={x4,y1,y2,y3} -%macro ROTLEFT 3 -%if cpuflag(ssse3) - palignr %1, %3, %2, 12 -%else - BUILDINVHIGHLOW %1, %2, %3 - shufps %1, %1, %3, 0x99 -%endif -%endmacro - -%macro INVERTHL 2 -%if cpuflag(sse2) - PSHUFD %1, %2, 0x4e -%else - movhlps %1, %2 - movlhps %1, %2 -%endif -%endmacro - -%macro BUTTERF 3 - INVERTHL %2, %1 - xorps %1, [ps_p1p1m1m1] - addps %1, %2 -%if cpuflag(sse3) - mulps %1, %1, [ps_cosh_sse3 + %3] - PSHUFD %2, %1, 0xb1 - addsubps %1, %1, %2 -%else - mulps %1, [ps_cosh + %3] - PSHUFD %2, %1, 0xb1 - xorps %1, [ps_p1m1p1m1] - addps %1, %2 -%endif -%endmacro - -%macro STORE 4 - movhlps %2, %1 - movss [%3 ], %1 - movss [%3 + 2*%4], %2 - shufps %1, %1, 0xb1 - movss [%3 + %4], %1 - movhlps %2, %1 - movss [%3 + 3*%4], %2 -%endmacro - -%macro LOAD 4 - movlps %1, [%3 ] - movhps %1, [%3 + %4] - movlps %2, [%3 + 2*%4] - movhps %2, [%3 + 3*%4] - shufps %1, %2, 0x88 -%endmacro - -%macro LOADA64 2 -%if cpuflag(avx) - movu %1, [%2] -%else - movlps %1, [%2] - movhps %1, [%2 + 8] -%endif -%endmacro - -%macro DEFINE_IMDCT 0 -cglobal imdct36_float, 4,4,9, out, buf, in, win - - ; for(i=17;i>=1;i--) in[i] += in[i-1]; - LOADA64 m0, inq - LOADA64 m1, inq + 16 - - ROTLEFT m5, m0, m1 - - PSHUFD m6, m0, 0x93 - andps m6, m6, [ps_mask] - addps m0, m0, m6 - - LOADA64 m2, inq + 32 - - ROTLEFT m7, m1, m2 - - addps m1, m1, m5 - LOADA64 m3, inq + 48 - - ROTLEFT m5, m2, m3 - - xorps m4, m4, m4 - movlps m4, [inq+64] - BUILDINVHIGHLOW m6, m3, m4 - shufps m6, m6, m4, 0xa9 - - addps m4, m4, m6 - addps m2, m2, m7 - addps m3, m3, m5 - - ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; - movlhps m5, m5, m0 - andps m5, m5, [ps_mask3] - - BUILDINVHIGHLOW m7, m0, m1 - andps m7, m7, [ps_mask2] - - addps m0, m0, m5 - - BUILDINVHIGHLOW m6, m1, m2 - andps m6, m6, [ps_mask2] - - addps m1, m1, m7 - - BUILDINVHIGHLOW m7, m2, m3 - andps m7, m7, [ps_mask2] - - addps m2, m2, m6 - - movhlps m6, m6, m3 - andps m6, m6, [ps_mask4] - - addps m3, m3, m7 - addps m4, m4, m6 - - ; Populate tmp[] - movlhps m6, m1, m5 ; zero out high values - subps m6, m6, m4 - - subps m5, m0, m3 - -%if ARCH_X86_64 - SWAP m5, m8 -%endif - - mulps m7, m2, [ps_val1] - -%if ARCH_X86_64 - mulps m5, m8, [ps_val2] -%else - mulps m5, m5, [ps_val2] -%endif - addps m7, m7, m5 - - mulps m5, m6, [ps_val1] - subps m7, m7, m5 - -%if ARCH_X86_64 - SWAP m5, m8 -%else - subps m5, m0, m3 -%endif - - subps m5, m5, m6 - addps m5, m5, m2 - - shufps m6, m4, m3, 0xe4 - subps m6, m6, m2 - mulps m6, m6, [ps_val3] - - addps m4, m4, m1 - mulps m4, m4, [ps_val4] - - shufps m1, m1, m0, 0xe4 - addps m1, m1, m2 - mulps m1, m1, [ps_val5] - - mulps m3, m3, [ps_val6] - mulps m0, m0, [ps_val7] - addps m0, m0, m3 - - xorps m2, m1, [ps_p1p1m1m1] - subps m2, m2, m4 - addps m2, m2, m0 - - addps m3, m4, m0 - subps m3, m3, m6 - xorps m3, m3, [ps_p1p1m1m1] - - shufps m0, m0, m4, 0xe4 - subps m0, m0, m1 - addps m0, m0, m6 - - BUILDINVHIGHLOW m4, m2, m3 - shufps m3, m3, m2, 0x4e - - ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} - - BUTTERF m0, m1, 0 - BUTTERF m7, m2, 16 - BUTTERF m3, m6, 32 - BUTTERF m4, m1, 48 - - mulps m5, m5, [ps_cosh + 64] - PSHUFD m1, m5, 0xe1 - xorps m5, m5, [ps_p1m1p1m1] - addps m5, m5, m1 - - ; permutates: - ; m0 0 1 2 3 => 2 6 10 14 m1 - ; m7 4 5 6 7 => 3 7 11 15 m2 - ; m3 8 9 10 11 => 17 13 9 5 m3 - ; m4 12 13 14 15 => 16 12 8 4 m5 - ; m5 16 17 xx xx => 0 1 xx xx m0 - - unpckhps m1, m0, m7 - unpckhps m6, m3, m4 - movhlps m2, m6, m1 - movlhps m1, m1, m6 - - unpcklps m5, m5, m4 - unpcklps m3, m3, m7 - movhlps m4, m3, m5 - movlhps m5, m5, m3 - SWAP m4, m3 - ; permutation done - - PSHUFD m6, m2, 0xb1 - movss m4, [bufq + 4*68] - movss m7, [bufq + 4*64] - unpcklps m7, m7, m4 - mulps m6, m6, [winq + 16*4] - addps m6, m6, m7 - movss [outq + 64*SBLIMIT], m6 - shufps m6, m6, m6, 0xb1 - movss [outq + 68*SBLIMIT], m6 - - mulps m6, m3, [winq + 4*4] - LOAD m4, m7, bufq + 4*16, 16 - addps m6, m6, m4 - STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT - - shufps m4, m0, m3, 0xb5 - mulps m4, m4, [winq + 8*4] - LOAD m7, m6, bufq + 4*32, 16 - addps m4, m4, m7 - STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT - - shufps m3, m3, m2, 0xb1 - mulps m3, m3, [winq + 12*4] - LOAD m7, m6, bufq + 4*48, 16 - addps m3, m3, m7 - STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT - - mulps m2, m2, [winq] - LOAD m6, m7, bufq, 16 - addps m2, m2, m6 - STORE m2, m7, outq, 4*SBLIMIT - - mulps m4, m1, [winq + 20*4] - STORE m4, m7, bufq, 16 - - mulps m3, m5, [winq + 24*4] - STORE m3, m7, bufq + 4*16, 16 - - shufps m0, m0, m5, 0xb0 - mulps m0, m0, [winq + 28*4] - STORE m0, m7, bufq + 4*32, 16 - - shufps m5, m5, m1, 0xb1 - mulps m5, m5, [winq + 32*4] - STORE m5, m7, bufq + 4*48, 16 - - shufps m1, m1, m1, 0xb1 - mulps m1, m1, [winq + 36*4] - movss [bufq + 4*64], m1 - shufps m1, m1, 0xb1 - movss [bufq + 4*68], m1 - RET -%endmacro - -INIT_XMM sse -DEFINE_IMDCT - -INIT_XMM sse2 -DEFINE_IMDCT - -INIT_XMM sse3 -DEFINE_IMDCT - -INIT_XMM ssse3 -DEFINE_IMDCT - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEFINE_IMDCT -%endif - -INIT_XMM sse - -%if ARCH_X86_64 -%define SPILL SWAP -%define UNSPILL SWAP -%define SPILLED(x) m %+ x -%else -%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] -%macro SPILL 2 ; xmm#, mempos - movaps SPILLED(%2), m%1 -%endmacro -%macro UNSPILL 2 - movaps m%1, SPILLED(%2) -%endmacro -%endif - -%macro DEFINE_FOUR_IMDCT 0 -cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp - movlps m0, [inq+64] - movhps m0, [inq+64 + 72] - movlps m3, [inq+64 + 2*72] - movhps m3, [inq+64 + 3*72] - - shufps m5, m0, m3, 0xdd - shufps m0, m0, m3, 0x88 - - mova m1, [inq+48] - movu m6, [inq+48 + 72] - mova m7, [inq+48 + 2*72] - movu m3, [inq+48 + 3*72] - - TRANSPOSE4x4PS 1, 6, 7, 3, 4 - - addps m4, m6, m7 - mova [tmpq+4*28], m4 - - addps m7, m3 - addps m6, m1 - addps m3, m0 - addps m0, m5 - addps m0, m7 - addps m7, m6 - mova [tmpq+4*12], m7 - SPILL 3, 12 - - mova m4, [inq+32] - movu m5, [inq+32 + 72] - mova m2, [inq+32 + 2*72] - movu m7, [inq+32 + 3*72] - - TRANSPOSE4x4PS 4, 5, 2, 7, 3 - - addps m1, m7 - SPILL 1, 11 - - addps m3, m5, m2 - SPILL 3, 13 - - addps m7, m2 - addps m5, m4 - addps m6, m7 - mova [tmpq], m6 - addps m7, m5 - mova [tmpq+4*16], m7 - - mova m2, [inq+16] - movu m7, [inq+16 + 72] - mova m1, [inq+16 + 2*72] - movu m6, [inq+16 + 3*72] - - TRANSPOSE4x4PS 2, 7, 1, 6, 3 - - addps m4, m6 - addps m6, m1 - addps m1, m7 - addps m7, m2 - addps m5, m6 - SPILL 5, 15 - addps m6, m7 - mulps m6, [costabs + 16*2] - mova [tmpq+4*8], m6 - SPILL 1, 10 - SPILL 0, 14 - - mova m1, [inq] - movu m6, [inq + 72] - mova m3, [inq + 2*72] - movu m5, [inq + 3*72] - - TRANSPOSE4x4PS 1, 6, 3, 5, 0 - - addps m2, m5 - addps m5, m3 - addps m7, m5 - addps m3, m6 - addps m6, m1 - SPILL 7, 8 - addps m5, m6 - SPILL 6, 9 - addps m6, m4, SPILLED(12) - subps m6, m2 - UNSPILL 7, 11 - SPILL 5, 11 - subps m5, m1, m7 - mulps m7, [costabs + 16*5] - addps m7, m1 - mulps m0, m6, [costabs + 16*6] - addps m0, m5 - mova [tmpq+4*24], m0 - addps m6, m5 - mova [tmpq+4*4], m6 - addps m6, m4, m2 - mulps m6, [costabs + 16*1] - subps m4, SPILLED(12) - mulps m4, [costabs + 16*8] - addps m2, SPILLED(12) - mulps m2, [costabs + 16*3] - subps m5, m7, m6 - subps m5, m2 - addps m6, m7 - addps m6, m4 - addps m7, m2 - subps m7, m4 - mova [tmpq+4*20], m7 - mova m2, [tmpq+4*28] - mova [tmpq+4*28], m5 - UNSPILL 7, 13 - subps m5, m7, m2 - mulps m5, [costabs + 16*7] - UNSPILL 1, 10 - mulps m1, [costabs + 16*2] - addps m4, m3, m2 - mulps m4, [costabs + 16*4] - addps m2, m7 - addps m7, m3 - mulps m7, [costabs] - subps m3, m2 - mulps m3, [costabs + 16*2] - addps m2, m7, m5 - addps m2, m1 - SPILL 2, 10 - addps m7, m4 - subps m7, m1 - SPILL 7, 12 - subps m5, m4 - subps m5, m1 - UNSPILL 0, 14 - SPILL 5, 13 - addps m1, m0, SPILLED(15) - subps m1, SPILLED(8) - mova m4, [costabs + 16*5] - mulps m4, [tmpq] - UNSPILL 2, 9 - addps m4, m2 - subps m2, [tmpq] - mulps m5, m1, [costabs + 16*6] - addps m5, m2 - SPILL 5, 9 - addps m2, m1 - SPILL 2, 14 - UNSPILL 5, 15 - subps m7, m5, m0 - addps m5, SPILLED(8) - mulps m5, [costabs + 16*1] - mulps m7, [costabs + 16*8] - addps m0, SPILLED(8) - mulps m0, [costabs + 16*3] - subps m2, m4, m5 - subps m2, m0 - SPILL 2, 15 - addps m5, m4 - addps m5, m7 - addps m4, m0 - subps m4, m7 - SPILL 4, 8 - mova m7, [tmpq+4*16] - mova m2, [tmpq+4*12] - addps m0, m7, m2 - subps m0, SPILLED(11) - mulps m0, [costabs + 16*2] - addps m4, m7, SPILLED(11) - mulps m4, [costabs] - subps m7, m2 - mulps m7, [costabs + 16*7] - addps m2, SPILLED(11) - mulps m2, [costabs + 16*4] - addps m1, m7, [tmpq+4*8] - addps m1, m4 - addps m4, m2 - subps m4, [tmpq+4*8] - SPILL 4, 11 - subps m7, m2 - subps m7, [tmpq+4*8] - addps m4, m6, SPILLED(10) - subps m6, SPILLED(10) - addps m2, m5, m1 - mulps m2, [costabs + 16*9] - subps m5, m1 - mulps m5, [costabs + 16*17] - subps m1, m4, m2 - addps m4, m2 - mulps m2, m1, [winq+4*36] - addps m2, [bufq+4*36] - mova [outq+1152], m2 - mulps m1, [winq+4*32] - addps m1, [bufq+4*32] - mova [outq+1024], m1 - mulps m1, m4, [winq+4*116] - mova [bufq+4*36], m1 - mulps m4, [winq+4*112] - mova [bufq+4*32], m4 - addps m2, m6, m5 - subps m6, m5 - mulps m1, m6, [winq+4*68] - addps m1, [bufq+4*68] - mova [outq+2176], m1 - mulps m6, [winq] - addps m6, [bufq] - mova [outq], m6 - mulps m1, m2, [winq+4*148] - mova [bufq+4*68], m1 - mulps m2, [winq+4*80] - mova [bufq], m2 - addps m5, m3, [tmpq+4*24] - mova m2, [tmpq+4*24] - subps m2, m3 - mova m1, SPILLED(9) - subps m1, m0 - mulps m1, [costabs + 16*10] - addps m0, SPILLED(9) - mulps m0, [costabs + 16*16] - addps m6, m5, m1 - subps m5, m1 - mulps m3, m5, [winq+4*40] - addps m3, [bufq+4*40] - mova [outq+1280], m3 - mulps m5, [winq+4*28] - addps m5, [bufq+4*28] - mova [outq+896], m5 - mulps m1, m6, [winq+4*120] - mova [bufq+4*40], m1 - mulps m6, [winq+4*108] - mova [bufq+4*28], m6 - addps m1, m2, m0 - subps m2, m0 - mulps m5, m2, [winq+4*64] - addps m5, [bufq+4*64] - mova [outq+2048], m5 - mulps m2, [winq+4*4] - addps m2, [bufq+4*4] - mova [outq+128], m2 - mulps m0, m1, [winq+4*144] - mova [bufq+4*64], m0 - mulps m1, [winq+4*84] - mova [bufq+4*4], m1 - mova m1, [tmpq+4*28] - mova m5, m1 - addps m1, SPILLED(13) - subps m5, SPILLED(13) - UNSPILL 3, 15 - addps m2, m7, m3 - mulps m2, [costabs + 16*11] - subps m3, m7 - mulps m3, [costabs + 16*15] - addps m0, m2, m1 - subps m1, m2 - SWAP m0, m2 - mulps m6, m1, [winq+4*44] - addps m6, [bufq+4*44] - mova [outq+1408], m6 - mulps m1, [winq+4*24] - addps m1, [bufq+4*24] - mova [outq+768], m1 - mulps m0, m2, [winq+4*124] - mova [bufq+4*44], m0 - mulps m2, [winq+4*104] - mova [bufq+4*24], m2 - addps m0, m5, m3 - subps m5, m3 - mulps m1, m5, [winq+4*60] - addps m1, [bufq+4*60] - mova [outq+1920], m1 - mulps m5, [winq+4*8] - addps m5, [bufq+4*8] - mova [outq+256], m5 - mulps m1, m0, [winq+4*140] - mova [bufq+4*60], m1 - mulps m0, [winq+4*88] - mova [bufq+4*8], m0 - mova m1, [tmpq+4*20] - addps m1, SPILLED(12) - mova m2, [tmpq+4*20] - subps m2, SPILLED(12) - UNSPILL 7, 8 - subps m0, m7, SPILLED(11) - addps m7, SPILLED(11) - mulps m4, m7, [costabs + 16*12] - mulps m0, [costabs + 16*14] - addps m5, m1, m4 - subps m1, m4 - mulps m7, m1, [winq+4*48] - addps m7, [bufq+4*48] - mova [outq+1536], m7 - mulps m1, [winq+4*20] - addps m1, [bufq+4*20] - mova [outq+640], m1 - mulps m1, m5, [winq+4*128] - mova [bufq+4*48], m1 - mulps m5, [winq+4*100] - mova [bufq+4*20], m5 - addps m6, m2, m0 - subps m2, m0 - mulps m1, m2, [winq+4*56] - addps m1, [bufq+4*56] - mova [outq+1792], m1 - mulps m2, [winq+4*12] - addps m2, [bufq+4*12] - mova [outq+384], m2 - mulps m0, m6, [winq+4*136] - mova [bufq+4*56], m0 - mulps m6, [winq+4*92] - mova [bufq+4*12], m6 - UNSPILL 0, 14 - mulps m0, [costabs + 16*13] - mova m3, [tmpq+4*4] - addps m2, m0, m3 - subps m3, m0 - mulps m0, m3, [winq+4*52] - addps m0, [bufq+4*52] - mova [outq+1664], m0 - mulps m3, [winq+4*16] - addps m3, [bufq+4*16] - mova [outq+512], m3 - mulps m0, m2, [winq+4*132] - mova [bufq+4*52], m0 - mulps m2, [winq+4*96] - mova [bufq+4*16], m2 - RET -%endmacro - -INIT_XMM sse -DEFINE_FOUR_IMDCT - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEFINE_FOUR_IMDCT -%endif diff --git a/ffmpeg1/libavcodec/x86/lpc.c b/ffmpeg1/libavcodec/x86/lpc.c deleted file mode 100644 index 1962212..0000000 --- a/ffmpeg1/libavcodec/x86/lpc.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * MMX optimized LPC DSP utils - * Copyright (c) 2007 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/asm.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavcodec/lpc.h" - -#if HAVE_SSE2_INLINE - -static void lpc_apply_welch_window_sse2(const int32_t *data, int len, - double *w_data) -{ - double c = 2.0 / (len-1.0); - int n2 = len>>1; - x86_reg i = -n2*sizeof(int32_t); - x86_reg j = n2*sizeof(int32_t); - __asm__ volatile( - "movsd %4, %%xmm7 \n\t" - "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "subpd %%xmm5, %%xmm7 \n\t" - "addsd %%xmm6, %%xmm7 \n\t" - "test $1, %5 \n\t" - "jz 2f \n\t" -#define WELCH(MOVPD, offset)\ - "1: \n\t"\ - "movapd %%xmm7, %%xmm1 \n\t"\ - "mulpd %%xmm1, %%xmm1 \n\t"\ - "movapd %%xmm6, %%xmm0 \n\t"\ - "subpd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ - "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ - "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ - "mulpd %%xmm0, %%xmm2 \n\t"\ - "mulpd %%xmm1, %%xmm3 \n\t"\ - "movapd %%xmm2, (%2,%0,2) \n\t"\ - MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ - "subpd %%xmm5, %%xmm7 \n\t"\ - "sub $8, %1 \n\t"\ - "add $8, %0 \n\t"\ - "jl 1b \n\t"\ - - WELCH("movupd", -1) - "jmp 3f \n\t" - "2: \n\t" - WELCH("movapd", -2) - "3: \n\t" - :"+&r"(i), "+&r"(j) - :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm5", "%xmm6", "%xmm7") - ); -#undef WELCH -} - -static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; j<lag; j+=2){ - x86_reg i = -len*sizeof(double); - if(j == lag-2) { - __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t" - "1: \n\t" - "movapd (%2,%0), %%xmm3 \n\t" - "movupd -8(%3,%0), %%xmm4 \n\t" - "movapd (%3,%0), %%xmm5 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm5 \n\t" - "mulpd -16(%3,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm5, %%xmm0 \n\t" - "addpd %%xmm3, %%xmm2 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "movhlps %%xmm2, %%xmm5 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "addsd %%xmm5, %%xmm2 \n\t" - "movsd %%xmm0, (%1) \n\t" - "movsd %%xmm1, 8(%1) \n\t" - "movsd %%xmm2, 16(%1) \n\t" - :"+&r"(i) - :"r"(autoc+j), "r"(data+len), "r"(data+len-j) - :"memory" - ); - } else { - __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" - "1: \n\t" - "movapd (%3,%0), %%xmm3 \n\t" - "movupd -8(%4,%0), %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd (%4,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm3, %%xmm0 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "movsd %%xmm0, %1 \n\t" - "movsd %%xmm1, %2 \n\t" - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) - :"r"(data+len), "r"(data+len-j) - ); - } - } -} - -#endif /* HAVE_SSE2_INLINE */ - -av_cold void ff_lpc_init_x86(LPCContext *c) -{ -#if HAVE_SSE2_INLINE - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { - c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; - } -#endif /* HAVE_SSE2_INLINE */ -} diff --git a/ffmpeg1/libavcodec/x86/mathops.h b/ffmpeg1/libavcodec/x86/mathops.h deleted file mode 100644 index 79e29e6..0000000 --- a/ffmpeg1/libavcodec/x86/mathops.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_MATHOPS_H -#define AVCODEC_X86_MATHOPS_H - -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_INLINE_ASM - -#if ARCH_X86_32 - -#define MULL MULL -static av_always_inline av_const int MULL(int a, int b, unsigned shift) -{ - int rt, dummy; - __asm__ ( - "imull %3 \n\t" - "shrdl %4, %%edx, %%eax \n\t" - :"=a"(rt), "=d"(dummy) - :"a"(a), "rm"(b), "ci"((uint8_t)shift) - ); - return rt; -} - -#define MULH MULH -static av_always_inline av_const int MULH(int a, int b) -{ - int rt, dummy; - __asm__ ( - "imull %3" - :"=d"(rt), "=a"(dummy) - :"a"(a), "rm"(b) - ); - return rt; -} - -#define MUL64 MUL64 -static av_always_inline av_const int64_t MUL64(int a, int b) -{ - int64_t rt; - __asm__ ( - "imull %2" - :"=A"(rt) - :"a"(a), "rm"(b) - ); - return rt; -} - -#endif /* ARCH_X86_32 */ - -#if HAVE_CMOV -/* median of 3 */ -#define mid_pred mid_pred -static inline av_const int mid_pred(int a, int b, int c) -{ - int i=b; - __asm__ volatile( - "cmp %2, %1 \n\t" - "cmovg %1, %0 \n\t" - "cmovg %2, %1 \n\t" - "cmp %3, %1 \n\t" - "cmovl %3, %1 \n\t" - "cmp %1, %0 \n\t" - "cmovg %1, %0 \n\t" - :"+&r"(i), "+&r"(a) - :"r"(b), "r"(c) - ); - return i; -} -#endif - -#if HAVE_CMOV -#define COPY3_IF_LT(x, y, a, b, c, d)\ -__asm__ volatile(\ - "cmpl %0, %3 \n\t"\ - "cmovl %3, %0 \n\t"\ - "cmovl %4, %1 \n\t"\ - "cmovl %5, %2 \n\t"\ - : "+&r" (x), "+&r" (a), "+r" (c)\ - : "r" (y), "r" (b), "r" (d)\ -); -#endif - -#define MASK_ABS(mask, level) \ - __asm__ ("cltd \n\t" \ - "xorl %1, %0 \n\t" \ - "subl %1, %0 \n\t" \ - : "+a"(level), "=&d"(mask)) - -// avoid +32 for shift optimization (gcc should do that ...) -#define NEG_SSR32 NEG_SSR32 -static inline int32_t NEG_SSR32( int32_t a, int8_t s){ - __asm__ ("sarl %1, %0\n\t" - : "+r" (a) - : "ic" ((uint8_t)(-s)) - ); - return a; -} - -#define NEG_USR32 NEG_USR32 -static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ - __asm__ ("shrl %1, %0\n\t" - : "+r" (a) - : "ic" ((uint8_t)(-s)) - ); - return a; -} - -#endif /* HAVE_INLINE_ASM */ -#endif /* AVCODEC_X86_MATHOPS_H */ diff --git a/ffmpeg1/libavcodec/x86/mlpdsp.c b/ffmpeg1/libavcodec/x86/mlpdsp.c deleted file mode 100644 index 81cab5a..0000000 --- a/ffmpeg1/libavcodec/x86/mlpdsp.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * MLP DSP functions x86-optimized - * Copyright (c) 2009 Ramiro Polla - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/mlpdsp.h" -#include "libavcodec/mlp.h" - -#if HAVE_7REGS && HAVE_INLINE_ASM - -extern char ff_mlp_firorder_8; -extern char ff_mlp_firorder_7; -extern char ff_mlp_firorder_6; -extern char ff_mlp_firorder_5; -extern char ff_mlp_firorder_4; -extern char ff_mlp_firorder_3; -extern char ff_mlp_firorder_2; -extern char ff_mlp_firorder_1; -extern char ff_mlp_firorder_0; - -extern char ff_mlp_iirorder_4; -extern char ff_mlp_iirorder_3; -extern char ff_mlp_iirorder_2; -extern char ff_mlp_iirorder_1; -extern char ff_mlp_iirorder_0; - -static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, - &ff_mlp_firorder_2, &ff_mlp_firorder_3, - &ff_mlp_firorder_4, &ff_mlp_firorder_5, - &ff_mlp_firorder_6, &ff_mlp_firorder_7, - &ff_mlp_firorder_8 }; -static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, - &ff_mlp_iirorder_2, &ff_mlp_iirorder_3, - &ff_mlp_iirorder_4 }; - -#if ARCH_X86_64 - -#define MLPMUL(label, offset, offs, offc) \ - LABEL_MANGLE(label)": \n\t" \ - "movslq "offset"+"offs"(%0), %%rax\n\t" \ - "movslq "offset"+"offc"(%1), %%rdx\n\t" \ - "imul %%rdx, %%rax\n\t" \ - "add %%rax, %%rsi\n\t" - -#define FIRMULREG(label, offset, firc)\ - LABEL_MANGLE(label)": \n\t" \ - "movslq "#offset"(%0), %%rax\n\t" \ - "imul %"#firc", %%rax\n\t" \ - "add %%rax, %%rsi\n\t" - -#define CLEAR_ACCUM \ - "xor %%rsi, %%rsi\n\t" - -#define SHIFT_ACCUM \ - "shr %%cl, %%rsi\n\t" - -#define ACCUM "%%rdx" -#define RESULT "%%rsi" -#define RESULT32 "%%esi" - -#else /* if ARCH_X86_32 */ - -#define MLPMUL(label, offset, offs, offc) \ - LABEL_MANGLE(label)": \n\t" \ - "mov "offset"+"offs"(%0), %%eax\n\t" \ - "imull "offset"+"offc"(%1) \n\t" \ - "add %%eax , %%esi\n\t" \ - "adc %%edx , %%ecx\n\t" - -#define FIRMULREG(label, offset, firc) \ - MLPMUL(label, #offset, "0", "0") - -#define CLEAR_ACCUM \ - "xor %%esi, %%esi\n\t" \ - "xor %%ecx, %%ecx\n\t" - -#define SHIFT_ACCUM \ - "mov %%ecx, %%edx\n\t" \ - "mov %%esi, %%eax\n\t" \ - "movzbl %7 , %%ecx\n\t" \ - "shrd %%cl, %%edx, %%eax\n\t" \ - -#define ACCUM "%%edx" -#define RESULT "%%eax" -#define RESULT32 "%%eax" - -#endif /* !ARCH_X86_64 */ - -#define BINC AV_STRINGIFY(4* MAX_CHANNELS) -#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE)) -#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER) - -#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0") -#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC) - -static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, - int firorder, int iirorder, - unsigned int filter_shift, int32_t mask, - int blocksize, int32_t *sample_buffer) -{ - const void *firjump = firtable[firorder]; - const void *iirjump = iirtable[iirorder]; - - blocksize = -blocksize; - - __asm__ volatile( - "1: \n\t" - CLEAR_ACCUM - "jmp *%5 \n\t" - FIRMUL (ff_mlp_firorder_8, 0x1c ) - FIRMUL (ff_mlp_firorder_7, 0x18 ) - FIRMUL (ff_mlp_firorder_6, 0x14 ) - FIRMUL (ff_mlp_firorder_5, 0x10 ) - FIRMUL (ff_mlp_firorder_4, 0x0c ) - FIRMULREG(ff_mlp_firorder_3, 0x08,10) - FIRMULREG(ff_mlp_firorder_2, 0x04, 9) - FIRMULREG(ff_mlp_firorder_1, 0x00, 8) - LABEL_MANGLE(ff_mlp_firorder_0)":\n\t" - "jmp *%6 \n\t" - IIRMUL (ff_mlp_iirorder_4, 0x0c ) - IIRMUL (ff_mlp_iirorder_3, 0x08 ) - IIRMUL (ff_mlp_iirorder_2, 0x04 ) - IIRMUL (ff_mlp_iirorder_1, 0x00 ) - LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t" - SHIFT_ACCUM - "mov "RESULT" ,"ACCUM" \n\t" - "add (%2) ,"RESULT" \n\t" - "and %4 ,"RESULT" \n\t" - "sub $4 , %0 \n\t" - "mov "RESULT32", (%0) \n\t" - "mov "RESULT32", (%2) \n\t" - "add $"BINC" , %2 \n\t" - "sub "ACCUM" ,"RESULT" \n\t" - "mov "RESULT32","IOFFS"(%0) \n\t" - "incl %3 \n\t" - "js 1b \n\t" - : /* 0*/"+r"(state), - /* 1*/"+r"(coeff), - /* 2*/"+r"(sample_buffer), -#if ARCH_X86_64 - /* 3*/"+r"(blocksize) - : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump), - /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift) - , /* 8*/"r"((int64_t)coeff[0]) - , /* 9*/"r"((int64_t)coeff[1]) - , /*10*/"r"((int64_t)coeff[2]) - : "rax", "rdx", "rsi" -#else /* ARCH_X86_32 */ - /* 3*/"+m"(blocksize) - : /* 4*/"m"( mask), /* 5*/"m"(firjump), - /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift) - : "eax", "edx", "esi", "ecx" -#endif /* !ARCH_X86_64 */ - ); -} - -#endif /* HAVE_7REGS && HAVE_INLINE_ASM */ - -av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) -{ -#if HAVE_7REGS && HAVE_INLINE_ASM - c->mlp_filter_channel = mlp_filter_channel_x86; -#endif -} diff --git a/ffmpeg1/libavcodec/x86/motion_est.c b/ffmpeg1/libavcodec/x86/motion_est.c deleted file mode 100644 index 3ffb002..0000000 --- a/ffmpeg1/libavcodec/x86/motion_est.c +++ /dev/null @@ -1,473 +0,0 @@ -/* - * MMX optimized motion estimation - * Copyright (c) 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * mostly by Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/avassert.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={ -0x0000000000000000ULL, -0x0001000100010001ULL, -0x0002000200020002ULL, -}; - -DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; - -static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(x86_reg)stride*h; - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "add %3, %%"REG_a" \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm5 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %3, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) -{ - int ret; - __asm__ volatile( - "pxor %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1, %4), %%xmm1 \n\t" - "psadbw (%2), %%xmm0 \n\t" - "psadbw (%2, %4), %%xmm1 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "lea (%1,%4,2), %1 \n\t" - "lea (%2,%4,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - "movhlps %%xmm2, %%xmm0 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "movd %%xmm2, %3 \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret) - : "r" ((x86_reg)stride) - ); - return ret; -} - -static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "pavgb 1(%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile( - "movq "MANGLE(bone)", %%mm5 \n\t" - "movq (%1), %%mm0 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1,%3), %%mm2 \n\t" - "pavgb 1(%1), %%mm1 \n\t" - "pavgb 1(%1,%3), %%mm2 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2,%3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(x86_reg)stride*h; - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm2 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psrlw $1, %%mm1 \n\t" - "psrlw $1, %%mm3 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm4, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(x86_reg)stride*h; - __asm__ volatile( - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq 1(%2, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm5 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "psubusb %%mm0, %%mm4 \n\t" - "psubusb %%mm5, %%mm0 \n\t" - "por %%mm4, %%mm0 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm3, %%mm1 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline int sum_mmx(void) -{ - int ret; - __asm__ volatile( - "movq %%mm6, %%mm0 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret&0xFFFF; -} - -static inline int sum_mmxext(void) -{ - int ret; - __asm__ volatile( - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret; -} - -static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+1, blk2, stride, h); -} -static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); -} - - -#define PIX_SAD(suf)\ -static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - av_assert2(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - av_assert2(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - av_assert2(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - av_assert2(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1 , blk2 , stride, h);\ - sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1 , blk2 , stride, h);\ - sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ - -PIX_SAD(mmx) -PIX_SAD(mmxext) - -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) -{ -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - c->pix_abs[0][0] = sad16_mmx; - c->pix_abs[0][1] = sad16_x2_mmx; - c->pix_abs[0][2] = sad16_y2_mmx; - c->pix_abs[0][3] = sad16_xy2_mmx; - c->pix_abs[1][0] = sad8_mmx; - c->pix_abs[1][1] = sad8_x2_mmx; - c->pix_abs[1][2] = sad8_y2_mmx; - c->pix_abs[1][3] = sad8_xy2_mmx; - - c->sad[0]= sad16_mmx; - c->sad[1]= sad8_mmx; - } - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->pix_abs[0][0] = sad16_mmxext; - c->pix_abs[1][0] = sad8_mmxext; - - c->sad[0] = sad16_mmxext; - c->sad[1] = sad8_mmxext; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->pix_abs[0][1] = sad16_x2_mmxext; - c->pix_abs[0][2] = sad16_y2_mmxext; - c->pix_abs[0][3] = sad16_xy2_mmxext; - c->pix_abs[1][1] = sad8_x2_mmxext; - c->pix_abs[1][2] = sad8_y2_mmxext; - c->pix_abs[1][3] = sad8_xy2_mmxext; - } - } - if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { - c->sad[0]= sad16_sse2; - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/ffmpeg1/libavcodec/x86/mpeg4qpel.asm b/ffmpeg1/libavcodec/x86/mpeg4qpel.asm deleted file mode 100644 index ca52375..0000000 --- a/ffmpeg1/libavcodec/x86/mpeg4qpel.asm +++ /dev/null @@ -1,560 +0,0 @@ -;****************************************************************************** -;* mpeg4 qpel -;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -cextern pb_1 -cextern pw_3 -cextern pw_15 -cextern pw_16 -cextern pw_20 - - -SECTION_TEXT - -; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PUT_NO_RND_PIXELS8_L2 0 -cglobal put_no_rnd_pixels8_l2, 6,6 - movsxdifnidn r4, r4d - movsxdifnidn r3, r3d - pcmpeqb m6, m6 - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r2] - add r1, r4 - add r2, 8 - pxor m0, m6 - pxor m1, m6 - PAVGB m0, m1 - pxor m0, m6 - mova [r0], m0 - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - add r1, r4 - mova m1, [r1] - add r1, r4 - mova m2, [r2] - mova m3, [r2+8] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - mova [r0], m0 - add r0, r3 - mova [r0], m1 - add r0, r3 - mova m0, [r1] - add r1, r4 - mova m1, [r1] - add r1, r4 - mova m2, [r2+16] - mova m3, [r2+24] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - mova [r0], m0 - add r0, r3 - mova [r0], m1 - add r0, r3 - add r2, 32 - sub r5d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS8_L2 - - -; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PUT_NO_RND_PIXELS16_l2 0 -cglobal put_no_rnd_pixels16_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - pcmpeqb m6, m6 - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r1+8] - mova m2, [r2] - mova m3, [r2+8] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - add r1, r4 - add r2, 16 - mova [r0], m0 - mova [r0+8], m1 - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - mova m2, [r2] - mova m3, [r2+8] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - mova [r0], m0 - mova [r0+8], m1 - add r0, r3 - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - mova m2, [r2+16] - mova m3, [r2+24] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - mova [r0], m0 - mova [r0+8], m1 - add r0, r3 - add r2, 32 - sub r5d, 2 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PUT_NO_RND_PIXELS16_l2 -INIT_MMX 3dnow -PUT_NO_RND_PIXELS16_l2 - -%macro MPEG4_QPEL16_H_LOWPASS 1 -cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - pxor m7, m7 -.loop: - mova m0, [r1] - mova m1, m0 - mova m2, m0 - punpcklbw m0, m7 - punpckhbw m1, m7 - pshufw m5, m0, 0x90 - pshufw m6, m0, 0x41 - mova m3, m2 - mova m4, m2 - psllq m2, 8 - psllq m3, 16 - psllq m4, 24 - punpckhbw m2, m7 - punpckhbw m3, m7 - punpckhbw m4, m7 - paddw m5, m3 - paddw m6, m2 - paddw m5, m5 - psubw m6, m5 - pshufw m5, m0, 6 - pmullw m6, [pw_3] - paddw m0, m4 - paddw m5, m1 - pmullw m0, [pw_20] - psubw m0, m5 - paddw m6, [PW_ROUND] - paddw m0, m6 - psraw m0, 5 - mova [rsp+8], m0 - mova m0, [r1+5] - mova m5, m0 - mova m6, m0 - psrlq m0, 8 - psrlq m5, 16 - punpcklbw m0, m7 - punpcklbw m5, m7 - paddw m2, m0 - paddw m3, m5 - paddw m2, m2 - psubw m3, m2 - mova m2, m6 - psrlq m6, 24 - punpcklbw m2, m7 - punpcklbw m6, m7 - pmullw m3, [pw_3] - paddw m1, m2 - paddw m4, m6 - pmullw m1, [pw_20] - psubw m3, m4 - paddw m1, [PW_ROUND] - paddw m3, m1 - psraw m3, 5 - mova m1, [rsp+8] - packuswb m1, m3 - OP_MOV [r0], m1, m4 - mova m1, [r1+9] - mova m4, m1 - mova m3, m1 - psrlq m1, 8 - psrlq m4, 16 - punpcklbw m1, m7 - punpcklbw m4, m7 - paddw m5, m1 - paddw m0, m4 - paddw m5, m5 - psubw m0, m5 - mova m5, m3 - psrlq m3, 24 - pmullw m0, [pw_3] - punpcklbw m3, m7 - paddw m2, m3 - psubw m0, m2 - mova m2, m5 - punpcklbw m2, m7 - punpckhbw m5, m7 - paddw m6, m2 - pmullw m6, [pw_20] - paddw m0, [PW_ROUND] - paddw m0, m6 - psraw m0, 5 - paddw m3, m5 - pshufw m6, m5, 0xf9 - paddw m6, m4 - pshufw m4, m5, 0xbe - pshufw m5, m5, 0x6f - paddw m4, m1 - paddw m5, m2 - paddw m6, m6 - psubw m4, m6 - pmullw m3, [pw_20] - pmullw m4, [pw_3] - psubw m3, m5 - paddw m4, [PW_ROUND] - paddw m4, m3 - psraw m4, 5 - packuswb m0, m4 - OP_MOV [r0+8], m0, m4 - add r1, r3 - add r0, r2 - dec r4d - jne .loop - REP_RET -%endmacro - -%macro PUT_OP 2-3 - mova %1, %2 -%endmacro - -%macro AVG_OP 2-3 - mova %3, %1 - pavgb %2, %3 - mova %1, %2 -%endmacro - -INIT_MMX mmxext -%define PW_ROUND pw_16 -%define OP_MOV PUT_OP -MPEG4_QPEL16_H_LOWPASS put -%define PW_ROUND pw_16 -%define OP_MOV AVG_OP -MPEG4_QPEL16_H_LOWPASS avg -%define PW_ROUND pw_15 -%define OP_MOV PUT_OP -MPEG4_QPEL16_H_LOWPASS put_no_rnd - - - -%macro MPEG4_QPEL8_H_LOWPASS 1 -cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - pxor m7, m7 -.loop: - mova m0, [r1] - mova m1, m0 - mova m2, m0 - punpcklbw m0, m7 - punpckhbw m1, m7 - pshufw m5, m0, 0x90 - pshufw m6, m0, 0x41 - mova m3, m2 - mova m4, m2 - psllq m2, 8 - psllq m3, 16 - psllq m4, 24 - punpckhbw m2, m7 - punpckhbw m3, m7 - punpckhbw m4, m7 - paddw m5, m3 - paddw m6, m2 - paddw m5, m5 - psubw m6, m5 - pshufw m5, m0, 0x6 - pmullw m6, [pw_3] - paddw m0, m4 - paddw m5, m1 - pmullw m0, [pw_20] - psubw m0, m5 - paddw m6, [PW_ROUND] - paddw m0, m6 - psraw m0, 5 - movh m5, [r1+5] - punpcklbw m5, m7 - pshufw m6, m5, 0xf9 - paddw m1, m5 - paddw m2, m6 - pshufw m6, m5, 0xbe - pshufw m5, m5, 0x6f - paddw m3, m6 - paddw m4, m5 - paddw m2, m2 - psubw m3, m2 - pmullw m1, [pw_20] - pmullw m3, [pw_3] - psubw m3, m4 - paddw m1, [PW_ROUND] - paddw m3, m1 - psraw m3, 5 - packuswb m0, m3 - OP_MOV [r0], m0, m4 - add r1, r3 - add r0, r2 - dec r4d - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -%define PW_ROUND pw_16 -%define OP_MOV PUT_OP -MPEG4_QPEL8_H_LOWPASS put -%define PW_ROUND pw_16 -%define OP_MOV AVG_OP -MPEG4_QPEL8_H_LOWPASS avg -%define PW_ROUND pw_15 -%define OP_MOV PUT_OP -MPEG4_QPEL8_H_LOWPASS put_no_rnd - - - -%macro QPEL_V_LOW 5 - paddw m0, m1 - mova m4, [pw_20] - pmullw m4, m0 - mova m0, %4 - mova m5, %1 - paddw m5, m0 - psubw m4, m5 - mova m5, %2 - mova m6, %3 - paddw m5, m3 - paddw m6, m2 - paddw m6, m6 - psubw m5, m6 - pmullw m5, [pw_3] - paddw m4, [PW_ROUND] - paddw m5, m4 - psraw m5, 5 - packuswb m5, m5 - OP_MOV %5, m5, m7 - SWAP 0,1,2,3 -%endmacro - -%macro MPEG4_QPEL16_V_LOWPASS 1 -cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - - mov r4d, 17 - mov r5, rsp - pxor m7, m7 -.looph: - mova m0, [r1] - mova m1, [r1] - mova m2, [r1+8] - mova m3, [r1+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - mova [r5], m0 - mova [r5+0x88], m1 - mova [r5+0x110], m2 - mova [r5+0x198], m3 - add r5, 8 - add r1, r3 - dec r4d - jne .looph - - - ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride - mov r4d, 4 - mov r1, 4 - neg r2 - lea r1, [r1+r2*8] - lea r1, [r1+r2*4] - lea r1, [r1+r2*2] - neg r2 - mov r5, rsp -.loopv: - pxor m7, m7 - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] - QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] - QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] - QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] - QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] - - add r5, 0x88 - add r0, r1 - dec r4d - jne .loopv - REP_RET -%endmacro - -%macro PUT_OPH 2-3 - movh %1, %2 -%endmacro - -%macro AVG_OPH 2-3 - movh %3, %1 - pavgb %2, %3 - movh %1, %2 -%endmacro - -INIT_MMX mmxext -%define PW_ROUND pw_16 -%define OP_MOV PUT_OPH -MPEG4_QPEL16_V_LOWPASS put -%define PW_ROUND pw_16 -%define OP_MOV AVG_OPH -MPEG4_QPEL16_V_LOWPASS avg -%define PW_ROUND pw_15 -%define OP_MOV PUT_OPH -MPEG4_QPEL16_V_LOWPASS put_no_rnd - - - -%macro MPEG4_QPEL8_V_LOWPASS 1 -cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 - movsxdifnidn r2, r2d - movsxdifnidn r3, r3d - - mov r4d, 9 - mov r5, rsp - pxor m7, m7 -.looph: - mova m0, [r1] - mova m1, [r1] - punpcklbw m0, m7 - punpckhbw m1, m7 - mova [r5], m0 - mova [r5+0x48], m1 - add r5, 8 - add r1, r3 - dec r4d - jne .looph - - - ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride - mov r4d, 2 - mov r1, 4 - neg r2 - lea r1, [r1+r2*4] - lea r1, [r1+r2*2] - neg r2 - mov r5, rsp -.loopv: - pxor m7, m7 - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] - - add r5, 0x48 - add r0, r1 - dec r4d - jne .loopv - REP_RET -%endmacro - -INIT_MMX mmxext -%define PW_ROUND pw_16 -%define OP_MOV PUT_OPH -MPEG4_QPEL8_V_LOWPASS put -%define PW_ROUND pw_16 -%define OP_MOV AVG_OPH -MPEG4_QPEL8_V_LOWPASS avg -%define PW_ROUND pw_15 -%define OP_MOV PUT_OPH -MPEG4_QPEL8_V_LOWPASS put_no_rnd diff --git a/ffmpeg1/libavcodec/x86/mpegaudiodec.c b/ffmpeg1/libavcodec/x86/mpegaudiodec.c deleted file mode 100644 index 287d8ff..0000000 --- a/ffmpeg1/libavcodec/x86/mpegaudiodec.c +++ /dev/null @@ -1,273 +0,0 @@ -/* - * MMX optimized MP3 decoding functions - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/internal.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/mpegaudiodsp.h" - -#define DECL(CPU)\ -static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ -void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); - -DECL(sse) -DECL(sse2) -DECL(sse3) -DECL(ssse3) -DECL(avx) - -void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, - float *tmpbuf); -void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, - float *tmpbuf); - -DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; - -#if HAVE_SSE2_INLINE - -#define MACS(rt, ra, rb) rt+=(ra)*(rb) -#define MLSS(rt, ra, rb) rt-=(ra)*(rb) - -#define SUM8(op, sum, w, p) \ -{ \ - op(sum, (w)[0 * 64], (p)[0 * 64]); \ - op(sum, (w)[1 * 64], (p)[1 * 64]); \ - op(sum, (w)[2 * 64], (p)[2 * 64]); \ - op(sum, (w)[3 * 64], (p)[3 * 64]); \ - op(sum, (w)[4 * 64], (p)[4 * 64]); \ - op(sum, (w)[5 * 64], (p)[5 * 64]); \ - op(sum, (w)[6 * 64], (p)[6 * 64]); \ - op(sum, (w)[7 * 64], (p)[7 * 64]); \ -} - -static void apply_window(const float *buf, const float *win1, - const float *win2, float *sum1, float *sum2, int len) -{ - x86_reg count = - 4*len; - const float *win1a = win1+len; - const float *win2a = win2+len; - const float *bufa = buf+len; - float *sum1a = sum1+len; - float *sum2a = sum2+len; - - -#define MULT(a, b) \ - "movaps " #a "(%1,%0), %%xmm1 \n\t" \ - "movaps " #a "(%3,%0), %%xmm2 \n\t" \ - "mulps %%xmm2, %%xmm1 \n\t" \ - "subps %%xmm1, %%xmm0 \n\t" \ - "mulps " #b "(%2,%0), %%xmm2 \n\t" \ - "subps %%xmm2, %%xmm4 \n\t" \ - - __asm__ volatile( - "1: \n\t" - "xorps %%xmm0, %%xmm0 \n\t" - "xorps %%xmm4, %%xmm4 \n\t" - - MULT( 0, 0) - MULT( 256, 64) - MULT( 512, 128) - MULT( 768, 192) - MULT(1024, 256) - MULT(1280, 320) - MULT(1536, 384) - MULT(1792, 448) - - "movaps %%xmm0, (%4,%0) \n\t" - "movaps %%xmm4, (%5,%0) \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - :"+&r"(count) - :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) - ); - -#undef MULT -} - -static void apply_window_mp3(float *in, float *win, int *unused, float *out, - int incr) -{ - LOCAL_ALIGNED_16(float, suma, [17]); - LOCAL_ALIGNED_16(float, sumb, [17]); - LOCAL_ALIGNED_16(float, sumc, [17]); - LOCAL_ALIGNED_16(float, sumd, [17]); - - float sum; - - /* copy to avoid wrap */ - __asm__ volatile( - "movaps 0(%0), %%xmm0 \n\t" \ - "movaps 16(%0), %%xmm1 \n\t" \ - "movaps 32(%0), %%xmm2 \n\t" \ - "movaps 48(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 0(%1) \n\t" \ - "movaps %%xmm1, 16(%1) \n\t" \ - "movaps %%xmm2, 32(%1) \n\t" \ - "movaps %%xmm3, 48(%1) \n\t" \ - "movaps 64(%0), %%xmm0 \n\t" \ - "movaps 80(%0), %%xmm1 \n\t" \ - "movaps 96(%0), %%xmm2 \n\t" \ - "movaps 112(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 64(%1) \n\t" \ - "movaps %%xmm1, 80(%1) \n\t" \ - "movaps %%xmm2, 96(%1) \n\t" \ - "movaps %%xmm3, 112(%1) \n\t" - ::"r"(in), "r"(in+512) - :"memory" - ); - - apply_window(in + 16, win , win + 512, suma, sumc, 16); - apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); - - SUM8(MACS, suma[0], win + 32, in + 48); - - sumc[ 0] = 0; - sumb[16] = 0; - sumd[16] = 0; - -#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ - "movups " #sumd "(%4), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "subps " #suma "(%1), %%xmm0 \n\t" \ - "movaps %%xmm0," #out1 "(%0) \n\t" \ -\ - "movups " #sumc "(%3), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "addps " #sumb "(%2), %%xmm0 \n\t" \ - "movaps %%xmm0," #out2 "(%0) \n\t" - - if (incr == 1) { - __asm__ volatile( - SUMS( 0, 48, 4, 52, 0, 112) - SUMS(16, 32, 20, 36, 16, 96) - SUMS(32, 16, 36, 20, 32, 80) - SUMS(48, 0, 52, 4, 48, 64) - - :"+&r"(out) - :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) - :"memory" - ); - out += 16*incr; - } else { - int j; - float *out2 = out + 32 * incr; - out[0 ] = -suma[ 0]; - out += incr; - out2 -= incr; - for(j=1;j<16;j++) { - *out = -suma[ j] + sumd[16-j]; - *out2 = sumb[16-j] + sumc[ j]; - out += incr; - out2 -= incr; - } - } - - sum = 0; - SUM8(MLSS, sum, win + 16 + 32, in + 32); - *out = sum; -} - -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_YASM -#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ -static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ - int count, int switch_point, int block_type) \ -{ \ - int align_end = count - (count & 3); \ - int j; \ - for (j = 0; j < align_end; j+= 4) { \ - LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ - float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ - in += 4*18; \ - buf += 4*18; \ - out += 4; \ - } \ - for (; j < count; j++) { \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - int win_idx = (switch_point && j < 2) ? 0 : block_type; \ - float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ - \ - ff_imdct36_float_ ## CPU1(out, buf, in, win); \ - \ - in += 18; \ - buf++; \ - out++; \ - } \ -} - -#if HAVE_SSE -DECL_IMDCT_BLOCKS(sse,sse) -DECL_IMDCT_BLOCKS(sse2,sse) -DECL_IMDCT_BLOCKS(sse3,sse) -DECL_IMDCT_BLOCKS(ssse3,sse) -#endif -#if HAVE_AVX_EXTERNAL -DECL_IMDCT_BLOCKS(avx,avx) -#endif -#endif /* HAVE_YASM */ - -av_cold void ff_mpadsp_init_x86(MPADSPContext *s) -{ - int mm_flags = av_get_cpu_flags(); - - int i, j; - for (j = 0; j < 4; j++) { - for (i = 0; i < 40; i ++) { - mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; - mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; - mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - } - } - -#if HAVE_SSE2_INLINE - if (mm_flags & AV_CPU_FLAG_SSE2) { - s->apply_window_float = apply_window_mp3; - } -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_YASM - if (EXTERNAL_AVX(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_avx; - } else if (EXTERNAL_SSSE3(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_ssse3; - } else if (EXTERNAL_SSE3(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse3; - } else if (EXTERNAL_SSE2(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse2; - } else if (EXTERNAL_SSE(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse; - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/mpegvideo.c b/ffmpeg1/libavcodec/x86/mpegvideo.c deleted file mode 100644 index 903ad62..0000000 --- a/ffmpeg1/libavcodec/x86/mpegvideo.c +++ /dev/null @@ -1,600 +0,0 @@ -/* - * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> - * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg level, qmul, qadd, nCoeffs; - - qmul = qscale << 1; - - av_assert2(s->block_last_index[n]>=0 || s->h263_aic); - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level= block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - -__asm__ volatile( - "movd %1, %%mm6 \n\t" //qmul - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" //qadd - "pxor %%mm7, %%mm7 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "psubw %%mm5, %%mm7 \n\t" - "pxor %%mm4, %%mm4 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - - "pmullw %%mm6, %%mm0 \n\t" - "pmullw %%mm6, %%mm1 \n\t" - - "movq (%0, %3), %%mm2 \n\t" - "movq 8(%0, %3), %%mm3 \n\t" - - "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm1 \n\t" - - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - - "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 - - "pandn %%mm2, %%mm0 \n\t" - "pandn %%mm3, %%mm1 \n\t" - - "movq %%mm0, (%0, %3) \n\t" - "movq %%mm1, 8(%0, %3) \n\t" - - "add $16, %3 \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) - : "memory" - ); - block[0]= level; -} - - -static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg qmul, qadd, nCoeffs; - - qmul = qscale << 1; - qadd = (qscale - 1) | 1; - - assert(s->block_last_index[n]>=0 || s->h263_aic); - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - -__asm__ volatile( - "movd %1, %%mm6 \n\t" //qmul - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" //qadd - "pxor %%mm7, %%mm7 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "psubw %%mm5, %%mm7 \n\t" - "pxor %%mm4, %%mm4 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - - "pmullw %%mm6, %%mm0 \n\t" - "pmullw %%mm6, %%mm1 \n\t" - - "movq (%0, %3), %%mm2 \n\t" - "movq 8(%0, %3), %%mm3 \n\t" - - "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm1 \n\t" - - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - - "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 - - "pandn %%mm2, %%mm0 \n\t" - "pandn %%mm3, %%mm1 \n\t" - - "movq %%mm0, (%0, %3) \n\t" - "movq %%mm1, 8(%0, %3) \n\t" - - "add $16, %3 \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) - : "memory" - ); -} - - -/* - We can suppose that result of two multiplications can't be greater than 0xFFFF - i.e. is 16-bit, so we use here only PMULLW instruction and can avoid - a complex multiplication. -===================================================== - Full formula for multiplication of 2 integer numbers - which are represent as high:low words: - input: value1 = high1:low1 - value2 = high2:low2 - output: value3 = value1*value2 - value3=high3:low3 (on overflow: modulus 2^32 wrap-around) - this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 - but this algorithm will compute only 0x66cb0ce4 - this limited by 16-bit size of operands - --------------------------------- - tlow1 = high1*low2 - tlow2 = high2*low1 - tlow1 = tlow1 + tlow2 - high3:low3 = low1*low2 - high3 += tlow1 -*/ -static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - int block0; - - av_assert2(s->block_last_index[n]>=0); - - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - /* XXX: only mpeg1 */ - quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); - block[0]= block0; -} - -static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - - av_assert2(s->block_last_index[n]>=0); - - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - - quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 - "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 - "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 - "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 - "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q - "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $4, %%mm0 \n\t" - "psraw $4, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); -} - -static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - int block0; - - av_assert2(s->block_last_index[n]>=0); - - if(s->alternate_scan) nCoeffs= 63; //FIXME - else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); - block[0]= block0; - //Note, we do not do mismatch control for intra as errors cannot accumulate -} - -static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - - av_assert2(s->block_last_index[n]>=0); - - if(s->alternate_scan) nCoeffs= 63; //FIXME - else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - - quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlq $48, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 - "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q - "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q - "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psrlw $4, %%mm0 \n\t" - "psrlw $4, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "pxor %%mm4, %%mm7 \n\t" - "pxor %%mm5, %%mm7 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "jng 1b \n\t" - "movd 124(%0, %3), %%mm0 \n\t" - "movq %%mm7, %%mm6 \n\t" - "psrlq $32, %%mm7 \n\t" - "pxor %%mm6, %%mm7 \n\t" - "movq %%mm7, %%mm6 \n\t" - "psrlq $16, %%mm7 \n\t" - "pxor %%mm6, %%mm7 \n\t" - "pslld $31, %%mm7 \n\t" - "psrlq $15, %%mm7 \n\t" - "pxor %%mm7, %%mm0 \n\t" - "movd %%mm0, 124(%0, %3) \n\t" - - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) - : "%"REG_a, "memory" - ); -} - -static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "1: \n\t" - "pxor %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "movq (%0), %%mm2 \n\t" - "movq 8(%0), %%mm3 \n\t" - "pcmpgtw %%mm2, %%mm0 \n\t" - "pcmpgtw %%mm3, %%mm1 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psubusw (%2), %%mm2 \n\t" - "psubusw 8(%2), %%mm3 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, %%mm2 \n\t" - "movq %%mm5, %%mm3 \n\t" - "punpcklwd %%mm7, %%mm4 \n\t" - "punpckhwd %%mm7, %%mm2 \n\t" - "punpcklwd %%mm7, %%mm5 \n\t" - "punpckhwd %%mm7, %%mm3 \n\t" - "paddd (%1), %%mm4 \n\t" - "paddd 8(%1), %%mm2 \n\t" - "paddd 16(%1), %%mm5 \n\t" - "paddd 24(%1), %%mm3 \n\t" - "movq %%mm4, (%1) \n\t" - "movq %%mm2, 8(%1) \n\t" - "movq %%mm5, 16(%1) \n\t" - "movq %%mm3, 24(%1) \n\t" - "add $16, %0 \n\t" - "add $32, %1 \n\t" - "add $16, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - ); -} - -static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "1: \n\t" - "pxor %%xmm0, %%xmm0 \n\t" - "pxor %%xmm1, %%xmm1 \n\t" - "movdqa (%0), %%xmm2 \n\t" - "movdqa 16(%0), %%xmm3 \n\t" - "pcmpgtw %%xmm2, %%xmm0 \n\t" - "pcmpgtw %%xmm3, %%xmm1 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, %%xmm4 \n\t" - "movdqa %%xmm3, %%xmm5 \n\t" - "psubusw (%2), %%xmm2 \n\t" - "psubusw 16(%2), %%xmm3 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm6 \n\t" - "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm7, %%xmm4 \n\t" - "punpckhwd %%xmm7, %%xmm6 \n\t" - "punpcklwd %%xmm7, %%xmm5 \n\t" - "punpckhwd %%xmm7, %%xmm0 \n\t" - "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm6 \n\t" - "paddd 32(%1), %%xmm5 \n\t" - "paddd 48(%1), %%xmm0 \n\t" - "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm6, 16(%1) \n\t" - "movdqa %%xmm5, 32(%1) \n\t" - "movdqa %%xmm0, 48(%1) \n\t" - "add $32, %0 \n\t" - "add $64, %1 \n\t" - "add $32, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); -} - -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_MPV_common_init_x86(MpegEncContext *s) -{ -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; - s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; - s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; - if(!(s->flags & CODEC_FLAG_BITEXACT)) - s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; - s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; - - if (mm_flags & AV_CPU_FLAG_SSE2) { - s->denoise_dct= denoise_dct_sse2; - } else { - s->denoise_dct= denoise_dct_mmx; - } - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/ffmpeg1/libavcodec/x86/mpegvideoenc.c b/ffmpeg1/libavcodec/x86/mpegvideoenc.c deleted file mode 100644 index 6219667..0000000 --- a/ffmpeg1/libavcodec/x86/mpegvideoenc.c +++ /dev/null @@ -1,107 +0,0 @@ -/* - * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Fabrice Bellard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dct.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" - -extern uint16_t ff_inv_zigzag_direct16[64]; - -#if HAVE_MMX_INLINE -#define COMPILE_TEMPLATE_MMXEXT 0 -#define COMPILE_TEMPLATE_SSE2 0 -#define COMPILE_TEMPLATE_SSSE3 0 -#define RENAME(a) a ## _MMX -#define RENAMEl(a) a ## _mmx -#include "mpegvideoenc_template.c" -#endif /* HAVE_MMX_INLINE */ - -#if HAVE_MMXEXT_INLINE -#undef COMPILE_TEMPLATE_SSSE3 -#undef COMPILE_TEMPLATE_SSE2 -#undef COMPILE_TEMPLATE_MMXEXT -#define COMPILE_TEMPLATE_MMXEXT 1 -#define COMPILE_TEMPLATE_SSE2 0 -#define COMPILE_TEMPLATE_SSSE3 0 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _MMXEXT -#define RENAMEl(a) a ## _mmxext -#include "mpegvideoenc_template.c" -#endif /* HAVE_MMXEXT_INLINE */ - -#if HAVE_SSE2_INLINE -#undef COMPILE_TEMPLATE_MMXEXT -#undef COMPILE_TEMPLATE_SSE2 -#undef COMPILE_TEMPLATE_SSSE3 -#define COMPILE_TEMPLATE_MMXEXT 0 -#define COMPILE_TEMPLATE_SSE2 1 -#define COMPILE_TEMPLATE_SSSE3 0 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSE2 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideoenc_template.c" -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_SSSE3_INLINE -#undef COMPILE_TEMPLATE_MMXEXT -#undef COMPILE_TEMPLATE_SSE2 -#undef COMPILE_TEMPLATE_SSSE3 -#define COMPILE_TEMPLATE_MMXEXT 0 -#define COMPILE_TEMPLATE_SSE2 1 -#define COMPILE_TEMPLATE_SSSE3 1 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSSE3 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideoenc_template.c" -#endif /* HAVE_SSSE3_INLINE */ - -av_cold void ff_dct_encode_init_x86(MpegEncContext *s) -{ - int mm_flags = av_get_cpu_flags(); - const int dct_algo = s->avctx->dct_algo; - - if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { -#if HAVE_MMX_INLINE - if (INLINE_MMX(mm_flags)) - s->dct_quantize = dct_quantize_MMX; -#endif -#if HAVE_MMXEXT_INLINE - if (INLINE_MMXEXT(mm_flags)) - s->dct_quantize = dct_quantize_MMXEXT; -#endif -#if HAVE_SSE2_INLINE - if (INLINE_SSE2(mm_flags)) - s->dct_quantize = dct_quantize_SSE2; -#endif -#if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(mm_flags)) - s->dct_quantize = dct_quantize_SSSE3; -#endif - } -} diff --git a/ffmpeg1/libavcodec/x86/mpegvideoenc_template.c b/ffmpeg1/libavcodec/x86/mpegvideoenc_template.c deleted file mode 100644 index 1e0505e..0000000 --- a/ffmpeg1/libavcodec/x86/mpegvideoenc_template.c +++ /dev/null @@ -1,364 +0,0 @@ -/* - * MPEG video MMX templates - * - * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#undef MMREG_WIDTH -#undef MM -#undef MOVQ -#undef SPREADW -#undef PMAXW -#undef PMAX -#undef SAVE_SIGN -#undef RESTORE_SIGN - -#if COMPILE_TEMPLATE_SSE2 -#define MMREG_WIDTH "16" -#define MM "%%xmm" -#define MOVQ "movdqa" -#define SPREADW(a) \ - "pshuflw $0, "a", "a" \n\t"\ - "punpcklwd "a", "a" \n\t" -#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" -#define PMAX(a,b) \ - "movhlps "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshuflw $0x0E, "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshuflw $0x01, "a", "b" \n\t"\ - PMAXW(b, a) -#else -#define MMREG_WIDTH "8" -#define MM "%%mm" -#define MOVQ "movq" -#if COMPILE_TEMPLATE_MMXEXT -#define SPREADW(a) "pshufw $0, "a", "a" \n\t" -#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" -#define PMAX(a,b) \ - "pshufw $0x0E, "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshufw $0x01, "a", "b" \n\t"\ - PMAXW(b, a) -#else -#define SPREADW(a) \ - "punpcklwd "a", "a" \n\t"\ - "punpcklwd "a", "a" \n\t" -#define PMAXW(a,b) \ - "psubusw "a", "b" \n\t"\ - "paddw "a", "b" \n\t" -#define PMAX(a,b) \ - "movq "a", "b" \n\t"\ - "psrlq $32, "a" \n\t"\ - PMAXW(b, a)\ - "movq "a", "b" \n\t"\ - "psrlq $16, "a" \n\t"\ - PMAXW(b, a) - -#endif -#endif - -#if COMPILE_TEMPLATE_SSSE3 -#define SAVE_SIGN(a,b) \ - "movdqa "b", "a" \n\t"\ - "pabsw "b", "b" \n\t" -#define RESTORE_SIGN(a,b) \ - "psignw "a", "b" \n\t" -#else -#define SAVE_SIGN(a,b) \ - "pxor "a", "a" \n\t"\ - "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ - "pxor "a", "b" \n\t"\ - "psubw "a", "b" \n\t" /* ABS(block[i]) */ -#define RESTORE_SIGN(a,b) \ - "pxor "a", "b" \n\t"\ - "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) -#endif - -static int RENAME(dct_quantize)(MpegEncContext *s, - int16_t *block, int n, - int qscale, int *overflow) -{ - x86_reg last_non_zero_p1; - int level=0, q; //=0 is because gcc says uninitialized ... - const uint16_t *qmat, *bias; - LOCAL_ALIGNED_16(int16_t, temp_block, [64]); - - av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? - - //s->fdct (block); - RENAMEl(ff_fdct) (block); //cannot be anything else ... - - if(s->dct_error_sum) - s->denoise_dct(s, block); - - if (s->mb_intra) { - int dummy; - if (n < 4){ - q = s->y_dc_scale; - bias = s->q_intra_matrix16[qscale][1]; - qmat = s->q_intra_matrix16[qscale][0]; - }else{ - q = s->c_dc_scale; - bias = s->q_chroma_intra_matrix16[qscale][1]; - qmat = s->q_chroma_intra_matrix16[qscale][0]; - } - /* note: block[0] is assumed to be positive */ - if (!s->h263_aic) { - __asm__ volatile ( - "mul %%ecx \n\t" - : "=d" (level), "=a"(dummy) - : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1]) - ); - } else - /* For AIC we skip quant/dequant of INTRADC */ - level = (block[0] + 4)>>3; - - block[0]=0; //avoid fake overflow -// temp_block[0] = (block[0] + (q >> 1)) / q; - last_non_zero_p1 = 1; - } else { - last_non_zero_p1 = 0; - bias = s->q_inter_matrix16[qscale][1]; - qmat = s->q_inter_matrix16[qscale][0]; - } - - if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ - - __asm__ volatile( - "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 - SPREADW(MM"3") - "pxor "MM"7, "MM"7 \n\t" // 0 - "pxor "MM"4, "MM"4 \n\t" // 0 - MOVQ" (%2), "MM"5 \n\t" // qmat[0] - "pxor "MM"6, "MM"6 \n\t" - "psubw (%3), "MM"6 \n\t" // -bias[0] - "mov $-128, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] - SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) - "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] - "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 - "por "MM"0, "MM"4 \n\t" - RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - MOVQ" "MM"0, (%5, %%"REG_a") \n\t" - "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 - MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" - MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 - "pandn "MM"1, "MM"0 \n\t" - PMAXW(MM"0", MM"3") - "add $"MMREG_WIDTH", %%"REG_a" \n\t" - " js 1b \n\t" - PMAX(MM"3", MM"0") - "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat), "r" (bias), - "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); - }else{ // FMT_H263 - __asm__ volatile( - "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 - SPREADW(MM"3") - "pxor "MM"7, "MM"7 \n\t" // 0 - "pxor "MM"4, "MM"4 \n\t" // 0 - "mov $-128, %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] - SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) - MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] - "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] - MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] - "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 - "por "MM"0, "MM"4 \n\t" - RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - MOVQ" "MM"0, (%5, %%"REG_a") \n\t" - "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 - MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" - MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 - "pandn "MM"1, "MM"0 \n\t" - PMAXW(MM"0", MM"3") - "add $"MMREG_WIDTH", %%"REG_a" \n\t" - " js 1b \n\t" - PMAX(MM"3", MM"0") - "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat+64), "r" (bias+64), - "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); - } - __asm__ volatile( - "movd %1, "MM"1 \n\t" // max_qcoeff - SPREADW(MM"1") - "psubusw "MM"1, "MM"4 \n\t" - "packuswb "MM"4, "MM"4 \n\t" -#if COMPILE_TEMPLATE_SSE2 - "packuswb "MM"4, "MM"4 \n\t" -#endif - "movd "MM"4, %0 \n\t" // *overflow - : "=g" (*overflow) - : "g" (s->max_qcoeff) - ); - - if(s->mb_intra) block[0]= level; - else block[0]= temp_block[0]; - - if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ - if(last_non_zero_p1 <= 1) goto end; - block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; - block[0x20] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; - block[0x09] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; - block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; - block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; - block[0x0C] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; - block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; - block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; - block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; - block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; - block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; - block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; - block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; - block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; - block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; - block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; - block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; - block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; - block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; - block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; - block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ - if(last_non_zero_p1 <= 1) goto end; - block[0x04] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; - block[0x05] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x1C] = temp_block[0x19]; - block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; - block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; - block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; - block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; - block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; - block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; - block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; - block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; - block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; - block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; - block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - }else{ - if(last_non_zero_p1 <= 1) goto end; - block[0x01] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; - block[0x03] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x19] = temp_block[0x19]; - block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; - block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; - block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; - block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; - block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; - block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; - block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; - block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; - block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; - block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; - block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - } - end: - return last_non_zero_p1 - 1; -} diff --git a/ffmpeg1/libavcodec/x86/pngdsp.asm b/ffmpeg1/libavcodec/x86/pngdsp.asm deleted file mode 100644 index c05f3da..0000000 --- a/ffmpeg1/libavcodec/x86/pngdsp.asm +++ /dev/null @@ -1,173 +0,0 @@ -;****************************************************************************** -;* x86 optimizations for PNG decoding -;* -;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> -;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -cextern pw_255 - -SECTION_TEXT - -; %1 = nr. of xmm registers used -%macro ADD_BYTES_FN 1 -cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i -%if ARCH_X86_64 - movsxd waq, wad -%endif - xor iq, iq - - ; vector loop - mov wq, waq - and waq, ~(mmsize*2-1) - jmp .end_v -.loop_v: - mova m0, [src1q+iq] - mova m1, [src1q+iq+mmsize] - paddb m0, [src2q+iq] - paddb m1, [src2q+iq+mmsize] - mova [dstq+iq ], m0 - mova [dstq+iq+mmsize], m1 - add iq, mmsize*2 -.end_v: - cmp iq, waq - jl .loop_v - -%if mmsize == 16 - ; vector loop - mov waq, wq - and waq, ~7 - jmp .end_l -.loop_l: - movq mm0, [src1q+iq] - paddb mm0, [src2q+iq] - movq [dstq+iq ], mm0 - add iq, 8 -.end_l: - cmp iq, waq - jl .loop_l -%endif - - ; scalar loop for leftover - jmp .end_s -.loop_s: - mov wab, [src1q+iq] - add wab, [src2q+iq] - mov [dstq+iq], wab - inc iq -.end_s: - cmp iq, wq - jl .loop_s - REP_RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -ADD_BYTES_FN 0 -%endif - -INIT_XMM sse2 -ADD_BYTES_FN 2 - -%macro ADD_PAETH_PRED_FN 1 -cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr -%if ARCH_X86_64 - movsxd bppq, bppd - movsxd wq, wd -%endif - lea endq, [dstq+wq-(mmsize/2-1)] - sub topq, dstq - sub srcq, dstq - sub dstq, bppq - pxor m7, m7 - - PUSH dstq - lea cntrq, [bppq-1] - shr cntrq, 2 + mmsize/16 -.bpp_loop: - lea dstq, [dstq+cntrq*(mmsize/2)] - movh m0, [dstq] - movh m1, [topq+dstq] - punpcklbw m0, m7 - punpcklbw m1, m7 - add dstq, bppq -.loop: - mova m2, m1 - movh m1, [topq+dstq] - mova m3, m2 - punpcklbw m1, m7 - mova m4, m2 - psubw m3, m1 - psubw m4, m0 - mova m5, m3 - paddw m5, m4 -%if cpuflag(ssse3) - pabsw m3, m3 - pabsw m4, m4 - pabsw m5, m5 -%else ; !cpuflag(ssse3) - psubw m7, m5 - pmaxsw m5, m7 - pxor m6, m6 - pxor m7, m7 - psubw m6, m3 - psubw m7, m4 - pmaxsw m3, m6 - pmaxsw m4, m7 - pxor m7, m7 -%endif ; cpuflag(ssse3) - mova m6, m4 - pminsw m6, m5 - pcmpgtw m3, m6 - pcmpgtw m4, m5 - mova m6, m4 - pand m4, m3 - pandn m6, m3 - pandn m3, m0 - movh m0, [srcq+dstq] - pand m6, m1 - pand m2, m4 - punpcklbw m0, m7 - paddw m0, m6 - paddw m3, m2 - paddw m0, m3 - pand m0, [pw_255] - mova m3, m0 - packuswb m3, m3 - movh [dstq], m3 - add dstq, bppq - cmp dstq, endq - jle .loop - - mov dstq, [rsp] - dec cntrq - jge .bpp_loop - POP dstq - RET -%endmacro - -INIT_MMX mmxext -ADD_PAETH_PRED_FN 0 - -INIT_MMX ssse3 -ADD_PAETH_PRED_FN 0 diff --git a/ffmpeg1/libavcodec/x86/pngdsp_init.c b/ffmpeg1/libavcodec/x86/pngdsp_init.c deleted file mode 100644 index 4c54ed3..0000000 --- a/ffmpeg1/libavcodec/x86/pngdsp_init.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * x86 PNG optimizations. - * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/common.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/pngdsp.h" - -void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src, - uint8_t *top, int w, int bpp); -void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src, - uint8_t *top, int w, int bpp); -void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1, - uint8_t *src2, int w); -void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1, - uint8_t *src2, int w); - -av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp) -{ - int flags = av_get_cpu_flags(); - -#if ARCH_X86_32 - if (EXTERNAL_MMX(flags)) - dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; -#endif - if (EXTERNAL_MMXEXT(flags)) - dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext; - if (EXTERNAL_SSE2(flags)) - dsp->add_bytes_l2 = ff_add_bytes_l2_sse2; - if (EXTERNAL_SSSE3(flags)) - dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3; -} diff --git a/ffmpeg1/libavcodec/x86/proresdsp.asm b/ffmpeg1/libavcodec/x86/proresdsp.asm deleted file mode 100644 index aedacc2..0000000 --- a/ffmpeg1/libavcodec/x86/proresdsp.asm +++ /dev/null @@ -1,326 +0,0 @@ -;****************************************************************************** -;* x86-SIMD-optimized IDCT for prores -;* this is identical to "simple" IDCT written by Michael Niedermayer -;* except for the clip range -;* -;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 -%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 -%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 -%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 -%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 -%define W6sh2 8867 ; W6 = 35468 = 8867<<2 -%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 - -%if ARCH_X86_64 - -SECTION_RODATA - -w4_plus_w2: times 4 dw W4sh2, +W2sh2 -w4_min_w2: times 4 dw W4sh2, -W2sh2 -w4_plus_w6: times 4 dw W4sh2, +W6sh2 -w4_min_w6: times 4 dw W4sh2, -W6sh2 -w1_plus_w3: times 4 dw W1sh2, +W3sh2 -w3_min_w1: times 4 dw W3sh2, -W1sh2 -w7_plus_w3: times 4 dw W7sh2, +W3sh2 -w3_min_w7: times 4 dw W3sh2, -W7sh2 -w1_plus_w5: times 4 dw W1sh2, +W5sh2 -w5_min_w1: times 4 dw W5sh2, -W1sh2 -w5_plus_w7: times 4 dw W5sh2, +W7sh2 -w7_min_w5: times 4 dw W7sh2, -W5sh2 -pw_88: times 8 dw 0x2008 - -cextern pw_1 -cextern pw_4 -cextern pw_512 -cextern pw_1019 - -section .text align=16 - -; interleave data while maintaining source -; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave -%macro SBUTTERFLY3 5 - punpckl%1 m%2, m%4, m%5 - punpckh%1 m%3, m%4, m%5 -%endmacro - -; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift -; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 -; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) -%macro SUMSUB_SHPK 7 - psubd %3, %1, %5 ; { a0 - b0 }[0-3] - psubd %4, %2, %6 ; { a0 - b0 }[4-7] - paddd %1, %5 ; { a0 + b0 }[0-3] - paddd %2, %6 ; { a0 + b0 }[4-7] - psrad %1, %7 - psrad %2, %7 - psrad %3, %7 - psrad %4, %7 - packssdw %1, %2 ; row[0] - packssdw %3, %4 ; row[7] -%endmacro - -; %1 = row or col (for rounding variable) -; %2 = number of bits to shift at the end -%macro IDCT_1D 2 - ; a0 = (W4 * row[0]) + (1 << (15 - 1)); - ; a1 = a0; - ; a2 = a0; - ; a3 = a0; - ; a0 += W2 * row[2]; - ; a1 += W6 * row[2]; - ; a2 -= W6 * row[2]; - ; a3 -= W2 * row[2]; -%ifidn %1, col - paddw m10,[pw_88] -%endif -%ifidn %1, row - paddw m10,[pw_1] -%endif - SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] - pmaddwd m2, m0, [w4_plus_w6] - pmaddwd m3, m1, [w4_plus_w6] - pmaddwd m4, m0, [w4_min_w6] - pmaddwd m5, m1, [w4_min_w6] - pmaddwd m6, m0, [w4_min_w2] - pmaddwd m7, m1, [w4_min_w2] - pmaddwd m0, [w4_plus_w2] - pmaddwd m1, [w4_plus_w2] - - ; a0: -1*row[0]-1*row[2] - ; a1: -1*row[0] - ; a2: -1*row[0] - ; a3: -1*row[0]+1*row[2] - - ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] - ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] - ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] - ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] - SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] - pmaddwd m10, m8, [w4_plus_w6] - pmaddwd m11, m9, [w4_plus_w6] - paddd m0, m10 ; a0[0-3] - paddd m1, m11 ; a0[4-7] - pmaddwd m10, m8, [w4_min_w6] - pmaddwd m11, m9, [w4_min_w6] - paddd m6, m10 ; a3[0-3] - paddd m7, m11 ; a3[4-7] - pmaddwd m10, m8, [w4_min_w2] - pmaddwd m11, m9, [w4_min_w2] - pmaddwd m8, [w4_plus_w2] - pmaddwd m9, [w4_plus_w2] - psubd m4, m10 ; a2[0-3] intermediate - psubd m5, m11 ; a2[4-7] intermediate - psubd m2, m8 ; a1[0-3] intermediate - psubd m3, m9 ; a1[4-7] intermediate - - ; load/store - mova [r2+ 0], m0 - mova [r2+ 32], m2 - mova [r2+ 64], m4 - mova [r2+ 96], m6 - mova m10,[r2+ 16] ; { row[1] }[0-7] - mova m8, [r2+ 48] ; { row[3] }[0-7] - mova m13,[r2+ 80] ; { row[5] }[0-7] - mova m14,[r2+112] ; { row[7] }[0-7] - mova [r2+ 16], m1 - mova [r2+ 48], m3 - mova [r2+ 80], m5 - mova [r2+112], m7 -%ifidn %1, row - pmullw m10,[r3+ 16] - pmullw m8, [r3+ 48] - pmullw m13,[r3+ 80] - pmullw m14,[r3+112] -%endif - - ; b0 = MUL(W1, row[1]); - ; MAC(b0, W3, row[3]); - ; b1 = MUL(W3, row[1]); - ; MAC(b1, -W7, row[3]); - ; b2 = MUL(W5, row[1]); - ; MAC(b2, -W1, row[3]); - ; b3 = MUL(W7, row[1]); - ; MAC(b3, -W5, row[3]); - SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] - pmaddwd m2, m0, [w3_min_w7] - pmaddwd m3, m1, [w3_min_w7] - pmaddwd m4, m0, [w5_min_w1] - pmaddwd m5, m1, [w5_min_w1] - pmaddwd m6, m0, [w7_min_w5] - pmaddwd m7, m1, [w7_min_w5] - pmaddwd m0, [w1_plus_w3] - pmaddwd m1, [w1_plus_w3] - - ; b0: +1*row[1]+2*row[3] - ; b1: +2*row[1]-1*row[3] - ; b2: -1*row[1]-1*row[3] - ; b3: +1*row[1]+1*row[3] - - ; MAC(b0, W5, row[5]); - ; MAC(b0, W7, row[7]); - ; MAC(b1, -W1, row[5]); - ; MAC(b1, -W5, row[7]); - ; MAC(b2, W7, row[5]); - ; MAC(b2, W3, row[7]); - ; MAC(b3, W3, row[5]); - ; MAC(b3, -W1, row[7]); - SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] - - ; b0: -1*row[5]+1*row[7] - ; b1: -1*row[5]+1*row[7] - ; b2: +1*row[5]+2*row[7] - ; b3: +2*row[5]-1*row[7] - - pmaddwd m10, m8, [w1_plus_w5] - pmaddwd m11, m9, [w1_plus_w5] - pmaddwd m12, m8, [w5_plus_w7] - pmaddwd m13, m9, [w5_plus_w7] - psubd m2, m10 ; b1[0-3] - psubd m3, m11 ; b1[4-7] - paddd m0, m12 ; b0[0-3] - paddd m1, m13 ; b0[4-7] - pmaddwd m12, m8, [w7_plus_w3] - pmaddwd m13, m9, [w7_plus_w3] - pmaddwd m8, [w3_min_w1] - pmaddwd m9, [w3_min_w1] - paddd m4, m12 ; b2[0-3] - paddd m5, m13 ; b2[4-7] - paddd m6, m8 ; b3[0-3] - paddd m7, m9 ; b3[4-7] - - ; row[0] = (a0 + b0) >> 15; - ; row[7] = (a0 - b0) >> 15; - ; row[1] = (a1 + b1) >> 15; - ; row[6] = (a1 - b1) >> 15; - ; row[2] = (a2 + b2) >> 15; - ; row[5] = (a2 - b2) >> 15; - ; row[3] = (a3 + b3) >> 15; - ; row[4] = (a3 - b3) >> 15; - mova m8, [r2+ 0] ; a0[0-3] - mova m9, [r2+16] ; a0[4-7] - SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 - mova m0, [r2+32] ; a1[0-3] - mova m1, [r2+48] ; a1[4-7] - SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 - mova m1, [r2+64] ; a2[0-3] - mova m2, [r2+80] ; a2[4-7] - SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 - mova m2, [r2+96] ; a3[0-3] - mova m3, [r2+112] ; a3[4-7] - SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 -%endmacro - -; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride, -; int16_t *block, const int16_t *qmat); -%macro idct_put_fn 1 -cglobal prores_idct_put_10, 4, 4, %1 - movsxd r1, r1d - pxor m15, m15 ; zero - - ; for (i = 0; i < 8; i++) - ; idctRowCondDC(block + i*8); - mova m10,[r2+ 0] ; { row[0] }[0-7] - mova m8, [r2+32] ; { row[2] }[0-7] - mova m13,[r2+64] ; { row[4] }[0-7] - mova m12,[r2+96] ; { row[6] }[0-7] - - pmullw m10,[r3+ 0] - pmullw m8, [r3+32] - pmullw m13,[r3+64] - pmullw m12,[r3+96] - - IDCT_1D row, 15 - - ; transpose for second part of IDCT - TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 - mova [r2+ 16], m0 - mova [r2+ 48], m2 - mova [r2+ 80], m11 - mova [r2+112], m10 - SWAP 8, 10 - SWAP 1, 8 - SWAP 4, 13 - SWAP 9, 12 - - ; for (i = 0; i < 8; i++) - ; idctSparseColAdd(dest + i, line_size, block + i); - IDCT_1D col, 18 - - ; clip/store - mova m3, [pw_4] - mova m5, [pw_1019] - pmaxsw m8, m3 - pmaxsw m0, m3 - pmaxsw m1, m3 - pmaxsw m2, m3 - pmaxsw m4, m3 - pmaxsw m11, m3 - pmaxsw m9, m3 - pmaxsw m10, m3 - pminsw m8, m5 - pminsw m0, m5 - pminsw m1, m5 - pminsw m2, m5 - pminsw m4, m5 - pminsw m11, m5 - pminsw m9, m5 - pminsw m10, m5 - - lea r2, [r1*3] - mova [r0 ], m8 - mova [r0+r1 ], m0 - mova [r0+r1*2], m1 - mova [r0+r2 ], m2 - lea r0, [r0+r1*4] - mova [r0 ], m4 - mova [r0+r1 ], m11 - mova [r0+r1*2], m9 - mova [r0+r2 ], m10 - RET -%endmacro - -%macro SIGNEXTEND 2-3 -%if cpuflag(sse4) ; dstlow, dsthigh - movhlps %2, %1 - pmovsxwd %1, %1 - pmovsxwd %2, %2 -%elif cpuflag(sse2) ; dstlow, dsthigh, tmp - pxor %3, %3 - pcmpgtw %3, %1 - mova %2, %1 - punpcklwd %1, %3 - punpckhwd %2, %3 -%endif -%endmacro - -INIT_XMM sse2 -idct_put_fn 16 -INIT_XMM sse4 -idct_put_fn 16 -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -idct_put_fn 16 -%endif - -%endif diff --git a/ffmpeg1/libavcodec/x86/proresdsp_init.c b/ffmpeg1/libavcodec/x86/proresdsp_init.c deleted file mode 100644 index 91ff257..0000000 --- a/ffmpeg1/libavcodec/x86/proresdsp_init.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Apple ProRes compatible decoder - * - * Copyright (c) 2010-2011 Maxim Poliakovski - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/proresdsp.h" - -void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, - int16_t *block, const int16_t *qmat); -void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, - int16_t *block, const int16_t *qmat); -void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, - int16_t *block, const int16_t *qmat); - -void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) -{ -#if ARCH_X86_64 - int flags = av_get_cpu_flags(); - - if(avctx->flags & CODEC_FLAG_BITEXACT) - return; - - if (EXTERNAL_SSE2(flags)) { - dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - dsp->idct_put = ff_prores_idct_put_10_sse2; - } - - if (EXTERNAL_SSE4(flags)) { - dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - dsp->idct_put = ff_prores_idct_put_10_sse4; - } - - if (EXTERNAL_AVX(flags)) { - dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - dsp->idct_put = ff_prores_idct_put_10_avx; - } -#endif /* ARCH_X86_64 */ -} diff --git a/ffmpeg1/libavcodec/x86/qpelbase.asm b/ffmpeg1/libavcodec/x86/qpelbase.asm deleted file mode 100644 index c2ffb86..0000000 --- a/ffmpeg1/libavcodec/x86/qpelbase.asm +++ /dev/null @@ -1,176 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -%macro op_avgh 3 - movh %3, %2 - pavgb %1, %3 - movh %2, %1 -%endmacro - -%macro op_avg 2 - pavgb %1, %2 - mova %2, %1 -%endmacro - -%macro op_puth 2-3 - movh %2, %1 -%endmacro - -%macro op_put 2 - mova %2, %1 -%endmacro - -; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS4_L2 1 -%define OP op_%1h -cglobal %1_pixels4_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - movd m0, [r1] - movd m1, [r2] - add r1, r4 - add r2, 4 - pavgb m0, m1 - OP m0, [r0], m3 - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2] - pavgb m1, [r2+4] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2+8] - pavgb m1, [r2+12] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - add r2, 16 - sub r5d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS4_L2 put -PIXELS4_L2 avg - -; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS8_L2 1 -%define OP op_%1 -cglobal %1_pixels8_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r2] - add r1, r4 - add r2, 8 - pavgb m0, m1 - OP m0, [r0] - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2] - pavgb m1, [r2+8] - OP m0, [r0] - OP m1, [r0+r3] - lea r0, [r0+2*r3] - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2+16] - pavgb m1, [r2+24] - OP m0, [r0] - OP m1, [r0+r3] - lea r0, [r0+2*r3] - add r2, 32 - sub r5d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS8_L2 put -PIXELS8_L2 avg - -; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS16_L2 1 -%define OP op_%1 -cglobal %1_pixels16_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r1+8] - pavgb m0, [r2] - pavgb m1, [r2+8] - add r1, r4 - add r2, 16 - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - pavgb m0, [r2] - pavgb m1, [r2+8] - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - pavgb m0, [r2+16] - pavgb m1, [r2+24] - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - add r2, 32 - sub r5d, 2 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS16_L2 put -PIXELS16_L2 avg diff --git a/ffmpeg1/libavcodec/x86/rv34dsp.asm b/ffmpeg1/libavcodec/x86/rv34dsp.asm deleted file mode 100644 index 4d9c35b..0000000 --- a/ffmpeg1/libavcodec/x86/rv34dsp.asm +++ /dev/null @@ -1,196 +0,0 @@ -;****************************************************************************** -;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders -;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -pw_row_coeffs: times 4 dw 13 - times 4 dw 17 - times 4 dw 7 -pd_512: times 2 dd 0x200 -pw_col_coeffs: dw 13, 13, 13, -13 - dw 17, 7, 7, -17 - dw 13, -13, 13, 13 - dw -7, 17, -17, -7 - -SECTION .text - -%macro IDCT_DC_NOROUND 1 - imul %1, 13*13*3 - sar %1, 11 -%endmacro - -%macro IDCT_DC_ROUND 1 - imul %1, 13*13 - add %1, 0x200 - sar %1, 10 -%endmacro - -%macro rv34_idct 1 -cglobal rv34_idct_%1, 1, 2, 0 - movsx r1, word [r0] - IDCT_DC r1 - movd m0, r1d - pshufw m0, m0, 0 - movq [r0+ 0], m0 - movq [r0+ 8], m0 - movq [r0+16], m0 - movq [r0+24], m0 - REP_RET -%endmacro - -INIT_MMX mmxext -%define IDCT_DC IDCT_DC_ROUND -rv34_idct dc -%define IDCT_DC IDCT_DC_NOROUND -rv34_idct dc_noround - -; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); -INIT_MMX mmx -cglobal rv34_idct_dc_add, 3, 3 - ; calculate DC - IDCT_DC_ROUND r2 - pxor m1, m1 - movd m0, r2d - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - punpcklbw m0, m0 - punpcklbw m1, m1 - punpcklwd m0, m0 - punpcklwd m1, m1 - - ; add DC - lea r2, [r0+r1*2] - movh m2, [r0] - movh m3, [r0+r1] - movh m4, [r2] - movh m5, [r2+r1] - paddusb m2, m0 - paddusb m3, m0 - paddusb m4, m0 - paddusb m5, m0 - psubusb m2, m1 - psubusb m3, m1 - psubusb m4, m1 - psubusb m5, m1 - movh [r0], m2 - movh [r0+r1], m3 - movh [r2], m4 - movh [r2+r1], m5 - RET - -; Load coeffs and perform row transform -; Output: coeffs in mm[0467], rounder in mm5 -%macro ROW_TRANSFORM 1 - pxor mm7, mm7 - mova mm0, [%1+ 0*8] - mova mm1, [%1+ 1*8] - mova mm2, [%1+ 2*8] - mova mm3, [%1+ 3*8] - mova [%1+ 0*8], mm7 - mova [%1+ 1*8], mm7 - mova [%1+ 2*8], mm7 - mova [%1+ 3*8], mm7 - mova mm4, mm0 - mova mm6, [pw_row_coeffs+ 0] - paddsw mm0, mm2 ; b0 + b2 - psubsw mm4, mm2 ; b0 - b2 - pmullw mm0, mm6 ; *13 = z0 - pmullw mm4, mm6 ; *13 = z1 - mova mm5, mm1 - pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 - pmullw mm5, [pw_row_coeffs+16] ; b1* 7 - mova mm7, mm3 - pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 - pmullw mm7, [pw_row_coeffs+16] ; b3* 7 - paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 - psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 - mova mm7, mm0 - mova mm6, mm4 - paddsw mm0, mm1 ; z0 + z3 - psubsw mm7, mm1 ; z0 - z3 - paddsw mm4, mm5 ; z1 + z2 - psubsw mm6, mm5 ; z1 - z2 - mova mm5, [pd_512] ; 0x200 -%endmacro - -; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); -%macro COL_TRANSFORM 4 - pshufw mm3, %2, 0xDD ; col. 1,3,1,3 - pshufw %2, %2, 0x88 ; col. 0,2,0,2 - pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 - pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 - paddd %2, mm5 - pshufw mm1, %2, 01001110b ; z1 | z0 - pshufw mm2, mm3, 01001110b ; z2 | z3 - paddd %2, mm3 ; z0+z3 | z1+z2 - psubd mm1, mm2 ; z1-z2 | z0-z3 - movd mm3, %1 - psrad %2, 10 - pxor mm2, mm2 - psrad mm1, 10 - punpcklbw mm3, mm2 - packssdw %2, mm1 - paddw %2, mm3 - packuswb %2, %2 - movd %1, %2 -%endmacro -INIT_MMX mmxext -cglobal rv34_idct_add, 3,3,0, d, s, b - ROW_TRANSFORM bq - COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] - mova mm0, [pw_col_coeffs+ 0] - COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] - mova mm4, [pw_col_coeffs+ 8] - lea dq, [dq + 2*sq] - COL_TRANSFORM [dq], mm6, mm0, mm4 - COL_TRANSFORM [dq+sq], mm7, mm0, mm4 - ret - -; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); -INIT_XMM sse4 -cglobal rv34_idct_dc_add, 3, 3, 6 - ; load data - IDCT_DC_ROUND r2 - pxor m1, m1 - - ; calculate DC - movd m0, r2d - lea r2, [r0+r1*2] - movd m2, [r0] - movd m3, [r0+r1] - pshuflw m0, m0, 0 - movd m4, [r2] - movd m5, [r2+r1] - punpcklqdq m0, m0 - punpckldq m2, m3 - punpckldq m4, m5 - punpcklbw m2, m1 - punpcklbw m4, m1 - paddw m2, m0 - paddw m4, m0 - packuswb m2, m4 - movd [r0], m2 - pextrd [r0+r1], m2, 1 - pextrd [r2], m2, 2 - pextrd [r2+r1], m2, 3 - RET diff --git a/ffmpeg1/libavcodec/x86/rv34dsp_init.c b/ffmpeg1/libavcodec/x86/rv34dsp_init.c deleted file mode 100644 index a2dea74..0000000 --- a/ffmpeg1/libavcodec/x86/rv34dsp_init.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * RV30/40 MMX/SSE2 optimizations - * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/rv34dsp.h" - -void ff_rv34_idct_dc_mmxext(int16_t *block); -void ff_rv34_idct_dc_noround_mmxext(int16_t *block); -void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc); -void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); -void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); - -av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c) -{ - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMX(mm_flags)) - c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; - if (EXTERNAL_MMXEXT(mm_flags)) { - c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext; - c->rv34_idct_add = ff_rv34_idct_add_mmxext; - } - if (EXTERNAL_SSE4(mm_flags)) - c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; -} diff --git a/ffmpeg1/libavcodec/x86/rv40dsp.asm b/ffmpeg1/libavcodec/x86/rv40dsp.asm deleted file mode 100644 index 7ec72be..0000000 --- a/ffmpeg1/libavcodec/x86/rv40dsp.asm +++ /dev/null @@ -1,505 +0,0 @@ -;****************************************************************************** -;* MMX/SSE2-optimized functions for the RV40 decoder -;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> -;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> -;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -align 16 -pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 - -sixtap_filter_hb_m: times 8 db 1, -5 - times 8 db 52, 20 - ; multiplied by 2 to have the same shift - times 8 db 2, -10 - times 8 db 40, 40 - ; back to normal - times 8 db 1, -5 - times 8 db 20, 52 - -sixtap_filter_v_m: times 8 dw 1 - times 8 dw -5 - times 8 dw 52 - times 8 dw 20 - ; multiplied by 2 to have the same shift - times 8 dw 2 - times 8 dw -10 - times 8 dw 40 - times 8 dw 40 - ; back to normal - times 8 dw 1 - times 8 dw -5 - times 8 dw 20 - times 8 dw 52 - -%ifdef PIC -%define sixtap_filter_hw picregq -%define sixtap_filter_hb picregq -%define sixtap_filter_v picregq -%define npicregs 1 -%else -%define sixtap_filter_hw sixtap_filter_hw_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define sixtap_filter_v sixtap_filter_v_m -%define npicregs 0 -%endif - -filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 - -cextern pw_32 -cextern pw_16 -cextern pw_512 - -SECTION .text - -;----------------------------------------------------------------------------- -; subpel MC functions: -; -; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, -; uint8_t *src, int srcstride, -; int len, int m); -;---------------------------------------------------------------------- -%macro LOAD 2 -%if WIN64 - movsxd %1q, %1d -%endif -%ifdef PIC - add %1q, picregq -%else - add %1q, %2 -%endif -%endmacro - -%macro STORE 3 -%ifidn %3, avg - movh %2, [dstq] -%endif - packuswb %1, %1 -%ifidn %3, avg -%if cpuflag(3dnow) - pavgusb %1, %2 -%else - pavgb %1, %2 -%endif -%endif - movh [dstq], %1 -%endmacro - -%macro FILTER_V 1 -cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg -%ifdef PIC - lea picregq, [sixtap_filter_v_m] -%endif - pxor m7, m7 - LOAD my, sixtap_filter_v - - ; read 5 lines - sub srcq, srcstrideq - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] - lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - movh m3, [srcq] - movh m4, [srcq+srcstrideq] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - -%ifdef m8 - mova m8, [myq+ 0] - mova m9, [myq+16] - mova m10, [myq+32] - mova m11, [myq+48] -%define COEFF05 m8 -%define COEFF14 m9 -%define COEFF2 m10 -%define COEFF3 m11 -%else -%define COEFF05 [myq+ 0] -%define COEFF14 [myq+16] -%define COEFF2 [myq+32] -%define COEFF3 [myq+48] -%endif -.nextrow: - mova m6, m1 - movh m5, [srcq+2*srcstrideq] ; read new row - paddw m6, m4 - punpcklbw m5, m7 - pmullw m6, COEFF14 - paddw m0, m5 - pmullw m0, COEFF05 - paddw m6, m0 - mova m0, m1 - paddw m6, [pw_32] - mova m1, m2 - pmullw m2, COEFF2 - paddw m6, m2 - mova m2, m3 - pmullw m3, COEFF3 - paddw m6, m3 - - ; round/clip/store - mova m3, m4 - psraw m6, 6 - mova m4, m5 - STORE m6, m5, %1 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET -%endmacro - -%macro FILTER_H 1 -cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg -%ifdef PIC - lea picregq, [sixtap_filter_v_m] -%endif - pxor m7, m7 - LOAD mx, sixtap_filter_v - mova m6, [pw_32] -%ifdef m8 - mova m8, [mxq+ 0] - mova m9, [mxq+16] - mova m10, [mxq+32] - mova m11, [mxq+48] -%define COEFF05 m8 -%define COEFF14 m9 -%define COEFF2 m10 -%define COEFF3 m11 -%else -%define COEFF05 [mxq+ 0] -%define COEFF14 [mxq+16] -%define COEFF2 [mxq+32] -%define COEFF3 [mxq+48] -%endif -.nextrow: - movq m0, [srcq-2] - movq m5, [srcq+3] - movq m1, [srcq-1] - movq m4, [srcq+2] - punpcklbw m0, m7 - punpcklbw m5, m7 - punpcklbw m1, m7 - punpcklbw m4, m7 - movq m2, [srcq-0] - movq m3, [srcq+1] - paddw m0, m5 - paddw m1, m4 - punpcklbw m2, m7 - punpcklbw m3, m7 - pmullw m0, COEFF05 - pmullw m1, COEFF14 - pmullw m2, COEFF2 - pmullw m3, COEFF3 - paddw m0, m6 - paddw m1, m2 - paddw m0, m3 - paddw m0, m1 - psraw m0, 6 - STORE m0, m1, %1 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -FILTER_V put -FILTER_H put - -INIT_MMX mmxext -FILTER_V avg -FILTER_H avg - -INIT_MMX 3dnow -FILTER_V avg -FILTER_H avg -%endif - -INIT_XMM sse2 -FILTER_H put -FILTER_H avg -FILTER_V put -FILTER_V avg - -%macro FILTER_SSSE3 1 -cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg -%ifdef PIC - lea picregq, [sixtap_filter_hb_m] -%endif - - ; read 5 lines - sub srcq, srcstrideq - LOAD my, sixtap_filter_hb - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] - lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - mova m5, [myq] - movh m3, [srcq] - movh m4, [srcq+srcstrideq] - lea srcq, [srcq+2*srcstrideq] - -.nextrow: - mova m6, m2 - punpcklbw m0, m1 - punpcklbw m6, m3 - pmaddubsw m0, m5 - pmaddubsw m6, [myq+16] - movh m7, [srcq] ; read new row - paddw m6, m0 - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - mova m4, m7 - punpcklbw m7, m3 - pmaddubsw m7, m5 - paddw m6, m7 - pmulhrsw m6, [pw_512] - STORE m6, m7, %1 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg -%ifdef PIC - lea picregq, [sixtap_filter_hb_m] -%endif - mova m3, [filter_h6_shuf2] - mova m4, [filter_h6_shuf3] - LOAD mx, sixtap_filter_hb - mova m5, [mxq] ; set up 6tap filter in bytes - mova m6, [mxq+16] - mova m7, [filter_h6_shuf1] - -.nextrow: - movu m0, [srcq-2] - mova m1, m0 - mova m2, m0 - pshufb m0, m7 - pshufb m1, m3 - pshufb m2, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m6 - pmaddubsw m2, m5 - paddw m0, m1 - paddw m0, m2 - pmulhrsw m0, [pw_512] - STORE m0, m1, %1 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET -%endmacro - -INIT_XMM ssse3 -FILTER_SSSE3 put -FILTER_SSSE3 avg - -; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 -%macro RV40_WCORE 4-5 - movh m4, [%3 + r6 + 0] - movh m5, [%4 + r6 + 0] -%if %0 == 4 -%define OFFSET r6 + mmsize / 2 -%else - ; 8x8 block and sse2, stride was provided -%define OFFSET r6 - add r6, r5 -%endif - movh m6, [%3 + OFFSET] - movh m7, [%4 + OFFSET] - -%if %1 == 0 - ; 14bits weights - punpcklbw m4, m0 - punpcklbw m5, m0 - punpcklbw m6, m0 - punpcklbw m7, m0 - - psllw m4, 7 - psllw m5, 7 - psllw m6, 7 - psllw m7, 7 - pmulhw m4, m3 - pmulhw m5, m2 - pmulhw m6, m3 - pmulhw m7, m2 - - paddw m4, m5 - paddw m6, m7 -%else - ; 5bits weights -%if cpuflag(ssse3) - punpcklbw m4, m5 - punpcklbw m6, m7 - - pmaddubsw m4, m3 - pmaddubsw m6, m3 -%else - punpcklbw m4, m0 - punpcklbw m5, m0 - punpcklbw m6, m0 - punpcklbw m7, m0 - - pmullw m4, m3 - pmullw m5, m2 - pmullw m6, m3 - pmullw m7, m2 - paddw m4, m5 - paddw m6, m7 -%endif - -%endif - - ; bias and shift down -%if cpuflag(ssse3) - pmulhrsw m4, m1 - pmulhrsw m6, m1 -%else - paddw m4, m1 - paddw m6, m1 - psrlw m4, 5 - psrlw m6, 5 -%endif - - packuswb m4, m6 -%if %0 == 5 - ; Only called for 8x8 blocks and sse2 - sub r6, r5 - movh [%2 + r6], m4 - add r6, r5 - movhps [%2 + r6], m4 -%else - mova [%2 + r6], m4 -%endif -%endmacro - - -%macro MAIN_LOOP 2 -%if mmsize == 8 - RV40_WCORE %2, r0, r1, r2 -%if %1 == 16 - RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 -%endif - - ; Prepare for next loop - add r6, r5 -%else -%ifidn %1, 8 - RV40_WCORE %2, r0, r1, r2, r5 - ; Prepare 2 next lines - add r6, r5 -%else - RV40_WCORE %2, r0, r1, r2 - ; Prepare single next line - add r6, r5 -%endif -%endif - -%endmacro - -; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) -; %1=size %2=num of xmm regs -; The weights are FP0.14 notation of fractions depending on pts. -; For timebases without rounding error (i.e. PAL), the fractions -; can be simplified, and several operations can be avoided. -; Therefore, we check here whether they are multiples of 2^9 for -; those simplifications to occur. -%macro RV40_WEIGHT 3 -cglobal rv40_weight_func_%1_%2, 6, 7, 8 -%if cpuflag(ssse3) - mova m1, [pw_1024] -%else - mova m1, [pw_16] -%endif - pxor m0, m0 - ; Set loop counter and increments - mov r6, r5 - shl r6, %3 - add r0, r6 - add r1, r6 - add r2, r6 - neg r6 - - movd m2, r3d - movd m3, r4d -%ifidn %1,rnd -%define RND 0 - SPLATW m2, m2 -%else -%define RND 1 -%if cpuflag(ssse3) - punpcklbw m3, m2 -%else - SPLATW m2, m2 -%endif -%endif - SPLATW m3, m3 - -.loop: - MAIN_LOOP %2, RND - jnz .loop - REP_RET -%endmacro - -INIT_MMX mmxext -RV40_WEIGHT rnd, 8, 3 -RV40_WEIGHT rnd, 16, 4 -RV40_WEIGHT nornd, 8, 3 -RV40_WEIGHT nornd, 16, 4 - -INIT_XMM sse2 -RV40_WEIGHT rnd, 8, 3 -RV40_WEIGHT rnd, 16, 4 -RV40_WEIGHT nornd, 8, 3 -RV40_WEIGHT nornd, 16, 4 - -INIT_XMM ssse3 -RV40_WEIGHT rnd, 8, 3 -RV40_WEIGHT rnd, 16, 4 -RV40_WEIGHT nornd, 8, 3 -RV40_WEIGHT nornd, 16, 4 diff --git a/ffmpeg1/libavcodec/x86/rv40dsp_init.c b/ffmpeg1/libavcodec/x86/rv40dsp_init.c deleted file mode 100644 index 2f97518..0000000 --- a/ffmpeg1/libavcodec/x86/rv40dsp_init.c +++ /dev/null @@ -1,243 +0,0 @@ -/* - * RV40 decoder motion compensation functions x86-optimised - * Copyright (c) 2008 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * RV40 decoder motion compensation functions x86-optimised - * 2,0 and 0,2 have h264 equivalents. - * 3,3 is bugged in the rv40 format and maps to _xy2 version - */ - -#include "libavcodec/rv34dsp.h" -#include "libavutil/attributes.h" -#include "libavutil/mem.h" -#include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" - -#if HAVE_YASM -void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - -#define DECLARE_WEIGHT(opt) \ -void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); \ -void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); \ -void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); \ -void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ - int w1, int w2, ptrdiff_t stride); -DECLARE_WEIGHT(mmxext) -DECLARE_WEIGHT(sse2) -DECLARE_WEIGHT(ssse3) - -/** @{ */ -/** - * Define one qpel function. - * LOOPSIZE must be already set to the number of pixels processed per - * iteration in the inner loop of the called functions. - * COFF(x) must be already defined so as to provide the offset into any - * array of coeffs used by the called function for the qpel position x. - */ -#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ -static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ - uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - int i; \ - if (PH && PV) { \ - DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ - uint8_t *tmpptr = tmp + SIZE * 2; \ - src -= stride * 2; \ - \ - for (i = 0; i < SIZE; i += LOOPSIZE) \ - ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ - SIZE + 5, HCOFF(PH)); \ - for (i = 0; i < SIZE; i += LOOPSIZE) \ - ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ - SIZE, SIZE, VCOFF(PV)); \ - } else if (PV) { \ - for (i = 0; i < SIZE; i += LOOPSIZE) \ - ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ - stride, SIZE, VCOFF(PV)); \ - } else { \ - for (i = 0; i < SIZE; i += LOOPSIZE) \ - ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ - stride, SIZE, HCOFF(PH)); \ - } \ -}; - -/** Declare functions for sizes 8 and 16 and given operations - * and qpel position. */ -#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ - QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ - QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) - -/** Declare all functions for all sizes and qpel positions */ -#define QPEL_MC_DECL(OP, OPT) \ -void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ - const uint8_t *src, \ - ptrdiff_t srcStride, \ - int len, int m); \ -void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ - const uint8_t *src, \ - ptrdiff_t srcStride, \ - int len, int m); \ -QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ -QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ -QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ -QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ -QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ -QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ -QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ -QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ -QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ -QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ -QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ -QPEL_FUNCS_DECL(OP, 3, 2, OPT) -/** @} */ - -#define LOOPSIZE 8 -#define HCOFF(x) (32 * (x - 1)) -#define VCOFF(x) (32 * (x - 1)) -QPEL_MC_DECL(put_, _ssse3) -QPEL_MC_DECL(avg_, _ssse3) - -#undef LOOPSIZE -#undef HCOFF -#undef VCOFF -#define LOOPSIZE 8 -#define HCOFF(x) (64 * (x - 1)) -#define VCOFF(x) (64 * (x - 1)) -QPEL_MC_DECL(put_, _sse2) -QPEL_MC_DECL(avg_, _sse2) - -#if ARCH_X86_32 -#undef LOOPSIZE -#undef HCOFF -#undef VCOFF -#define LOOPSIZE 4 -#define HCOFF(x) (64 * (x - 1)) -#define VCOFF(x) (64 * (x - 1)) - -QPEL_MC_DECL(put_, _mmx) - -#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx -#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx -QPEL_MC_DECL(avg_, _mmxext) - -#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx -#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx -QPEL_MC_DECL(avg_, _3dnow) -#endif - -/** @{ */ -/** Set one function */ -#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ - c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; - -/** Set functions put and avg for sizes 8 and 16 and a given qpel position */ -#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ - QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ - QPEL_FUNC_SET(OP, 16, PH, PV, OPT) - -/** Set all functions for all sizes and qpel positions */ -#define QPEL_MC_SET(OP, OPT) \ -QPEL_FUNCS_SET (OP, 0, 1, OPT) \ -QPEL_FUNCS_SET (OP, 0, 3, OPT) \ -QPEL_FUNCS_SET (OP, 1, 0, OPT) \ -QPEL_FUNCS_SET (OP, 1, 1, OPT) \ -QPEL_FUNCS_SET (OP, 1, 2, OPT) \ -QPEL_FUNCS_SET (OP, 1, 3, OPT) \ -QPEL_FUNCS_SET (OP, 2, 1, OPT) \ -QPEL_FUNCS_SET (OP, 2, 2, OPT) \ -QPEL_FUNCS_SET (OP, 2, 3, OPT) \ -QPEL_FUNCS_SET (OP, 3, 0, OPT) \ -QPEL_FUNCS_SET (OP, 3, 1, OPT) \ -QPEL_FUNCS_SET (OP, 3, 2, OPT) -/** @} */ - -#endif /* HAVE_YASM */ - -av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_MMX(mm_flags)) { - c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; - c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; -#if HAVE_MMX_INLINE - c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx; - c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx; - c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx; - c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx; -#endif /* HAVE_MMX_INLINE */ -#if ARCH_X86_32 - QPEL_MC_SET(put_, _mmx) -#endif - } - if (EXTERNAL_MMXEXT(mm_flags)) { - c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; - c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; - c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; - c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext; - c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext; - c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext; -#if ARCH_X86_32 - QPEL_MC_SET(avg_, _mmxext) -#endif - } else if (EXTERNAL_AMD3DNOW(mm_flags)) { - c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; - c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; -#if ARCH_X86_32 - QPEL_MC_SET(avg_, _3dnow) -#endif - } - if (EXTERNAL_SSE2(mm_flags)) { - c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; - c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; - c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; - c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; - QPEL_MC_SET(put_, _sse2) - QPEL_MC_SET(avg_, _sse2) - } - if (EXTERNAL_SSSE3(mm_flags)) { - c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; - c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; - c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; - c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; - QPEL_MC_SET(put_, _ssse3) - QPEL_MC_SET(avg_, _ssse3) - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/sbrdsp.asm b/ffmpeg1/libavcodec/x86/sbrdsp.asm deleted file mode 100644 index 1b7f3a8..0000000 --- a/ffmpeg1/libavcodec/x86/sbrdsp.asm +++ /dev/null @@ -1,222 +0,0 @@ -;****************************************************************************** -;* AAC Spectral Band Replication decoding functions -;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA -; mask equivalent for multiply by -1.0 1.0 -ps_mask times 2 dd 1<<31, 0 -ps_neg times 4 dd 1<<31 - -SECTION_TEXT - -INIT_XMM sse -cglobal sbr_sum_square, 2, 3, 6 - mov r2, r1 - xorps m0, m0 - xorps m1, m1 - sar r2, 3 - jz .prepare -.loop: - movu m2, [r0 + 0] - movu m3, [r0 + 16] - movu m4, [r0 + 32] - movu m5, [r0 + 48] - mulps m2, m2 - mulps m3, m3 - mulps m4, m4 - mulps m5, m5 - addps m0, m2 - addps m1, m3 - addps m0, m4 - addps m1, m5 - add r0, 64 - dec r2 - jnz .loop -.prepare: - and r1, 7 - sar r1, 1 - jz .end -; len is a multiple of 2, thus there are at least 4 elements to process -.endloop: - movu m2, [r0] - add r0, 16 - mulps m2, m2 - dec r1 - addps m0, m2 - jnz .endloop -.end: - addps m0, m1 - movhlps m2, m0 - addps m0, m2 - movss m1, m0 - shufps m0, m0, 1 - addss m0, m1 -%if ARCH_X86_64 == 0 - movss r0m, m0 - fld dword r0m -%endif - RET - -%define STEP 40*4*2 -cglobal sbr_hf_g_filt, 5, 6, 5 - lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high - mov r5, r3 - and r3, 0xFC - lea r2, [r2 + r3*4] - lea r0, [r0 + r3*8] - neg r3 - jz .loop1 -.loop4: - movlps m0, [r2 + 4*r3 + 0] - movlps m1, [r2 + 4*r3 + 8] - movlps m2, [r1 + 0*STEP] - movlps m3, [r1 + 2*STEP] - movhps m2, [r1 + 1*STEP] - movhps m3, [r1 + 3*STEP] - unpcklps m0, m0 - unpcklps m1, m1 - mulps m0, m2 - mulps m1, m3 - movu [r0 + 8*r3 + 0], m0 - movu [r0 + 8*r3 + 16], m1 - add r1, 4*STEP - add r3, 4 - jnz .loop4 - and r5, 3 ; number of single element loops - jz .end -.loop1: ; element 0 and 1 can be computed at the same time - movss m0, [r2] - movlps m2, [r1] - unpcklps m0, m0 - mulps m2, m0 - movlps [r0], m2 - add r0, 8 - add r2, 4 - add r1, STEP - dec r5 - jnz .loop1 -.end: - RET - -; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2], -; const float alpha0[2], const float alpha1[2], -; float bw, int start, int end) -; -cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E - ; load alpha factors -%define bw m0 -%if ARCH_X86_64 == 0 || WIN64 - movss bw, BWm -%endif - movlps m2, [alpha1q] - movlps m1, [alpha0q] - shufps bw, bw, 0 - mulps m2, bw ; (a1[0] a1[1])*bw - mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) - mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) - mova m3, m1 - mova m4, m2 - - ; Set pointers -%if ARCH_X86_64 == 0 || WIN64 - ; start and end 6th and 7th args on stack - mov r2d, Sm - mov r3d, Em -%define start r2q -%define end r3q -%else -; BW does not actually occupy a register, so shift by 1 -%define start BWq -%define end Sq -%endif - sub start, end ; neg num of loops - lea X_highq, [X_highq + end*2*4] - lea X_lowq, [X_lowq + end*2*4 - 2*2*4] - shl start, 3 ; offset from num loops - - mova m0, [X_lowq + start] - shufps m3, m3, q1111 - shufps m4, m4, q1111 - xorps m3, [ps_mask] - shufps m1, m1, q0000 - shufps m2, m2, q0000 - xorps m4, [ps_mask] -.loop2: - movu m7, [X_lowq + start + 8] ; BbCc - mova m6, m0 - mova m5, m7 - shufps m0, m0, q2301 ; aAbB - shufps m7, m7, q2301 ; bBcC - mulps m0, m4 - mulps m7, m3 - mulps m6, m2 - mulps m5, m1 - addps m7, m0 - mova m0, [X_lowq + start +16] ; CcDd - addps m7, m0 - addps m6, m5 - addps m7, m6 - mova [X_highq + start], m7 - add start, 16 - jnz .loop2 - RET - -cglobal sbr_sum64x5, 1,2,4,z - lea r1q, [zq+ 256] -.loop: - mova m0, [zq+ 0] - mova m2, [zq+ 16] - mova m1, [zq+ 256] - mova m3, [zq+ 272] - addps m0, [zq+ 512] - addps m2, [zq+ 528] - addps m1, [zq+ 768] - addps m3, [zq+ 784] - addps m0, [zq+1024] - addps m2, [zq+1040] - addps m0, m1 - addps m2, m3 - mova [zq], m0 - mova [zq+16], m2 - add zq, 32 - cmp zq, r1q - jne .loop - REP_RET - -INIT_XMM sse -cglobal sbr_qmf_post_shuffle, 2,3,4,W,z - lea r2q, [zq + (64-4)*4] - mova m3, [ps_neg] -.loop: - mova m1, [zq] - xorps m0, m3, [r2q] - shufps m0, m0, m0, q0123 - unpcklps m2, m0, m1 - unpckhps m0, m0, m1 - mova [Wq + 0], m2 - mova [Wq + 16], m0 - add Wq, 32 - sub r2q, 16 - add zq, 16 - cmp zq, r2q - jl .loop - REP_RET diff --git a/ffmpeg1/libavcodec/x86/sbrdsp_init.c b/ffmpeg1/libavcodec/x86/sbrdsp_init.c deleted file mode 100644 index 27fade1..0000000 --- a/ffmpeg1/libavcodec/x86/sbrdsp_init.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * AAC Spectral Band Replication decoding functions - * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/sbrdsp.h" - -float ff_sbr_sum_square_sse(float (*x)[2], int n); -void ff_sbr_sum64x5_sse(float *z); -void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2], - const float *g_filt, int m_max, intptr_t ixh); -void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], - const float alpha0[2], const float alpha1[2], - float bw, int start, int end); -void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z); - -av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) -{ - int mm_flags = av_get_cpu_flags(); - - if (EXTERNAL_SSE(mm_flags)) { - s->sum_square = ff_sbr_sum_square_sse; - s->sum64x5 = ff_sbr_sum64x5_sse; - s->hf_g_filt = ff_sbr_hf_g_filt_sse; - s->hf_gen = ff_sbr_hf_gen_sse; - s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse; - } -} diff --git a/ffmpeg1/libavcodec/x86/simple_idct.c b/ffmpeg1/libavcodec/x86/simple_idct.c deleted file mode 100644 index f27d2b9..0000000 --- a/ffmpeg1/libavcodec/x86/simple_idct.c +++ /dev/null @@ -1,1167 +0,0 @@ -/* - * Simple IDCT MMX - * - * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "libavcodec/simple_idct.h" -#include "libavutil/mem.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -/* -23170.475006 -22725.260826 -21406.727617 -19265.545870 -16384.000000 -12872.826198 -8866.956905 -4520.335430 -*/ -#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 -#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - -#define ROW_SHIFT 11 -#define COL_SHIFT 20 // 6 - -DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; -DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; - -DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { - 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, -// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, -// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), - 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, - // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) -// 0, 0, 0, 0, -// 0, 0, 0, 0, - - C4, C4, C4, C4, - C4, -C4, C4, -C4, - - C2, C6, C2, C6, - C6, -C2, C6, -C2, - - C1, C3, C1, C3, - C5, C7, C5, C7, - - C3, -C7, C3, -C7, --C1, -C5, -C1, -C5, - - C5, -C1, C5, -C1, - C7, C3, C7, C3, - - C7, -C5, C7, -C5, - C3, -C1, C3, -C1 -}; - -static inline void idct(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; - int16_t * const temp= (int16_t*)align_tmp; - - __asm__ volatile( -#if 0 //Alternative, simpler variant - -#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -#define COL_IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t"\ - - -#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ - "pand %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - "jmp 2f \n\t"\ - "1: \n\t"\ - "pslld $16, %%mm0 \n\t"\ - "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ - "psrad $13, %%mm0 \n\t"\ - "packssdw %%mm0, %%mm0 \n\t"\ - "movq %%mm0, " #dst " \n\t"\ - "movq %%mm0, 8+" #dst " \n\t"\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 24+" #dst " \n\t"\ - "2: \n\t" - - -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) -ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) -ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ - -DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) -DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) -DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) - - -//IDCT( src0, src4, src1, src5, dst, shift) -COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - -#else - -#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ - "pand %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - "jmp 2f \n\t"\ - "1: \n\t"\ - "pslld $16, %%mm0 \n\t"\ - "paddd "MANGLE(d40000)", %%mm0 \n\t"\ - "psrad $13, %%mm0 \n\t"\ - "packssdw %%mm0, %%mm0 \n\t"\ - "movq %%mm0, " #dst " \n\t"\ - "movq %%mm0, 8+" #dst " \n\t"\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 24+" #dst " \n\t"\ - "2: \n\t" - -#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz " #bt " \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) -Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "# .p2align 4 \n\t"\ - "4: \n\t" -Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm1, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "# .p2align 4 \n\t"\ - "6: \n\t" -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm1, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "# .p2align 4 \n\t"\ - "2: \n\t" -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "# .p2align 4 \n\t"\ - "3: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 64(%2), %%mm3 \n\t"\ - "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm1, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ - "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm1, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "# .p2align 4 \n\t"\ - "5: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ - "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ - "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ - "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ - "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ - "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ - "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ - "psrad $" #shift ", %%mm4 \n\t"\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm3 \n\t"\ - "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ - "movq %%mm4, " #dst " \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 96+" #dst " \n\t"\ - "movq %%mm4, 112+" #dst " \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movq %%mm5, 32+" #dst " \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movq %%mm6, 48+" #dst " \n\t"\ - "movq %%mm6, 64+" #dst " \n\t"\ - "movq %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - - "# .p2align 4 \n\t"\ - "1: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 64(%2), %%mm1 \n\t"\ - "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm3 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm3, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ - "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm3 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ - "movd %%mm3, 32+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - - "# .p2align 4 \n\t" - "7: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "psrad $" #shift ", %%mm4 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ - "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ - "movq %%mm4, " #dst " \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 96+" #dst " \n\t"\ - "movq %%mm4, 112+" #dst " \n\t"\ - "movq %%mm0, 32+" #dst " \n\t"\ - "movq %%mm4, 48+" #dst " \n\t"\ - "movq %%mm4, 64+" #dst " \n\t"\ - "movq %%mm0, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - - -#endif - -/* -Input - 00 40 04 44 20 60 24 64 - 10 30 14 34 50 70 54 74 - 01 41 03 43 21 61 23 63 - 11 31 13 33 51 71 53 73 - 02 42 06 46 22 62 26 66 - 12 32 16 36 52 72 56 76 - 05 45 07 47 25 65 27 67 - 15 35 17 37 55 75 57 77 - -Temp - 00 04 10 14 20 24 30 34 - 40 44 50 54 60 64 70 74 - 01 03 11 13 21 23 31 33 - 41 43 51 53 61 63 71 73 - 02 06 12 16 22 26 32 36 - 42 46 52 56 62 66 72 76 - 05 07 15 17 25 27 35 37 - 45 47 55 57 65 67 75 77 -*/ - -"9: \n\t" - :: "r" (block), "r" (temp), "r" (coeffs) - : "%eax" - ); -} - -void ff_simple_idct_mmx(int16_t *block) -{ - idct(block); -} - -//FIXME merge add/put into the idct - -void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block) -{ - idct(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} -void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block) -{ - idct(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg1/libavcodec/x86/snowdsp.c b/ffmpeg1/libavcodec/x86/snowdsp.c deleted file mode 100644 index 5505ee8..0000000 --- a/ffmpeg1/libavcodec/x86/snowdsp.c +++ /dev/null @@ -1,902 +0,0 @@ -/* - * MMX and SSE2 optimized snow DSP utils - * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/snow.h" -#include "libavcodec/snow_dwt.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ - const int w2= (width+1)>>1; - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice - // (the first time erroneously), we allow the SSE2 code to run an extra pass. - // The savings in code and time are well worth having to store this value and - // calculate b[0] correctly afterwards. - - i = 0; - __asm__ volatile( - "pcmpeqd %%xmm7, %%xmm7 \n\t" - "pcmpeqd %%xmm3, %%xmm3 \n\t" - "psllw $1, %%xmm3 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "psllw $13, %%xmm3 \n\t" - ::); - for(; i<w_l-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "paddw %%xmm5, %%xmm6 \n\t" - "paddw %%xmm7, %%xmm2 \n\t" - "paddw %%xmm7, %%xmm6 \n\t" - "pmulhw %%xmm3, %%xmm2 \n\t" - "pmulhw %%xmm3, %%xmm6 \n\t" - "paddw (%0), %%xmm2 \n\t" - "paddw 16(%0), %%xmm6 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm6, 16(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - } - - { // Lift 1 - IDWTELEM * const dst = b+w2; - - i = 0; - for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ - dst[i] = dst[i] - (b[i] + b[i + 1]); - } - for(; i<w_r-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "paddw %%xmm5, %%xmm6 \n\t" - "movdqa (%0), %%xmm0 \n\t" - "movdqa 16(%0), %%xmm4 \n\t" - "psubw %%xmm2, %%xmm0 \n\t" - "psubw %%xmm6, %%xmm4 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm4, 16(%0) \n\t" - :: "r"(&dst[i]), "r"(&b[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - IDWTELEM * const ref = b+w2 - 1; - IDWTELEM b_0 = b[0]; - - i = 0; - __asm__ volatile( - "psllw $15, %%xmm7 \n\t" - "pcmpeqw %%xmm6, %%xmm6 \n\t" - "psrlw $13, %%xmm6 \n\t" - "paddw %%xmm7, %%xmm6 \n\t" - ::); - for(; i<w_l-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm0 \n\t" - "movdqu 16(%1), %%xmm4 \n\t" - "movdqu 2(%1), %%xmm1 \n\t" - "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm4 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm5 \n\t" - "pavgw %%xmm1, %%xmm0 \n\t" - "pavgw %%xmm5, %%xmm4 \n\t" - "psubw %%xmm7, %%xmm0 \n\t" - "psubw %%xmm7, %%xmm4 \n\t" - "psraw $1, %%xmm0 \n\t" - "psraw $1, %%xmm4 \n\t" - "movdqa (%0), %%xmm1 \n\t" - "movdqa 16(%0), %%xmm5 \n\t" - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "psraw $2, %%xmm0 \n\t" - "psraw $2, %%xmm4 \n\t" - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm4, 16(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); - } - - { // Lift 3 - IDWTELEM * const src = b+w2; - - i = 0; - for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ - temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); - } - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw (%1), %%xmm2 \n\t" - "paddw 16(%1), %%xmm6 \n\t" - "movdqu (%0), %%xmm0 \n\t" - "movdqu 16(%0), %%xmm4 \n\t" - "paddw %%xmm2, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm4 \n\t" - "psraw $1, %%xmm2 \n\t" - "psraw $1, %%xmm6 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm4, %%xmm6 \n\t" - "movdqa %%xmm2, (%2) \n\t" - "movdqa %%xmm6, 16(%2) \n\t" - :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); - } - - { - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0x3E) != 0x3E; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=62; i>=0; i-=64){ - __asm__ volatile( - "movdqa (%1), %%xmm0 \n\t" - "movdqa 16(%1), %%xmm2 \n\t" - "movdqa 32(%1), %%xmm4 \n\t" - "movdqa 48(%1), %%xmm6 \n\t" - "movdqa (%1), %%xmm1 \n\t" - "movdqa 16(%1), %%xmm3 \n\t" - "movdqa 32(%1), %%xmm5 \n\t" - "movdqa 48(%1), %%xmm7 \n\t" - "punpcklwd (%2), %%xmm0 \n\t" - "punpcklwd 16(%2), %%xmm2 \n\t" - "punpcklwd 32(%2), %%xmm4 \n\t" - "punpcklwd 48(%2), %%xmm6 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm2, 32(%0) \n\t" - "movdqa %%xmm4, 64(%0) \n\t" - "movdqa %%xmm6, 96(%0) \n\t" - "punpckhwd (%2), %%xmm1 \n\t" - "punpckhwd 16(%2), %%xmm3 \n\t" - "punpckhwd 32(%2), %%xmm5 \n\t" - "punpckhwd 48(%2), %%xmm7 \n\t" - "movdqa %%xmm1, 16(%0) \n\t" - "movdqa %%xmm3, 48(%0) \n\t" - "movdqa %%xmm5, 80(%0) \n\t" - "movdqa %%xmm7, 112(%0) \n\t" - :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) - : "memory" - ); - } - } -} - -static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ - const int w2= (width+1)>>1; - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - - i = 1; - b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm3, %%mm3 \n\t" - "psllw $1, %%mm3 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "psllw $13, %%mm3 \n\t" - ::); - for(; i<w_l-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm2 \n\t" - "movq 8(%1), %%mm6 \n\t" - "paddw 2(%1), %%mm2 \n\t" - "paddw 10(%1), %%mm6 \n\t" - "paddw %%mm7, %%mm2 \n\t" - "paddw %%mm7, %%mm6 \n\t" - "pmulhw %%mm3, %%mm2 \n\t" - "pmulhw %%mm3, %%mm6 \n\t" - "paddw (%0), %%mm2 \n\t" - "paddw 8(%0), %%mm6 \n\t" - "movq %%mm2, (%0) \n\t" - "movq %%mm6, 8(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - } - - { // Lift 1 - IDWTELEM * const dst = b+w2; - - i = 0; - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm2 \n\t" - "movq 8(%1), %%mm6 \n\t" - "paddw 2(%1), %%mm2 \n\t" - "paddw 10(%1), %%mm6 \n\t" - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm4 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm6, %%mm4 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm4, 8(%0) \n\t" - :: "r"(&dst[i]), "r"(&b[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - IDWTELEM * const ref = b+w2 - 1; - - i = 1; - b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); - __asm__ volatile( - "psllw $15, %%mm7 \n\t" - "pcmpeqw %%mm6, %%mm6 \n\t" - "psrlw $13, %%mm6 \n\t" - "paddw %%mm7, %%mm6 \n\t" - ::); - for(; i<w_l-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq 2(%1), %%mm1 \n\t" - "movq 10(%1), %%mm5 \n\t" - "paddw %%mm6, %%mm0 \n\t" - "paddw %%mm6, %%mm4 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm5 \n\t" - "pavgw %%mm1, %%mm0 \n\t" - "pavgw %%mm5, %%mm4 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm4 \n\t" - "psraw $1, %%mm0 \n\t" - "psraw $1, %%mm4 \n\t" - "movq (%0), %%mm1 \n\t" - "movq 8(%0), %%mm5 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm5, %%mm4 \n\t" - "psraw $2, %%mm0 \n\t" - "psraw $2, %%mm4 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm5, %%mm4 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm4, 8(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - } - - { // Lift 3 - IDWTELEM * const src = b+w2; - i = 0; - - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movq 2(%1), %%mm2 \n\t" - "movq 10(%1), %%mm6 \n\t" - "paddw (%1), %%mm2 \n\t" - "paddw 8(%1), %%mm6 \n\t" - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm4 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm6, %%mm4 \n\t" - "psraw $1, %%mm2 \n\t" - "psraw $1, %%mm6 \n\t" - "paddw %%mm0, %%mm2 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, (%2) \n\t" - "movq %%mm6, 8(%2) \n\t" - :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); - } - - { - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0x1E) != 0x1E; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=30; i>=0; i-=32){ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm6 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm5 \n\t" - "movq 24(%1), %%mm7 \n\t" - "punpcklwd (%2), %%mm0 \n\t" - "punpcklwd 8(%2), %%mm2 \n\t" - "punpcklwd 16(%2), %%mm4 \n\t" - "punpcklwd 24(%2), %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, 16(%0) \n\t" - "movq %%mm4, 32(%0) \n\t" - "movq %%mm6, 48(%0) \n\t" - "punpckhwd (%2), %%mm1 \n\t" - "punpckhwd 8(%2), %%mm3 \n\t" - "punpckhwd 16(%2), %%mm5 \n\t" - "punpckhwd 24(%2), %%mm7 \n\t" - "movq %%mm1, 8(%0) \n\t" - "movq %%mm3, 24(%0) \n\t" - "movq %%mm5, 40(%0) \n\t" - "movq %%mm7, 56(%0) \n\t" - :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) - : "memory" - ); - } - } -} - -#if HAVE_7REGS -#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 48("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) - -#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ - "psubw %%"s0", %%"t0" \n\t"\ - "psubw %%"s1", %%"t1" \n\t"\ - "psubw %%"s2", %%"t2" \n\t"\ - "psubw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ - "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ - "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ - "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ - "movdqa %%"s3", 48("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ - "psraw $"n", %%"t0" \n\t"\ - "psraw $"n", %%"t1" \n\t"\ - "psraw $"n", %%"t2" \n\t"\ - "psraw $"n", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ - "paddw %%"s0", %%"t0" \n\t"\ - "paddw %%"s1", %%"t1" \n\t"\ - "paddw %%"s2", %%"t2" \n\t"\ - "paddw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ - "pmulhw %%"s0", %%"t0" \n\t"\ - "pmulhw %%"s1", %%"t1" \n\t"\ - "pmulhw %%"s2", %%"t2" \n\t"\ - "pmulhw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movdqa %%"s0", %%"t0" \n\t"\ - "movdqa %%"s1", %%"t1" \n\t"\ - "movdqa %%"s2", %%"t2" \n\t"\ - "movdqa %%"s3", %%"t3" \n\t" - -static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - - while(i & 0x1F) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") - - - "pcmpeqw %%xmm0, %%xmm0 \n\t" - "pcmpeqw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "psllw $13, %%xmm2 \n\t" - snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") - - "pcmpeqw %%xmm7, %%xmm7 \n\t" - "pcmpeqw %%xmm5, %%xmm5 \n\t" - "psllw $15, %%xmm7 \n\t" - "psrlw $13, %%xmm5 \n\t" - "paddw %%xmm7, %%xmm5 \n\t" - snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") - "movq (%2,%%"REG_d"), %%xmm1 \n\t" - "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm0 \n\t" - "pavgw %%xmm3, %%xmm2 \n\t" - "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" - "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm4 \n\t" - "pavgw %%xmm3, %%xmm6 \n\t" - snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - - snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") - - "2: \n\t" - "sub $64, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} - -#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 24("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ - "movq %%"s0", ("w",%%"REG_d") \n\t"\ - "movq %%"s1", 8("w",%%"REG_d") \n\t"\ - "movq %%"s2", 16("w",%%"REG_d") \n\t"\ - "movq %%"s3", 24("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movq %%"s0", %%"t0" \n\t"\ - "movq %%"s1", %%"t1" \n\t"\ - "movq %%"s2", %%"t2" \n\t"\ - "movq %%"s3", %%"t3" \n\t" - - -static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - while(i & 15) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - - snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") - "pcmpeqw %%mm0, %%mm0 \n\t" - "pcmpeqw %%mm2, %%mm2 \n\t" - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm2 \n\t" - "psllw $13, %%mm2 \n\t" - snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm5, %%mm5 \n\t" - "psllw $15, %%mm7 \n\t" - "psrlw $13, %%mm5 \n\t" - "paddw %%mm7, %%mm5 \n\t" - snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") - "movq (%2,%%"REG_d"), %%mm1 \n\t" - "movq 8(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm0 \n\t" - "pavgw %%mm3, %%mm2 \n\t" - "movq 16(%2,%%"REG_d"), %%mm1 \n\t" - "movq 24(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm4 \n\t" - "pavgw %%mm3, %%mm6 \n\t" - snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - - snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") - - "2: \n\t" - "sub $32, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} -#endif //HAVE_7REGS - -#define snow_inner_add_yblock_sse2_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ - "pcmpeqd %%xmm3, %%xmm3 \n\t"\ - "psllw $15, %%xmm3 \n\t"\ - "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_end_common1\ - "add $32, %%"REG_S" \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t" - -#define snow_inner_add_yblock_sse2_end_common2\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -#define snow_inner_add_yblock_sse2_end_8\ - "sal $1, %%"REG_c" \n\t"\ - "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "sar $1, %%"REG_c" \n\t"\ - "sub $2, %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -#define snow_inner_add_yblock_sse2_end_16\ - "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "dec %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_8("2", "8") -snow_inner_add_yblock_sse2_accum_8("1", "128") -snow_inner_add_yblock_sse2_accum_8("0", "136") - - "mov %0, %%"REG_d" \n\t" - "movdqa (%%"REG_D"), %%xmm0 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - - "punpckhwd %%xmm7, %%xmm1 \n\t" - "punpcklwd %%xmm7, %%xmm2 \n\t" - "paddd %%xmm2, %%xmm0 \n\t" - "movdqa 16(%%"REG_D"), %%xmm2 \n\t" - "paddd %%xmm1, %%xmm2 \n\t" - "paddd %%xmm3, %%xmm0 \n\t" - "paddd %%xmm3, %%xmm2 \n\t" - - "mov %1, %%"REG_D" \n\t" - "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" - "add %3, %%"REG_D" \n\t" - - "movdqa (%%"REG_D"), %%xmm4 \n\t" - "movdqa %%xmm5, %%xmm6 \n\t" - "punpckhwd %%xmm7, %%xmm5 \n\t" - "punpcklwd %%xmm7, %%xmm6 \n\t" - "paddd %%xmm6, %%xmm4 \n\t" - "movdqa 16(%%"REG_D"), %%xmm6 \n\t" - "paddd %%xmm5, %%xmm6 \n\t" - "paddd %%xmm3, %%xmm4 \n\t" - "paddd %%xmm3, %%xmm6 \n\t" - - "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm2, %%xmm0 \n\t" - "packuswb %%xmm7, %%xmm0 \n\t" - "movq %%xmm0, (%%"REG_d") \n\t" - - "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm6, %%xmm4 \n\t" - "packuswb %%xmm7, %%xmm4 \n\t" - "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" -snow_inner_add_yblock_sse2_end_8 -} - -static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_16("2", "16") -snow_inner_add_yblock_sse2_accum_16("1", "512") -snow_inner_add_yblock_sse2_accum_16("0", "528") - - "mov %0, %%"REG_d" \n\t" - "psrlw $4, %%xmm1 \n\t" - "psrlw $4, %%xmm5 \n\t" - "paddw (%%"REG_D"), %%xmm1 \n\t" - "paddw 16(%%"REG_D"), %%xmm5 \n\t" - "paddw %%xmm3, %%xmm1 \n\t" - "paddw %%xmm3, %%xmm5 \n\t" - "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ - "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ - "packuswb %%xmm5, %%xmm1 \n\t" - - "movdqu %%xmm1, (%%"REG_d") \n\t" - -snow_inner_add_yblock_sse2_end_16 -} - -#define snow_inner_add_yblock_mmx_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%mm7, %%mm7 \n\t" /* 0 */\ - "pcmpeqd %%mm3, %%mm3 \n\t"\ - "psllw $15, %%mm3 \n\t"\ - "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ - "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%mm7, %%"out_reg1" \n\t"\ - "punpcklbw %%mm7, %%"out_reg2" \n\t"\ - "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ - "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "pmullw %%mm0, %%"out_reg1" \n\t"\ - "pmullw %%mm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ - snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ - "paddusw %%mm2, %%mm1 \n\t"\ - "paddusw %%mm6, %%mm5 \n\t" - -#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ - "mov %0, %%"REG_d" \n\t"\ - "psrlw $4, %%mm1 \n\t"\ - "psrlw $4, %%mm5 \n\t"\ - "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ - "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psraw $4, %%mm1 \n\t"\ - "psraw $4, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm1 \n\t"\ - "movq %%mm1, "write_offset"(%%"REG_d") \n\t" - -#define snow_inner_add_yblock_mmx_end(s_step)\ - "add $"s_step", %%"REG_S" \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t"\ - "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "dec %2 \n\t"\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "8", "0") -snow_inner_add_yblock_mmx_accum("1", "128", "0") -snow_inner_add_yblock_mmx_accum("0", "136", "0") -snow_inner_add_yblock_mmx_mix("0", "0") -snow_inner_add_yblock_mmx_end("16") -} - -static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "16", "0") -snow_inner_add_yblock_mmx_accum("1", "512", "0") -snow_inner_add_yblock_mmx_accum("0", "528", "0") -snow_inner_add_yblock_mmx_mix("0", "0") - -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") -snow_inner_add_yblock_mmx_accum("2", "24", "8") -snow_inner_add_yblock_mmx_accum("1", "520", "8") -snow_inner_add_yblock_mmx_accum("0", "536", "8") -snow_inner_add_yblock_mmx_mix("16", "8") -snow_inner_add_yblock_mmx_end("32") -} - -static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) { - if (!(b_h & 1)) - inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - } else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -} - -static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -} - -#endif /* HAVE_INLINE_ASM */ - -void ff_dwt_init_x86(SnowDWTContext *c) -{ -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; -#endif - c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; - } - else{ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; -#endif - } - c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; - } - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/ffmpeg1/libavcodec/x86/v210-init.c b/ffmpeg1/libavcodec/x86/v210-init.c deleted file mode 100644 index 02c5eaa..0000000 --- a/ffmpeg1/libavcodec/x86/v210-init.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavcodec/v210dec.h" - -extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); -extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); - -extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); -extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); - -av_cold void v210_x86_init(V210DecContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_YASM - if (s->aligned_input) { - if (cpu_flags & AV_CPU_FLAG_SSSE3) - s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; - - if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) - s->unpack_frame = ff_v210_planar_unpack_aligned_avx; - } - else { - if (cpu_flags & AV_CPU_FLAG_SSSE3) - s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; - - if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) - s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; - } -#endif -} diff --git a/ffmpeg1/libavcodec/x86/v210.asm b/ffmpeg1/libavcodec/x86/v210.asm deleted file mode 100644 index 5473126..0000000 --- a/ffmpeg1/libavcodec/x86/v210.asm +++ /dev/null @@ -1,88 +0,0 @@ -;****************************************************************************** -;* V210 SIMD unpack -;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> -;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -v210_mask: times 4 dd 0x3ff -v210_mult: dw 64,4,64,4,64,4,64,4 -v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 -v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 - -SECTION .text - -%macro v210_planar_unpack 2 - -; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1_%2, 5, 5, 7 - movsxdifnidn r4, r4d - lea r1, [r1+2*r4] - add r2, r4 - add r3, r4 - neg r4 - - mova m3, [v210_mult] - mova m4, [v210_mask] - mova m5, [v210_luma_shuf] - mova m6, [v210_chroma_shuf] -.loop -%ifidn %1, unaligned - movu m0, [r0] -%else - mova m0, [r0] -%endif - - pmullw m1, m0, m3 - psrld m0, 10 - psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 - pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ - - shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ - pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ - movu [r1+2*r4], m2 - - shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ - pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ - movq [r2+r4], m1 - movhps [r3+r4], m1 - - add r0, mmsize - add r4, 6 - jl .loop - - REP_RET -%endmacro - -INIT_XMM -v210_planar_unpack unaligned, ssse3 -%if HAVE_AVX_EXTERNAL -INIT_AVX -v210_planar_unpack unaligned, avx -%endif - -INIT_XMM -v210_planar_unpack aligned, ssse3 -%if HAVE_AVX_EXTERNAL -INIT_AVX -v210_planar_unpack aligned, avx -%endif diff --git a/ffmpeg1/libavcodec/x86/vc1dsp.asm b/ffmpeg1/libavcodec/x86/vc1dsp.asm deleted file mode 100644 index 546688c..0000000 --- a/ffmpeg1/libavcodec/x86/vc1dsp.asm +++ /dev/null @@ -1,317 +0,0 @@ -;****************************************************************************** -;* VC1 deblocking optimizations -;* Copyright (c) 2009 David Conrad -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -cextern pw_4 -cextern pw_5 - -section .text - -; dst_low, dst_high (src), zero -; zero-extends one vector from 8 to 16 bits -%macro UNPACK_8TO16 4 - mova m%2, m%3 - punpckh%1 m%3, m%4 - punpckl%1 m%2, m%4 -%endmacro - -%macro STORE_4_WORDS 6 -%if cpuflag(sse4) - pextrw %1, %5, %6+0 - pextrw %2, %5, %6+1 - pextrw %3, %5, %6+2 - pextrw %4, %5, %6+3 -%else - movd %6d, %5 -%if mmsize==16 - psrldq %5, 4 -%else - psrlq %5, 32 -%endif - mov %1, %6w - shr %6, 16 - mov %2, %6w - movd %6d, %5 - mov %3, %6w - shr %6, 16 - mov %4, %6w -%endif -%endmacro - -; in: p1 p0 q0 q1, clobbers p0 -; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 -%macro VC1_LOOP_FILTER_A0 4 - psubw %1, %4 - psubw %2, %3 - paddw %1, %1 - pmullw %2, [pw_5] - psubw %1, %2 - paddw %1, [pw_4] - psraw %1, 3 -%endmacro - -; in: p0 q0 a0 a1 a2 -; m0 m1 m7 m6 m5 -; %1: size -; out: m0=p0' m1=q0' -%macro VC1_FILTER 1 - PABSW m4, m7 - PABSW m3, m6 - PABSW m2, m5 - mova m6, m4 - pminsw m3, m2 - pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) - psubw m3, m4 - pmullw m3, [pw_5] ; 5*(a3 - a0) - PABSW m2, m3 - psraw m2, 3 ; abs(d/8) - pxor m7, m3 ; d_sign ^= a0_sign - - pxor m5, m5 - movd m3, r2d -%if %1 > 4 - punpcklbw m3, m3 -%endif - punpcklbw m3, m5 - pcmpgtw m3, m4 ; if (a0 < pq) - pand m6, m3 - - mova m3, m0 - psubw m3, m1 - PABSW m4, m3 - psraw m4, 1 - pxor m3, m7 ; d_sign ^ clip_sign - psraw m3, 15 - pminsw m2, m4 ; min(d, clip) - pcmpgtw m4, m5 - pand m6, m4 ; filt3 (C return value) - -; each set of 4 pixels is not filtered if the 3rd is not -%if mmsize==16 - pshuflw m4, m6, 0xaa -%if %1 > 4 - pshufhw m4, m4, 0xaa -%endif -%else - pshufw m4, m6, 0xaa -%endif - pandn m3, m4 - pand m2, m6 - pand m3, m2 ; d final - - psraw m7, 15 - pxor m3, m7 - psubw m3, m7 - psubw m0, m3 - paddw m1, m3 - packuswb m0, m0 - packuswb m1, m1 -%endmacro - -; 1st param: size of filter -; 2nd param: mov suffix equivalent to the filter size -%macro VC1_V_LOOP_FILTER 2 - pxor m5, m5 - mov%2 m6, [r4] - mov%2 m4, [r4+r1] - mov%2 m7, [r4+2*r1] - mov%2 m0, [r4+r3] - punpcklbw m6, m5 - punpcklbw m4, m5 - punpcklbw m7, m5 - punpcklbw m0, m5 - - VC1_LOOP_FILTER_A0 m6, m4, m7, m0 - mov%2 m1, [r0] - mov%2 m2, [r0+r1] - punpcklbw m1, m5 - punpcklbw m2, m5 - mova m4, m0 - VC1_LOOP_FILTER_A0 m7, m4, m1, m2 - mov%2 m3, [r0+2*r1] - mov%2 m4, [r0+r3] - punpcklbw m3, m5 - punpcklbw m4, m5 - mova m5, m1 - VC1_LOOP_FILTER_A0 m5, m2, m3, m4 - - VC1_FILTER %1 - mov%2 [r4+r3], m0 - mov%2 [r0], m1 -%endmacro - -; 1st param: size of filter -; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register -; 2nd (optional) param: temp register to use for storing words -%macro VC1_H_LOOP_FILTER 1-2 -%if %1 == 4 - movq m0, [r0 -4] - movq m1, [r0+ r1-4] - movq m2, [r0+2*r1-4] - movq m3, [r0+ r3-4] - TRANSPOSE4x4B 0, 1, 2, 3, 4 -%else - movq m0, [r0 -4] - movq m4, [r0+ r1-4] - movq m1, [r0+2*r1-4] - movq m5, [r0+ r3-4] - movq m2, [r4 -4] - movq m6, [r4+ r1-4] - movq m3, [r4+2*r1-4] - movq m7, [r4+ r3-4] - punpcklbw m0, m4 - punpcklbw m1, m5 - punpcklbw m2, m6 - punpcklbw m3, m7 - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif - pxor m5, m5 - - UNPACK_8TO16 bw, 6, 0, 5 - UNPACK_8TO16 bw, 7, 1, 5 - VC1_LOOP_FILTER_A0 m6, m0, m7, m1 - UNPACK_8TO16 bw, 4, 2, 5 - mova m0, m1 ; m0 = p0 - VC1_LOOP_FILTER_A0 m7, m1, m4, m2 - UNPACK_8TO16 bw, 1, 3, 5 - mova m5, m4 - VC1_LOOP_FILTER_A0 m5, m2, m1, m3 - SWAP 1, 4 ; m1 = q0 - - VC1_FILTER %1 - punpcklbw m0, m1 -%if %0 > 1 - STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 -%if %1 > 4 - psrldq m0, 4 - STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 -%endif -%else - STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 - STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 -%endif -%endmacro - - -%macro START_V_FILTER 0 - mov r4, r0 - lea r3, [4*r1] - sub r4, r3 - lea r3, [r1+2*r1] - imul r2, 0x01010101 -%endmacro - -%macro START_H_FILTER 1 - lea r3, [r1+2*r1] -%if %1 > 4 - lea r4, [r0+4*r1] -%endif - imul r2, 0x01010101 -%endmacro - -%macro VC1_LF 0 -cglobal vc1_v_loop_filter_internal - VC1_V_LOOP_FILTER 4, d - ret - -cglobal vc1_h_loop_filter_internal - VC1_H_LOOP_FILTER 4, r4 - ret - -; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter4, 3,5,0 - START_V_FILTER - call vc1_v_loop_filter_internal - RET - -; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter4, 3,5,0 - START_H_FILTER 4 - call vc1_h_loop_filter_internal - RET - -; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8, 3,5,0 - START_V_FILTER - call vc1_v_loop_filter_internal - add r4, 4 - add r0, 4 - call vc1_v_loop_filter_internal - RET - -; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8, 3,5,0 - START_H_FILTER 4 - call vc1_h_loop_filter_internal - lea r0, [r0+4*r1] - call vc1_h_loop_filter_internal - RET -%endmacro - -INIT_MMX mmxext -VC1_LF - -INIT_XMM sse2 -; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8, 3,5,8 - START_V_FILTER - VC1_V_LOOP_FILTER 8, q - RET - -; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8, 3,6,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8, r5 - RET - -INIT_MMX ssse3 -; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter4, 3,5,0 - START_V_FILTER - VC1_V_LOOP_FILTER 4, d - RET - -; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter4, 3,5,0 - START_H_FILTER 4 - VC1_H_LOOP_FILTER 4, r4 - RET - -INIT_XMM ssse3 -; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8, 3,5,8 - START_V_FILTER - VC1_V_LOOP_FILTER 8, q - RET - -; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8, 3,6,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8, r5 - RET - -INIT_XMM sse4 -; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8, 3,5,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8 - RET diff --git a/ffmpeg1/libavcodec/x86/vc1dsp.h b/ffmpeg1/libavcodec/x86/vc1dsp.h deleted file mode 100644 index fdd4de1..0000000 --- a/ffmpeg1/libavcodec/x86/vc1dsp.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * VC-1 and WMV3 decoder - X86 DSP init functions - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VC1DSP_H -#define AVCODEC_X86_VC1DSP_H - -#include "libavcodec/vc1dsp.h" - -void ff_vc1dsp_init_mmx(VC1DSPContext *dsp); -void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp); - -#endif /* AVCODEC_X86_VC1DSP_H */ diff --git a/ffmpeg1/libavcodec/x86/vc1dsp_init.c b/ffmpeg1/libavcodec/x86/vc1dsp_init.c deleted file mode 100644 index 228f4dc..0000000 --- a/ffmpeg1/libavcodec/x86/vc1dsp_init.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * VC-1 and WMV3 - DSP functions MMX-optimized - * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/vc1dsp.h" -#include "dsputil_mmx.h" -#include "vc1dsp.h" -#include "config.h" - -#define LOOP_FILTER(EXT) \ -void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ -void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ -void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ -void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ -\ -static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ -{ \ - ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \ - ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \ -} \ -\ -static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ -{ \ - ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \ - ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \ -} - -#if HAVE_YASM -LOOP_FILTER(mmxext) -LOOP_FILTER(sse2) -LOOP_FILTER(ssse3) - -void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq); - -static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) -{ - ff_vc1_h_loop_filter8_sse4(src, stride, pq); - ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq); -} - -static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd) -{ - ff_avg_pixels8_mmxext(dst, src, stride, 8); -} -#endif /* HAVE_YASM */ - -void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); -void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, - int stride, int h, int x, int y); - - -av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) -{ - int mm_flags = av_get_cpu_flags(); - - if (INLINE_MMX(mm_flags)) - ff_vc1dsp_init_mmx(dsp); - - if (INLINE_MMXEXT(mm_flags)) - ff_vc1dsp_init_mmxext(dsp); - -#define ASSIGN_LF(EXT) \ - dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \ - dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \ - dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \ - dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \ - dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \ - dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT - -#if HAVE_YASM - if (mm_flags & AV_CPU_FLAG_MMX) { - dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx; - } - - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - ASSIGN_LF(mmxext); - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext; - } else if (mm_flags & AV_CPU_FLAG_3DNOW) { - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; - } - - if (mm_flags & AV_CPU_FLAG_SSE2) { - dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; - dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; - dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; - dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; - } - if (mm_flags & AV_CPU_FLAG_SSSE3) { - ASSIGN_LF(ssse3); - dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; - } - if (mm_flags & AV_CPU_FLAG_SSE4) { - dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; - dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/vc1dsp_mmx.c b/ffmpeg1/libavcodec/x86/vc1dsp_mmx.c deleted file mode 100644 index df0385f..0000000 --- a/ffmpeg1/libavcodec/x86/vc1dsp_mmx.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * VC-1 and WMV3 - DSP functions MMX-optimized - * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" -#include "libavcodec/vc1dsp.h" -#include "vc1dsp.h" - -#if HAVE_INLINE_ASM - -#define OP_PUT(S,D) -#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" - -/** Add rounder from mm7 to mm3 and pack result at destination */ -#define NORMALIZE_MMX(SHIFT) \ - "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ - "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ - "psraw "SHIFT", %%mm3 \n\t" \ - "psraw "SHIFT", %%mm4 \n\t" - -#define TRANSFER_DO_PACK(OP) \ - "packuswb %%mm4, %%mm3 \n\t" \ - OP((%2), %%mm3) \ - "movq %%mm3, (%2) \n\t" - -#define TRANSFER_DONT_PACK(OP) \ - OP(0(%2), %%mm3) \ - OP(8(%2), %%mm4) \ - "movq %%mm3, 0(%2) \n\t" \ - "movq %%mm4, 8(%2) \n\t" - -/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ -#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" -#define DONT_UNPACK(reg) - -/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ -#define LOAD_ROUNDER_MMX(ROUND) \ - "movd "ROUND", %%mm7 \n\t" \ - "punpcklwd %%mm7, %%mm7 \n\t" \ - "punpckldq %%mm7, %%mm7 \n\t" - -#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ - "paddw %%mm"#R2", %%mm"#R1" \n\t" \ - "movd (%0,%3), %%mm"#R0" \n\t" \ - "pmullw %%mm6, %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R0" \n\t" \ - "movd (%0,%2), %%mm"#R3" \n\t" \ - "psubw %%mm"#R0", %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R3" \n\t" \ - "paddw %%mm7, %%mm"#R1" \n\t" \ - "psubw %%mm"#R3", %%mm"#R1" \n\t" \ - "psraw %4, %%mm"#R1" \n\t" \ - "movq %%mm"#R1", "#OFF"(%1) \n\t" \ - "add %2, %0 \n\t" - -/** Sacrifying mm6 allows to pipeline loads from src */ -static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, - const uint8_t *src, x86_reg stride, - int rnd, int64_t shift) -{ - __asm__ volatile( - "mov $3, %%"REG_c" \n\t" - LOAD_ROUNDER_MMX("%5") - "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" - "1: \n\t" - "movd (%0), %%mm2 \n\t" - "add %2, %0 \n\t" - "movd (%0), %%mm3 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - SHIFT2_LINE( 0, 1, 2, 3, 4) - SHIFT2_LINE( 24, 2, 3, 4, 1) - SHIFT2_LINE( 48, 3, 4, 1, 2) - SHIFT2_LINE( 72, 4, 1, 2, 3) - SHIFT2_LINE( 96, 1, 2, 3, 4) - SHIFT2_LINE(120, 2, 3, 4, 1) - SHIFT2_LINE(144, 3, 4, 1, 2) - SHIFT2_LINE(168, 4, 1, 2, 3) - "sub %6, %0 \n\t" - "add $8, %1 \n\t" - "dec %%"REG_c" \n\t" - "jnz 1b \n\t" - : "+r"(src), "+r"(dst) - : "r"(stride), "r"(-2*stride), - "m"(shift), "m"(rnd), "r"(9*stride-4) - : "%"REG_c, "memory" - ); -} - -/** - * Data is already unpacked, so some operations can directly be made from - * memory. - */ -#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ -static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ - const int16_t *src, int rnd)\ -{\ - int h = 8;\ -\ - src -= 1;\ - rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ - __asm__ volatile(\ - LOAD_ROUNDER_MMX("%4")\ - "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ - "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ - "1: \n\t"\ - "movq 2*0+0(%1), %%mm1 \n\t"\ - "movq 2*0+8(%1), %%mm2 \n\t"\ - "movq 2*1+0(%1), %%mm3 \n\t"\ - "movq 2*1+8(%1), %%mm4 \n\t"\ - "paddw 2*3+0(%1), %%mm1 \n\t"\ - "paddw 2*3+8(%1), %%mm2 \n\t"\ - "paddw 2*2+0(%1), %%mm3 \n\t"\ - "paddw 2*2+8(%1), %%mm4 \n\t"\ - "pmullw %%mm5, %%mm3 \n\t"\ - "pmullw %%mm5, %%mm4 \n\t"\ - "psubw %%mm1, %%mm3 \n\t"\ - "psubw %%mm2, %%mm4 \n\t"\ - NORMALIZE_MMX("$7")\ - /* Remove bias */\ - "paddw %%mm6, %%mm3 \n\t"\ - "paddw %%mm6, %%mm4 \n\t"\ - TRANSFER_DO_PACK(OP)\ - "add $24, %1 \n\t"\ - "add %3, %2 \n\t"\ - "decl %0 \n\t"\ - "jnz 1b \n\t"\ - : "+r"(h), "+r" (src), "+r" (dst)\ - : "r"(stride), "m"(rnd)\ - : "memory"\ - );\ -} - -VC1_HOR_16b_SHIFT2(OP_PUT, put_) -VC1_HOR_16b_SHIFT2(OP_AVG, avg_) - - -/** - * Purely vertical or horizontal 1/2 shift interpolation. - * Sacrify mm6 for *9 factor. - */ -#define VC1_SHIFT2(OP, OPNAME)\ -static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ - x86_reg stride, int rnd, x86_reg offset)\ -{\ - rnd = 8-rnd;\ - __asm__ volatile(\ - "mov $8, %%"REG_c" \n\t"\ - LOAD_ROUNDER_MMX("%5")\ - "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ - "1: \n\t"\ - "movd 0(%0 ), %%mm3 \n\t"\ - "movd 4(%0 ), %%mm4 \n\t"\ - "movd 0(%0,%2), %%mm1 \n\t"\ - "movd 4(%0,%2), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm0, %%mm3 \n\t"\ - "punpcklbw %%mm0, %%mm4 \n\t"\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "paddw %%mm1, %%mm3 \n\t"\ - "paddw %%mm2, %%mm4 \n\t"\ - "movd 0(%0,%3), %%mm1 \n\t"\ - "movd 4(%0,%3), %%mm2 \n\t"\ - "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ - "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ - "movd 0(%0,%2), %%mm1 \n\t"\ - "movd 4(%0,%2), %%mm2 \n\t"\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ - NORMALIZE_MMX("$4")\ - "packuswb %%mm4, %%mm3 \n\t"\ - OP((%1), %%mm3)\ - "movq %%mm3, (%1) \n\t"\ - "add %6, %0 \n\t"\ - "add %4, %1 \n\t"\ - "dec %%"REG_c" \n\t"\ - "jnz 1b \n\t"\ - : "+r"(src), "+r"(dst)\ - : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ - "g"(stride-offset)\ - : "%"REG_c, "memory"\ - );\ -} - -VC1_SHIFT2(OP_PUT, put_) -VC1_SHIFT2(OP_AVG, avg_) - -/** - * Core of the 1/4 and 3/4 shift bicubic interpolation. - * - * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). - * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. - * @param A1 Address of 1st tap (beware of unpacked/packed). - * @param A2 Address of 2nd tap - * @param A3 Address of 3rd tap - * @param A4 Address of 4th tap - */ -#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ - MOVQ "*0+"A1", %%mm1 \n\t" \ - MOVQ "*4+"A1", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ - "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ - MOVQ "*0+"A2", %%mm3 \n\t" \ - MOVQ "*4+"A2", %%mm4 \n\t" \ - UNPACK("%%mm3") \ - UNPACK("%%mm4") \ - "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ - "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ - "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ - "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ - MOVQ "*0+"A4", %%mm1 \n\t" \ - MOVQ "*4+"A4", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "psllw $2, %%mm1 \n\t" /* 4* */ \ - "psllw $2, %%mm2 \n\t" /* 4* */ \ - "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ - "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ - MOVQ "*0+"A3", %%mm1 \n\t" \ - MOVQ "*4+"A3", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ - "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ - "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ - "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ - -/** - * Macro to build the vertical 16bits version of vc1_put_shift[13]. - * Here, offset=src_stride. Parameters passed A1 to A4 must use - * %3 (src_stride) and %4 (3*src_stride). - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ -static void \ -vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ - x86_reg src_stride, \ - int rnd, int64_t shift) \ -{ \ - int h = 8; \ - src -= src_stride; \ - __asm__ volatile( \ - LOAD_ROUNDER_MMX("%5") \ - "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ - "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ - ".p2align 3 \n\t" \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ - NORMALIZE_MMX("%6") \ - TRANSFER_DONT_PACK(OP_PUT) \ - /* Last 3 (in fact 4) bytes on the line */ \ - "movd 8+"A1", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "movq %%mm1, %%mm3 \n\t" \ - "paddw %%mm1, %%mm1 \n\t" \ - "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ - "movd 8+"A2", %%mm3 \n\t" \ - DO_UNPACK("%%mm3") \ - "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ - "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ - "movd 8+"A3", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ - "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ - "movd 8+"A4", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "psllw $2, %%mm1 \n\t" /* 4* */ \ - "psubw %%mm1, %%mm3 \n\t" \ - "paddw %%mm7, %%mm3 \n\t" \ - "psraw %6, %%mm3 \n\t" \ - "movq %%mm3, 16(%2) \n\t" \ - "add %3, %1 \n\t" \ - "add $24, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(src_stride), "r"(3*src_stride), \ - "m"(rnd), "m"(shift) \ - : "memory" \ - ); \ -} - -/** - * Macro to build the horizontal 16bits version of vc1_put_shift[13]. - * Here, offset=16bits, so parameters passed A1 to A4 should be simple. - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ -static void \ -OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ - const int16_t *src, int rnd) \ -{ \ - int h = 8; \ - src -= 1; \ - rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ - __asm__ volatile( \ - LOAD_ROUNDER_MMX("%4") \ - "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ - "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ - ".p2align 3 \n\t" \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ - NORMALIZE_MMX("$7") \ - /* Remove bias */ \ - "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ - "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ - TRANSFER_DO_PACK(OP) \ - "add $24, %1 \n\t" \ - "add %3, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(stride), "m"(rnd) \ - : "memory" \ - ); \ -} - -/** - * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. - * Here, offset=src_stride. Parameters passed A1 to A4 must use - * %3 (offset) and %4 (3*offset). - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ -static void \ -OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ - x86_reg stride, int rnd, x86_reg offset) \ -{ \ - int h = 8; \ - src -= offset; \ - rnd = 32-rnd; \ - __asm__ volatile ( \ - LOAD_ROUNDER_MMX("%6") \ - "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ - "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ - ".p2align 3 \n\t" \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ - NORMALIZE_MMX("$6") \ - TRANSFER_DO_PACK(OP) \ - "add %5, %1 \n\t" \ - "add %5, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ - : "memory" \ - ); \ -} - -/** 1/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) -MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) -MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") -MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) -MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) - -/** 3/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) -MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) -MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") -MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) -MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) - -typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); -typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); -typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); - -/** - * Interpolate fractional pel values by applying proper vertical then - * horizontal filter. - * - * @param dst Destination buffer for interpolated pels. - * @param src Source buffer. - * @param stride Stride for both src and dst buffers. - * @param hmode Horizontal filter (expressed in quarter pixels shift). - * @param hmode Vertical filter. - * @param rnd Rounding bias. - */ -#define VC1_MSPEL_MC(OP)\ -static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ - int hmode, int vmode, int rnd)\ -{\ - static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ - { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ - static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ - { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ - static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ - { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ -\ - __asm__ volatile(\ - "pxor %%mm0, %%mm0 \n\t"\ - ::: "memory"\ - );\ -\ - if (vmode) { /* Vertical filter to apply */\ - if (hmode) { /* Horizontal filter to apply, output to tmp */\ - static const int shift_value[] = { 0, 5, 1, 5 };\ - int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ - int r;\ - DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ -\ - r = (1<<(shift-1)) + rnd-1;\ - vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ -\ - vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ - return;\ - }\ - else { /* No horizontal filter, output 8 lines to dst */\ - vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ - return;\ - }\ - }\ -\ - /* Horizontal mode with no vertical mode */\ - vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ -} - -VC1_MSPEL_MC(put_) -VC1_MSPEL_MC(avg_) - -/** Macro to ease bicubic filter interpolation functions declarations */ -#define DECLARE_FUNCTION(a, b) \ -static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t stride, \ - int rnd) \ -{ \ - put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ -}\ -static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t stride, \ - int rnd) \ -{ \ - avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ -} - -DECLARE_FUNCTION(0, 1) -DECLARE_FUNCTION(0, 2) -DECLARE_FUNCTION(0, 3) - -DECLARE_FUNCTION(1, 0) -DECLARE_FUNCTION(1, 1) -DECLARE_FUNCTION(1, 2) -DECLARE_FUNCTION(1, 3) - -DECLARE_FUNCTION(2, 0) -DECLARE_FUNCTION(2, 1) -DECLARE_FUNCTION(2, 2) -DECLARE_FUNCTION(2, 3) - -DECLARE_FUNCTION(3, 0) -DECLARE_FUNCTION(3, 1) -DECLARE_FUNCTION(3, 2) -DECLARE_FUNCTION(3, 3) - -static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (17 * dc + 4) >> 3; - dc = (17 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (17 * dc + 4) >> 3; - dc = (12 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); - dest += 4*linesize; - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = ( 3 * dc + 1) >> 1; - dc = (17 * dc + 64) >> 7; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, - int16_t *block) -{ - int dc = block[0]; - dc = (3 * dc + 1) >> 1; - dc = (3 * dc + 16) >> 5; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); - dest += 4*linesize; - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dest+0*linesize)), - "+m"(*(uint32_t*)(dest+1*linesize)), - "+m"(*(uint32_t*)(dest+2*linesize)), - "+m"(*(uint32_t*)(dest+3*linesize)) - ); -} - -av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) -{ - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; - dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; - dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; - dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; - dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; - dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; - dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; - dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; - dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; - dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; - dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; - dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; - dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; -} - -av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) -{ - dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; - dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; - dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; - dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; - dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; - dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; - dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; - - dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; - dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; - dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; - dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; -} -#endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg1/libavcodec/x86/videodsp.asm b/ffmpeg1/libavcodec/x86/videodsp.asm deleted file mode 100644 index 0eb4721..0000000 --- a/ffmpeg1/libavcodec/x86/videodsp.asm +++ /dev/null @@ -1,612 +0,0 @@ -;****************************************************************************** -;* Core video DSP functions -;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, -; x86_reg start_y, x86_reg end_y, x86_reg block_h, -; x86_reg start_x, x86_reg end_x, x86_reg block_w); -; -; The actual function itself is below. It basically wraps a very simple -; w = end_x - start_x -; if (w) { -; if (w > 22) { -; jump to the slow loop functions -; } else { -; jump to the fast loop functions -; } -; } -; -; ... and then the same for left/right extend also. See below for loop -; function implementations. Fast are fixed-width, slow is variable-width - -%macro EMU_EDGE_FUNC 0 -%if ARCH_X86_64 -%define w_reg r7 -cglobal emu_edge_core, 6, 9, 1 - mov r8, r5 ; save block_h -%else -%define w_reg r6 -cglobal emu_edge_core, 2, 7, 0 - mov r4, r4m ; end_y - mov r5, r5m ; block_h -%endif - - ; start with vertical extend (top/bottom) and body pixel copy - mov w_reg, r7m - sub w_reg, r6m ; w = start_x - end_x - sub r5, r4 -%if ARCH_X86_64 - sub r4, r3 -%else - sub r4, dword r3m -%endif - cmp w_reg, 22 - jg .slow_v_extend_loop -%if ARCH_X86_32 - mov r2, r2m ; linesize -%endif - sal w_reg, 7 ; w * 128 -%ifdef PIC - lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] - add w_reg, rax -%else - lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] -%endif - call w_reg ; fast top extend, body copy and bottom extend -.v_extend_end: - - ; horizontal extend (left/right) - mov w_reg, r6m ; start_x - sub r0, w_reg -%if ARCH_X86_64 - mov r3, r0 ; backup of buf+block_h*linesize - mov r5, r8 -%else - mov r0m, r0 ; backup of buf+block_h*linesize - mov r5, r5m -%endif - test w_reg, w_reg - jz .right_extend - cmp w_reg, 22 - jg .slow_left_extend_loop - mov r1, w_reg - dec w_reg - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar w_reg, 1 - sal w_reg, 6 - ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs - ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h -%ifdef PIC - lea rax, [.emuedge_extend_left_2] - add w_reg, rax -%else - lea w_reg, [.emuedge_extend_left_2+w_reg] -%endif - call w_reg - - ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w -.right_extend: -%if ARCH_X86_32 - mov r0, r0m - mov r5, r5m -%endif - mov w_reg, r7m ; end_x - mov r1, r8m ; block_w - mov r4, r1 - sub r1, w_reg - jz .h_extend_end ; if (end_x == block_w) goto h_extend_end - cmp r1, 22 - jg .slow_right_extend_loop - dec r1 - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar r1, 1 - sal r1, 6 -%ifdef PIC - lea rax, [.emuedge_extend_right_2] - add r1, rax -%else - lea r1, [.emuedge_extend_right_2+r1] -%endif - call r1 -.h_extend_end: - RET - -%if ARCH_X86_64 -%define vall al -%define valh ah -%define valw ax -%define valw2 r7w -%define valw3 r3w -%if WIN64 -%define valw4 r7w -%else ; unix64 -%define valw4 r3w -%endif -%define vald eax -%else -%define vall bl -%define valh bh -%define valw bx -%define valw2 r6w -%define valw3 valw2 -%define valw4 valw3 -%define vald ebx -%define stack_offset 0x14 -%endif - -%endmacro - -; macro to read/write a horizontal number of pixels (%2) to/from registers -; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels -; - if (%2 & 15 == 8) fills the last 8 bytes into rax -; - else if (%2 & 8) fills 8 bytes into mm0 -; - if (%2 & 7 == 4) fills the last 4 bytes into rax -; - else if (%2 & 4) fills 4 bytes into mm0-1 -; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax -; (note that we're using r3 for body/bottom because it's a shorter -; opcode, and then the loop fits in 128 bytes) -; - else fills remaining bytes into rax -; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels -; - if (%2 & 7 == 4) fills 4 bytes into ebx -; - else if (%2 & 4) fills 4 bytes into mm0-7 -; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx -; - else fills remaining bytes into ebx -; writing data out is in the same way -%macro READ_NUM_BYTES 2 -%assign %%src_off 0 ; offset in source buffer -%assign %%smidx 0 ; mmx register idx -%assign %%sxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups xmm %+ %%sxidx, [r1+%%src_off] -%assign %%src_off %%src_off+16 -%assign %%sxidx %%sxidx+1 -%endrep ; %2/16 -%endif - -%if ARCH_X86_64 -%if (%2-%%src_off) == 8 - mov rax, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%endif ; (%2-%%src_off) == 8 -%endif ; x86-64 - -%rep (%2-%%src_off)/8 - movq mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%assign %%smidx %%smidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%src_off) == 4 - mov vald, [r1+%%src_off] -%elif (%2-%%src_off) & 4 - movd mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+4 -%endif ; (%2-%%src_off) ==/& 4 - -%if (%2-%%src_off) == 1 - mov vall, [r1+%%src_off] -%elif (%2-%%src_off) == 2 - mov valw, [r1+%%src_off] -%elif (%2-%%src_off) == 3 -%ifidn %1, top - mov valw2, [r1+%%src_off] -%elifidn %1, body - mov valw3, [r1+%%src_off] -%elifidn %1, bottom - mov valw4, [r1+%%src_off] -%endif ; %1 ==/!= top - mov vall, [r1+%%src_off+2] -%endif ; (%2-%%src_off) == 1/2/3 -%endmacro ; READ_NUM_BYTES - -%macro WRITE_NUM_BYTES 2 -%assign %%dst_off 0 ; offset in destination buffer -%assign %%dmidx 0 ; mmx register idx -%assign %%dxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups [r0+%%dst_off], xmm %+ %%dxidx -%assign %%dst_off %%dst_off+16 -%assign %%dxidx %%dxidx+1 -%endrep ; %2/16 -%endif - -%if ARCH_X86_64 -%if (%2-%%dst_off) == 8 - mov [r0+%%dst_off], rax -%assign %%dst_off %%dst_off+8 -%endif ; (%2-%%dst_off) == 8 -%endif ; x86-64 - -%rep (%2-%%dst_off)/8 - movq [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+8 -%assign %%dmidx %%dmidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%dst_off) == 4 - mov [r0+%%dst_off], vald -%elif (%2-%%dst_off) & 4 - movd [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+4 -%endif ; (%2-%%dst_off) ==/& 4 - -%if (%2-%%dst_off) == 1 - mov [r0+%%dst_off], vall -%elif (%2-%%dst_off) == 2 - mov [r0+%%dst_off], valw -%elif (%2-%%dst_off) == 3 -%ifidn %1, top - mov [r0+%%dst_off], valw2 -%elifidn %1, body - mov [r0+%%dst_off], valw3 -%elifidn %1, bottom - mov [r0+%%dst_off], valw4 -%endif ; %1 ==/!= top - mov [r0+%%dst_off+2], vall -%endif ; (%2-%%dst_off) == 1/2/3 -%endmacro ; WRITE_NUM_BYTES - -; vertical top/bottom extend and body copy fast loops -; these are function pointers to set-width line copy functions, i.e. -; they read a fixed number of pixels into set registers, and write -; those out into the destination buffer -; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r6(eax/64)/r3(ebx/32)=val_reg -%macro VERTICAL_EXTEND 0 -%assign %%n 1 -%rep 22 -ALIGN 128 -.emuedge_v_extend_ %+ %%n: - ; extend pixels above body -%if ARCH_X86_64 - test r3 , r3 ; if (!start_y) - jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body -%else ; ARCH_X86_32 - cmp dword r3m, 0 - je .emuedge_copy_body_ %+ %%n %+ _loop -%endif ; ARCH_X86_64/32 - READ_NUM_BYTES top, %%n ; read bytes -.emuedge_extend_top_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES top, %%n ; write bytes - add r0 , r2 ; dst += linesize -%if ARCH_X86_64 - dec r3d -%else ; ARCH_X86_32 - dec dword r3m -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) - - ; copy body pixels -.emuedge_copy_body_ %+ %%n %+ _loop: ; do { - READ_NUM_BYTES body, %%n ; read bytes - WRITE_NUM_BYTES body, %%n ; write bytes - add r0 , r2 ; dst += linesize - add r1 , r2 ; src += linesize - dec r4d - jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) - - ; copy bottom pixels - test r5 , r5 ; if (!block_h) - jz .emuedge_v_extend_end_ %+ %%n ; goto end - sub r1 , r2 ; src -= linesize - READ_NUM_BYTES bottom, %%n ; read bytes -.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES bottom, %%n ; write bytes - add r0 , r2 ; dst += linesize - dec r5d - jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) - -.emuedge_v_extend_end_ %+ %%n: -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+1 -%endrep -%endmacro VERTICAL_EXTEND - -; left/right (horizontal) fast extend functions -; these are essentially identical to the vertical extend ones above, -; just left/right separated because number of pixels to extend is -; obviously not the same on both sides. -; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the -; lowest two bytes of the register (so val*0x0101), and are splatted -; into each byte of mm0 as well if n_pixels >= 8 - -%macro READ_V_PIXEL 2 - mov vall, %2 - mov valh, vall -%if %1 >= 8 - movd mm0, vald -%if cpuflag(mmxext) - pshufw mm0, mm0, 0 -%else ; mmx - punpcklwd mm0, mm0 - punpckldq mm0, mm0 -%endif ; sse -%endif ; %1 >= 8 -%endmacro - -%macro WRITE_V_PIXEL 2 -%assign %%dst_off 0 -%rep %1/8 - movq [%2+%%dst_off], mm0 -%assign %%dst_off %%dst_off+8 -%endrep -%if %1 & 4 -%if %1 >= 8 - movd [%2+%%dst_off], mm0 -%else ; %1 < 8 - mov [%2+%%dst_off] , valw - mov [%2+%%dst_off+2], valw -%endif ; %1 >=/< 8 -%assign %%dst_off %%dst_off+4 -%endif ; %1 & 4 -%if %1&2 - mov [%2+%%dst_off], valw -%endif ; %1 & 2 -%endmacro - -; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val -%macro LEFT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_left_ %+ %%n: ; do { - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+r1] ; read pixels - WRITE_V_PIXEL %%n, r0 ; write pixels - dec r5 - jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep -%endmacro ; LEFT_EXTEND - -; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val -%macro RIGHT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_right_ %+ %%n: ; do { -%if ARCH_X86_64 - sub r3, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels - dec r8 -%else ; ARCH_X86_32 - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels - dec r5 -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep - -%if ARCH_X86_32 -%define stack_offset 0x10 -%endif -%endmacro ; RIGHT_EXTEND - -; below follow the "slow" copy/extend functions, these act on a non-fixed -; width specified in a register, and run a loop to copy the full amount -; of bytes. They are optimized for copying of large amounts of pixels per -; line, so they unconditionally splat data into mm registers to copy 8 -; bytes per loop iteration. It could be considered to use xmm for x86-64 -; also, but I haven't optimized this as much (i.e. FIXME) -%macro V_COPY_NPX 4-5 -%if %0 == 4 - test w_reg, %4 - jz .%1_skip_%4_px -%else ; %0 == 5 -.%1_%4_px_loop: -%endif - %3 %2, [r1+cnt_reg] - %3 [r0+cnt_reg], %2 - add cnt_reg, %4 -%if %0 == 5 - sub w_reg, %4 - test w_reg, %5 - jnz .%1_%4_px_loop -%endif -.%1_skip_%4_px: -%endmacro - -%macro V_COPY_ROW 2 -%ifidn %1, bottom - sub r1, linesize -%endif -.%1_copy_loop: - xor cnt_reg, cnt_reg -%if notcpuflag(sse) -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 -%else ; sse - V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 -%if ARCH_X86_64 -%define linesize r2 - V_COPY_NPX %1, rax , mov, 8 -%else ; ARCH_X86_32 -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8 -%endif ; ARCH_X86_64/32 -%endif ; sse - V_COPY_NPX %1, vald, mov, 4 - V_COPY_NPX %1, valw, mov, 2 - V_COPY_NPX %1, vall, mov, 1 - mov w_reg, cnt_reg -%ifidn %1, body - add r1, linesize -%endif - add r0, linesize - dec %2 - jnz .%1_copy_loop -%endmacro - -%macro SLOW_V_EXTEND 0 -.slow_v_extend_loop: -; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x -%if ARCH_X86_64 - push r8 ; save old value of block_h - test r3, r3 -%define cnt_reg r8 - jz .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, r3 -%else - cmp dword r3m, 0 -%define cnt_reg r2 - je .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, dword r3m -%endif - -.do_body_copy: - V_COPY_ROW body, r4 - -%if ARCH_X86_64 - pop r8 ; restore old value of block_h -%define cnt_reg r3 -%endif - test r5, r5 -%if ARCH_X86_64 - jz .v_extend_end -%else - jz .skip_bottom_extend -%endif - V_COPY_ROW bottom, r5 -%if ARCH_X86_32 -.skip_bottom_extend: - mov r2, r2m -%endif - jmp .v_extend_end -%endmacro - -%macro SLOW_LEFT_EXTEND 0 -.slow_left_extend_loop: -; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x - mov r4, 8 - sub r0, linesize - READ_V_PIXEL 8, [r0+w_reg] -.left_extend_8px_loop: - movq [r0+r4-8], mm0 - add r4, 8 - cmp r4, w_reg - jle .left_extend_8px_loop - sub r4, 8 - cmp r4, w_reg - jge .left_extend_loop_end -.left_extend_2px_loop: - mov [r0+r4], valw - add r4, 2 - cmp r4, w_reg - jl .left_extend_2px_loop -.left_extend_loop_end: - dec r5 - jnz .slow_left_extend_loop -%if ARCH_X86_32 - mov r2, r2m -%endif - jmp .right_extend -%endmacro - -%macro SLOW_RIGHT_EXTEND 0 -.slow_right_extend_loop: -; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, -; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr -%if ARCH_X86_64 -%define buf_reg r3 -%define bh_reg r8 -%else -%define buf_reg r0 -%define bh_reg r5 -%endif - lea r1, [r4-8] - sub buf_reg, linesize - READ_V_PIXEL 8, [buf_reg+w_reg-1] -.right_extend_8px_loop: - movq [buf_reg+r1], mm0 - sub r1, 8 - cmp r1, w_reg - jge .right_extend_8px_loop - add r1, 8 - cmp r1, w_reg - je .right_extend_loop_end -.right_extend_2px_loop: - sub r1, 2 - mov [buf_reg+r1], valw - cmp r1, w_reg - jg .right_extend_2px_loop -.right_extend_loop_end: - dec bh_reg - jnz .slow_right_extend_loop - jmp .h_extend_end -%endmacro - -%macro emu_edge 1 -INIT_XMM %1 -EMU_EDGE_FUNC -VERTICAL_EXTEND -LEFT_EXTEND -RIGHT_EXTEND -SLOW_V_EXTEND -SLOW_LEFT_EXTEND -SLOW_RIGHT_EXTEND -%endmacro - -emu_edge sse -%if ARCH_X86_32 -emu_edge mmx -%endif - -%macro PREFETCH_FN 1 -cglobal prefetch, 3, 3, 0, buf, stride, h -.loop: - %1 [bufq] - add bufq, strideq - dec hd - jg .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PREFETCH_FN prefetcht0 -%if ARCH_X86_32 -INIT_MMX 3dnow -PREFETCH_FN prefetch -%endif diff --git a/ffmpeg1/libavcodec/x86/videodsp_init.c b/ffmpeg1/libavcodec/x86/videodsp_init.c deleted file mode 100644 index 902450e..0000000 --- a/ffmpeg1/libavcodec/x86/videodsp_init.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2002-2012 Michael Niedermayer - * Copyright (C) 2012 Ronald S. Bultje - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/avassert.h" -#include "libavutil/common.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/videodsp.h" - -#if HAVE_YASM -typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, - x86_reg linesize, x86_reg start_y, - x86_reg end_y, x86_reg block_h, - x86_reg start_x, x86_reg end_x, - x86_reg block_w); -extern emu_edge_core_func ff_emu_edge_core_mmx; -extern emu_edge_core_func ff_emu_edge_core_sse; - -static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize_arg, - int block_w, int block_h, - int src_x, int src_y, - int w, int h, - emu_edge_core_func *core_fn) -{ - int start_y, start_x, end_y, end_x, src_y_add = 0; - int linesize = linesize_arg; - - if(!w || !h) - return; - - if (src_y >= h) { - src -= src_y*linesize; - src_y_add = h - 1; - src_y = h - 1; - } else if (src_y <= -block_h) { - src -= src_y*linesize; - src_y_add = 1 - block_h; - src_y = 1 - block_h; - } - if (src_x >= w) { - src += w - 1 - src_x; - src_x = w - 1; - } else if (src_x <= -block_w) { - src += 1 - block_w - src_x; - src_x = 1 - block_w; - } - - start_y = FFMAX(0, -src_y); - start_x = FFMAX(0, -src_x); - end_y = FFMIN(block_h, h-src_y); - end_x = FFMIN(block_w, w-src_x); - av_assert2(start_x < end_x && block_w > 0); - av_assert2(start_y < end_y && block_h > 0); - - // fill in the to-be-copied part plus all above/below - src += (src_y_add + start_y) * linesize + start_x; - buf += start_x; - core_fn(buf, src, linesize, start_y, end_y, - block_h, start_x, end_x, block_w); -} - -#if ARCH_X86_32 -static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize, - int block_w, int block_h, - int src_x, int src_y, int w, int h) -{ - emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, - w, h, &ff_emu_edge_core_mmx); -} -#endif - -static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize, - int block_w, int block_h, - int src_x, int src_y, int w, int h) -{ - emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, - w, h, &ff_emu_edge_core_sse); -} -#endif /* HAVE_YASM */ - -void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); -void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h); - -av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - -#if ARCH_X86_32 - if (bpc <= 8 && mm_flags & AV_CPU_FLAG_MMX) { - ctx->emulated_edge_mc = emulated_edge_mc_mmx; - } - if (mm_flags & AV_CPU_FLAG_3DNOW) { - ctx->prefetch = ff_prefetch_3dnow; - } -#endif /* ARCH_X86_32 */ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - ctx->prefetch = ff_prefetch_mmxext; - } - if (bpc <= 8 && mm_flags & AV_CPU_FLAG_SSE) { - ctx->emulated_edge_mc = emulated_edge_mc_sse; - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/vorbisdsp.asm b/ffmpeg1/libavcodec/x86/vorbisdsp.asm deleted file mode 100644 index b25d838..0000000 --- a/ffmpeg1/libavcodec/x86/vorbisdsp.asm +++ /dev/null @@ -1,83 +0,0 @@ -;****************************************************************************** -;* Vorbis x86 optimizations -;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -pdw_80000000: times 4 dd 0x80000000 - -SECTION .text - -%if ARCH_X86_32 -INIT_MMX 3dnow -cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size - pxor m7, m7 - lea magq, [magq+block_sizeq*4] - lea angq, [angq+block_sizeq*4] - neg block_sizeq -.loop: - mova m0, [magq+block_sizeq*4] - mova m1, [angq+block_sizeq*4] - mova m2, m0 - mova m3, m1 - pfcmpge m2, m7 ; m <= 0.0 - pfcmpge m3, m7 ; a <= 0.0 - pslld m2, 31 ; keep only the sign bit - pxor m1, m2 - mova m4, m3 - pand m3, m1 - pandn m4, m1 - pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) - pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) - mova [angq+block_sizeq*4], m3 - mova [magq+block_sizeq*4], m0 - add block_sizeq, 2 - jl .loop - femms - RET -%endif - -INIT_XMM sse -cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr - mova m5, [pdw_80000000] - xor cntrq, cntrq -align 16 -.loop: - mova m0, [magq+cntrq*4] - mova m1, [angq+cntrq*4] - xorps m2, m2 - xorps m3, m3 - cmpleps m2, m0 ; m <= 0.0 - cmpleps m3, m1 ; a <= 0.0 - andps m2, m5 ; keep only the sign bit - xorps m1, m2 - mova m4, m3 - andps m3, m1 - andnps m4, m1 - addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) - subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) - mova [angq+cntrq*4], m3 - mova [magq+cntrq*4], m0 - add cntrq, 4 - cmp cntrq, block_sizeq - jl .loop - RET diff --git a/ffmpeg1/libavcodec/x86/vorbisdsp_init.c b/ffmpeg1/libavcodec/x86/vorbisdsp_init.c deleted file mode 100644 index 08a2c09..0000000 --- a/ffmpeg1/libavcodec/x86/vorbisdsp_init.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavcodec/vorbisdsp.h" - -void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, - intptr_t blocksize); -void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, - intptr_t blocksize); - -av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - -#if ARCH_X86_32 - if (mm_flags & AV_CPU_FLAG_3DNOW) - dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; -#endif /* ARCH_X86_32 */ - if (mm_flags & AV_CPU_FLAG_SSE) - dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/vp3dsp.asm b/ffmpeg1/libavcodec/x86/vp3dsp.asm deleted file mode 100644 index a47b8f2..0000000 --- a/ffmpeg1/libavcodec/x86/vp3dsp.asm +++ /dev/null @@ -1,709 +0,0 @@ -;****************************************************************************** -;* MMX/SSE2-optimized functions for the VP3 decoder -;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -; MMX-optimized functions cribbed from the original VP3 source code. - -SECTION_RODATA - -vp3_idct_data: times 8 dw 64277 - times 8 dw 60547 - times 8 dw 54491 - times 8 dw 46341 - times 8 dw 36410 - times 8 dw 25080 - times 8 dw 12785 - -pb_7: times 8 db 7 -pb_1F: times 8 db 0x1f -pb_81: times 8 db 0x81 - -cextern pb_1 -cextern pb_3 -cextern pb_80 - -cextern pw_8 - -SECTION .text - -; this is off by one or two for some cases when filter_limit is greater than 63 -; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 -; out: p1 in mm4, p2 in mm3 -%macro VP3_LOOP_FILTER 0 - movq m7, m6 - pand m6, [pb_7] ; p0&7 - psrlw m7, 3 - pand m7, [pb_1F] ; p0>>3 - movq m3, m2 ; p2 - pxor m2, m4 - pand m2, [pb_1] ; (p2^p1)&1 - movq m5, m2 - paddb m2, m2 - paddb m2, m5 ; 3*(p2^p1)&1 - paddb m2, m6 ; extra bits lost in shifts - pcmpeqb m0, m0 - pxor m1, m0 ; 255 - p3 - pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 - pxor m0, m4 ; 255 - p1 - pavgb m0, m3 ; (256 + p2-p1) >> 1 - paddb m1, [pb_3] - pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 - pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 - paddusb m7, m1 ; d+128+1 - movq m6, [pb_81] - psubusb m6, m7 - psubusb m7, [pb_81] - - movq m5, [r2+516] ; flim - pminub m6, m5 - pminub m7, m5 - movq m0, m6 - movq m1, m7 - paddb m6, m6 - paddb m7, m7 - pminub m6, m5 - pminub m7, m5 - psubb m6, m0 - psubb m7, m1 - paddusb m4, m7 - psubusb m4, m6 - psubusb m3, m7 - paddusb m3, m6 -%endmacro - -%macro STORE_4_WORDS 1 - movd r2d, %1 - mov [r0 -1], r2w - psrlq %1, 32 - shr r2, 16 - mov [r0+r1 -1], r2w - movd r2d, %1 - mov [r0+r1*2-1], r2w - shr r2, 16 - mov [r0+r3 -1], r2w -%endmacro - -INIT_MMX mmxext -cglobal vp3_v_loop_filter, 3, 4 -%if ARCH_X86_64 - movsxd r1, r1d -%endif - mov r3, r1 - neg r1 - movq m6, [r0+r1*2] - movq m4, [r0+r1 ] - movq m2, [r0 ] - movq m1, [r0+r3 ] - - VP3_LOOP_FILTER - - movq [r0+r1], m4 - movq [r0 ], m3 - RET - -cglobal vp3_h_loop_filter, 3, 4 -%if ARCH_X86_64 - movsxd r1, r1d -%endif - lea r3, [r1*3] - - movd m6, [r0 -2] - movd m4, [r0+r1 -2] - movd m2, [r0+r1*2-2] - movd m1, [r0+r3 -2] - lea r0, [r0+r1*4 ] - punpcklbw m6, [r0 -2] - punpcklbw m4, [r0+r1 -2] - punpcklbw m2, [r0+r1*2-2] - punpcklbw m1, [r0+r3 -2] - sub r0, r3 - sub r0, r1 - - TRANSPOSE4x4B 6, 4, 2, 1, 0 - VP3_LOOP_FILTER - SBUTTERFLY bw, 4, 3, 5 - - STORE_4_WORDS m4 - lea r0, [r0+r1*4 ] - STORE_4_WORDS m3 - RET - -; from original comments: The Macro does IDct on 4 1-D Dcts -%macro BeginIDCT 0 - movq m2, I(3) - movq m6, C(3) - movq m4, m2 - movq m7, J(5) - pmulhw m4, m6 ; r4 = c3*i3 - i3 - movq m1, C(5) - pmulhw m6, m7 ; r6 = c3*i5 - i5 - movq m5, m1 - pmulhw m1, m2 ; r1 = c5*i3 - i3 - movq m3, I(1) - pmulhw m5, m7 ; r5 = c5*i5 - i5 - movq m0, C(1) - paddw m4, m2 ; r4 = c3*i3 - paddw m6, m7 ; r6 = c3*i5 - paddw m2, m1 ; r2 = c5*i3 - movq m1, J(7) - paddw m7, m5 ; r7 = c5*i5 - movq m5, m0 ; r5 = c1 - pmulhw m0, m3 ; r0 = c1*i1 - i1 - paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 - pmulhw m5, m1 ; r5 = c1*i7 - i7 - movq m7, C(7) - psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 - paddw m0, m3 ; r0 = c1*i1 - pmulhw m3, m7 ; r3 = c7*i1 - movq m2, I(2) - pmulhw m7, m1 ; r7 = c7*i7 - paddw m5, m1 ; r5 = c1*i7 - movq m1, m2 ; r1 = i2 - pmulhw m2, C(2) ; r2 = c2*i2 - i2 - psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 - movq m5, J(6) - paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 - movq m7, m5 ; r7 = i6 - psubsw m0, m4 ; r0 = A - C - pmulhw m5, C(2) ; r5 = c2*i6 - i6 - paddw m2, m1 ; r2 = c2*i2 - pmulhw m1, C(6) ; r1 = c6*i2 - paddsw m4, m4 ; r4 = C + C - paddsw m4, m0 ; r4 = C. = A + C - psubsw m3, m6 ; r3 = B - D - paddw m5, m7 ; r5 = c2*i6 - paddsw m6, m6 ; r6 = D + D - pmulhw m7, C(6) ; r7 = c6*i6 - paddsw m6, m3 ; r6 = D. = B + D - movq I(1), m4 ; save C. at I(1) - psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 - movq m4, C(4) - movq m5, m3 ; r5 = B - D - pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) - paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) - movq I(2), m6 ; save D. at I(2) - movq m2, m0 ; r2 = A - C - movq m6, I(0) - pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) - paddw m5, m3 ; r5 = B. = c4 * (B - D) - movq m3, J(4) - psubsw m5, m1 ; r5 = B.. = B. - H - paddw m2, m0 ; r0 = A. = c4 * (A - C) - psubsw m6, m3 ; r6 = i0 - i4 - movq m0, m6 - pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) - paddsw m3, m3 ; r3 = i4 + i4 - paddsw m1, m1 ; r1 = H + H - paddsw m3, m0 ; r3 = i0 + i4 - paddsw m1, m5 ; r1 = H. = B + H - pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) - paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) - psubsw m6, m2 ; r6 = F. = F - A. - paddsw m2, m2 ; r2 = A. + A. - movq m0, I(1) ; r0 = C. - paddsw m2, m6 ; r2 = A.. = F + A. - paddw m4, m3 ; r4 = E = c4 * (i0 + i4) - psubsw m2, m1 ; r2 = R2 = A.. - H. -%endmacro - -; RowIDCT gets ready to transpose -%macro RowIDCT 0 - BeginIDCT - movq m3, I(2) ; r3 = D. - psubsw m4, m7 ; r4 = E. = E - G - paddsw m1, m1 ; r1 = H. + H. - paddsw m7, m7 ; r7 = G + G - paddsw m1, m2 ; r1 = R1 = A.. + H. - paddsw m7, m4 ; r1 = R1 = A.. + H. - psubsw m4, m3 ; r4 = R4 = E. - D. - paddsw m3, m3 - psubsw m6, m5 ; r6 = R6 = F. - B.. - paddsw m5, m5 - paddsw m3, m4 ; r3 = R3 = E. + D. - paddsw m5, m6 ; r5 = R5 = F. + B.. - psubsw m7, m0 ; r7 = R7 = G. - C. - paddsw m0, m0 - movq I(1), m1 ; save R1 - paddsw m0, m7 ; r0 = R0 = G. + C. -%endmacro - -; Column IDCT normalizes and stores final results -%macro ColumnIDCT 0 - BeginIDCT - paddsw m2, OC_8 ; adjust R2 (and R1) for shift - paddsw m1, m1 ; r1 = H. + H. - paddsw m1, m2 ; r1 = R1 = A.. + H. - psraw m2, 4 ; r2 = NR2 - psubsw m4, m7 ; r4 = E. = E - G - psraw m1, 4 ; r1 = NR2 - movq m3, I(2) ; r3 = D. - paddsw m7, m7 ; r7 = G + G - movq I(2), m2 ; store NR2 at I2 - paddsw m7, m4 ; r7 = G. = E + G - movq I(1), m1 ; store NR1 at I1 - psubsw m4, m3 ; r4 = R4 = E. - D. - paddsw m4, OC_8 ; adjust R4 (and R3) for shift - paddsw m3, m3 ; r3 = D. + D. - paddsw m3, m4 ; r3 = R3 = E. + D. - psraw m4, 4 ; r4 = NR4 - psubsw m6, m5 ; r6 = R6 = F. - B.. - psraw m3, 4 ; r3 = NR3 - paddsw m6, OC_8 ; adjust R6 (and R5) for shift - paddsw m5, m5 ; r5 = B.. + B.. - paddsw m5, m6 ; r5 = R5 = F. + B.. - psraw m6, 4 ; r6 = NR6 - movq J(4), m4 ; store NR4 at J4 - psraw m5, 4 ; r5 = NR5 - movq I(3), m3 ; store NR3 at I3 - psubsw m7, m0 ; r7 = R7 = G. - C. - paddsw m7, OC_8 ; adjust R7 (and R0) for shift - paddsw m0, m0 ; r0 = C. + C. - paddsw m0, m7 ; r0 = R0 = G. + C. - psraw m7, 4 ; r7 = NR7 - movq J(6), m6 ; store NR6 at J6 - psraw m0, 4 ; r0 = NR0 - movq J(5), m5 ; store NR5 at J5 - movq J(7), m7 ; store NR7 at J7 - movq I(0), m0 ; store NR0 at I0 -%endmacro - -; Following macro does two 4x4 transposes in place. -; -; At entry (we assume): -; -; r0 = a3 a2 a1 a0 -; I(1) = b3 b2 b1 b0 -; r2 = c3 c2 c1 c0 -; r3 = d3 d2 d1 d0 -; -; r4 = e3 e2 e1 e0 -; r5 = f3 f2 f1 f0 -; r6 = g3 g2 g1 g0 -; r7 = h3 h2 h1 h0 -; -; At exit, we have: -; -; I(0) = d0 c0 b0 a0 -; I(1) = d1 c1 b1 a1 -; I(2) = d2 c2 b2 a2 -; I(3) = d3 c3 b3 a3 -; -; J(4) = h0 g0 f0 e0 -; J(5) = h1 g1 f1 e1 -; J(6) = h2 g2 f2 e2 -; J(7) = h3 g3 f3 e3 -; -; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. -; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. -; -; Since r1 is free at entry, we calculate the Js first. -%macro Transpose 0 - movq m1, m4 ; r1 = e3 e2 e1 e0 - punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 - movq I(0), m0 ; save a3 a2 a1 a0 - punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 - movq m0, m6 ; r0 = g3 g2 g1 g0 - punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 - movq m5, m4 ; r5 = f1 e1 f0 e0 - punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 - punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 - movq m6, m1 ; r6 = f3 e3 f2 e2 - movq J(4), m4 - punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 - movq J(5), m5 - punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 - movq m4, I(0) ; r4 = a3 a2 a1 a0 - punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 - movq m5, I(1) ; r5 = b3 b2 b1 b0 - movq m0, m4 ; r0 = a3 a2 a1 a0 - movq J(7), m6 - punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 - movq J(6), m1 - punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 - movq m5, m2 ; r5 = c3 c2 c1 c0 - punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 - movq m1, m0 ; r1 = b1 a1 b0 a0 - punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 - punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 - movq m2, m4 ; r2 = b3 a3 b2 a2 - movq I(0), m0 - punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 - movq I(1), m1 - punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 - punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 - movq I(3), m4 - movq I(2), m2 -%endmacro - -%macro VP3_1D_IDCT_SSE2 0 - movdqa m2, I(3) ; xmm2 = i3 - movdqa m6, C(3) ; xmm6 = c3 - movdqa m4, m2 ; xmm4 = i3 - movdqa m7, I(5) ; xmm7 = i5 - pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 - movdqa m1, C(5) ; xmm1 = c5 - pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 - movdqa m5, m1 ; xmm5 = c5 - pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 - movdqa m3, I(1) ; xmm3 = i1 - pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 - movdqa m0, C(1) ; xmm0 = c1 - paddw m4, m2 ; xmm4 = c3 * i3 - paddw m6, m7 ; xmm6 = c3 * i5 - paddw m2, m1 ; xmm2 = c5 * i3 - movdqa m1, I(7) ; xmm1 = i7 - paddw m7, m5 ; xmm7 = c5 * i5 - movdqa m5, m0 ; xmm5 = c1 - pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 - paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C - pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 - movdqa m7, C(7) ; xmm7 = c7 - psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D - paddw m0, m3 ; xmm0 = c1 * i1 - pmulhw m3, m7 ; xmm3 = c7 * i1 - movdqa m2, I(2) ; xmm2 = i2 - pmulhw m7, m1 ; xmm7 = c7 * i7 - paddw m5, m1 ; xmm5 = c1 * i7 - movdqa m1, m2 ; xmm1 = i2 - pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 - psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B - movdqa m5, I(6) ; xmm5 = i6 - paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A - movdqa m7, m5 ; xmm7 = i6 - psubsw m0, m4 ; xmm0 = A - C - pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 - paddw m2, m1 ; xmm2 = i2 * c2 - pmulhw m1, C(6) ; xmm1 = c6 * i2 - paddsw m4, m4 ; xmm4 = C + C - paddsw m4, m0 ; xmm4 = A + C = C. - psubsw m3, m6 ; xmm3 = B - D - paddw m5, m7 ; xmm5 = c2 * i6 - paddsw m6, m6 ; xmm6 = D + D - pmulhw m7, C(6) ; xmm7 = c6 * i6 - paddsw m6, m3 ; xmm6 = B + D = D. - movdqa I(1), m4 ; Save C. at I(1) - psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H - movdqa m4, C(4) ; xmm4 = C4 - movdqa m5, m3 ; xmm5 = B - D - pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) - paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G - movdqa I(2), m6 ; save D. at I(2) - movdqa m2, m0 ; xmm2 = A - C - movdqa m6, I(0) ; xmm6 = i0 - pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. - paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. - movdqa m3, I(4) ; xmm3 = i4 - psubsw m5, m1 ; xmm5 = B. - H = B.. - paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. - psubsw m6, m3 ; xmm6 = i0 - i4 - movdqa m0, m6 ; xmm0 = i0 - i4 - pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F - paddsw m3, m3 ; xmm3 = i4 + i4 - paddsw m1, m1 ; xmm1 = H + H - paddsw m3, m0 ; xmm3 = i0 + i4 - paddsw m1, m5 ; xmm1 = B. + H = H. - pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) - paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) - psubsw m6, m2 ; xmm6 = F - A. = F. - paddsw m2, m2 ; xmm2 = A. + A. - movdqa m0, I(1) ; Load C. from I(1) - paddsw m2, m6 ; xmm2 = F + A. = A.. - paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 - psubsw m2, m1 ; xmm2 = A.. - H. = R2 - ADD(m2) ; Adjust R2 and R1 before shifting - paddsw m1, m1 ; xmm1 = H. + H. - paddsw m1, m2 ; xmm1 = A.. + H. = R1 - SHIFT(m2) ; xmm2 = op2 - psubsw m4, m7 ; xmm4 = E - G = E. - SHIFT(m1) ; xmm1 = op1 - movdqa m3, I(2) ; Load D. from I(2) - paddsw m7, m7 ; xmm7 = G + G - paddsw m7, m4 ; xmm7 = E + G = G. - psubsw m4, m3 ; xmm4 = E. - D. = R4 - ADD(m4) ; Adjust R4 and R3 before shifting - paddsw m3, m3 ; xmm3 = D. + D. - paddsw m3, m4 ; xmm3 = E. + D. = R3 - SHIFT(m4) ; xmm4 = op4 - psubsw m6, m5 ; xmm6 = F. - B..= R6 - SHIFT(m3) ; xmm3 = op3 - ADD(m6) ; Adjust R6 and R5 before shifting - paddsw m5, m5 ; xmm5 = B.. + B.. - paddsw m5, m6 ; xmm5 = F. + B.. = R5 - SHIFT(m6) ; xmm6 = op6 - SHIFT(m5) ; xmm5 = op5 - psubsw m7, m0 ; xmm7 = G. - C. = R7 - ADD(m7) ; Adjust R7 and R0 before shifting - paddsw m0, m0 ; xmm0 = C. + C. - paddsw m0, m7 ; xmm0 = G. + C. - SHIFT(m7) ; xmm7 = op7 - SHIFT(m0) ; xmm0 = op0 -%endmacro - -%macro PUT_BLOCK 8 - movdqa O(0), m%1 - movdqa O(1), m%2 - movdqa O(2), m%3 - movdqa O(3), m%4 - movdqa O(4), m%5 - movdqa O(5), m%6 - movdqa O(6), m%7 - movdqa O(7), m%8 -%endmacro - -%macro VP3_IDCT 1 -%if mmsize == 16 -%define I(x) [%1+16*x] -%define O(x) [%1+16*x] -%define C(x) [vp3_idct_data+16*(x-1)] -%define SHIFT(x) -%define ADD(x) - VP3_1D_IDCT_SSE2 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] -%endif - PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 - -%define SHIFT(x) psraw x, 4 -%define ADD(x) paddsw x, [pw_8] - VP3_1D_IDCT_SSE2 - PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 -%else ; mmsize == 8 - ; eax = quantized input - ; ebx = dequantizer matrix - ; ecx = IDCT constants - ; M(I) = ecx + MaskOffset(0) + I * 8 - ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 - ; edx = output - ; r0..r7 = mm0..mm7 -%define OC_8 [pw_8] -%define C(x) [vp3_idct_data+16*(x-1)] - - ; at this point, function has completed dequantization + dezigzag + - ; partial transposition; now do the idct itself -%define I(x) [%1+16*x] -%define J(x) [%1+16*x] - RowIDCT - Transpose - -%define I(x) [%1+16*x+8] -%define J(x) [%1+16*x+8] - RowIDCT - Transpose - -%define I(x) [%1+16* x] -%define J(x) [%1+16*(x-4)+8] - ColumnIDCT - -%define I(x) [%1+16* x +64] -%define J(x) [%1+16*(x-4)+72] - ColumnIDCT -%endif ; mmsize == 16/8 -%endmacro - -%macro vp3_idct_funcs 0 -cglobal vp3_idct_put, 3, 4, 9 - VP3_IDCT r2 - - movsxdifnidn r1, r1d - mova m4, [pb_80] - lea r3, [r1*3] -%assign %%i 0 -%rep 16/mmsize - mova m0, [r2+mmsize*0+%%i] - mova m1, [r2+mmsize*2+%%i] - mova m2, [r2+mmsize*4+%%i] - mova m3, [r2+mmsize*6+%%i] -%if mmsize == 8 - packsswb m0, [r2+mmsize*8+%%i] - packsswb m1, [r2+mmsize*10+%%i] - packsswb m2, [r2+mmsize*12+%%i] - packsswb m3, [r2+mmsize*14+%%i] -%else - packsswb m0, [r2+mmsize*1+%%i] - packsswb m1, [r2+mmsize*3+%%i] - packsswb m2, [r2+mmsize*5+%%i] - packsswb m3, [r2+mmsize*7+%%i] -%endif - paddb m0, m4 - paddb m1, m4 - paddb m2, m4 - paddb m3, m4 - movq [r0 ], m0 -%if mmsize == 8 - movq [r0+r1 ], m1 - movq [r0+r1*2], m2 - movq [r0+r3 ], m3 -%else - movhps [r0+r1 ], m0 - movq [r0+r1*2], m1 - movhps [r0+r3 ], m1 -%endif -%if %%i == 0 - lea r0, [r0+r1*4] -%endif -%if mmsize == 16 - movq [r0 ], m2 - movhps [r0+r1 ], m2 - movq [r0+r1*2], m3 - movhps [r0+r3 ], m3 -%endif -%assign %%i %%i+8 -%endrep - - pxor m0, m0 -%assign %%offset 0 -%rep 128/mmsize - mova [r2+%%offset], m0 -%assign %%offset %%offset+mmsize -%endrep - RET - -cglobal vp3_idct_add, 3, 4, 9 - VP3_IDCT r2 - - movsxdifnidn r1, r1d - lea r3, [r1*3] - pxor m4, m4 -%if mmsize == 16 -%assign %%i 0 -%rep 2 - movq m0, [r0] - movq m1, [r0+r1] - movq m2, [r0+r1*2] - movq m3, [r0+r3] - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 - paddsw m0, [r2+ 0+%%i] - paddsw m1, [r2+16+%%i] - paddsw m2, [r2+32+%%i] - paddsw m3, [r2+48+%%i] - packuswb m0, m1 - packuswb m2, m3 - movq [r0 ], m0 - movhps [r0+r1 ], m0 - movq [r0+r1*2], m2 - movhps [r0+r3 ], m2 -%if %%i == 0 - lea r0, [r0+r1*4] -%endif -%assign %%i %%i+64 -%endrep -%else -%assign %%i 0 -%rep 2 - movq m0, [r0] - movq m1, [r0+r1] - movq m2, [r0+r1*2] - movq m3, [r0+r3] - movq m5, m0 - movq m6, m1 - movq m7, m2 - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpckhbw m5, m4 - punpckhbw m6, m4 - punpckhbw m7, m4 - paddsw m0, [r2+ 0+%%i] - paddsw m1, [r2+16+%%i] - paddsw m2, [r2+32+%%i] - paddsw m5, [r2+64+%%i] - paddsw m6, [r2+80+%%i] - paddsw m7, [r2+96+%%i] - packuswb m0, m5 - movq m5, m3 - punpcklbw m3, m4 - punpckhbw m5, m4 - packuswb m1, m6 - paddsw m3, [r2+48+%%i] - paddsw m5, [r2+112+%%i] - packuswb m2, m7 - packuswb m3, m5 - movq [r0 ], m0 - movq [r0+r1 ], m1 - movq [r0+r1*2], m2 - movq [r0+r3 ], m3 -%if %%i == 0 - lea r0, [r0+r1*4] -%endif -%assign %%i %%i+8 -%endrep -%endif -%assign %%i 0 -%rep 128/mmsize - mova [r2+%%i], m4 -%assign %%i %%i+mmsize -%endrep - RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -vp3_idct_funcs -%endif - -INIT_XMM sse2 -vp3_idct_funcs - -%macro DC_ADD 0 - movq m2, [r0 ] - movq m3, [r0+r1 ] - paddusb m2, m0 - movq m4, [r0+r1*2] - paddusb m3, m0 - movq m5, [r0+r2 ] - paddusb m4, m0 - paddusb m5, m0 - psubusb m2, m1 - psubusb m3, m1 - movq [r0 ], m2 - psubusb m4, m1 - movq [r0+r1 ], m3 - psubusb m5, m1 - movq [r0+r1*2], m4 - movq [r0+r2 ], m5 -%endmacro - -INIT_MMX mmxext -cglobal vp3_idct_dc_add, 3, 4 -%if ARCH_X86_64 - movsxd r1, r1d -%endif - movsx r3, word [r2] - mov word [r2], 0 - lea r2, [r1*3] - add r3, 15 - sar r3, 5 - movd m0, r3d - pshufw m0, m0, 0x0 - pxor m1, m1 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - DC_ADD - lea r0, [r0+r1*4] - DC_ADD - RET diff --git a/ffmpeg1/libavcodec/x86/vp3dsp_init.c b/ffmpeg1/libavcodec/x86/vp3dsp_init.c deleted file mode 100644 index 252b40a..0000000 --- a/ffmpeg1/libavcodec/x86/vp3dsp_init.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2009 David Conrad <lessen42@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/vp3dsp.h" -#include "config.h" - -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); - -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block); -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block); - -void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size, - int16_t *block); - -void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, - int *bounding_values); -void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, - int *bounding_values); - -#if HAVE_INLINE_ASM - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pand "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "paddb "#regb", "#regr" \n\t" \ - "paddb "#regd", "#regp" \n\t" - -static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h) -{ -// START_TIMER - MOVQ_BFE(mm6); - __asm__ volatile( - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "movq (%1,%4), %%mm2 \n\t" - "movq (%2,%4), %%mm3 \n\t" - PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "movq %%mm5, (%3,%4) \n\t" - - "movq (%1,%4,2), %%mm0 \n\t" - "movq (%2,%4,2), %%mm1 \n\t" - "movq (%1,%5), %%mm2 \n\t" - "movq (%2,%5), %%mm3 \n\t" - "lea (%1,%4,4), %1 \n\t" - "lea (%2,%4,4), %2 \n\t" - PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3,%4,2) \n\t" - "movq %%mm5, (%3,%5) \n\t" - "lea (%3,%4,4), %3 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+r"(h), "+r"(a), "+r"(b), "+r"(dst) - :"r"((x86_reg)stride), "r"((x86_reg)3L*stride) - :"memory"); -// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx") -} -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) -{ - int cpuflags = av_get_cpu_flags(); - -#if HAVE_INLINE_ASM - c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx; -#endif /* HAVE_INLINE_ASM */ - -#if ARCH_X86_32 - if (EXTERNAL_MMX(cpuflags)) { - c->idct_put = ff_vp3_idct_put_mmx; - c->idct_add = ff_vp3_idct_add_mmx; - } -#endif - - if (EXTERNAL_MMXEXT(cpuflags)) { - c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; - - if (!(flags & CODEC_FLAG_BITEXACT)) { - c->v_loop_filter = ff_vp3_v_loop_filter_mmxext; - c->h_loop_filter = ff_vp3_h_loop_filter_mmxext; - } - } - - if (EXTERNAL_SSE2(cpuflags)) { - c->idct_put = ff_vp3_idct_put_sse2; - c->idct_add = ff_vp3_idct_add_sse2; - } -} diff --git a/ffmpeg1/libavcodec/x86/vp56_arith.h b/ffmpeg1/libavcodec/x86/vp56_arith.h deleted file mode 100644 index e71dbf8..0000000 --- a/ffmpeg1/libavcodec/x86/vp56_arith.h +++ /dev/null @@ -1,54 +0,0 @@ -/** - * VP5 and VP6 compatible video decoder (arith decoder) - * - * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> - * Copyright (C) 2010 Eli Friedman - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP56_ARITH_H -#define AVCODEC_X86_VP56_ARITH_H - -#if HAVE_INLINE_ASM && HAVE_FAST_CMOV -#define vp56_rac_get_prob vp56_rac_get_prob -static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) -{ - unsigned int code_word = vp56_rac_renorm(c); - unsigned int high = c->high; - unsigned int low = 1 + (((high - 1) * prob) >> 8); - unsigned int low_shift = low << 16; - int bit = 0; - - __asm__( - "subl %4, %1 \n\t" - "subl %3, %2 \n\t" - "leal (%2, %3), %3 \n\t" - "setae %b0 \n\t" - "cmovb %4, %1 \n\t" - "cmovb %3, %2 \n\t" - : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift) - : "r"(low) - ); - - c->high = high; - c->code_word = code_word; - return bit; -} -#endif - -#endif /* AVCODEC_X86_VP56_ARITH_H */ diff --git a/ffmpeg1/libavcodec/x86/vp56dsp.asm b/ffmpeg1/libavcodec/x86/vp56dsp.asm deleted file mode 100644 index 3d874ea..0000000 --- a/ffmpeg1/libavcodec/x86/vp56dsp.asm +++ /dev/null @@ -1,170 +0,0 @@ -;****************************************************************************** -;* MMX/SSE2-optimized functions for the VP6 decoder -;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> -;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -cextern pw_64 - -SECTION .text - -%macro DIAG4 6 -%if mmsize == 8 - movq m0, [%1+%2] - movq m1, [%1+%3] - movq m3, m0 - movq m4, m1 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpckhbw m3, m7 - punpckhbw m4, m7 - pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] - pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] - pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] - pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] - paddw m0, m1 - paddw m3, m4 - movq m1, [%1+%4] - movq m2, [%1+%5] - movq m4, m1 - movq m5, m2 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpckhbw m4, m7 - punpckhbw m5, m7 - pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] - pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] - pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] - pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] - paddw m1, m2 - paddw m4, m5 - paddsw m0, m1 - paddsw m3, m4 - paddsw m0, m6 ; Add 64 - paddsw m3, m6 ; Add 64 - psraw m0, 7 - psraw m3, 7 - packuswb m0, m3 - movq [%6], m0 -%else ; mmsize == 16 - movq m0, [%1+%2] - movq m1, [%1+%3] - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 ; src[x-8 ] * biweight [0] - pmullw m1, m5 ; src[x ] * biweight [1] - paddw m0, m1 - movq m1, [%1+%4] - movq m2, [%1+%5] - punpcklbw m1, m7 - punpcklbw m2, m7 - pmullw m1, m6 ; src[x+8 ] * biweight [2] - pmullw m2, m3 ; src[x+16] * biweight [3] - paddw m1, m2 - paddsw m0, m1 - paddsw m0, [pw_64] ; Add 64 - psraw m0, 7 - packuswb m0, m0 - movq [%6], m0 -%endif ; mmsize == 8/16 -%endmacro - -%macro SPLAT4REGS 0 -%if mmsize == 8 - movq m5, m3 - punpcklwd m3, m3 - movq m4, m3 - punpckldq m3, m3 - punpckhdq m4, m4 - punpckhwd m5, m5 - movq m2, m5 - punpckhdq m2, m2 - punpckldq m5, m5 - movq [rsp+8*11], m3 - movq [rsp+8*12], m4 - movq [rsp+8*13], m5 - movq [rsp+8*14], m2 -%else ; mmsize == 16 - pshuflw m4, m3, 0x0 - pshuflw m5, m3, 0x55 - pshuflw m6, m3, 0xAA - pshuflw m3, m3, 0xFF - punpcklqdq m4, m4 - punpcklqdq m5, m5 - punpcklqdq m6, m6 - punpcklqdq m3, m3 -%endif ; mmsize == 8/16 -%endmacro - -%macro vp6_filter_diag4 0 -; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, -; const int16_t h_weight[4], const int16_t v_weights[4]) -cglobal vp6_filter_diag4, 5, 7, 8 - mov r5, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack -%if mmsize == 16 - sub rsp, 8*11 -%else - sub rsp, 8*15 - movq m6, [pw_64] -%endif -%if ARCH_X86_64 - movsxd r2, r2d -%endif - - sub r1, r2 - - pxor m7, m7 - movq m3, [r3] - SPLAT4REGS - - mov r3, rsp - mov r6, 11 -.nextrow: - DIAG4 r1, -1, 0, 1, 2, r3 - add r3, 8 - add r1, r2 - dec r6 - jnz .nextrow - - movq m3, [r4] - SPLAT4REGS - - lea r3, [rsp+8] - mov r6, 8 -.nextcol: - DIAG4 r3, -8, 0, 8, 16, r0 - add r3, 8 - add r0, r2 - dec r6 - jnz .nextcol - - mov rsp, r5 ; restore stack pointer - RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -vp6_filter_diag4 -%endif - -INIT_XMM sse2 -vp6_filter_diag4 diff --git a/ffmpeg1/libavcodec/x86/vp56dsp_init.c b/ffmpeg1/libavcodec/x86/vp56dsp_init.c deleted file mode 100644 index defc63b..0000000 --- a/ffmpeg1/libavcodec/x86/vp56dsp_init.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * VP6 MMX/SSE2 optimizations - * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> - * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/vp56dsp.h" - -void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights,const int16_t *v_weights); -void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights,const int16_t *v_weights); - -av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec) -{ - int mm_flags = av_get_cpu_flags(); - - if (CONFIG_VP6_DECODER && codec == AV_CODEC_ID_VP6) { -#if ARCH_X86_32 - if (EXTERNAL_MMX(mm_flags)) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; - } -#endif - - if (EXTERNAL_SSE2(mm_flags)) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; - } - } -} diff --git a/ffmpeg1/libavcodec/x86/vp8dsp.asm b/ffmpeg1/libavcodec/x86/vp8dsp.asm deleted file mode 100644 index ca07333..0000000 --- a/ffmpeg1/libavcodec/x86/vp8dsp.asm +++ /dev/null @@ -1,2780 +0,0 @@ -;****************************************************************************** -;* VP8 MMXEXT optimizations -;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> -;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -fourtap_filter_hw_m: times 4 dw -6, 123 - times 4 dw 12, -1 - times 4 dw -9, 93 - times 4 dw 50, -6 - times 4 dw -6, 50 - times 4 dw 93, -9 - times 4 dw -1, 12 - times 4 dw 123, -6 - -sixtap_filter_hw_m: times 4 dw 2, -11 - times 4 dw 108, 36 - times 4 dw -8, 1 - times 4 dw 3, -16 - times 4 dw 77, 77 - times 4 dw -16, 3 - times 4 dw 1, -8 - times 4 dw 36, 108 - times 4 dw -11, 2 - -fourtap_filter_hb_m: times 8 db -6, 123 - times 8 db 12, -1 - times 8 db -9, 93 - times 8 db 50, -6 - times 8 db -6, 50 - times 8 db 93, -9 - times 8 db -1, 12 - times 8 db 123, -6 - -sixtap_filter_hb_m: times 8 db 2, 1 - times 8 db -11, 108 - times 8 db 36, -8 - times 8 db 3, 3 - times 8 db -16, 77 - times 8 db 77, -16 - times 8 db 1, 2 - times 8 db -8, 36 - times 8 db 108, -11 - -fourtap_filter_v_m: times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - -sixtap_filter_v_m: times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - -bilinear_filter_vw_m: times 8 dw 1 - times 8 dw 2 - times 8 dw 3 - times 8 dw 4 - times 8 dw 5 - times 8 dw 6 - times 8 dw 7 - -bilinear_filter_vb_m: times 8 db 7, 1 - times 8 db 6, 2 - times 8 db 5, 3 - times 8 db 4, 4 - times 8 db 3, 5 - times 8 db 2, 6 - times 8 db 1, 7 - -%ifdef PIC -%define fourtap_filter_hw picregq -%define sixtap_filter_hw picregq -%define fourtap_filter_hb picregq -%define sixtap_filter_hb picregq -%define fourtap_filter_v picregq -%define sixtap_filter_v picregq -%define bilinear_filter_vw picregq -%define bilinear_filter_vb picregq -%define npicregs 1 -%else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m -%define fourtap_filter_hb fourtap_filter_hb_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define fourtap_filter_v fourtap_filter_v_m -%define sixtap_filter_v sixtap_filter_v_m -%define bilinear_filter_vw bilinear_filter_vw_m -%define bilinear_filter_vb bilinear_filter_vb_m -%define npicregs 0 -%endif - -filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 - -filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 -filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 - -pw_27: times 8 dw 27 -pw_63: times 8 dw 63 -pw_256: times 8 dw 256 -pw_20091: times 4 dw 20091 -pw_17734: times 4 dw 17734 - -pb_4: times 16 db 4 -pb_F8: times 16 db 0xF8 -pb_FE: times 16 db 0xFE -pb_27_63: times 8 db 27, 63 -pb_18_63: times 8 db 18, 63 -pb_9_63: times 8 db 9, 63 - -cextern pb_1 -cextern pw_3 -cextern pb_3 -cextern pw_4 -cextern pw_9 -cextern pw_18 -cextern pw_64 -cextern pb_80 - -SECTION .text - -;----------------------------------------------------------------------------- -; subpel MC functions: -; -; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, -; uint8_t *src, int srcstride, -; int height, int mx, int my); -;----------------------------------------------------------------------------- - -%macro FILTER_SSSE3 1 -cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] - mova m3, [filter_h6_shuf2] - mova m4, [filter_h6_shuf3] -%ifdef PIC - lea picregq, [sixtap_filter_hb_m] -%endif - mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+mxq*8-32] - mova m7, [sixtap_filter_hb+mxq*8-16] - -.nextrow: - movu m0, [srcq-2] - mova m1, m0 - mova m2, m0 -%if mmsize == 8 -; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the -; shuffle with a memory operand - punpcklbw m0, [srcq+3] -%else - pshufb m0, [filter_h6_shuf1] -%endif - pshufb m1, m3 - pshufb m2, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - paddsw m0, m1 - paddsw m0, m2 - pmulhrsw m0, [pw_256] - packuswb m0, m0 - movh [dstq], m0 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 - mova m2, [pw_256] - mova m3, [filter_h2_shuf] - mova m4, [filter_h4_shuf] -%ifdef PIC - lea picregq, [fourtap_filter_hb_m] -%endif - mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+mxq] - -.nextrow: - movu m0, [srcq-1] - mova m1, m0 - pshufb m0, m3 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m6 - paddsw m0, m1 - pmulhrsw m0, m2 - packuswb m0, m0 - movh [dstq], m0 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my - shl myd, 4 -%ifdef PIC - lea picregq, [fourtap_filter_hb_m] -%endif - mova m5, [fourtap_filter_hb+myq-16] - mova m6, [fourtap_filter_hb+myq] - mova m7, [pw_256] - - ; read 3 lines - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+ srcstrideq] - movh m2, [srcq+2*srcstrideq] - add srcq, srcstrideq - -.nextrow: - movh m3, [srcq+2*srcstrideq] ; read new row - mova m4, m0 - mova m0, m1 - punpcklbw m4, m1 - mova m1, m2 - punpcklbw m2, m3 - pmaddubsw m4, m5 - pmaddubsw m2, m6 - paddsw m4, m2 - mova m2, m3 - pmulhrsw m4, m7 - packuswb m4, m4 - movh [dstq], m4 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my - lea myd, [myq*3] -%ifdef PIC - lea picregq, [sixtap_filter_hb_m] -%endif - lea myq, [sixtap_filter_hb+myq*8] - - ; read 5 lines - sub srcq, srcstrideq - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] - lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - movh m3, [srcq] - movh m4, [srcq+srcstrideq] - -.nextrow: - movh m5, [srcq+2*srcstrideq] ; read new row - mova m6, m0 - punpcklbw m6, m5 - mova m0, m1 - punpcklbw m1, m2 - mova m7, m3 - punpcklbw m7, m4 - pmaddubsw m6, [myq-48] - pmaddubsw m1, [myq-32] - pmaddubsw m7, [myq-16] - paddsw m6, m1 - paddsw m6, m7 - mova m1, m2 - mova m2, m3 - pmulhrsw m6, [pw_256] - mova m3, m4 - packuswb m6, m6 - mova m4, m5 - movh [dstq], m6 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET -%endmacro - -INIT_MMX ssse3 -FILTER_SSSE3 4 -INIT_XMM ssse3 -FILTER_SSSE3 8 - -; 4x4 block, H-only 4-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%ifdef PIC - lea picregq, [fourtap_filter_hw_m] -%endif - movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+mxq] - movq mm7, [pw_64] - pxor mm6, mm6 - -.nextrow: - movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm6 ; byte->word ABCD - pshufw mm0, mm2, 9 ; byte CDEF.. - punpcklbw mm0, mm6 ; byte->word CDEF - pshufw mm3, mm1, 0x94 ; word ABBC - pshufw mm1, mm0, 0x94 ; word CDDE - pmaddwd mm3, mm4 ; multiply 2px with F0/F1 - movq mm0, mm1 ; backup for second set of pixels - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm3, mm1 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - punpckhbw mm2, mm6 ; byte->word EFGH - pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 - pshufw mm1, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm0, mm1 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm3, mm0 ; merge dword->word (4px) - paddsw mm3, mm7 ; rounding - psraw mm3, 7 - packuswb mm3, mm6 ; clip and word->bytes - movd [dstq], mm3 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -; 4x4 block, H-only 6-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] -%ifdef PIC - lea picregq, [sixtap_filter_hw_m] -%endif - movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+mxq*8-32] - movq mm6, [sixtap_filter_hw+mxq*8-16] - movq mm7, [pw_64] - pxor mm3, mm3 - -.nextrow: - movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm3 ; byte->word ABCD - pshufw mm0, mm2, 0x9 ; byte CDEF.. - punpckhbw mm2, mm3 ; byte->word EFGH - punpcklbw mm0, mm3 ; byte->word CDEF - pshufw mm1, mm1, 0x94 ; word ABBC - pshufw mm2, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm4 ; multiply 2px with F0/F1 - pshufw mm3, mm0, 0x94 ; word CDDE - movq mm0, mm3 ; backup for second set of pixels - pmaddwd mm3, mm5 ; multiply 2px with F2/F3 - paddd mm1, mm3 ; add to 1st 2px cache - movq mm3, mm2 ; backup for second set of pixels - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm1, mm2 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - movd mm2, [srcq+3] ; byte FGHI (prevent overreads) - pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 - pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 - paddd mm0, mm3 ; add to 2nd 2px cache - pxor mm3, mm3 - punpcklbw mm2, mm3 ; byte->word FGHI - pshufw mm2, mm2, 0xE9 ; word GHHI - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm0, mm2 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm1, mm0 ; merge dword->word (4px) - paddsw mm1, mm7 ; rounding - psraw mm1, 7 - packuswb mm1, mm3 ; clip and word->bytes - movd [dstq], mm1 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -INIT_XMM sse2 -cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 5 -%ifdef PIC - lea picregq, [fourtap_filter_v_m] -%endif - lea mxq, [fourtap_filter_v+mxq-32] - pxor m7, m7 - mova m4, [pw_64] - mova m5, [mxq+ 0] - mova m6, [mxq+16] -%ifdef m8 - mova m8, [mxq+32] - mova m9, [mxq+48] -%endif -.nextrow: - movq m0, [srcq-1] - movq m1, [srcq-0] - movq m2, [srcq+1] - movq m3, [srcq+2] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - pmullw m0, m5 - pmullw m1, m6 -%ifdef m8 - pmullw m2, m8 - pmullw m3, m9 -%else - pmullw m2, [mxq+32] - pmullw m3, [mxq+48] -%endif - paddsw m0, m1 - paddsw m2, m3 - paddsw m0, m2 - paddsw m0, m4 - psraw m0, 7 - packuswb m0, m7 - movh [dstq], m0 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -INIT_XMM sse2 -cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] - shl mxd, 4 -%ifdef PIC - lea picregq, [sixtap_filter_v_m] -%endif - lea mxq, [sixtap_filter_v+mxq-96] - pxor m7, m7 - mova m6, [pw_64] -%ifdef m8 - mova m8, [mxq+ 0] - mova m9, [mxq+16] - mova m10, [mxq+32] - mova m11, [mxq+48] - mova m12, [mxq+64] - mova m13, [mxq+80] -%endif -.nextrow: - movq m0, [srcq-2] - movq m1, [srcq-1] - movq m2, [srcq-0] - movq m3, [srcq+1] - movq m4, [srcq+2] - movq m5, [srcq+3] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - punpcklbw m5, m7 -%ifdef m8 - pmullw m0, m8 - pmullw m1, m9 - pmullw m2, m10 - pmullw m3, m11 - pmullw m4, m12 - pmullw m5, m13 -%else - pmullw m0, [mxq+ 0] - pmullw m1, [mxq+16] - pmullw m2, [mxq+32] - pmullw m3, [mxq+48] - pmullw m4, [mxq+64] - pmullw m5, [mxq+80] -%endif - paddsw m1, m4 - paddsw m0, m5 - paddsw m1, m2 - paddsw m0, m3 - paddsw m0, m1 - paddsw m0, m6 - psraw m0, 7 - packuswb m0, m7 - movh [dstq], m0 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - -%macro FILTER_V 1 -; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my - shl myd, 5 -%ifdef PIC - lea picregq, [fourtap_filter_v_m] -%endif - lea myq, [fourtap_filter_v+myq-32] - mova m6, [pw_64] - pxor m7, m7 - mova m5, [myq+48] - - ; read 3 lines - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+ srcstrideq] - movh m2, [srcq+2*srcstrideq] - add srcq, srcstrideq - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - -.nextrow: - ; first calculate negative taps (to prevent losing positive overflows) - movh m4, [srcq+2*srcstrideq] ; read new row - punpcklbw m4, m7 - mova m3, m4 - pmullw m0, [myq+0] - pmullw m4, m5 - paddsw m4, m0 - - ; then calculate positive taps - mova m0, m1 - pmullw m1, [myq+16] - paddsw m4, m1 - mova m1, m2 - pmullw m2, [myq+32] - paddsw m4, m2 - mova m2, m3 - - ; round/clip/store - paddsw m4, m6 - psraw m4, 7 - packuswb m4, m7 - movh [dstq], m4 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET - - -; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my - shl myd, 4 - lea myq, [myq*3] -%ifdef PIC - lea picregq, [sixtap_filter_v_m] -%endif - lea myq, [sixtap_filter_v+myq-96] - pxor m7, m7 - - ; read 5 lines - sub srcq, srcstrideq - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] - lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - movh m3, [srcq] - movh m4, [srcq+srcstrideq] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - -.nextrow: - ; first calculate negative taps (to prevent losing positive overflows) - mova m5, m1 - pmullw m5, [myq+16] - mova m6, m4 - pmullw m6, [myq+64] - paddsw m6, m5 - - ; then calculate positive taps - movh m5, [srcq+2*srcstrideq] ; read new row - punpcklbw m5, m7 - pmullw m0, [myq+0] - paddsw m6, m0 - mova m0, m1 - mova m1, m2 - pmullw m2, [myq+32] - paddsw m6, m2 - mova m2, m3 - pmullw m3, [myq+48] - paddsw m6, m3 - mova m3, m4 - mova m4, m5 - pmullw m5, [myq+80] - paddsw m6, m5 - - ; round/clip/store - paddsw m6, [pw_64] - psraw m6, 7 - packuswb m6, m7 - movh [dstq], m6 - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -FILTER_V 4 -INIT_XMM sse2 -FILTER_V 8 - -%macro FILTER_BILINEAR 1 -cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my - shl myd, 4 -%ifdef PIC - lea picregq, [bilinear_filter_vw_m] -%endif - pxor m6, m6 - mova m5, [bilinear_filter_vw+myq-1*16] - neg myq - mova m4, [bilinear_filter_vw+myq+7*16] -.nextrow: - movh m0, [srcq+srcstrideq*0] - movh m1, [srcq+srcstrideq*1] - movh m3, [srcq+srcstrideq*2] - punpcklbw m0, m6 - punpcklbw m1, m6 - punpcklbw m3, m6 - mova m2, m1 - pmullw m0, m4 - pmullw m1, m5 - pmullw m2, m4 - pmullw m3, m5 - paddsw m0, m1 - paddsw m2, m3 - psraw m0, 2 - psraw m2, 2 - pavgw m0, m6 - pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else - packuswb m0, m2 - movh [dstq+dststrideq*0], m0 - movhps [dstq+dststrideq*1], m0 -%endif - - lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET - -cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%ifdef PIC - lea picregq, [bilinear_filter_vw_m] -%endif - pxor m6, m6 - mova m5, [bilinear_filter_vw+mxq-1*16] - neg mxq - mova m4, [bilinear_filter_vw+mxq+7*16] -.nextrow: - movh m0, [srcq+srcstrideq*0+0] - movh m1, [srcq+srcstrideq*0+1] - movh m2, [srcq+srcstrideq*1+0] - movh m3, [srcq+srcstrideq*1+1] - punpcklbw m0, m6 - punpcklbw m1, m6 - punpcklbw m2, m6 - punpcklbw m3, m6 - pmullw m0, m4 - pmullw m1, m5 - pmullw m2, m4 - pmullw m3, m5 - paddsw m0, m1 - paddsw m2, m3 - psraw m0, 2 - psraw m2, 2 - pavgw m0, m6 - pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else - packuswb m0, m2 - movh [dstq+dststrideq*0], m0 - movhps [dstq+dststrideq*1], m0 -%endif - - lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET -%endmacro - -INIT_MMX mmxext -FILTER_BILINEAR 4 -INIT_XMM sse2 -FILTER_BILINEAR 8 - -%macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my - shl myd, 4 -%ifdef PIC - lea picregq, [bilinear_filter_vb_m] -%endif - pxor m4, m4 - mova m3, [bilinear_filter_vb+myq-16] -.nextrow: - movh m0, [srcq+srcstrideq*0] - movh m1, [srcq+srcstrideq*1] - movh m2, [srcq+srcstrideq*2] - punpcklbw m0, m1 - punpcklbw m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - psraw m0, 2 - psraw m1, 2 - pavgw m0, m4 - pavgw m1, m4 -%if mmsize==8 - packuswb m0, m0 - packuswb m1, m1 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m1 -%else - packuswb m0, m1 - movh [dstq+dststrideq*0], m0 - movhps [dstq+dststrideq*1], m0 -%endif - - lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET - -cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%ifdef PIC - lea picregq, [bilinear_filter_vb_m] -%endif - pxor m4, m4 - mova m2, [filter_h2_shuf] - mova m3, [bilinear_filter_vb+mxq-16] -.nextrow: - movu m0, [srcq+srcstrideq*0] - movu m1, [srcq+srcstrideq*1] - pshufb m0, m2 - pshufb m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - psraw m0, 2 - psraw m1, 2 - pavgw m0, m4 - pavgw m1, m4 -%if mmsize==8 - packuswb m0, m0 - packuswb m1, m1 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m1 -%else - packuswb m0, m1 - movh [dstq+dststrideq*0], m0 - movhps [dstq+dststrideq*1], m0 -%endif - - lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET -%endmacro - -INIT_MMX ssse3 -FILTER_BILINEAR_SSSE3 4 -INIT_XMM ssse3 -FILTER_BILINEAR_SSSE3 8 - -INIT_MMX mmx -cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height -.nextrow: - movq mm0, [srcq+srcstrideq*0] - movq mm1, [srcq+srcstrideq*1] - lea srcq, [srcq+srcstrideq*2] - movq [dstq+dststrideq*0], mm0 - movq [dstq+dststrideq*1], mm1 - lea dstq, [dstq+dststrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET - -%if ARCH_X86_32 -INIT_MMX mmx -cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height -.nextrow: - movq mm0, [srcq+srcstrideq*0+0] - movq mm1, [srcq+srcstrideq*0+8] - movq mm2, [srcq+srcstrideq*1+0] - movq mm3, [srcq+srcstrideq*1+8] - lea srcq, [srcq+srcstrideq*2] - movq [dstq+dststrideq*0+0], mm0 - movq [dstq+dststrideq*0+8], mm1 - movq [dstq+dststrideq*1+0], mm2 - movq [dstq+dststrideq*1+8], mm3 - lea dstq, [dstq+dststrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET -%endif - -INIT_XMM sse -cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height -.nextrow: - movups xmm0, [srcq+srcstrideq*0] - movups xmm1, [srcq+srcstrideq*1] - lea srcq, [srcq+srcstrideq*2] - movaps [dstq+dststrideq*0], xmm0 - movaps [dstq+dststrideq*1], xmm1 - lea dstq, [dstq+dststrideq*2] - sub heightd, 2 - jg .nextrow - REP_RET - -;----------------------------------------------------------------------------- -; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride); -;----------------------------------------------------------------------------- - -%macro ADD_DC 4 - %4 m2, [dst1q+%3] - %4 m3, [dst1q+strideq+%3] - %4 m4, [dst2q+%3] - %4 m5, [dst2q+strideq+%3] - paddusb m2, %1 - paddusb m3, %1 - paddusb m4, %1 - paddusb m5, %1 - psubusb m2, %2 - psubusb m3, %2 - psubusb m4, %2 - psubusb m5, %2 - %4 [dst1q+%3], m2 - %4 [dst1q+strideq+%3], m3 - %4 [dst2q+%3], m4 - %4 [dst2q+strideq+%3], m5 -%endmacro - -INIT_MMX mmx -cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride - ; load data - movd m0, [blockq] - - ; calculate DC - paddw m0, [pw_4] - pxor m1, m1 - psraw m0, 3 - movd [blockq], m1 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - punpcklbw m0, m0 - punpcklbw m1, m1 - punpcklwd m0, m0 - punpcklwd m1, m1 - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m1, 0, movh - RET - -INIT_XMM sse4 -cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride - ; load data - movd m0, [blockq] - pxor m1, m1 - - ; calculate DC - paddw m0, [pw_4] - movd [blockq], m1 - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - movd m2, [dst1q] - movd m3, [dst1q+strideq] - movd m4, [dst2q] - movd m5, [dst2q+strideq] - psraw m0, 3 - pshuflw m0, m0, 0 - punpcklqdq m0, m0 - punpckldq m2, m3 - punpckldq m4, m5 - punpcklbw m2, m1 - punpcklbw m4, m1 - paddw m2, m0 - paddw m4, m0 - packuswb m2, m4 - movd [dst1q], m2 - pextrd [dst1q+strideq], m2, 1 - pextrd [dst2q], m2, 2 - pextrd [dst2q+strideq], m2, 3 - RET - -;----------------------------------------------------------------------------- -; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride); -;----------------------------------------------------------------------------- - -%if ARCH_X86_32 -INIT_MMX mmx -cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride - ; load data - movd m0, [blockq+32*0] ; A - movd m1, [blockq+32*2] ; C - punpcklwd m0, [blockq+32*1] ; A B - punpcklwd m1, [blockq+32*3] ; C D - punpckldq m0, m1 ; A B C D - pxor m6, m6 - - ; calculate DC - paddw m0, [pw_4] - movd [blockq+32*0], m6 - movd [blockq+32*1], m6 - movd [blockq+32*2], m6 - movd [blockq+32*3], m6 - psraw m0, 3 - psubw m6, m0 - packuswb m0, m0 - packuswb m6, m6 - punpcklbw m0, m0 ; AABBCCDD - punpcklbw m6, m6 ; AABBCCDD - movq m1, m0 - movq m7, m6 - punpcklbw m0, m0 ; AAAABBBB - punpckhbw m1, m1 ; CCCCDDDD - punpcklbw m6, m6 ; AAAABBBB - punpckhbw m7, m7 ; CCCCDDDD - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m6, 0, mova - ADD_DC m1, m7, 8, mova - RET -%endif - -INIT_XMM sse2 -cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride - ; load data - movd m0, [blockq+32*0] ; A - movd m1, [blockq+32*2] ; C - punpcklwd m0, [blockq+32*1] ; A B - punpcklwd m1, [blockq+32*3] ; C D - punpckldq m0, m1 ; A B C D - pxor m1, m1 - - ; calculate DC - paddw m0, [pw_4] - movd [blockq+32*0], m1 - movd [blockq+32*1], m1 - movd [blockq+32*2], m1 - movd [blockq+32*3], m1 - psraw m0, 3 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - punpcklbw m0, m0 - punpcklbw m1, m1 - punpcklbw m0, m0 - punpcklbw m1, m1 - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m1, 0, mova - RET - -;----------------------------------------------------------------------------- -; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride); -;----------------------------------------------------------------------------- - -INIT_MMX mmx -cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride - ; load data - movd m0, [blockq+32*0] ; A - movd m1, [blockq+32*2] ; C - punpcklwd m0, [blockq+32*1] ; A B - punpcklwd m1, [blockq+32*3] ; C D - punpckldq m0, m1 ; A B C D - pxor m6, m6 - - ; calculate DC - paddw m0, [pw_4] - movd [blockq+32*0], m6 - movd [blockq+32*1], m6 - movd [blockq+32*2], m6 - movd [blockq+32*3], m6 - psraw m0, 3 - psubw m6, m0 - packuswb m0, m0 - packuswb m6, m6 - punpcklbw m0, m0 ; AABBCCDD - punpcklbw m6, m6 ; AABBCCDD - movq m1, m0 - movq m7, m6 - punpcklbw m0, m0 ; AAAABBBB - punpckhbw m1, m1 ; CCCCDDDD - punpcklbw m6, m6 ; AAAABBBB - punpckhbw m7, m7 ; CCCCDDDD - - ; add DC - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+strideq*2] - ADD_DC m0, m6, 0, mova - lea dst1q, [dst1q+strideq*4] - lea dst2q, [dst2q+strideq*4] - ADD_DC m1, m7, 0, mova - RET - -;----------------------------------------------------------------------------- -; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride); -;----------------------------------------------------------------------------- - -; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) -; this macro assumes that m6/m7 have words for 20091/17734 loaded -%macro VP8_MULTIPLY_SUMSUB 4 - mova %3, %1 - mova %4, %2 - pmulhw %3, m6 ;20091(1) - pmulhw %4, m6 ;20091(2) - paddw %3, %1 - paddw %4, %2 - paddw %1, %1 - paddw %2, %2 - pmulhw %1, m7 ;35468(1) - pmulhw %2, m7 ;35468(2) - psubw %1, %4 - paddw %2, %3 -%endmacro - -; calculate x0=%1+%3; x1=%1-%3 -; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) -; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) -; %5/%6 are temporary registers -; we assume m6/m7 have constant words 20091/17734 loaded in them -%macro VP8_IDCT_TRANSFORM4x4_1D 6 - SUMSUB_BA w, %3, %1, %5 ;t0, t1 - VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 - SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 - SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 - SWAP %4, %1 - SWAP %4, %3 -%endmacro - -%macro VP8_IDCT_ADD 0 -cglobal vp8_idct_add, 3, 3, 0, dst, block, stride - ; load block data - movq m0, [blockq+ 0] - movq m1, [blockq+ 8] - movq m2, [blockq+16] - movq m3, [blockq+24] - movq m6, [pw_20091] - movq m7, [pw_17734] -%if cpuflag(sse) - xorps xmm0, xmm0 - movaps [blockq+ 0], xmm0 - movaps [blockq+16], xmm0 -%else - pxor m4, m4 - movq [blockq+ 0], m4 - movq [blockq+ 8], m4 - movq [blockq+16], m4 - movq [blockq+24], m4 -%endif - - ; actual IDCT - VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - paddw m0, [pw_4] - VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - - ; store - pxor m4, m4 - DEFINE_ARGS dst1, dst2, stride - lea dst2q, [dst1q+2*strideq] - STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq - STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq - - RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -VP8_IDCT_ADD -%endif -INIT_MMX sse -VP8_IDCT_ADD - -;----------------------------------------------------------------------------- -; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16]) -;----------------------------------------------------------------------------- - -%macro SCATTER_WHT 3 - movd dc1d, m%1 - movd dc2d, m%2 - mov [blockq+2*16*(0+%3)], dc1w - mov [blockq+2*16*(1+%3)], dc2w - shr dc1d, 16 - shr dc2d, 16 - psrlq m%1, 32 - psrlq m%2, 32 - mov [blockq+2*16*(4+%3)], dc1w - mov [blockq+2*16*(5+%3)], dc2w - movd dc1d, m%1 - movd dc2d, m%2 - mov [blockq+2*16*(8+%3)], dc1w - mov [blockq+2*16*(9+%3)], dc2w - shr dc1d, 16 - shr dc2d, 16 - mov [blockq+2*16*(12+%3)], dc1w - mov [blockq+2*16*(13+%3)], dc2w -%endmacro - -%macro HADAMARD4_1D 4 - SUMSUB_BADC w, %2, %1, %4, %3 - SUMSUB_BADC w, %4, %2, %3, %1 - SWAP %1, %4, %3 -%endmacro - -%macro VP8_DC_WHT 0 -cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 - movq m0, [dc1q] - movq m1, [dc1q+8] - movq m2, [dc1q+16] - movq m3, [dc1q+24] -%if cpuflag(sse) - xorps xmm0, xmm0 - movaps [dc1q+ 0], xmm0 - movaps [dc1q+16], xmm0 -%else - pxor m4, m4 - movq [dc1q+ 0], m4 - movq [dc1q+ 8], m4 - movq [dc1q+16], m4 - movq [dc1q+24], m4 -%endif - HADAMARD4_1D 0, 1, 2, 3 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - paddw m0, [pw_3] - HADAMARD4_1D 0, 1, 2, 3 - psraw m0, 3 - psraw m1, 3 - psraw m2, 3 - psraw m3, 3 - SCATTER_WHT 0, 1, 0 - SCATTER_WHT 2, 3, 2 - RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -VP8_DC_WHT -%endif -INIT_MMX sse -VP8_DC_WHT - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); -;----------------------------------------------------------------------------- - -; macro called with 7 mm register indexes as argument, and 4 regular registers -; -; first 4 mm registers will carry the transposed pixel data -; the other three are scratchspace (one would be sufficient, but this allows -; for more spreading/pipelining and thus faster execution on OOE CPUs) -; -; first two regular registers are buf+4*stride and buf+5*stride -; third is -stride, fourth is +stride -%macro READ_8x4_INTERLEAVED 11 - ; interleave 8 (A-H) rows of 4 pixels each - movd m%1, [%8+%10*4] ; A0-3 - movd m%5, [%9+%10*4] ; B0-3 - movd m%2, [%8+%10*2] ; C0-3 - movd m%6, [%8+%10] ; D0-3 - movd m%3, [%8] ; E0-3 - movd m%7, [%9] ; F0-3 - movd m%4, [%9+%11] ; G0-3 - punpcklbw m%1, m%5 ; A/B interleaved - movd m%5, [%9+%11*2] ; H0-3 - punpcklbw m%2, m%6 ; C/D interleaved - punpcklbw m%3, m%7 ; E/F interleaved - punpcklbw m%4, m%5 ; G/H interleaved -%endmacro - -; macro called with 7 mm register indexes as argument, and 5 regular registers -; first 11 mean the same as READ_8x4_TRANSPOSED above -; fifth regular register is scratchspace to reach the bottom 8 rows, it -; will be set to second regular register + 8*stride at the end -%macro READ_16x4_INTERLEAVED 12 - ; transpose 16 (A-P) rows of 4 pixels each - lea %12, [r0+8*r2] - - ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M - movd m%1, [%8+%10*4] ; A0-3 - movd m%3, [%12+%10*4] ; I0-3 - movd m%2, [%8+%10*2] ; C0-3 - movd m%4, [%12+%10*2] ; K0-3 - movd m%6, [%8+%10] ; D0-3 - movd m%5, [%12+%10] ; L0-3 - movd m%7, [%12] ; M0-3 - add %12, %11 - punpcklbw m%1, m%3 ; A/I - movd m%3, [%8] ; E0-3 - punpcklbw m%2, m%4 ; C/K - punpcklbw m%6, m%5 ; D/L - punpcklbw m%3, m%7 ; E/M - punpcklbw m%2, m%6 ; C/D/K/L interleaved - - ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P - movd m%5, [%9+%10*4] ; B0-3 - movd m%4, [%12+%10*4] ; J0-3 - movd m%7, [%9] ; F0-3 - movd m%6, [%12] ; N0-3 - punpcklbw m%5, m%4 ; B/J - punpcklbw m%7, m%6 ; F/N - punpcklbw m%1, m%5 ; A/B/I/J interleaved - punpcklbw m%3, m%7 ; E/F/M/N interleaved - movd m%4, [%9+%11] ; G0-3 - movd m%6, [%12+%11] ; O0-3 - movd m%5, [%9+%11*2] ; H0-3 - movd m%7, [%12+%11*2] ; P0-3 - punpcklbw m%4, m%6 ; G/O - punpcklbw m%5, m%7 ; H/P - punpcklbw m%4, m%5 ; G/H/O/P interleaved -%endmacro - -; write 4 mm registers of 2 dwords each -; first four arguments are mm register indexes containing source data -; last four are registers containing buf+4*stride, buf+5*stride, -; -stride and +stride -%macro WRITE_4x2D 8 - ; write out (2 dwords per register) - movd [%5+%7*4], m%1 - movd [%5+%7*2], m%2 - movd [%5], m%3 - movd [%6+%8], m%4 - punpckhdq m%1, m%1 - punpckhdq m%2, m%2 - punpckhdq m%3, m%3 - punpckhdq m%4, m%4 - movd [%6+%7*4], m%1 - movd [%5+%7], m%2 - movd [%6], m%3 - movd [%6+%8*2], m%4 -%endmacro - -; write 4 xmm registers of 4 dwords each -; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular -; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride -; we add 1*stride to the third regular registry in the process -; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the -; same memory region), or 8 if they cover two separate buffers (third one points to -; a different memory region than the first two), allowing for more optimal code for -; the 16-width case -%macro WRITE_4x4D 10 - ; write out (4 dwords per register), start with dwords zero - movd [%5+%8*4], m%1 - movd [%5], m%2 - movd [%7+%8*4], m%3 - movd [%7], m%4 - - ; store dwords 1 - psrldq m%1, 4 - psrldq m%2, 4 - psrldq m%3, 4 - psrldq m%4, 4 - movd [%6+%8*4], m%1 - movd [%6], m%2 -%if %10 == 16 - movd [%6+%9*4], m%3 -%endif - movd [%7+%9], m%4 - - ; write dwords 2 - psrldq m%1, 4 - psrldq m%2, 4 -%if %10 == 8 - movd [%5+%8*2], m%1 - movd %5d, m%3 -%endif - psrldq m%3, 4 - psrldq m%4, 4 -%if %10 == 16 - movd [%5+%8*2], m%1 -%endif - movd [%6+%9], m%2 - movd [%7+%8*2], m%3 - movd [%7+%9*2], m%4 - add %7, %9 - - ; store dwords 3 - psrldq m%1, 4 - psrldq m%2, 4 - psrldq m%3, 4 - psrldq m%4, 4 -%if %10 == 8 - mov [%7+%8*4], %5d - movd [%6+%8*2], m%1 -%else - movd [%5+%8], m%1 -%endif - movd [%6+%9*2], m%2 - movd [%7+%8*2], m%3 - movd [%7+%9*2], m%4 -%endmacro - -; write 4 or 8 words in the mmx/xmm registers as 8 lines -; 1 and 2 are the registers to write, this can be the same (for SSE2) -; for pre-SSE4: -; 3 is a general-purpose register that we will clobber -; for SSE4: -; 3 is a pointer to the destination's 5th line -; 4 is a pointer to the destination's 4th line -; 5/6 is -stride and +stride -%macro WRITE_2x4W 6 - movd %3d, %1 - punpckhdq %1, %1 - mov [%4+%5*4], %3w - shr %3, 16 - add %4, %6 - mov [%4+%5*4], %3w - - movd %3d, %1 - add %4, %5 - mov [%4+%5*2], %3w - shr %3, 16 - mov [%4+%5 ], %3w - - movd %3d, %2 - punpckhdq %2, %2 - mov [%4 ], %3w - shr %3, 16 - mov [%4+%6 ], %3w - - movd %3d, %2 - add %4, %6 - mov [%4+%6 ], %3w - shr %3, 16 - mov [%4+%6*2], %3w - add %4, %5 -%endmacro - -%macro WRITE_8W 5 -%if cpuflag(sse4) - pextrw [%3+%4*4], %1, 0 - pextrw [%2+%4*4], %1, 1 - pextrw [%3+%4*2], %1, 2 - pextrw [%3+%4 ], %1, 3 - pextrw [%3 ], %1, 4 - pextrw [%2 ], %1, 5 - pextrw [%2+%5 ], %1, 6 - pextrw [%2+%5*2], %1, 7 -%else - movd %2d, %1 - psrldq %1, 4 - mov [%3+%4*4], %2w - shr %2, 16 - add %3, %5 - mov [%3+%4*4], %2w - - movd %2d, %1 - psrldq %1, 4 - add %3, %4 - mov [%3+%4*2], %2w - shr %2, 16 - mov [%3+%4 ], %2w - - movd %2d, %1 - psrldq %1, 4 - mov [%3 ], %2w - shr %2, 16 - mov [%3+%5 ], %2w - - movd %2d, %1 - add %3, %5 - mov [%3+%5 ], %2w - shr %2, 16 - mov [%3+%5*2], %2w -%endif -%endmacro - -%macro SIMPLE_LOOPFILTER 2 -cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr -%if mmsize == 8 ; mmx/mmxext - mov cntrq, 2 -%endif -%if cpuflag(ssse3) - pxor m0, m0 -%endif - SPLATB_REG m7, flim, m0 ; splat "flim" into register - - ; set up indexes to address 4 rows -%if mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, cntr, dst2 -%else - DEFINE_ARGS dst1, mstride, stride, dst3, dst2 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+4*strideq-2] -%endif - -%if mmsize == 8 ; mmx / mmxext -.next8px: -%endif -%ifidn %1, v - ; read 4 half/full rows of pixels - mova m0, [dst1q+mstrideq*2] ; p1 - mova m1, [dst1q+mstrideq] ; p0 - mova m2, [dst1q] ; q0 - mova m3, [dst1q+ strideq] ; q1 -%else ; h - lea dst2q, [dst1q+ strideq] - -%if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq -%else ; sse2 - READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q -%endif - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif - - ; simple_limit - mova m5, m2 ; m5=backup of q0 - mova m6, m1 ; m6=backup of p0 - psubusb m1, m2 ; p0-q0 - psubusb m2, m6 ; q0-p0 - por m1, m2 ; FFABS(p0-q0) - paddusb m1, m1 ; m1=FFABS(p0-q0)*2 - - mova m4, m3 - mova m2, m0 - psubusb m3, m0 ; q1-p1 - psubusb m0, m4 ; p1-q1 - por m3, m0 ; FFABS(p1-q1) - mova m0, [pb_80] - pxor m2, m0 - pxor m4, m0 - psubsb m2, m4 ; m2=p1-q1 (signed) backup for below - pand m3, [pb_FE] - psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed - paddusb m3, m1 - psubusb m3, m7 - pxor m1, m1 - pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) - - ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) - mova m4, m5 - pxor m5, m0 - pxor m0, m6 - psubsb m5, m0 ; q0-p0 (signed) - paddsb m2, m5 - paddsb m2, m5 - paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) - pand m2, m3 ; apply filter mask (m3) - - mova m3, [pb_F8] - mova m1, m2 - paddsb m2, [pb_4] ; f1<<3=a+4 - paddsb m1, [pb_3] ; f2<<3=a+3 - pand m2, m3 - pand m1, m3 ; cache f2<<3 - - pxor m0, m0 - pxor m3, m3 - pcmpgtb m0, m2 ; which values are <0? - psubb m3, m2 ; -f1<<3 - psrlq m2, 3 ; +f1 - psrlq m3, 3 ; -f1 - pand m3, m0 - pandn m0, m2 - psubusb m4, m0 - paddusb m4, m3 ; q0-f1 - - pxor m0, m0 - pxor m3, m3 - pcmpgtb m0, m1 ; which values are <0? - psubb m3, m1 ; -f2<<3 - psrlq m1, 3 ; +f2 - psrlq m3, 3 ; -f2 - pand m3, m0 - pandn m0, m1 - paddusb m6, m0 - psubusb m6, m3 ; p0+f2 - - ; store -%ifidn %1, v - mova [dst1q], m4 - mova [dst1q+mstrideq], m6 -%else ; h - inc dst1q - SBUTTERFLY bw, 6, 4, 0 - -%if mmsize == 16 ; sse2 -%if cpuflag(sse4) - inc dst2q -%endif - WRITE_8W m6, dst2q, dst1q, mstrideq, strideq - lea dst2q, [dst3q+mstrideq+1] -%if cpuflag(sse4) - inc dst3q -%endif - WRITE_8W m4, dst3q, dst2q, mstrideq, strideq -%else ; mmx/mmxext - WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq -%endif -%endif - -%if mmsize == 8 ; mmx/mmxext - ; next 8 pixels -%ifidn %1, v - add dst1q, 8 ; advance 8 cols = pixels -%else ; h - lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines -%endif - dec cntrq - jg .next8px - REP_RET -%else ; sse2 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -INIT_MMX mmxext -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -%endif - -INIT_XMM sse2 -SIMPLE_LOOPFILTER v, 3 -SIMPLE_LOOPFILTER h, 5 -INIT_XMM ssse3 -SIMPLE_LOOPFILTER v, 3 -SIMPLE_LOOPFILTER h, 5 -INIT_XMM sse4 -SIMPLE_LOOPFILTER h, 5 - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, -; int flimE, int flimI, int hev_thr); -;----------------------------------------------------------------------------- - -%macro INNER_LOOPFILTER 2 -%define stack_size 0 -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%ifidn %1, v ; [3]=hev() result -%define stack_size mmsize * -4 -%else ; h ; extra storage space for transposes -%define stack_size mmsize * -5 -%endif -%endif - -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif - -%ifndef m8 - ; splat function arguments - SPLATB_REG m0, flimEq, m7 ; E - SPLATB_REG m1, flimIq, m7 ; I - SPLATB_REG m2, hevthrq, m7 ; hev_thresh - -%define m_flimE [rsp] -%define m_flimI [rsp+mmsize] -%define m_hevthr [rsp+mmsize*2] -%define m_maskres [rsp+mmsize*3] -%define m_p0backup [rsp+mmsize*3] -%define m_q0backup [rsp+mmsize*4] - - mova m_flimE, m0 - mova m_flimI, m1 - mova m_hevthr, m2 -%else -%define m_flimE m9 -%define m_flimI m10 -%define m_hevthr m11 -%define m_maskres m12 -%define m_p0backup m12 -%define m_q0backup m8 - - ; splat function arguments - SPLATB_REG m_flimE, flimEq, m7 ; E - SPLATB_REG m_flimI, flimIq, m7 ; I - SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh -%endif - -%if %2 == 8 ; chroma - DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 -%else - DEFINE_ARGS dst1, mstride, stride, dst2, dst8 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+strideq*4-4] -%if %2 == 8 ; chroma - lea dst8q, [dst8q+strideq*4-4] -%endif -%endif - -%if mmsize == 8 -.next8px: -%endif - ; read - lea dst2q, [dst1q+strideq] -%ifidn %1, v -%if %2 == 8 && mmsize == 16 -%define movrow movh -%else -%define movrow mova -%endif - movrow m0, [dst1q+mstrideq*4] ; p3 - movrow m1, [dst2q+mstrideq*4] ; p2 - movrow m2, [dst1q+mstrideq*2] ; p1 - movrow m5, [dst2q] ; q1 - movrow m6, [dst2q+ strideq*1] ; q2 - movrow m7, [dst2q+ strideq*2] ; q3 -%if mmsize == 16 && %2 == 8 - movhps m0, [dst8q+mstrideq*4] - movhps m2, [dst8q+mstrideq*2] - add dst8q, strideq - movhps m1, [dst8q+mstrideq*4] - movhps m5, [dst8q] - movhps m6, [dst8q+ strideq ] - movhps m7, [dst8q+ strideq*2] - add dst8q, mstrideq -%endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) -%if %2 == 16 - lea dst8q, [dst1q+ strideq*8] -%endif - - ; read 16 rows of 8px each, interleave - movh m0, [dst1q+mstrideq*4] - movh m1, [dst8q+mstrideq*4] - movh m2, [dst1q+mstrideq*2] - movh m5, [dst8q+mstrideq*2] - movh m3, [dst1q+mstrideq ] - movh m6, [dst8q+mstrideq ] - movh m4, [dst1q] - movh m7, [dst8q] - punpcklbw m0, m1 ; A/I - punpcklbw m2, m5 ; C/K - punpcklbw m3, m6 ; D/L - punpcklbw m4, m7 ; E/M - - add dst8q, strideq - movh m1, [dst2q+mstrideq*4] - movh m6, [dst8q+mstrideq*4] - movh m5, [dst2q] - movh m7, [dst8q] - punpcklbw m1, m6 ; B/J - punpcklbw m5, m7 ; F/N - movh m6, [dst2q+ strideq ] - movh m7, [dst8q+ strideq ] - punpcklbw m6, m7 ; G/O - - ; 8x16 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m8 - SWAP 1, 8 -%else - mova m_q0backup, m1 -%endif - movh m7, [dst2q+ strideq*2] - movh m1, [dst8q+ strideq*2] - punpcklbw m7, m1 ; H/P - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m8 - SWAP 1, 8 - SWAP 2, 8 -%else - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 -%endif - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m12 - SWAP 5, 12 -%else - mova m_p0backup, m5 ; store p0 -%endif - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%endif - - ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 - mova m4, m1 - SWAP 4, 1 - psubusb m4, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m4 ; abs(p3-p2) - - mova m4, m2 - SWAP 4, 2 - psubusb m4, m1 ; p1-p2 - psubusb m1, m2 ; p2-p1 - por m1, m4 ; abs(p2-p1) - - mova m4, m6 - SWAP 4, 6 - psubusb m4, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m4 ; abs(q3-q2) - - mova m4, m5 - SWAP 4, 5 - psubusb m4, m6 ; q1-q2 - psubusb m6, m5 ; q2-q1 - por m6, m4 ; abs(q2-q1) - -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 - pmaxub m0, m1 - pmaxub m6, m7 - pmaxub m0, m6 -%endif - - ; normal_limit and high_edge_variance for p1-p0, q1-q0 - SWAP 7, 3 ; now m7 is zero -%ifidn %1, v - movrow m3, [dst1q+mstrideq ] ; p0 -%if mmsize == 16 && %2 == 8 - movhps m3, [dst8q+mstrideq ] -%endif -%elifdef m12 - SWAP 3, 12 -%else - mova m3, m_p0backup -%endif - - mova m1, m2 - SWAP 1, 2 - mova m6, m3 - SWAP 3, 6 - psubusb m1, m3 ; p1-p0 - psubusb m6, m2 ; p0-p1 - por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 - pmaxub m0, m1 ; max_I - SWAP 1, 4 ; max_hev_thresh -%endif - - SWAP 6, 4 ; now m6 is I -%ifidn %1, v - movrow m4, [dst1q] ; q0 -%if mmsize == 16 && %2 == 8 - movhps m4, [dst8q] -%endif -%elifdef m8 - SWAP 4, 8 -%else - mova m4, m_q0backup -%endif - mova m1, m4 - SWAP 1, 4 - mova m7, m5 - SWAP 7, 5 - psubusb m1, m5 ; q0-q1 - psubusb m7, m4 ; q1-q0 - por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 - pxor m7, m7 - pmaxub m0, m1 - pmaxub m6, m1 - psubusb m0, m_flimI - psubusb m6, m_hevthr - pcmpeqb m0, m7 ; max(abs(..)) <= I - pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif -%ifdef m12 - SWAP 6, 12 -%else - mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) -%endif - - ; simple_limit - mova m1, m3 - SWAP 1, 3 - mova m6, m4 ; keep copies of p0/q0 around for later use - SWAP 6, 4 - psubusb m1, m4 ; p0-q0 - psubusb m6, m3 ; q0-p0 - por m1, m6 ; abs(q0-p0) - paddusb m1, m1 ; m1=2*abs(q0-p0) - - mova m7, m2 - SWAP 7, 2 - mova m6, m5 - SWAP 6, 5 - psubusb m7, m5 ; p1-q1 - psubusb m6, m2 ; q1-p1 - por m7, m6 ; abs(q1-p1) - pxor m6, m6 - pand m7, [pb_FE] - psrlq m7, 1 ; abs(q1-p1)/2 - paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, m_flimE - pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E - pand m0, m7 ; normal_limit result - - ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask -%ifdef m8 ; x86-64 && sse2 - mova m8, [pb_80] -%define m_pb_80 m8 -%else ; x86-32 or mmx/mmxext -%define m_pb_80 [pb_80] -%endif - mova m1, m4 - mova m7, m3 - pxor m1, m_pb_80 - pxor m7, m_pb_80 - psubsb m1, m7 ; (signed) q0-p0 - mova m6, m2 - mova m7, m5 - pxor m6, m_pb_80 - pxor m7, m_pb_80 - psubsb m6, m7 ; (signed) p1-q1 - mova m7, m_maskres - pandn m7, m6 - paddsb m7, m1 - paddsb m7, m1 - paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) - - pand m7, m0 - mova m1, [pb_F8] - mova m6, m7 - paddsb m7, [pb_3] - paddsb m6, [pb_4] - pand m7, m1 - pand m6, m1 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m1, m7 - psubb m0, m7 - psrlq m7, 3 ; +f2 - psrlq m0, 3 ; -f2 - pand m0, m1 - pandn m1, m7 - psubusb m3, m0 - paddusb m3, m1 ; p0+f2 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m0, m6 - psubb m1, m6 - psrlq m6, 3 ; +f1 - psrlq m1, 3 ; -f1 - pand m1, m0 - pandn m0, m6 - psubusb m4, m0 - paddusb m4, m1 ; q0-f1 - -%ifdef m12 - SWAP 6, 12 -%else - mova m6, m_maskres -%endif -%if notcpuflag(mmxext) - mova m7, [pb_1] -%else ; mmxext/sse2 - pxor m7, m7 -%endif - pand m0, m6 - pand m1, m6 -%if notcpuflag(mmxext) - paddusb m0, m7 - pand m1, [pb_FE] - pandn m7, m0 - psrlq m1, 1 - psrlq m7, 1 - SWAP 0, 7 -%else ; mmxext/sse2 - psubusb m1, [pb_1] - pavgb m0, m7 ; a - pavgb m1, m7 ; -a -%endif - psubusb m5, m0 - psubusb m2, m1 - paddusb m5, m1 ; q1-a - paddusb m2, m0 ; p1+a - - ; store -%ifidn %1, v - movrow [dst1q+mstrideq*2], m2 - movrow [dst1q+mstrideq ], m3 - movrow [dst1q], m4 - movrow [dst1q+ strideq ], m5 -%if mmsize == 16 && %2 == 8 - movhps [dst8q+mstrideq*2], m2 - movhps [dst8q+mstrideq ], m3 - movhps [dst8q], m4 - movhps [dst8q+ strideq ], m5 -%endif -%else ; h - add dst1q, 2 - add dst2q, 2 - - ; 4x8/16 transpose - TRANSPOSE4x4B 2, 3, 4, 5, 6 - -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq -%else ; sse2 (h) - lea dst8q, [dst8q+mstrideq +2] - WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 -%endif -%endif - -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 2 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-2] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -INIT_MMX mmxext -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 -%endif - -INIT_XMM sse2 -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -INIT_XMM ssse3 -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, -; int flimE, int flimI, int hev_thr); -;----------------------------------------------------------------------------- - -%macro MBEDGE_LOOPFILTER 2 -%define stack_size 0 -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%if mmsize == 16 ; [3]=hev() result - ; [4]=filter tmp result - ; [5]/[6] = p2/q2 backup - ; [7]=lim_res sign result -%define stack_size mmsize * -7 -%else ; 8 ; extra storage space for transposes -%define stack_size mmsize * -8 -%endif -%endif - -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif - -%ifndef m8 - ; splat function arguments - SPLATB_REG m0, flimEq, m7 ; E - SPLATB_REG m1, flimIq, m7 ; I - SPLATB_REG m2, hevthrq, m7 ; hev_thresh - -%define m_flimE [rsp] -%define m_flimI [rsp+mmsize] -%define m_hevthr [rsp+mmsize*2] -%define m_maskres [rsp+mmsize*3] -%define m_limres [rsp+mmsize*4] -%define m_p0backup [rsp+mmsize*3] -%define m_q0backup [rsp+mmsize*4] -%define m_p2backup [rsp+mmsize*5] -%define m_q2backup [rsp+mmsize*6] -%if mmsize == 16 -%define m_limsign [rsp] -%else -%define m_limsign [rsp+mmsize*7] -%endif - - mova m_flimE, m0 - mova m_flimI, m1 - mova m_hevthr, m2 -%else ; sse2 on x86-64 -%define m_flimE m9 -%define m_flimI m10 -%define m_hevthr m11 -%define m_maskres m12 -%define m_limres m8 -%define m_p0backup m12 -%define m_q0backup m8 -%define m_p2backup m13 -%define m_q2backup m14 -%define m_limsign m9 - - ; splat function arguments - SPLATB_REG m_flimE, flimEq, m7 ; E - SPLATB_REG m_flimI, flimIq, m7 ; I - SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh -%endif - -%if %2 == 8 ; chroma - DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 -%else - DEFINE_ARGS dst1, mstride, stride, dst2, dst8 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+strideq*4-4] -%if %2 == 8 ; chroma - lea dst8q, [dst8q+strideq*4-4] -%endif -%endif - -%if mmsize == 8 -.next8px: -%endif - ; read - lea dst2q, [dst1q+ strideq ] -%ifidn %1, v -%if %2 == 8 && mmsize == 16 -%define movrow movh -%else -%define movrow mova -%endif - movrow m0, [dst1q+mstrideq*4] ; p3 - movrow m1, [dst2q+mstrideq*4] ; p2 - movrow m2, [dst1q+mstrideq*2] ; p1 - movrow m5, [dst2q] ; q1 - movrow m6, [dst2q+ strideq ] ; q2 - movrow m7, [dst2q+ strideq*2] ; q3 -%if mmsize == 16 && %2 == 8 - movhps m0, [dst8q+mstrideq*4] - movhps m2, [dst8q+mstrideq*2] - add dst8q, strideq - movhps m1, [dst8q+mstrideq*4] - movhps m5, [dst8q] - movhps m6, [dst8q+ strideq ] - movhps m7, [dst8q+ strideq*2] - add dst8q, mstrideq -%endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) -%if %2 == 16 - lea dst8q, [dst1q+ strideq*8 ] -%endif - - ; read 16 rows of 8px each, interleave - movh m0, [dst1q+mstrideq*4] - movh m1, [dst8q+mstrideq*4] - movh m2, [dst1q+mstrideq*2] - movh m5, [dst8q+mstrideq*2] - movh m3, [dst1q+mstrideq ] - movh m6, [dst8q+mstrideq ] - movh m4, [dst1q] - movh m7, [dst8q] - punpcklbw m0, m1 ; A/I - punpcklbw m2, m5 ; C/K - punpcklbw m3, m6 ; D/L - punpcklbw m4, m7 ; E/M - - add dst8q, strideq - movh m1, [dst2q+mstrideq*4] - movh m6, [dst8q+mstrideq*4] - movh m5, [dst2q] - movh m7, [dst8q] - punpcklbw m1, m6 ; B/J - punpcklbw m5, m7 ; F/N - movh m6, [dst2q+ strideq ] - movh m7, [dst8q+ strideq ] - punpcklbw m6, m7 ; G/O - - ; 8x16 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m8 - SWAP 1, 8 -%else - mova m_q0backup, m1 -%endif - movh m7, [dst2q+ strideq*2] - movh m1, [dst8q+ strideq*2] - punpcklbw m7, m1 ; H/P - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m8 - SWAP 1, 8 - SWAP 2, 8 -%else - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 -%endif - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m12 - SWAP 5, 12 -%else - mova m_p0backup, m5 ; store p0 -%endif - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%endif - - ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 - mova m4, m1 - SWAP 4, 1 - psubusb m4, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m4 ; abs(p3-p2) - - mova m4, m2 - SWAP 4, 2 - psubusb m4, m1 ; p1-p2 - mova m_p2backup, m1 - psubusb m1, m2 ; p2-p1 - por m1, m4 ; abs(p2-p1) - - mova m4, m6 - SWAP 4, 6 - psubusb m4, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m4 ; abs(q3-q2) - - mova m4, m5 - SWAP 4, 5 - psubusb m4, m6 ; q1-q2 - mova m_q2backup, m6 - psubusb m6, m5 ; q2-q1 - por m6, m4 ; abs(q2-q1) - -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 - pmaxub m0, m1 - pmaxub m6, m7 - pmaxub m0, m6 -%endif - - ; normal_limit and high_edge_variance for p1-p0, q1-q0 - SWAP 7, 3 ; now m7 is zero -%ifidn %1, v - movrow m3, [dst1q+mstrideq ] ; p0 -%if mmsize == 16 && %2 == 8 - movhps m3, [dst8q+mstrideq ] -%endif -%elifdef m12 - SWAP 3, 12 -%else - mova m3, m_p0backup -%endif - - mova m1, m2 - SWAP 1, 2 - mova m6, m3 - SWAP 3, 6 - psubusb m1, m3 ; p1-p0 - psubusb m6, m2 ; p0-p1 - por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 - pmaxub m0, m1 ; max_I - SWAP 1, 4 ; max_hev_thresh -%endif - - SWAP 6, 4 ; now m6 is I -%ifidn %1, v - movrow m4, [dst1q] ; q0 -%if mmsize == 16 && %2 == 8 - movhps m4, [dst8q] -%endif -%elifdef m8 - SWAP 4, 8 -%else - mova m4, m_q0backup -%endif - mova m1, m4 - SWAP 1, 4 - mova m7, m5 - SWAP 7, 5 - psubusb m1, m5 ; q0-q1 - psubusb m7, m4 ; q1-q0 - por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 - pxor m7, m7 - pmaxub m0, m1 - pmaxub m6, m1 - psubusb m0, m_flimI - psubusb m6, m_hevthr - pcmpeqb m0, m7 ; max(abs(..)) <= I - pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif -%ifdef m12 - SWAP 6, 12 -%else - mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) -%endif - - ; simple_limit - mova m1, m3 - SWAP 1, 3 - mova m6, m4 ; keep copies of p0/q0 around for later use - SWAP 6, 4 - psubusb m1, m4 ; p0-q0 - psubusb m6, m3 ; q0-p0 - por m1, m6 ; abs(q0-p0) - paddusb m1, m1 ; m1=2*abs(q0-p0) - - mova m7, m2 - SWAP 7, 2 - mova m6, m5 - SWAP 6, 5 - psubusb m7, m5 ; p1-q1 - psubusb m6, m2 ; q1-p1 - por m7, m6 ; abs(q1-p1) - pxor m6, m6 - pand m7, [pb_FE] - psrlq m7, 1 ; abs(q1-p1)/2 - paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, m_flimE - pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E - pand m0, m7 ; normal_limit result - - ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask -%ifdef m8 ; x86-64 && sse2 - mova m8, [pb_80] -%define m_pb_80 m8 -%else ; x86-32 or mmx/mmxext -%define m_pb_80 [pb_80] -%endif - mova m1, m4 - mova m7, m3 - pxor m1, m_pb_80 - pxor m7, m_pb_80 - psubsb m1, m7 ; (signed) q0-p0 - mova m6, m2 - mova m7, m5 - pxor m6, m_pb_80 - pxor m7, m_pb_80 - psubsb m6, m7 ; (signed) p1-q1 - mova m7, m_maskres - paddsb m6, m1 - paddsb m6, m1 - paddsb m6, m1 - pand m6, m0 -%ifdef m8 - mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge - pand m_limres, m7 -%else - mova m0, m6 - pand m0, m7 - mova m_limres, m0 -%endif - pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common - - mova m1, [pb_F8] - mova m6, m7 - paddsb m7, [pb_3] - paddsb m6, [pb_4] - pand m7, m1 - pand m6, m1 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m1, m7 - psubb m0, m7 - psrlq m7, 3 ; +f2 - psrlq m0, 3 ; -f2 - pand m0, m1 - pandn m1, m7 - psubusb m3, m0 - paddusb m3, m1 ; p0+f2 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m0, m6 - psubb m1, m6 - psrlq m6, 3 ; +f1 - psrlq m1, 3 ; -f1 - pand m1, m0 - pandn m0, m6 - psubusb m4, m0 - paddusb m4, m1 ; q0-f1 - - ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) -%if cpuflag(ssse3) - mova m7, [pb_1] -%else - mova m7, [pw_63] -%endif -%ifdef m8 - SWAP 1, 8 -%else - mova m1, m_limres -%endif - pxor m0, m0 - mova m6, m1 - pcmpgtb m0, m1 ; which are negative -%if cpuflag(ssse3) - punpcklbw m6, m7 ; interleave with "1" for rounding - punpckhbw m1, m7 -%else - punpcklbw m6, m0 ; signed byte->word - punpckhbw m1, m0 -%endif - mova m_limsign, m0 -%if cpuflag(ssse3) - mova m7, [pb_27_63] -%ifndef m8 - mova m_limres, m1 -%endif -%ifdef m10 - SWAP 0, 10 ; don't lose lim_sign copy -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%ifdef m10 - SWAP 0, 10 -%else - mova m0, m_limsign -%endif -%else - mova m_maskres, m6 ; backup for later in filter - mova m_limres, m1 - pmullw m6, [pw_27] - pmullw m1, [pw_27] - paddw m6, m7 - paddw m1, m7 -%endif - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a0 - pxor m1, m1 - psubb m1, m6 - pand m1, m0 ; -a0 - pandn m0, m6 ; +a0 -%if cpuflag(ssse3) - mova m6, [pb_18_63] ; pipelining -%endif - psubusb m3, m1 - paddusb m4, m1 - paddusb m3, m0 ; p0+a0 - psubusb m4, m0 ; q0-a0 - -%if cpuflag(ssse3) - SWAP 6, 7 -%ifdef m10 - SWAP 1, 10 -%else - mova m1, m_limres -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%ifdef m10 - SWAP 0, 10 -%endif - mova m0, m_limsign -%else - mova m6, m_maskres - mova m1, m_limres - pmullw m6, [pw_18] - pmullw m1, [pw_18] - paddw m6, m7 - paddw m1, m7 -%endif - mova m0, m_limsign - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a1 - pxor m1, m1 - psubb m1, m6 - pand m1, m0 ; -a1 - pandn m0, m6 ; +a1 -%if cpuflag(ssse3) - mova m6, [pb_9_63] -%endif - psubusb m2, m1 - paddusb m5, m1 - paddusb m2, m0 ; p1+a1 - psubusb m5, m0 ; q1-a1 - -%if cpuflag(ssse3) - SWAP 6, 7 -%ifdef m10 - SWAP 1, 10 -%else - mova m1, m_limres -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%else -%ifdef m8 - SWAP 6, 12 - SWAP 1, 8 -%else - mova m6, m_maskres - mova m1, m_limres -%endif - pmullw m6, [pw_9] - pmullw m1, [pw_9] - paddw m6, m7 - paddw m1, m7 -%endif -%ifdef m9 - SWAP 7, 9 -%else - mova m7, m_limsign -%endif - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a1 - pxor m0, m0 - psubb m0, m6 - pand m0, m7 ; -a1 - pandn m7, m6 ; +a1 -%ifdef m8 - SWAP 1, 13 - SWAP 6, 14 -%else - mova m1, m_p2backup - mova m6, m_q2backup -%endif - psubusb m1, m0 - paddusb m6, m0 - paddusb m1, m7 ; p1+a1 - psubusb m6, m7 ; q1-a1 - - ; store -%ifidn %1, v - movrow [dst2q+mstrideq*4], m1 - movrow [dst1q+mstrideq*2], m2 - movrow [dst1q+mstrideq ], m3 - movrow [dst1q], m4 - movrow [dst2q], m5 - movrow [dst2q+ strideq ], m6 -%if mmsize == 16 && %2 == 8 - add dst8q, mstrideq - movhps [dst8q+mstrideq*2], m1 - movhps [dst8q+mstrideq ], m2 - movhps [dst8q], m3 - add dst8q, strideq - movhps [dst8q], m4 - movhps [dst8q+ strideq ], m5 - movhps [dst8q+ strideq*2], m6 -%endif -%else ; h - inc dst1q - inc dst2q - - ; 4x8/16 transpose - TRANSPOSE4x4B 1, 2, 3, 4, 0 - SBUTTERFLY bw, 5, 6, 0 - -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq - add dst1q, 4 - WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq -%else ; sse2 (h) - lea dst8q, [dst8q+mstrideq+1] - WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 - lea dst1q, [dst2q+mstrideq+4] - lea dst8q, [dst8q+mstrideq+4] -%if cpuflag(sse4) - add dst2q, 4 -%endif - WRITE_8W m5, dst2q, dst1q, mstrideq, strideq -%if cpuflag(sse4) - lea dst2q, [dst8q+ strideq ] -%endif - WRITE_8W m6, dst2q, dst8q, mstrideq, strideq -%endif -%endif - -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 5 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-5] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_MMX mmxext -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 -%endif - -INIT_XMM sse2 -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_XMM ssse3 -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_XMM sse4 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER h, 8 diff --git a/ffmpeg1/libavcodec/x86/vp8dsp_init.c b/ffmpeg1/libavcodec/x86/vp8dsp_init.c deleted file mode 100644 index 09e2d91..0000000 --- a/ffmpeg1/libavcodec/x86/vp8dsp_init.c +++ /dev/null @@ -1,442 +0,0 @@ -/* - * VP8 DSP functions x86-optimized - * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> - * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/vp8dsp.h" - -#if HAVE_YASM - -/* - * MC functions - */ -extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - - -extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ -static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ - uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ - ptrdiff_t srcstride, int height, int mx, int my) \ -{ \ - ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ - dst, dststride, src, srcstride, height, mx, my); \ - ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ - dst + 8, dststride, src + 8, srcstride, height, mx, my); \ -} -#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ -static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ - uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ - ptrdiff_t srcstride, int height, int mx, int my) \ -{ \ - ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ - dst, dststride, src, srcstride, height, mx, my); \ - ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ - dst + 4, dststride, src + 4, srcstride, height, mx, my); \ -} - -#if ARCH_X86_32 -TAP_W8 (mmxext, epel, h4) -TAP_W8 (mmxext, epel, h6) -TAP_W16(mmxext, epel, h6) -TAP_W8 (mmxext, epel, v4) -TAP_W8 (mmxext, epel, v6) -TAP_W16(mmxext, epel, v6) -TAP_W8 (mmxext, bilinear, h) -TAP_W16(mmxext, bilinear, h) -TAP_W8 (mmxext, bilinear, v) -TAP_W16(mmxext, bilinear, v) -#endif - -TAP_W16(sse2, epel, h6) -TAP_W16(sse2, epel, v6) -TAP_W16(sse2, bilinear, h) -TAP_W16(sse2, bilinear, v) - -TAP_W16(ssse3, epel, h6) -TAP_W16(ssse3, epel, v6) -TAP_W16(ssse3, bilinear, h) -TAP_W16(ssse3, bilinear, v) - -#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ -static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ - uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ - ptrdiff_t srcstride, int height, int mx, int my) \ -{ \ - DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \ - uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ - src -= srcstride * (TAPNUMY / 2 - 1); \ - ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ - tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \ - ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \ - dst, dststride, tmpptr, SIZE, height, mx, my); \ -} - -#if ARCH_X86_32 -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) \ -HVTAP(mmxext, 8, x, y, 8, 16) - -HVTAP(mmxext, 8, 6, 6, 16, 16) -#else -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) -#endif - -HVTAPMMX(4, 4) -HVTAPMMX(4, 6) -HVTAPMMX(6, 4) -HVTAPMMX(6, 6) - -#define HVTAPSSE2(x, y, w) \ -HVTAP(sse2, 16, x, y, w, 16) \ -HVTAP(ssse3, 16, x, y, w, 16) - -HVTAPSSE2(4, 4, 8) -HVTAPSSE2(4, 6, 8) -HVTAPSSE2(6, 4, 8) -HVTAPSSE2(6, 6, 8) -HVTAPSSE2(6, 6, 16) - -HVTAP(ssse3, 16, 4, 4, 4, 8) -HVTAP(ssse3, 16, 4, 6, 4, 8) -HVTAP(ssse3, 16, 6, 4, 4, 8) -HVTAP(ssse3, 16, 6, 6, 4, 8) - -#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \ -static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ - uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ - ptrdiff_t srcstride, int height, int mx, int my) \ -{ \ - DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \ - ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ - tmp, SIZE, src, srcstride, height + 1, mx, my); \ - ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ - dst, dststride, tmp, SIZE, height, mx, my); \ -} - -HVBILIN(mmxext, 8, 4, 8) -#if ARCH_X86_32 -HVBILIN(mmxext, 8, 8, 16) -HVBILIN(mmxext, 8, 16, 16) -#endif -HVBILIN(sse2, 8, 8, 16) -HVBILIN(sse2, 8, 16, 16) -HVBILIN(ssse3, 8, 4, 8) -HVBILIN(ssse3, 8, 8, 16) -HVBILIN(ssse3, 8, 16, 16) - -extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], - ptrdiff_t stride); -extern void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]); -extern void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); -extern void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); - -#define DECLARE_LOOP_FILTER(NAME)\ -extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim);\ -extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim);\ -extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt); - -DECLARE_LOOP_FILTER(mmx) -DECLARE_LOOP_FILTER(mmxext) -DECLARE_LOOP_FILTER(sse2) -DECLARE_LOOP_FILTER(ssse3) -DECLARE_LOOP_FILTER(sse4) - -#endif /* HAVE_YASM */ - -#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ - c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT - -#define VP8_MC_FUNC(IDX, SIZE, OPT) \ - c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \ - c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \ - VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) - -#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \ - c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ - c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT - - -av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; - c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; -#if ARCH_X86_32 - c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; - c->vp8_idct_add = ff_vp8_idct_add_mmx; - c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; - c->put_vp8_epel_pixels_tab[0][0][0] = - c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; -#endif - c->put_vp8_epel_pixels_tab[1][0][0] = - c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; - -#if ARCH_X86_32 - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx; -#endif - } - - /* note that 4-tap width=16 functions are missing because w=16 - * is only used for luma, and luma is always a copy or sixtap. */ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); -#if ARCH_X86_32 - VP8_LUMA_MC_FUNC(0, 16, mmxext); - VP8_MC_FUNC(1, 8, mmxext); - VP8_BILINEAR_MC_FUNC(0, 16, mmxext); - VP8_BILINEAR_MC_FUNC(1, 8, mmxext); - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; -#endif - } - - if (mm_flags & AV_CPU_FLAG_SSE) { - c->vp8_idct_add = ff_vp8_idct_add_sse; - c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; - c->put_vp8_epel_pixels_tab[0][0][0] = - c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; - } - - if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { - VP8_LUMA_MC_FUNC(0, 16, sse2); - VP8_MC_FUNC(1, 8, sse2); - VP8_BILINEAR_MC_FUNC(0, 16, sse2); - VP8_BILINEAR_MC_FUNC(1, 8, sse2); - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; - } - - if (mm_flags & AV_CPU_FLAG_SSE2) { - c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; - - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; - - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; - - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; - } - - if (mm_flags & AV_CPU_FLAG_SSSE3) { - VP8_LUMA_MC_FUNC(0, 16, ssse3); - VP8_MC_FUNC(1, 8, ssse3); - VP8_MC_FUNC(2, 4, ssse3); - VP8_BILINEAR_MC_FUNC(0, 16, ssse3); - VP8_BILINEAR_MC_FUNC(1, 8, ssse3); - VP8_BILINEAR_MC_FUNC(2, 4, ssse3); - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; - } - - if (mm_flags & AV_CPU_FLAG_SSE4) { - c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; - - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg1/libavcodec/x86/w64xmmtest.c b/ffmpeg1/libavcodec/x86/w64xmmtest.c deleted file mode 100644 index f6e3de9..0000000 --- a/ffmpeg1/libavcodec/x86/w64xmmtest.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * check XMM registers for clobbers on Win64 - * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavutil/x86/w64xmmtest.h" - -wrap(avcodec_open2(AVCodecContext *avctx, - AVCodec *codec, - AVDictionary **options)) -{ - testxmmclobbers(avcodec_open2, avctx, codec, options); -} - -wrap(avcodec_decode_audio4(AVCodecContext *avctx, - AVFrame *frame, - int *got_frame_ptr, - AVPacket *avpkt)) -{ - testxmmclobbers(avcodec_decode_audio4, avctx, frame, - got_frame_ptr, avpkt); -} - -wrap(avcodec_decode_video2(AVCodecContext *avctx, - AVFrame *picture, - int *got_picture_ptr, - AVPacket *avpkt)) -{ - testxmmclobbers(avcodec_decode_video2, avctx, picture, - got_picture_ptr, avpkt); -} - -wrap(avcodec_decode_subtitle2(AVCodecContext *avctx, - AVSubtitle *sub, - int *got_sub_ptr, - AVPacket *avpkt)) -{ - testxmmclobbers(avcodec_decode_subtitle2, avctx, sub, - got_sub_ptr, avpkt); -} - -wrap(avcodec_encode_audio2(AVCodecContext *avctx, - AVPacket *avpkt, - const AVFrame *frame, - int *got_packet_ptr)) -{ - testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame, - got_packet_ptr); -} - -wrap(avcodec_encode_video(AVCodecContext *avctx, - uint8_t *buf, int buf_size, - const AVFrame *pict)) -{ - testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict); -} - -wrap(avcodec_encode_subtitle(AVCodecContext *avctx, - uint8_t *buf, int buf_size, - const AVSubtitle *sub)) -{ - testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub); -} |
