diff options
Diffstat (limited to 'ffmpeg/libavcodec/x86')
74 files changed, 2027 insertions, 5790 deletions
diff --git a/ffmpeg/libavcodec/x86/Makefile b/ffmpeg/libavcodec/x86/Makefile index 38ef867..2d2d5a0 100644 --- a/ffmpeg/libavcodec/x86/Makefile +++ b/ffmpeg/libavcodec/x86/Makefile @@ -1,11 +1,18 @@ -OBJS += x86/fmtconvert_init.o \ - x86/constants.o +OBJS += x86/constants.o \ + x86/fmtconvert_init.o \ OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o +OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o +OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \ + x86/dsputil_x86.o +OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ + x86/fdct.o \ + x86/motion_est.o OBJS-$(CONFIG_FFT) += x86/fft_init.o +OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o @@ -13,7 +20,7 @@ OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o -OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o +OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o @@ -28,33 +35,41 @@ OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o -OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o -OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o +OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o +OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o +OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ - x86/fdct.o \ + x86/fpel_mmx.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ - x86/simple_idct.o \ - -MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ - x86/motion_est.o + x86/rnd_mmx.o \ + x86/simple_idct.o MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o +MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ + x86/hpeldsp_mmx.o \ + x86/rnd_mmx.o MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o +YASM-OBJS += x86/deinterlace.o \ + x86/fmtconvert.o \ + YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ x86/dwt_yasm.o +YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ + x86/fpel.o \ + x86/mpeg4qpel.o \ + x86/qpel.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT) += x86/fft.o -YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o -YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o +YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ @@ -67,10 +82,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ x86/h264_intrapred_10bit.o YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ x86/h264_qpel_10bit.o \ - x86/qpelbase.o \ - x86/fpelbase.o -YASM-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp.o \ - x86/fpelbase.o + x86/fpel.o \ + x86/qpel.o +YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ + x86/hpeldsp.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o @@ -83,13 +98,9 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o -YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o -YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o - -YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ - x86/mpeg4qpel.o \ - x86/qpelbase.o \ - x86/fpelbase.o - -YASM-OBJS += x86/deinterlace.o \ - x86/fmtconvert.o +YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o +YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o \ + x86/vp8dsp_loopfilter.o +YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9itxfm.o \ + x86/vp9mc.o +YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/ffmpeg/libavcodec/x86/ac3dsp.asm b/ffmpeg/libavcodec/x86/ac3dsp.asm index 98fb446..89a64f5 100644 --- a/ffmpeg/libavcodec/x86/ac3dsp.asm +++ b/ffmpeg/libavcodec/x86/ac3dsp.asm @@ -379,42 +379,6 @@ cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum %endif %endmacro -%if HAVE_AMD3DNOW_EXTERNAL -INIT_MMX 3dnow -cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len - add expq, lenq - lea coefq, [coefq+4*lenq] - neg lenq - movq m3, [pd_1] - movq m4, [pd_151] -.loop: - movq m0, [coefq+4*lenq ] - movq m1, [coefq+4*lenq+8] - PABSD m0, m2 - PABSD m1, m2 - pslld m0, 1 - por m0, m3 - pi2fd m2, m0 - psrld m2, 23 - movq m0, m4 - psubd m0, m2 - pslld m1, 1 - por m1, m3 - pi2fd m2, m1 - psrld m2, 23 - movq m1, m4 - psubd m1, m2 - packssdw m0, m0 - packuswb m0, m0 - packssdw m1, m1 - packuswb m1, m1 - punpcklwd m0, m1 - movd [expq+lenq], m0 - add lenq, 4 - jl .loop - REP_RET -%endif - %macro AC3_EXTRACT_EXPONENTS 0 cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len add expq, lenq diff --git a/ffmpeg/libavcodec/x86/ac3dsp_init.c b/ffmpeg/libavcodec/x86/ac3dsp_init.c index e2a190e..5819d00 100644 --- a/ffmpeg/libavcodec/x86/ac3dsp_init.c +++ b/ffmpeg/libavcodec/x86/ac3dsp_init.c @@ -22,34 +22,47 @@ #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #include "libavcodec/ac3.h" #include "libavcodec/ac3dsp.h" -extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); +void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); +void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); +void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); +int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); +int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); +int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); +int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); -extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); +void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); +void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); +void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); +void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); -extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); +void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); +void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); +void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); -extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); +int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); -extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); +void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); +void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); +void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); + +void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); +void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); +void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); +void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); +void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); +void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, + const int16_t *window, unsigned int len); #if ARCH_X86_32 && defined(__INTEL_COMPILER) # undef HAVE_7REGS @@ -185,47 +198,59 @@ static void ac3_downmix_sse(float **samples, float (*matrix)[2], av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags)) { + if (EXTERNAL_MMX(cpu_flags)) { c->ac3_exponent_min = ff_ac3_exponent_min_mmx; c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; } - if (EXTERNAL_AMD3DNOW(mm_flags)) { - c->extract_exponents = ff_ac3_extract_exponents_3dnow; + if (EXTERNAL_AMD3DNOW(cpu_flags)) { if (!bit_exact) { c->float_to_fixed24 = ff_float_to_fixed24_3dnow; } } - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; + if (bit_exact) { + c->apply_window_int16 = ff_apply_window_int16_mmxext; + } else { + c->apply_window_int16 = ff_apply_window_int16_round_mmxext; + } } - if (EXTERNAL_SSE(mm_flags)) { + if (EXTERNAL_SSE(cpu_flags)) { c->float_to_fixed24 = ff_float_to_fixed24_sse; } - if (EXTERNAL_SSE2(mm_flags)) { + if (EXTERNAL_SSE2(cpu_flags)) { c->ac3_exponent_min = ff_ac3_exponent_min_sse2; c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; c->float_to_fixed24 = ff_float_to_fixed24_sse2; c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; c->extract_exponents = ff_ac3_extract_exponents_sse2; - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; } + if (bit_exact) { + c->apply_window_int16 = ff_apply_window_int16_sse2; + } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + c->apply_window_int16 = ff_apply_window_int16_round_sse2; + } } - if (EXTERNAL_SSSE3(mm_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) { c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; - if (!(mm_flags & AV_CPU_FLAG_ATOM)) { + if (cpu_flags & AV_CPU_FLAG_ATOM) { + c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; + } else { c->extract_exponents = ff_ac3_extract_exponents_ssse3; + c->apply_window_int16 = ff_apply_window_int16_ssse3; } } #if HAVE_SSE_INLINE && HAVE_7REGS - if (INLINE_SSE(mm_flags)) { + if (INLINE_SSE(cpu_flags)) { c->downmix = ac3_downmix_sse; } #endif diff --git a/ffmpeg/libavcodec/x86/cabac.h b/ffmpeg/libavcodec/x86/cabac.h index 2c9f77e..558d287 100644 --- a/ffmpeg/libavcodec/x86/cabac.h +++ b/ffmpeg/libavcodec/x86/cabac.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,8 +27,27 @@ #include "libavutil/internal.h" #include "config.h" +#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ + || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) +# define BROKEN_COMPILER 1 +#else +# define BROKEN_COMPILER 0 +#endif + #if HAVE_INLINE_ASM +#ifndef UNCHECKED_BITSTREAM_READER +#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER +#endif + +#if UNCHECKED_BITSTREAM_READER +#define END_CHECK(end) "" +#else +#define END_CHECK(end) \ + "cmp "end" , %%"REG_c" \n\t"\ + "jge 1f \n\t" +#endif + #ifdef BROKEN_RELOCATIONS #define TABLES_ARG , "r"(tables) @@ -73,7 +92,9 @@ "test "lowword" , "lowword" \n\t"\ "jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ + "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ "lea -1("low") , %%ecx \n\t"\ "xor "low" , %%ecx \n\t"\ @@ -132,7 +153,9 @@ "test "lowword" , "lowword" \n\t"\ " jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ + "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ "lea -1("low") , %%ecx \n\t"\ "xor "low" , %%ecx \n\t"\ @@ -149,9 +172,7 @@ #endif /* BROKEN_RELOCATIONS */ - -#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ - && !( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) +#if HAVE_7REGS && !BROKEN_COMPILER #define get_cabac_inline get_cabac_inline_x86 static av_always_inline int get_cabac_inline_x86(CABACContext *c, uint8_t *const state) @@ -186,6 +207,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, } #endif /* HAVE_7REGS */ +#if !BROKEN_COMPILER #define get_cabac_bypass_sign get_cabac_bypass_sign_x86 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) { @@ -208,9 +230,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) "movzwl (%1), %%edx \n\t" "bswap %%edx \n\t" "shrl $15, %%edx \n\t" +#if UNCHECKED_BITSTREAM_READER "add $2, %1 \n\t" "addl %%edx, %%eax \n\t" "mov %1, %c4(%2) \n\t" +#else + "addl %%edx, %%eax \n\t" + "cmp %c5(%2), %1 \n\t" + "jge 1f \n\t" + "add"OPSIZE" $2, %c4(%2) \n\t" +#endif "1: \n\t" "movl %%eax, %c3(%2) \n\t" @@ -225,5 +254,46 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) return val; } +#define get_cabac_bypass get_cabac_bypass_x86 +static av_always_inline int get_cabac_bypass_x86(CABACContext *c) +{ + x86_reg tmp; + int res; + __asm__ volatile( + "movl %c6(%2), %k1 \n\t" + "movl %c3(%2), %%eax \n\t" + "shl $17, %k1 \n\t" + "add %%eax, %%eax \n\t" + "sub %k1, %%eax \n\t" + "cltd \n\t" + "and %%edx, %k1 \n\t" + "add %k1, %%eax \n\t" + "inc %%edx \n\t" + "test %%ax, %%ax \n\t" + "jnz 1f \n\t" + "mov %c4(%2), %1 \n\t" + "subl $0xFFFF, %%eax \n\t" + "movzwl (%1), %%ecx \n\t" + "bswap %%ecx \n\t" + "shrl $15, %%ecx \n\t" + "addl %%ecx, %%eax \n\t" + "cmp %c5(%2), %1 \n\t" + "jge 1f \n\t" + "add"OPSIZE" $2, %c4(%2) \n\t" + "1: \n\t" + "movl %%eax, %c3(%2) \n\t" + + : "=&d"(res), "=&r"(tmp) + : "r"(c), + "i"(offsetof(CABACContext, low)), + "i"(offsetof(CABACContext, bytestream)), + "i"(offsetof(CABACContext, bytestream_end)), + "i"(offsetof(CABACContext, range)) + : "%eax", "%ecx", "memory" + ); + return res; +} +#endif /* !BROKEN_COMPILER */ + #endif /* HAVE_INLINE_ASM */ #endif /* AVCODEC_X86_CABAC_H */ diff --git a/ffmpeg/libavcodec/x86/cavsdsp.c b/ffmpeg/libavcodec/x86/cavsdsp.c index deeb5cf..aaa09d1 100644 --- a/ffmpeg/libavcodec/x86/cavsdsp.c +++ b/ffmpeg/libavcodec/x86/cavsdsp.c @@ -28,10 +28,11 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/cavsdsp.h" -#include "dsputil_mmx.h" +#include "constants.h" +#include "dsputil_x86.h" #include "config.h" -#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) +#if HAVE_MMX_INLINE /* in/out: mma=mma+mmb, mmb=mmb-mma */ #define SUMSUB_BA( a, b ) \ @@ -122,6 +123,17 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) ); } +#define SBUTTERFLY(a,b,t,n,m)\ + "mov" #m " " #a ", " #t " \n\t" /* abcd */\ + "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ + "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ + +#define TRANSPOSE4(a,b,c,d,t)\ + SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ + SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ + SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ + SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ + static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) { int i; @@ -187,6 +199,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) ff_add_pixels_clamped_mmx(b2, dst, stride); } +#endif /* HAVE_MMX_INLINE */ + +#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) + /***************************************************************************** * * motion compensation @@ -409,22 +425,22 @@ static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstS }\ #define CAVS_MC(OPNAME, SIZE, MMX) \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ +static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ {\ OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ }\ \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ +static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ {\ OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ }\ \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ +static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ {\ OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ }\ \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ +static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ {\ OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ }\ @@ -441,6 +457,50 @@ static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, ui #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ +#if HAVE_MMX_INLINE +static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels8_mmx(dst, src, stride, 8); +} + +static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels8_mmx(dst, src, stride, 8); +} + +static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels16_mmx(dst, src, stride, 16); +} + +static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels16_mmx(dst, src, stride, 16); +} + +static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c, + AVCodecContext *avctx) +{ + c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx; + c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; + c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx; + c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx; + + c->cavs_idct8_add = cavs_idct8_add_mmx; + c->idct_perm = FF_TRANSPOSE_IDCT_PERM; +} +#endif /* HAVE_MMX_INLINE */ + +#define DSPFUNC(PFX, IDX, NUM, EXT) \ + c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ + c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \ + c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \ + c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \ + #if HAVE_MMXEXT_INLINE QPEL_CAVS(put_, PUT_OP, mmxext) QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext) @@ -450,23 +510,13 @@ CAVS_MC(put_, 16, mmxext) CAVS_MC(avg_, 8, mmxext) CAVS_MC(avg_, 16, mmxext) -static av_cold void ff_cavsdsp_init_mmxext(CAVSDSPContext *c, - AVCodecContext *avctx) +static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c, + AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmxext; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmxext; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; - c->idct_perm = FF_TRANSPOSE_IDCT_PERM; + DSPFUNC(put, 0, 16, mmxext); + DSPFUNC(put, 1, 8, mmxext); + DSPFUNC(avg, 0, 16, mmxext); + DSPFUNC(avg, 1, 8, mmxext); } #endif /* HAVE_MMXEXT_INLINE */ @@ -479,34 +529,30 @@ CAVS_MC(put_, 16,3dnow) CAVS_MC(avg_, 8, 3dnow) CAVS_MC(avg_, 16,3dnow) -static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c, - AVCodecContext *avctx) +static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c, + AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; - c->idct_perm = FF_TRANSPOSE_IDCT_PERM; + DSPFUNC(put, 0, 16, 3dnow); + DSPFUNC(put, 1, 8, 3dnow); + DSPFUNC(avg, 0, 16, 3dnow); + DSPFUNC(avg, 1, 8, 3dnow); } #endif /* HAVE_AMD3DNOW_INLINE */ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) { - int mm_flags = av_get_cpu_flags(); +#if HAVE_MMX_INLINE + int cpu_flags = av_get_cpu_flags(); -#if HAVE_MMXEXT_INLINE - if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmxext(c, avctx); -#endif /* HAVE_MMXEXT_INLINE */ + if (INLINE_MMX(cpu_flags)) + cavsdsp_init_mmx(c, avctx); +#endif /* HAVE_MMX_INLINE */ #if HAVE_AMD3DNOW_INLINE - if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx); + if (INLINE_AMD3DNOW(cpu_flags)) + cavsdsp_init_3dnow(c, avctx); #endif /* HAVE_AMD3DNOW_INLINE */ +#if HAVE_MMXEXT_INLINE + if (INLINE_MMXEXT(cpu_flags)) + cavsdsp_init_mmxext(c, avctx); +#endif /* HAVE_MMXEXT_INLINE */ } diff --git a/ffmpeg/libavcodec/x86/constants.c b/ffmpeg/libavcodec/x86/constants.c index 821d73f..3bba80b 100644 --- a/ffmpeg/libavcodec/x86/constants.c +++ b/ffmpeg/libavcodec/x86/constants.c @@ -20,6 +20,9 @@ #include "libavutil/mem.h" #include "libavutil/x86/asm.h" // for xmm_reg +#include "constants.h" + +DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; @@ -28,12 +31,23 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x000 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; diff --git a/ffmpeg/libavcodec/x86/dirac_dwt.c b/ffmpeg/libavcodec/x86/dirac_dwt.c index fbb25a4..04c514f 100644 --- a/ffmpeg/libavcodec/x86/dirac_dwt.c +++ b/ffmpeg/libavcodec/x86/dirac_dwt.c @@ -21,7 +21,7 @@ */ #include "libavutil/x86/asm.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #include "dirac_dwt.h" #define COMPOSE_VERTICAL(ext, align) \ diff --git a/ffmpeg/libavcodec/x86/diracdsp_mmx.c b/ffmpeg/libavcodec/x86/diracdsp_mmx.c index cb6465f..a28bb82 100644 --- a/ffmpeg/libavcodec/x86/diracdsp_mmx.c +++ b/ffmpeg/libavcodec/x86/diracdsp_mmx.c @@ -18,7 +18,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #include "diracdsp_mmx.h" void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); diff --git a/ffmpeg/libavcodec/x86/dnxhdenc.c b/ffmpeg/libavcodec/x86/dnxhdenc.c index 349fbb0..c7e776a 100644 --- a/ffmpeg/libavcodec/x86/dnxhdenc.c +++ b/ffmpeg/libavcodec/x86/dnxhdenc.c @@ -23,6 +23,7 @@ #include "libavutil/attributes.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/dnxhdenc.h" #if HAVE_SSE2_INLINE @@ -58,7 +59,7 @@ static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int l av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx) { #if HAVE_SSE2_INLINE - if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) { + if (INLINE_SSE2(av_get_cpu_flags())) { if (ctx->cid_table->bit_depth == 8) ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2; } diff --git a/ffmpeg/libavcodec/x86/dsputil.asm b/ffmpeg/libavcodec/x86/dsputil.asm index 9970c02..77069e2 100644 --- a/ffmpeg/libavcodec/x86/dsputil.asm +++ b/ffmpeg/libavcodec/x86/dsputil.asm @@ -554,8 +554,8 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 %if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 - mova [r0 + 0], m0 - mova [r0 + 16], m1 + mov%1 [r0 + 0], m0 + mov%1 [r0 + 16], m1 %else pshuflw m0, m0, 10110001b pshuflw m1, m1, 10110001b @@ -569,8 +569,8 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 psrlw m3, 8 por m2, m0 por m3, m1 - mova [r0 + 0], m2 - mova [r0 + 16], m3 + mov%1 [r0 + 0], m2 + mov%1 [r0 + 16], m3 %endif add r0, 32 add r1, 32 @@ -583,7 +583,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 mov%1 m0, [r1] %if cpuflag(ssse3) pshufb m0, m2 - mova [r0], m0 + mov%1 [r0], m0 %else pshuflw m0, m0, 10110001b pshufhw m0, m0, 10110001b @@ -591,7 +591,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 psllw m0, 8 psrlw m2, 8 por m2, m0 - mova [r0], m2 + mov%1 [r0], m2 %endif add r1, 16 add r0, 16 @@ -607,6 +607,7 @@ cglobal bswap32_buf, 3,4,3 cglobal bswap32_buf, 3,4,5 mov r3, r1 %endif + or r3, r0 and r3, 15 jz .start_align BSWAP_LOOPS u diff --git a/ffmpeg/libavcodec/x86/dsputil_mmx.c b/ffmpeg/libavcodec/x86/dsputil_mmx.c index fe59d22..df8cfdb 100644 --- a/ffmpeg/libavcodec/x86/dsputil_mmx.c +++ b/ffmpeg/libavcodec/x86/dsputil_mmx.c @@ -22,195 +22,17 @@ * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ -#include "libavutil/attributes.h" +#include "config.h" +#include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/h264dsp.h" -#include "libavcodec/mpegvideo.h" -#include "libavcodec/simple_idct.h" #include "libavcodec/videodsp.h" -#include "dsputil_mmx.h" -#include "idct_xvid.h" +#include "constants.h" +#include "dsputil_x86.h" #include "diracdsp_mmx.h" -//#undef NDEBUG -//#include <assert.h> - -/* pixel operations */ -DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; - -DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; - -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; - -DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; -DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; - - -#if HAVE_YASM -void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, - uint8_t *src2, int dstStride, - int src1Stride, int h); -void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int dstStride, int src1Stride, int h); -void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - ff_put_pixels8_mmxext(block, pixels, line_size, h); - ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} - -void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, - int h); -void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, int h); -void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride, - int h); -void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, - int dstStride, int srcStride); -#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext -#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext -#endif /* HAVE_YASM */ - - #if HAVE_INLINE_ASM -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "packuswb %%"#regd", %%"#regd" \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "psllw $1, %%"#regd" \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "por "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" \ - "psubb "#regd", "#regp" \n\t" - -/***********************************/ -/* MMX rounding */ - -#define DEF(x, y) x ## _ ## y ## _mmx -#define SET_RND MOVQ_WTWO -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) -#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) - -#include "dsputil_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB -#undef OP_AVG - -#endif /* HAVE_INLINE_ASM */ - - -#if HAVE_YASM - -/***********************************/ -/* MMXEXT specific */ - -//FIXME the following could be optimized too ... -static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_avg_pixels8_mmxext(block, pixels, line_size, h); - ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} - -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM -/***********************************/ -/* standard MMX */ - void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size) { @@ -345,70 +167,8 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, } while (--i); } -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - #define CLEAR_BLOCKS(name, n) \ -static void name(int16_t *blocks) \ +void name(int16_t *blocks) \ { \ __asm__ volatile ( \ "pxor %%mm7, %%mm7 \n\t" \ @@ -425,10 +185,10 @@ static void name(int16_t *blocks) \ : "%"REG_a \ ); \ } -CLEAR_BLOCKS(clear_blocks_mmx, 6) -CLEAR_BLOCKS(clear_block_mmx, 1) +CLEAR_BLOCKS(ff_clear_blocks_mmx, 6) +CLEAR_BLOCKS(ff_clear_block_mmx, 1) -static void clear_block_sse(int16_t *block) +void ff_clear_block_sse(int16_t *block) { __asm__ volatile ( "xorps %%xmm0, %%xmm0 \n" @@ -445,7 +205,7 @@ static void clear_block_sse(int16_t *block) ); } -static void clear_blocks_sse(int16_t *blocks) +void ff_clear_blocks_sse(int16_t *blocks) { __asm__ volatile ( "xorps %%xmm0, %%xmm0 \n" @@ -467,7 +227,7 @@ static void clear_blocks_sse(int16_t *blocks) ); } -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) { x86_reg i = 0; __asm__ volatile ( @@ -492,53 +252,10 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) dst[i + 0] += src[i + 0]; } -#if HAVE_7REGS -static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top) -{ - x86_reg w2 = -w; - x86_reg x; - int l = *left & 0xff; - int tl = *left_top & 0xff; - int t; - __asm__ volatile ( - "mov %7, %3 \n" - "1: \n" - "movzbl (%3, %4), %2 \n" - "mov %2, %k3 \n" - "sub %b1, %b3 \n" - "add %b0, %b3 \n" - "mov %2, %1 \n" - "cmp %0, %2 \n" - "cmovg %0, %2 \n" - "cmovg %1, %0 \n" - "cmp %k3, %0 \n" - "cmovg %k3, %0 \n" - "mov %7, %3 \n" - "cmp %2, %0 \n" - "cmovl %2, %0 \n" - "add (%6, %4), %b0 \n" - "mov %b0, (%5, %4) \n" - "inc %4 \n" - "jl 1b \n" - : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) - : "r"(dst + w), "r"(diff + w), "rm"(top + w) - ); - *left = l; - *left_top = tl; -} -#endif -#endif /* HAVE_INLINE_ASM */ - -void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale); -void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale); - -#if HAVE_INLINE_ASM /* Draw the edges of width 'w' of an image of size width, height * this MMX version can only handle w == 8 || w == 16. */ -static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, - int w, int h, int sides) +void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, + int w, int h, int sides) { uint8_t *ptr, *last_line; int i; @@ -649,417 +366,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, } } } -#endif /* HAVE_INLINE_ASM */ - - -#if HAVE_YASM -#define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \ -static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ - 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ - stride, stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \ - stride, stride); \ -} \ - \ -static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[8]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ - 8, stride); \ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half) + 64; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ - stride, 8, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ - 8, stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[8 + 9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[9]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ - stride, 8); \ -} \ - \ -static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ - stride, stride, 16);\ -} \ - \ -static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ - stride, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \ - stride, stride); \ -} \ - \ -static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t temp[32]; \ - uint8_t * const half = (uint8_t*)temp; \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ - stride, stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[16 * 2 + 17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half) + 256; \ - uint8_t * const halfHV = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ - stride, 16, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} \ - \ -static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ - ptrdiff_t stride) \ -{ \ - uint64_t half[17 * 2]; \ - uint8_t * const halfH = ((uint8_t*)half); \ - ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ - stride, 16); \ -} - -QPEL_OP(put_, ff_pw_16, _, mmxext) -QPEL_OP(avg_, ff_pw_16, _, mmxext) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext) -#endif /* HAVE_YASM */ - - -#if HAVE_INLINE_ASM -void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels8_xy2_mmx(dst, src, stride, 8); -} -void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels16_xy2_mmx(dst, src, stride, 16); -} -void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels8_xy2_mmx(dst, src, stride, 8); -} -void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels16_xy2_mmx(dst, src, stride, 16); -} typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, - ptrdiff_t linesize, int block_w, int block_h, + ptrdiff_t dst_stride, + ptrdiff_t src_linesize, + int block_w, int block_h, int src_x, int src_y, int w, int h); static av_always_inline void gmc(uint8_t *dst, uint8_t *src, @@ -1107,7 +418,7 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, src += ix + iy * stride; if (need_emu) { - emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height); + emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height); src = edge_buf; } @@ -1191,28 +502,28 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, #if CONFIG_VIDEODSP #if HAVE_YASM #if ARCH_X86_32 -static void gmc_mmx(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) +void ff_gmc_mmx(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) { gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height, &ff_emulated_edge_mc_8); } #endif -static void gmc_sse(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) +void ff_gmc_sse(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) { gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height, &ff_emulated_edge_mc_8); } #else -static void gmc_mmx(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) +void ff_gmc_mmx(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) { gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height, &ff_emulated_edge_mc_8); @@ -1220,43 +531,6 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, #endif #endif -#endif /* HAVE_INLINE_ASM */ - -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -#if HAVE_INLINE_ASM - -/* CAVS-specific */ -void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels8_mmx(dst, src, stride, 8); -} - -void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels8_mmx(dst, src, stride, 8); -} - -void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - put_pixels16_mmx(dst, src, stride, 16); -} - -void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride) -{ - avg_pixels16_mmx(dst, src, stride, 16); -} - -/* VC-1-specific */ -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride, int rnd) -{ - put_pixels8_mmx(dst, src, stride, 8); -} - #if CONFIG_DIRAC_DECODER #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ @@ -1284,8 +558,9 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[ } #if HAVE_MMX_INLINE -DIRAC_PIXOP(put, put, mmx) -DIRAC_PIXOP(avg, avg, mmx) +PIXELS16(static, ff_avg, , , _mmxext) +DIRAC_PIXOP(put, ff_put, mmx) +DIRAC_PIXOP(avg, ff_avg, mmx) #endif #if HAVE_YASM @@ -1326,8 +601,8 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, #endif #endif -static void vector_clipf_sse(float *dst, const float *src, - float min, float max, int len) +void ff_vector_clipf_sse(float *dst, const float *src, + float min, float max, int len) { x86_reg i = (len - 16) * 4; __asm__ volatile ( @@ -1361,276 +636,3 @@ static void vector_clipf_sse(float *dst, const float *src, } #endif /* HAVE_INLINE_ASM */ - -int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, - int order); -int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, - int order); -int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); - -void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); -void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, - const int16_t *window, unsigned int len); - -void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); -void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); - -void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top); -int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, - int w, int left); -int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, - int w, int left); - -void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); - -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ - do { \ - c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ - } while (0) - -static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - -#if HAVE_INLINE_ASM - c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = ff_add_pixels_clamped_mmx; - - if (!high_bit_depth) { - c->clear_block = clear_block_mmx; - c->clear_blocks = clear_blocks_mmx; - c->draw_edges = draw_edges_mmx; - } - -#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM) - c->gmc = gmc_mmx; -#endif - - c->add_bytes = add_bytes_mmx; -#endif /* HAVE_INLINE_ASM */ - -#if HAVE_YASM - if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { - c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx; - c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx; - } - - c->vector_clip_int32 = ff_vector_clip_int32_mmx; -#endif - -} - -static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - -#if HAVE_YASM - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); - - SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); -#endif /* HAVE_YASM */ - -#if HAVE_MMXEXT_EXTERNAL - /* slower than cmov version on AMD */ - if (!(mm_flags & AV_CPU_FLAG_3DNOW)) - c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; - - c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; - - if (avctx->flags & CODEC_FLAG_BITEXACT) { - c->apply_window_int16 = ff_apply_window_int16_mmxext; - } else { - c->apply_window_int16 = ff_apply_window_int16_round_mmxext; - } -#endif /* HAVE_MMXEXT_EXTERNAL */ -} - -static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - -#if HAVE_INLINE_ASM - if (!high_bit_depth) { - if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) { - /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ - c->clear_block = clear_block_sse; - c->clear_blocks = clear_blocks_sse; - } - } - - c->vector_clipf = vector_clipf_sse; -#endif /* HAVE_INLINE_ASM */ - -#if HAVE_YASM -#if HAVE_INLINE_ASM && CONFIG_VIDEODSP - c->gmc = gmc_sse; -#endif -#endif /* HAVE_YASM */ -} - -static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ - const int bit_depth = avctx->bits_per_raw_sample; - const int high_bit_depth = bit_depth > 8; - -#if HAVE_SSE2_INLINE - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { - c->idct_put = ff_idct_xvid_sse2_put; - c->idct_add = ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type = FF_SSE2_IDCT_PERM; - } -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_SSE2_EXTERNAL - c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; - if (mm_flags & AV_CPU_FLAG_ATOM) { - c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; - } else { - c->vector_clip_int32 = ff_vector_clip_int32_sse2; - } - if (avctx->flags & CODEC_FLAG_BITEXACT) { - c->apply_window_int16 = ff_apply_window_int16_sse2; - } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { - c->apply_window_int16 = ff_apply_window_int16_round_sse2; - } - c->bswap_buf = ff_bswap32_buf_sse2; -#endif /* HAVE_SSE2_EXTERNAL */ -} - -static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ -#if HAVE_SSSE3_EXTERNAL - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; - if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; - - if (mm_flags & AV_CPU_FLAG_ATOM) - c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; - else - c->apply_window_int16 = ff_apply_window_int16_ssse3; - if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; - c->bswap_buf = ff_bswap32_buf_ssse3; -#endif /* HAVE_SSSE3_EXTERNAL */ -} - -static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx, - int mm_flags) -{ -#if HAVE_SSE4_EXTERNAL - c->vector_clip_int32 = ff_vector_clip_int32_sse4; -#endif /* HAVE_SSE4_EXTERNAL */ -} - -av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) -{ - int mm_flags = av_get_cpu_flags(); - -#if HAVE_7REGS && HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_CMOV) - c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; -#endif - - if (mm_flags & AV_CPU_FLAG_MMX) { -#if HAVE_INLINE_ASM - const int idct_algo = avctx->idct_algo; - - if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { - if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { - c->idct_put = ff_simple_idct_put_mmx; - c->idct_add = ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; - } else if (idct_algo == FF_IDCT_XVIDMMX) { - if (mm_flags & AV_CPU_FLAG_SSE2) { - c->idct_put = ff_idct_xvid_sse2_put; - c->idct_add = ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type = FF_SSE2_IDCT_PERM; - } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->idct_put = ff_idct_xvid_mmxext_put; - c->idct_add = ff_idct_xvid_mmxext_add; - c->idct = ff_idct_xvid_mmxext; - } else { - c->idct_put = ff_idct_xvid_mmx_put; - c->idct_add = ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; - } - } - } -#endif /* HAVE_INLINE_ASM */ - - dsputil_init_mmx(c, avctx, mm_flags); - } - - if (mm_flags & AV_CPU_FLAG_MMXEXT) - dsputil_init_mmxext(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE) - dsputil_init_sse(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE2) - dsputil_init_sse2(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSSE3) - dsputil_init_ssse3(c, avctx, mm_flags); - - if (mm_flags & AV_CPU_FLAG_SSE4) - dsputil_init_sse4(c, avctx, mm_flags); - - if (CONFIG_ENCODERS) - ff_dsputilenc_init_mmx(c, avctx); -} diff --git a/ffmpeg/libavcodec/x86/dsputil_mmx.h b/ffmpeg/libavcodec/x86/dsputil_mmx.h deleted file mode 100644 index 28b0078..0000000 --- a/ffmpeg/libavcodec/x86/dsputil_mmx.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DSPUTIL_MMX_H -#define AVCODEC_X86_DSPUTIL_MMX_H - -#include <stddef.h> -#include <stdint.h> - -#include "libavcodec/dsputil.h" -#include "libavutil/x86/asm.h" - -extern const uint64_t ff_bone; -extern const uint64_t ff_wtwo; - -extern const xmm_reg ff_pw_3; -extern const xmm_reg ff_pw_4; -extern const xmm_reg ff_pw_5; -extern const xmm_reg ff_pw_8; -extern const uint64_t ff_pw_15; -extern const xmm_reg ff_pw_16; -extern const xmm_reg ff_pw_18; -extern const uint64_t ff_pw_20; -extern const xmm_reg ff_pw_32; -extern const uint64_t ff_pw_42; -extern const uint64_t ff_pw_53; -extern const xmm_reg ff_pw_64; -extern const uint64_t ff_pw_96; -extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; - -extern const xmm_reg ff_pb_1; -extern const xmm_reg ff_pb_3; -extern const uint64_t ff_pb_3F; -extern const xmm_reg ff_pb_F8; -extern const uint64_t ff_pb_FC; - -extern const double ff_pd_1[2]; -extern const double ff_pd_2[2]; - -#define SBUTTERFLY(a,b,t,n,m)\ - "mov" #m " " #a ", " #t " \n\t" /* abcd */\ - "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ - "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ - -#define TRANSPOSE4(a,b,c,d,t)\ - SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ - SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ - SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ - SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ - -#define MOVQ_WONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) - -void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); - -void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); -void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride); - -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd); - -void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); -void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride); - -void ff_mmx_idct(int16_t *block); -void ff_mmxext_idct(int16_t *block); - - -void ff_deinterlace_line_mmx(uint8_t *dst, - const uint8_t *lum_m4, const uint8_t *lum_m3, - const uint8_t *lum_m2, const uint8_t *lum_m1, - const uint8_t *lum, - int size); - -void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, - const uint8_t *lum_m3, - const uint8_t *lum_m2, - const uint8_t *lum_m1, - const uint8_t *lum, int size); - -#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ diff --git a/ffmpeg/libavcodec/x86/dsputil_qns_template.c b/ffmpeg/libavcodec/x86/dsputil_qns_template.c index 77a41b9..bde6b0a 100644 --- a/ffmpeg/libavcodec/x86/dsputil_qns_template.c +++ b/ffmpeg/libavcodec/x86/dsputil_qns_template.c @@ -28,7 +28,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[ { x86_reg i=0; - assert(FFABS(scale) < MAX_ABS); + av_assert2(FFABS(scale) < MAX_ABS); scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; SET_RND(mm6); diff --git a/ffmpeg/libavcodec/x86/dsputil_rnd_template.c b/ffmpeg/libavcodec/x86/dsputil_rnd_template.c deleted file mode 100644 index 1a89b77..0000000 --- a/ffmpeg/libavcodec/x86/dsputil_rnd_template.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * DSP utils mmx functions are compiled twice for rnd/no_rnd - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -// put_pixels -static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -// this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -//FIXME optimize -static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_xy2)(block , pixels , line_size, h); - DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_xy2)(block , pixels , line_size, h); - DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); -} diff --git a/ffmpeg/libavcodec/x86/dsputilenc_mmx.c b/ffmpeg/libavcodec/x86/dsputilenc_mmx.c index a3f268e..5de8ade 100644 --- a/ffmpeg/libavcodec/x86/dsputilenc_mmx.c +++ b/ffmpeg/libavcodec/x86/dsputilenc_mmx.c @@ -3,6 +3,8 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or @@ -18,8 +20,6 @@ * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ #include "libavutil/attributes.h" @@ -30,7 +30,7 @@ #include "libavcodec/dsputil.h" #include "libavcodec/mpegvideo.h" #include "libavcodec/mathops.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); @@ -946,11 +946,13 @@ hadamard_func(ssse3) av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx) { - int mm_flags = av_get_cpu_flags(); - int bit_depth = avctx->bits_per_raw_sample; + int cpu_flags = av_get_cpu_flags(); + const int dct_algo = avctx->dct_algo; #if HAVE_YASM - if (EXTERNAL_MMX(mm_flags)) { + int bit_depth = avctx->bits_per_raw_sample; + + if (EXTERNAL_MMX(cpu_flags)) { if (bit_depth <= 8) c->get_pixels = ff_get_pixels_mmx; c->diff_pixels = ff_diff_pixels_mmx; @@ -958,25 +960,16 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx) c->pix_norm1 = ff_pix_norm1_mmx; } - if (EXTERNAL_SSE2(mm_flags)) + if (EXTERNAL_SSE2(cpu_flags)) if (bit_depth <= 8) c->get_pixels = ff_get_pixels_sse2; #endif /* HAVE_YASM */ #if HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_MMX) { - const int dct_algo = avctx->dct_algo; + if (INLINE_MMX(cpu_flags)) { if (avctx->bits_per_raw_sample <= 8 && - (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) { - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->fdct = ff_fdct_sse2; - } else if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->fdct = ff_fdct_mmxext; - }else{ - c->fdct = ff_fdct_mmx; - } - } - + (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) + c->fdct = ff_fdct_mmx; c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; @@ -997,63 +990,71 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx) c->add_8x8basis= add_8x8basis_mmx; c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; + } - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->sum_abs_dctelem = sum_abs_dctelem_mmxext; - c->vsad[4] = vsad_intra16_mmxext; + if (INLINE_AMD3DNOW(cpu_flags)) { + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->try_8x8basis = try_8x8basis_3dnow; + } + c->add_8x8basis = add_8x8basis_3dnow; + } - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->vsad[0] = vsad16_mmxext; - } + if (INLINE_MMXEXT(cpu_flags)) { + if (avctx->bits_per_raw_sample <= 8 && + (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) + c->fdct = ff_fdct_mmxext; - c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext; - } + c->sum_abs_dctelem = sum_abs_dctelem_mmxext; + c->vsad[4] = vsad_intra16_mmxext; - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->sum_abs_dctelem= sum_abs_dctelem_sse2; + if (!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->vsad[0] = vsad16_mmxext; } -#if HAVE_SSSE3_INLINE - if(mm_flags & AV_CPU_FLAG_SSSE3){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_ssse3; - } - c->add_8x8basis= add_8x8basis_ssse3; - c->sum_abs_dctelem= sum_abs_dctelem_ssse3; - } -#endif + c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext; + } + + if (INLINE_SSE2(cpu_flags)) { + if (avctx->bits_per_raw_sample <= 8 && + (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) + c->fdct = ff_fdct_sse2; + + c->sum_abs_dctelem= sum_abs_dctelem_sse2; + } - if(mm_flags & AV_CPU_FLAG_3DNOW){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_3dnow; - } - c->add_8x8basis= add_8x8basis_3dnow; +#if HAVE_SSSE3_INLINE + if (INLINE_SSSE3(cpu_flags)) { + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->try_8x8basis = try_8x8basis_ssse3; } + c->add_8x8basis = add_8x8basis_ssse3; + c->sum_abs_dctelem = sum_abs_dctelem_ssse3; } +#endif #endif /* HAVE_INLINE_ASM */ - if (EXTERNAL_MMX(mm_flags)) { + if (EXTERNAL_MMX(cpu_flags)) { c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; + } - if (EXTERNAL_MMXEXT(mm_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; - } + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; + c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; + } - if (EXTERNAL_SSE2(mm_flags)) { - c->sse[0] = ff_sse16_sse2; + if (EXTERNAL_SSE2(cpu_flags)) { + c->sse[0] = ff_sse16_sse2; #if HAVE_ALIGNED_STACK - c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; - c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; + c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; + c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; #endif - } + } - if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; - c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; - } + if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; + c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; } ff_dsputil_init_pix_mmx(c, avctx); diff --git a/ffmpeg/libavcodec/x86/fdct.c b/ffmpeg/libavcodec/x86/fdct.c index d35245d..11a13bb 100644 --- a/ffmpeg/libavcodec/x86/fdct.c +++ b/ffmpeg/libavcodec/x86/fdct.c @@ -34,7 +34,7 @@ #include "libavutil/x86/asm.h" #include "libavcodec/dct.h" -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE ////////////////////////////////////////////////////////////////////// // @@ -556,6 +556,10 @@ void ff_fdct_mmx(int16_t *block) } } +#endif /* HAVE_MMX_INLINE */ + +#if HAVE_MMXEXT_INLINE + void ff_fdct_mmxext(int16_t *block) { DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; @@ -574,6 +578,10 @@ void ff_fdct_mmxext(int16_t *block) } } +#endif /* HAVE_MMXEXT_INLINE */ + +#if HAVE_SSE2_INLINE + void ff_fdct_sse2(int16_t *block) { DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; @@ -583,4 +591,4 @@ void ff_fdct_sse2(int16_t *block) fdct_row_sse2(block1, block); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSE2_INLINE */ diff --git a/ffmpeg/libavcodec/x86/fft.asm b/ffmpeg/libavcodec/x86/fft.asm index 5071741..cae404c 100644 --- a/ffmpeg/libavcodec/x86/fft.asm +++ b/ffmpeg/libavcodec/x86/fft.asm @@ -36,7 +36,7 @@ %define pointer resd %endif -SECTION_RODATA +SECTION_RODATA 32 struc FFTContext .nbits: resd 1 @@ -57,7 +57,6 @@ endstruc %define M_COS_PI_1_8 0.923879532511287 %define M_COS_PI_3_8 0.38268343236509 -align 32 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 @@ -672,13 +671,13 @@ cglobal imdct_calc, 3,5,3 push r1 push r0 %else - sub rsp, 8 + sub rsp, 8+32*WIN64 ; allocate win64 shadow space %endif call r4 %if ARCH_X86_32 add esp, 12 %else - add rsp, 8 + add rsp, 8+32*WIN64 %endif POP r1 POP r3 diff --git a/ffmpeg/libavcodec/x86/fft.h b/ffmpeg/libavcodec/x86/fft.h index 3f8b21d..398091e 100644 --- a/ffmpeg/libavcodec/x86/fft.h +++ b/ffmpeg/libavcodec/x86/fft.h @@ -34,8 +34,5 @@ void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *i void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); -void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); -void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); #endif /* AVCODEC_X86_FFT_H */ diff --git a/ffmpeg/libavcodec/x86/fft_init.c b/ffmpeg/libavcodec/x86/fft_init.c index bfa7947..5682230 100644 --- a/ffmpeg/libavcodec/x86/fft_init.c +++ b/ffmpeg/libavcodec/x86/fft_init.c @@ -16,29 +16,31 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "config.h" +#include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/dct.h" #include "fft.h" av_cold void ff_fft_init_x86(FFTContext *s) { - int has_vectors = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); + #if ARCH_X86_32 - if (EXTERNAL_AMD3DNOW(has_vectors)) { + if (EXTERNAL_AMD3DNOW(cpu_flags)) { /* 3DNow! for K6-2/3 */ s->imdct_calc = ff_imdct_calc_3dnow; s->imdct_half = ff_imdct_half_3dnow; s->fft_calc = ff_fft_calc_3dnow; } - if (EXTERNAL_AMD3DNOWEXT(has_vectors)) { + if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { /* 3DNowEx for K7 */ s->imdct_calc = ff_imdct_calc_3dnowext; s->imdct_half = ff_imdct_half_3dnowext; s->fft_calc = ff_fft_calc_3dnowext; } #endif - if (EXTERNAL_SSE(has_vectors)) { + if (EXTERNAL_SSE(cpu_flags)) { /* SSE for P3/P4/K8 */ s->imdct_calc = ff_imdct_calc_sse; s->imdct_half = ff_imdct_half_sse; @@ -46,23 +48,10 @@ av_cold void ff_fft_init_x86(FFTContext *s) s->fft_calc = ff_fft_calc_sse; s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; } - if (EXTERNAL_AVX(has_vectors) && s->nbits >= 5) { + if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) { /* AVX for SB */ s->imdct_half = ff_imdct_half_avx; s->fft_calc = ff_fft_calc_avx; s->fft_permutation = FF_FFT_PERM_AVX; } } - -#if CONFIG_DCT -av_cold void ff_dct_init_x86(DCTContext *s) -{ - int has_vectors = av_get_cpu_flags(); - if (EXTERNAL_SSE(has_vectors)) - s->dct32 = ff_dct32_float_sse; - if (EXTERNAL_SSE2(has_vectors)) - s->dct32 = ff_dct32_float_sse2; - if (EXTERNAL_AVX(has_vectors)) - s->dct32 = ff_dct32_float_avx; -} -#endif diff --git a/ffmpeg/libavcodec/x86/fmtconvert.asm b/ffmpeg/libavcodec/x86/fmtconvert.asm index 1bd13fc..60078e2 100644 --- a/ffmpeg/libavcodec/x86/fmtconvert.asm +++ b/ffmpeg/libavcodec/x86/fmtconvert.asm @@ -32,7 +32,7 @@ SECTION_TEXT %endmacro ;--------------------------------------------------------------------------------- -; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); +; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len); ;--------------------------------------------------------------------------------- %macro INT32_TO_FLOAT_FMUL_SCALAR 1 %if UNIX64 diff --git a/ffmpeg/libavcodec/x86/fmtconvert_init.c b/ffmpeg/libavcodec/x86/fmtconvert_init.c index 4a4c017..d300dfd 100644 --- a/ffmpeg/libavcodec/x86/fmtconvert_init.c +++ b/ffmpeg/libavcodec/x86/fmtconvert_init.c @@ -3,6 +3,8 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or @@ -18,8 +20,6 @@ * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ #include "libavutil/attributes.h" @@ -30,8 +30,8 @@ #if HAVE_YASM -void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); -void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); +void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len); +void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len); void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); @@ -116,33 +116,32 @@ static void float_interleave_sse(float *dst, const float **src, av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) { #if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags)) { + if (EXTERNAL_MMX(cpu_flags)) { c->float_interleave = float_interleave_mmx; - - if (EXTERNAL_AMD3DNOW(mm_flags)) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = ff_float_to_int16_3dnow; - c->float_to_int16_interleave = float_to_int16_interleave_3dnow; - } - } - if (EXTERNAL_AMD3DNOWEXT(mm_flags)) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; - } - } - if (EXTERNAL_SSE(mm_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; - c->float_to_int16 = ff_float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; - c->float_interleave = float_interleave_sse; + } + if (EXTERNAL_AMD3DNOW(cpu_flags)) { + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = ff_float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } - if (EXTERNAL_SSE2(mm_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = ff_float_to_int16_sse2; - c->float_to_int16_interleave = float_to_int16_interleave_sse2; + } + if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; } } + if (EXTERNAL_SSE(cpu_flags)) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; + c->float_to_int16 = ff_float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; + c->float_interleave = float_interleave_sse; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; + c->float_to_int16 = ff_float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; + } #endif /* HAVE_YASM */ } diff --git a/ffmpeg/libavcodec/x86/fpelbase.asm b/ffmpeg/libavcodec/x86/fpelbase.asm deleted file mode 100644 index a327206..0000000 --- a/ffmpeg/libavcodec/x86/fpelbase.asm +++ /dev/null @@ -1,106 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -INIT_MMX mmxext -; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) -%macro PIXELS48 2 -%if %2 == 4 -%define OP movh -%else -%define OP mova -%endif -cglobal %1_pixels%2, 4,5 - movsxdifnidn r2, r2d - lea r4, [r2*3] -.loop: - OP m0, [r1] - OP m1, [r1+r2] - OP m2, [r1+r2*2] - OP m3, [r1+r4] - lea r1, [r1+r2*4] -%ifidn %1, avg - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] -%endif - OP [r0], m0 - OP [r0+r2], m1 - OP [r0+r2*2], m2 - OP [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jne .loop - RET -%endmacro - -PIXELS48 put, 4 -PIXELS48 avg, 4 -PIXELS48 put, 8 -PIXELS48 avg, 8 - - -INIT_XMM sse2 -; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal put_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET - -; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal avg_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET diff --git a/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm b/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm index b850551..beb7c0f 100644 --- a/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264_deblock.asm b/ffmpeg/libavcodec/x86/h264_deblock.asm index d58e16c..1317783 100644 --- a/ffmpeg/libavcodec/x86/h264_deblock.asm +++ b/ffmpeg/libavcodec/x86/h264_deblock.asm @@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma_8, 5,9 +cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 movsxd r7, r1d lea r8, [r7+r7*2] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 + %define pix_tmp rsp+0x30 ; shadow space + r4 %else - sub rsp, 0x68 %define pix_tmp rsp %endif @@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9 movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) -%if WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 -%endif RET %endmacro @@ -708,13 +701,16 @@ INIT_MMX cpuname ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8, 4,9 +cglobal deblock_h_luma_intra_8, 4,9,0,0x80 movsxd r7, r1d lea r8, [r7*3] lea r6, [r0-4] lea r5, [r0-4+r8] - sub rsp, 0x88 +%if WIN64 + %define pix_tmp rsp+0x20 ; shadow space +%else %define pix_tmp rsp +%endif ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) @@ -734,7 +730,6 @@ cglobal deblock_h_luma_intra_8, 4,9 sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - add rsp, 0x88 RET %else cglobal deblock_h_luma_intra_8, 2,4,8,0x80 diff --git a/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm b/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm index d63ca02..fdaf510 100644 --- a/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm @@ -7,20 +7,20 @@ ;* Loren Merritt <lorenm@u.washington.edu> ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264_idct.asm b/ffmpeg/libavcodec/x86/h264_idct.asm index 7bb1653..9af98a9 100644 --- a/ffmpeg/libavcodec/x86/h264_idct.asm +++ b/ffmpeg/libavcodec/x86/h264_idct.asm @@ -30,7 +30,6 @@ SECTION_RODATA -; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 @@ -81,7 +80,7 @@ SECTION .text %endmacro INIT_MMX mmx -; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct_add_8, 3, 3, 0 IDCT4_ADD r0, r1, r2 RET @@ -203,7 +202,7 @@ cglobal h264_idct_add_8, 3, 3, 0 %endmacro INIT_MMX mmx -; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_add_8, 3, 4, 0 %assign pad 128+4-(stack_offset&7) SUB rsp, pad @@ -271,7 +270,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 %endmacro INIT_XMM sse2 -; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_add_8, 3, 4, 10 IDCT8_ADD_SSE r0, r1, r2, r3 RET @@ -308,37 +307,38 @@ cglobal h264_idct8_add_8, 3, 4, 10 %endmacro INIT_MMX mmxext -; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) %if ARCH_X86_64 cglobal h264_idct_dc_add_8, 3, 4, 0 movsx r3, word [r1] - mov word [r1], 0 + mov dword [r1], 0 DC_ADD_MMXEXT_INIT r3, r2 DC_ADD_MMXEXT_OP movh, r0, r2, r3 RET -; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 3, 4, 0 movsx r3, word [r1] - mov word [r1], 0 + mov dword [r1], 0 DC_ADD_MMXEXT_INIT r3, r2 DC_ADD_MMXEXT_OP mova, r0, r2, r3 lea r0, [r0+r2*4] DC_ADD_MMXEXT_OP mova, r0, r2, r3 RET %else +; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct_dc_add_8, 2, 3, 0 movsx r2, word [r1] - mov word [r1], 0 + mov dword [r1], 0 mov r1, r2m DC_ADD_MMXEXT_INIT r2, r1 DC_ADD_MMXEXT_OP movh, r0, r1, r2 RET -; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) +; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 2, 3, 0 movsx r2, word [r1] - mov word [r1], 0 + mov dword [r1], 0 mov r1, r2m DC_ADD_MMXEXT_INIT r2, r1 DC_ADD_MMXEXT_OP mova, r0, r1, r2 @@ -348,8 +348,9 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0 %endif INIT_MMX mmx -; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC @@ -370,8 +371,9 @@ cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, jl .nextblock REP_RET -; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad @@ -403,8 +405,9 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, RET INIT_MMX mmxext -; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC @@ -449,8 +452,9 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride REP_RET INIT_MMX mmx -; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg xor r5, r5 %ifdef PIC @@ -473,9 +477,9 @@ cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, st REP_RET INIT_MMX mmxext -; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6*8]) +; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC @@ -517,9 +521,9 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s jl .nextblock REP_RET -; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6*8]) +; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg %assign pad 128+4-(stack_offset&7) SUB rsp, pad @@ -579,8 +583,9 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride RET INIT_XMM sse2 -; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg xor r5, r5 %ifdef PIC @@ -655,8 +660,8 @@ h264_idct_add8_mmx_plane: jnz .nextblock rep ret -; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, +; int16_t *block, int stride, const uint8_t nnzc[6 * 8]) cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 @@ -720,8 +725,9 @@ h264_idct_add8_mmxext_plane: rep ret INIT_MMX mmxext -; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg mov r5, 16 add r2, 512 @@ -803,8 +809,9 @@ h264_add8x4_idct_sse2: %endif %endmacro -; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 %if ARCH_X86_64 mov r5, r0 @@ -850,8 +857,9 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 %endif %endmacro -; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 %if ARCH_X86_64 mov r7, r0 @@ -901,8 +909,9 @@ cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 %endif %endmacro -; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, const uint8_t nnzc[6*8]) +; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6 * 8]) cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 add r2, 512 %if ARCH_X86_64 diff --git a/ffmpeg/libavcodec/x86/h264_idct_10bit.asm b/ffmpeg/libavcodec/x86/h264_idct_10bit.asm index 88fdb84..df21288 100644 --- a/ffmpeg/libavcodec/x86/h264_idct_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_idct_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264_intrapred.asm b/ffmpeg/libavcodec/x86/h264_intrapred.asm index 5c0dff4..3064ec5 100644 --- a/ffmpeg/libavcodec/x86/h264_intrapred.asm +++ b/ffmpeg/libavcodec/x86/h264_intrapred.asm @@ -2486,10 +2486,7 @@ cglobal pred4x4_tm_vp8_8, 3,3 pshufb mm3, mm6 pshufb mm4, mm6 pshufb mm5, mm6 - psubw mm2, mm7 - psubw mm3, mm7 - psubw mm4, mm7 - psubw mm5, mm7 + psubw mm0, mm7 paddw mm2, mm0 paddw mm3, mm0 paddw mm4, mm0 diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm b/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm index db2b25c..54eaee5 100644 --- a/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_init.c b/ffmpeg/libavcodec/x86/h264_intrapred_init.c index f5b5e3e..ad2984b 100644 --- a/ffmpeg/libavcodec/x86/h264_intrapred_init.c +++ b/ffmpeg/libavcodec/x86/h264_intrapred_init.c @@ -185,10 +185,10 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); if (bit_depth == 8) { - if (EXTERNAL_MMX(mm_flags)) { + if (EXTERNAL_MMX(cpu_flags)) { h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx; if (chroma_format_idc == 1) { @@ -203,7 +203,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, if (chroma_format_idc == 1) h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx; if (codec_id == AV_CODEC_ID_SVQ3) { - if (mm_flags & AV_CPU_FLAG_CMOV) + if (cpu_flags & AV_CPU_FLAG_CMOV) h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx; } else if (codec_id == AV_CODEC_ID_RV40) { h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx; @@ -213,7 +213,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } } - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext; if (chroma_format_idc == 1) @@ -265,11 +265,11 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } } - if (EXTERNAL_SSE(mm_flags)) { + if (EXTERNAL_SSE(cpu_flags)) { h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse; } - if (EXTERNAL_SSE2(mm_flags)) { + if (EXTERNAL_SSE2(cpu_flags)) { h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; @@ -292,7 +292,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } } - if (EXTERNAL_SSSE3(mm_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) { h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3; h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; if (chroma_format_idc == 1) @@ -323,7 +323,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } } } else if (bit_depth == 10) { - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; @@ -339,7 +339,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; } - if (EXTERNAL_SSE2(mm_flags)) { + if (EXTERNAL_SSE2(cpu_flags)) { h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2; @@ -371,7 +371,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; } - if (EXTERNAL_SSSE3(mm_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) { h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; @@ -382,7 +382,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; } - if (EXTERNAL_AVX(mm_flags)) { + if (EXTERNAL_AVX(cpu_flags)) { h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; diff --git a/ffmpeg/libavcodec/x86/h264_qpel.c b/ffmpeg/libavcodec/x86/h264_qpel.c index 96dec82..fd6068f 100644 --- a/ffmpeg/libavcodec/x86/h264_qpel.c +++ b/ffmpeg/libavcodec/x86/h264_qpel.c @@ -25,24 +25,13 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/h264qpel.h" #include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #if HAVE_YASM -void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); -static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_put_pixels8_mmxext(block, pixels, line_size, h); - ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} -static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - ff_avg_pixels8_mmxext(block, pixels, line_size, h); - ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -} +void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h); void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, @@ -55,15 +44,14 @@ void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h); void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h); -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h); #define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext +PIXELS16(static, ff_avg, , , _mmxext) +PIXELS16(static, ff_put, , , _mmxext) + #define DEF_QPEL(OPNAME)\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ @@ -209,7 +197,12 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ } -static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, + uint8_t *src, + int tmpStride, + int srcStride, + int size) +{ int w = (size+8)>>3; src -= 2*srcStride+2; while(w--){ @@ -221,7 +214,7 @@ static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ }\ static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ @@ -345,7 +338,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ @@ -355,7 +348,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ @@ -365,7 +358,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ }\ @@ -375,7 +368,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ }\ @@ -394,8 +387,6 @@ QPEL(put_, 16,XMM, 16)\ QPEL(avg_, 8, XMM, 16)\ QPEL(avg_, 16,XMM, 16)\ -#undef PAVGB -#define PAVGB "pavgb" QPEL_H264(put_, PUT_OP, mmxext) QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext) QPEL_H264_V_XMM(put_, PUT_OP, sse2) @@ -406,7 +397,6 @@ QPEL_H264_H_XMM(put_, PUT_OP, ssse3) QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) -#undef PAVGB H264_MC_4816(mmxext) H264_MC_816(H264_MC_V, sse2) @@ -552,9 +542,9 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) { #if HAVE_YASM int high_bit_depth = bit_depth > 8; - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { if (!high_bit_depth) { SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, ); @@ -574,8 +564,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) } } - if (EXTERNAL_SSE2(mm_flags)) { - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) { + if (EXTERNAL_SSE2(cpu_flags)) { + if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) { // these functions are slower than mmx on AMD, but faster on Intel H264_QPEL_FUNCS(0, 0, sse2); } @@ -606,7 +596,7 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) } } - if (EXTERNAL_SSSE3(mm_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) { if (!high_bit_depth) { H264_QPEL_FUNCS(1, 0, ssse3); H264_QPEL_FUNCS(1, 1, ssse3); @@ -629,7 +619,7 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) } } - if (EXTERNAL_AVX(mm_flags)) { + if (EXTERNAL_AVX(cpu_flags)) { /* AVX implies 64 byte cache lines without the need to avoid unaligned * memory accesses that cross the boundary between two cache lines. * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid diff --git a/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm b/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm index e14df84..4561871 100644 --- a/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg/libavcodec/x86/h264_weight_10bit.asm index 3b09e42..b7845fd 100644 --- a/ffmpeg/libavcodec/x86/h264_weight_10bit.asm +++ b/ffmpeg/libavcodec/x86/h264_weight_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/h264chroma_init.c b/ffmpeg/libavcodec/x86/h264chroma_init.c index b5c078f..3d8d5b0 100644 --- a/ffmpeg/libavcodec/x86/h264chroma_init.c +++ b/ffmpeg/libavcodec/x86/h264chroma_init.c @@ -19,6 +19,7 @@ #include <stdint.h> #include "config.h" +#include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" #include "libavcodec/h264chroma.h" @@ -66,49 +67,49 @@ CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) CHROMA_MC(avg, 8, 10, avx) -void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth) +av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth) { #if HAVE_YASM int high_bit_depth = bit_depth > 8; - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags) && !high_bit_depth) { + if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx; c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; } - if (EXTERNAL_AMD3DNOW(mm_flags) && !high_bit_depth) { + if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) { c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow; } - if (EXTERNAL_MMXEXT(mm_flags) && !high_bit_depth) { + if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) { c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext; c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext; } - if (EXTERNAL_MMXEXT(mm_flags) && bit_depth > 8 && bit_depth <= 10) { + if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; } - if (EXTERNAL_SSE2(mm_flags) && bit_depth > 8 && bit_depth <= 10) { + if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; } - if (EXTERNAL_SSSE3(mm_flags) && !high_bit_depth) { + if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) { c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3; c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3; c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3; } - if (EXTERNAL_AVX(mm_flags) && bit_depth > 8 && bit_depth <= 10) { + if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { // AVX implies !cache64. // TODO: Port cache(32|64) detection from x264. c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; diff --git a/ffmpeg/libavcodec/x86/h264dsp_init.c b/ffmpeg/libavcodec/x86/h264dsp_init.c index 11aae77..30801c4 100644 --- a/ffmpeg/libavcodec/x86/h264dsp_init.c +++ b/ffmpeg/libavcodec/x86/h264dsp_init.c @@ -23,7 +23,7 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/h264dsp.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" /***********************************/ /* IDCT */ @@ -132,8 +132,8 @@ LF_FUNCS(uint16_t, 10) #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL LF_FUNC(v8, luma, 8, mmxext) -static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0) +static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0) { if ((tc0[0] & tc0[1]) >= 0) ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); @@ -141,8 +141,8 @@ static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); } LF_IFUNC(v8, luma_intra, 8, mmxext) -static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, - int alpha, int beta) +static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, + int alpha, int beta) { ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); @@ -212,13 +212,13 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { #if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags)) + if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags)) c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; if (bit_depth == 8) { - if (EXTERNAL_MMX(mm_flags)) { + if (EXTERNAL_MMX(cpu_flags)) { c->h264_idct_dc_add = c->h264_idct_add = ff_h264_idct_add_8_mmx; c->h264_idct8_dc_add = @@ -229,146 +229,142 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, if (chroma_format_idc == 1) c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (mm_flags & AV_CPU_FLAG_CMOV) + if (cpu_flags & AV_CPU_FLAG_CMOV) c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - - if (EXTERNAL_MMXEXT(mm_flags)) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; - if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; - } + } + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; + if (chroma_format_idc == 1) { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; + c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; + } #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; + c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; + c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; #endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; - - if (EXTERNAL_SSE2(mm_flags)) { - c->h264_idct8_add = ff_h264_idct8_add_8_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; - - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; - - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; - } - if (EXTERNAL_SSSE3(mm_flags)) { - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; - } - if (EXTERNAL_AVX(mm_flags)) { - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; - } - } + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->h264_idct8_add = ff_h264_idct8_add_8_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; + + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; + + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; + } + if (EXTERNAL_AVX(cpu_flags)) { + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; } } else if (bit_depth == 10) { - if (EXTERNAL_MMX(mm_flags)) { - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { #if ARCH_X86_32 - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; #endif /* ARCH_X86_32 */ - c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; - if (EXTERNAL_SSE2(mm_flags)) { - c->h264_idct_add = ff_h264_idct_add_10_sse2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; + c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { + c->h264_idct_add = ff_h264_idct_add_10_sse2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; + c->h264_idct8_add = ff_h264_idct8_add_10_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; #endif /* HAVE_ALIGNED_STACK */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; #endif /* HAVE_ALIGNED_STACK */ - } - if (EXTERNAL_SSE4(mm_flags)) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; - } - if (EXTERNAL_AVX(mm_flags)) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_10_avx; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; - - c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; + } + if (EXTERNAL_SSE4(cpu_flags)) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; + } + if (EXTERNAL_AVX(cpu_flags)) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_10_avx; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; + + c->h264_idct_add16 = ff_h264_idct_add16_10_avx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_avx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_avx; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; + c->h264_idct8_add = ff_h264_idct8_add_10_avx; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; #endif /* HAVE_ALIGNED_STACK */ - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; #endif /* HAVE_ALIGNED_STACK */ - } - } } } #endif diff --git a/ffmpeg/libavcodec/x86/hpeldsp.asm b/ffmpeg/libavcodec/x86/hpeldsp.asm index 1a572a3..4eaba6e 100644 --- a/ffmpeg/libavcodec/x86/hpeldsp.asm +++ b/ffmpeg/libavcodec/x86/hpeldsp.asm @@ -423,30 +423,30 @@ cglobal avg_pixels8_xy2, 4,5 mova m6, [pb_1] lea r4, [r2*2] mova m0, [r1] - pavgb m0, [r1+1] + PAVGB m0, [r1+1] .loop: mova m2, [r1+r4] mova m1, [r1+r2] psubusb m2, m6 - pavgb m1, [r1+r2+1] - pavgb m2, [r1+r4+1] + PAVGB m1, [r1+r2+1] + PAVGB m2, [r1+r4+1] add r1, r4 - pavgb m0, m1 - pavgb m1, m2 - pavgb m0, [r0] - pavgb m1, [r0+r2] + PAVGB m0, m1 + PAVGB m1, m2 + PAVGB m0, [r0] + PAVGB m1, [r0+r2] mova [r0], m0 mova [r0+r2], m1 mova m1, [r1+r2] mova m0, [r1+r4] - pavgb m1, [r1+r2+1] - pavgb m0, [r1+r4+1] + PAVGB m1, [r1+r2+1] + PAVGB m0, [r1+r4+1] add r0, r4 add r1, r4 - pavgb m2, m1 - pavgb m1, m0 - pavgb m2, [r0] - pavgb m1, [r0+r2] + PAVGB m2, m1 + PAVGB m1, m0 + PAVGB m2, [r0] + PAVGB m1, [r0+r2] mova [r0], m2 mova [r0+r2], m1 add r0, r4 diff --git a/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c b/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c deleted file mode 100644 index b9a8f83..0000000 --- a/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * DSP utils : average functions are compiled twice for 3dnow/mmxext - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -//FIXME the following could be optimized too ... -static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_no_rnd_pixels8_x2)(block, pixels, line_size, h); - DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_put_no_rnd_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8)(block, pixels, line_size, h); - DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_x2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_y2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h); -} - -static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - DEF(ff_avg_pixels8_xy2)(block, pixels, line_size, h); - DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h); -} diff --git a/ffmpeg/libavcodec/x86/hpeldsp_init.c b/ffmpeg/libavcodec/x86/hpeldsp_init.c index 4b877b8..8ecf909 100644 --- a/ffmpeg/libavcodec/x86/hpeldsp_init.c +++ b/ffmpeg/libavcodec/x86/hpeldsp_init.c @@ -24,13 +24,10 @@ #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/hpeldsp.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" -//#undef NDEBUG -//#include <assert.h> - -#if HAVE_YASM void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, @@ -77,103 +74,45 @@ void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -#endif /* HAVE_YASM */ +#define avg_pixels8_mmx ff_avg_pixels8_mmx +#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx +#define avg_pixels16_mmx ff_avg_pixels16_mmx +#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx +#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx +#define put_pixels8_mmx ff_put_pixels8_mmx +#define put_pixels16_mmx ff_put_pixels16_mmx +#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx +#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx +#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx +#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx +#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx #if HAVE_INLINE_ASM -#define JUMPALIGN() __asm__ volatile (".p2align 3"::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "paddb %%"#regd", %%"#regd" \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "packuswb %%"#regd", %%"#regd" \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%"#regd", %%"#regd" \n\t" \ - "psrlw $15, %%"#regd" \n\t" \ - "psllw $1, %%"#regd" \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "paddb "#regb", "#regr" \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq "#rega", "#regr" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pand "#regfe", "#regb" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "pand "#regb", "#regr" \n\t" \ - "pand "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "paddb "#regb", "#regr" \n\t" \ - "paddb "#regd", "#regp" \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq "#rega", "#regr" \n\t" \ - "movq "#regc", "#regp" \n\t" \ - "por "#regb", "#regr" \n\t" \ - "por "#regd", "#regp" \n\t" \ - "pxor "#rega", "#regb" \n\t" \ - "pxor "#regc", "#regd" \n\t" \ - "pand %%mm6, "#regb" \n\t" \ - "pand %%mm6, "#regd" \n\t" \ - "psrlq $1, "#regd" \n\t" \ - "psrlq $1, "#regb" \n\t" \ - "psubb "#regb", "#regr" \n\t" \ - "psubb "#regd", "#regp" \n\t" - /***********************************/ /* MMX no rounding */ -#define NO_RND 1 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx #define SET_RND MOVQ_WONE #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) -#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) +#define STATIC static +#include "rnd_template.c" #include "hpeldsp_rnd_template.c" #undef DEF #undef SET_RND #undef PAVGBP #undef PAVGB -#undef NO_RND +#undef STATIC + +PIXELS16(static, avg_no_rnd, , _y2, _mmx) +PIXELS16(static, put_no_rnd, , _y2, _mmx) + +PIXELS16(static, avg_no_rnd, , _xy2, _mmx) +PIXELS16(static, put_no_rnd, , _xy2, _mmx) + /***********************************/ /* MMX rounding */ @@ -188,112 +127,29 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, #undef SET_RND #undef PAVGBP #undef PAVGB -#undef OP_AVG + +PIXELS16(static, avg, , _y2, _mmx) +PIXELS16(static, put, , _y2, _mmx) #endif /* HAVE_INLINE_ASM */ #if HAVE_YASM -#define ff_put_pixels8_mmx ff_put_pixels8_mmxext - -/***********************************/ -/* 3Dnow specific */ -#define DEF(x) x ## _3dnow +#define HPELDSP_AVG_PIXELS16(CPUEXT) \ + PIXELS16(static, put_no_rnd, ff_, _x2, CPUEXT) \ + PIXELS16(static, put, ff_, _y2, CPUEXT) \ + PIXELS16(static, put_no_rnd, ff_, _y2, CPUEXT) \ + PIXELS16(static, avg, ff_, , CPUEXT) \ + PIXELS16(static, avg, ff_, _x2, CPUEXT) \ + PIXELS16(static, avg, ff_, _y2, CPUEXT) \ + PIXELS16(static, avg, ff_, _xy2, CPUEXT) -#include "hpeldsp_avg_template.c" - -#undef DEF - -/***********************************/ -/* MMXEXT specific */ - -#define DEF(x) x ## _mmxext - -#include "hpeldsp_avg_template.c" - -#undef DEF +HPELDSP_AVG_PIXELS16(_3dnow) +HPELDSP_AVG_PIXELS16(_mmxext) #endif /* HAVE_YASM */ - -#if HAVE_INLINE_ASM -#define put_no_rnd_pixels16_mmx put_pixels16_mmx -#define put_no_rnd_pixels8_mmx put_pixels8_mmx -#define put_pixels16_mmxext put_pixels16_mmx -#define put_pixels8_mmxext put_pixels8_mmx -#define put_pixels4_mmxext put_pixels4_mmx -#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} -#endif /* HAVE_INLINE_ASM */ - -void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ do { \ c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ @@ -302,9 +158,9 @@ void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ } while (0) -static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags) +static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) { -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE SET_HPEL_FUNCS(put, [0], 16, mmx); SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); SET_HPEL_FUNCS(avg, [0], 16, mmx); @@ -312,18 +168,18 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags) SET_HPEL_FUNCS(put, [1], 8, mmx); SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); SET_HPEL_FUNCS(avg, [1], 8, mmx); -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ } -static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) +static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) { -#if HAVE_YASM +#if HAVE_MMXEXT_EXTERNAL c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; + c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; - c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; + c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; @@ -333,17 +189,15 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; if (!(flags & CODEC_FLAG_BITEXACT)) { - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; } -#endif /* HAVE_YASM */ -#if HAVE_MMXEXT_EXTERNAL if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; @@ -351,15 +205,15 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) #endif /* HAVE_MMXEXT_EXTERNAL */ } -static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) +static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) { -#if HAVE_YASM +#if HAVE_AMD3DNOW_EXTERNAL c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; + c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; - c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; + c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; @@ -369,12 +223,12 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; if (!(flags & CODEC_FLAG_BITEXACT)){ - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; } @@ -382,13 +236,13 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; } -#endif /* HAVE_YASM */ +#endif /* HAVE_AMD3DNOW_EXTERNAL */ } -static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags) +static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) { #if HAVE_SSE2_EXTERNAL - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { // these functions are slower than mmx on AMD, but faster on Intel c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; @@ -399,17 +253,17 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags) void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX) - hpeldsp_init_mmx(c, flags, mm_flags); + if (INLINE_MMX(cpu_flags)) + hpeldsp_init_mmx(c, flags, cpu_flags); - if (mm_flags & AV_CPU_FLAG_MMXEXT) - hpeldsp_init_mmxext(c, flags, mm_flags); + if (EXTERNAL_AMD3DNOW(cpu_flags)) + hpeldsp_init_3dnow(c, flags, cpu_flags); - if (mm_flags & AV_CPU_FLAG_3DNOW) - hpeldsp_init_3dnow(c, flags, mm_flags); + if (EXTERNAL_MMXEXT(cpu_flags)) + hpeldsp_init_mmxext(c, flags, cpu_flags); - if (mm_flags & AV_CPU_FLAG_SSE2) - hpeldsp_init_sse2(c, flags, mm_flags); + if (EXTERNAL_SSE2(cpu_flags)) + hpeldsp_init_sse2(c, flags, cpu_flags); } diff --git a/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c b/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c index 07de675..94e06d8 100644 --- a/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c +++ b/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c @@ -132,140 +132,6 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_ :REG_a, "memory"); } -static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// avg_pixels -#ifndef NO_RND -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} -#endif // NO_RND - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -#ifndef NO_RND -static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} -#endif // NO_RND - static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { MOVQ_BFE(mm6); @@ -276,13 +142,13 @@ static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff "movq 1%1, %%mm1 \n\t" "movq %0, %%mm3 \n\t" PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) + PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) "movq %%mm0, %0 \n\t" "movq 8%1, %%mm0 \n\t" "movq 9%1, %%mm1 \n\t" "movq 8%0, %%mm3 \n\t" PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) + PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) "movq %%mm0, 8%0 \n\t" :"+m"(*block) :"m"(*pixels) @@ -304,9 +170,9 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_ "movq (%1, %%"REG_a"), %%mm2 \n\t" PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) "movq (%2), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) + PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) "movq (%2, %3), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) + PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" @@ -316,9 +182,9 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_ "movq (%1, %%"REG_a"), %%mm0 \n\t" PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) "movq (%2), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) + PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) "movq (%2, %3), %%mm3 \n\t" - OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) + PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) "movq %%mm2, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" @@ -330,99 +196,3 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_ :"r"((x86_reg)line_size) :REG_a, "memory"); } - -// this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -//FIXME optimize -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_y2)(block , pixels , line_size, h); - DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(put, pixels8_xy2)(block , pixels , line_size, h); - DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_y2)(block , pixels , line_size, h); - DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ - DEF(avg, pixels8_xy2)(block , pixels , line_size, h); - DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); -} diff --git a/ffmpeg/libavcodec/x86/idct_mmx_xvid.c b/ffmpeg/libavcodec/x86/idct_mmx_xvid.c index 5e9f405..4cd6de1 100644 --- a/ffmpeg/libavcodec/x86/idct_mmx_xvid.c +++ b/ffmpeg/libavcodec/x86/idct_mmx_xvid.c @@ -44,10 +44,10 @@ #include "config.h" #include "libavcodec/avcodec.h" #include "libavutil/mem.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #include "idct_xvid.h" -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE //============================================================================= // Macros and other preprocessor constants @@ -507,6 +507,22 @@ __asm__ volatile( :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16)); } +void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_idct_xvid_mmx(block); + ff_put_pixels_clamped_mmx(block, dest, line_size); +} + +void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_idct_xvid_mmx(block); + ff_add_pixels_clamped_mmx(block, dest, line_size); +} + +#endif /* HAVE_MMX_INLINE */ + +#if HAVE_MMXEXT_INLINE + //----------------------------------------------------------------------------- // void idct_xmm(uint16_t block[64]); //----------------------------------------------------------------------------- @@ -531,18 +547,6 @@ __asm__ volatile( :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)); } -void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmx(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_idct_xvid_mmx(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block) { ff_idct_xvid_mmxext(block); @@ -555,4 +559,4 @@ void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block) ff_add_pixels_clamped_mmx(block, dest, line_size); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMXEXT_INLINE */ diff --git a/ffmpeg/libavcodec/x86/idct_sse2_xvid.c b/ffmpeg/libavcodec/x86/idct_sse2_xvid.c index b51466c..af4790c 100644 --- a/ffmpeg/libavcodec/x86/idct_sse2_xvid.c +++ b/ffmpeg/libavcodec/x86/idct_sse2_xvid.c @@ -41,9 +41,9 @@ #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "idct_xvid.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" -#if HAVE_INLINE_ASM +#if HAVE_SSE2_INLINE /** * @file @@ -404,4 +404,4 @@ void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) ff_add_pixels_clamped_mmx(block, dest, line_size); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSE2_INLINE */ diff --git a/ffmpeg/libavcodec/x86/lpc.c b/ffmpeg/libavcodec/x86/lpc.c index 1962212..8a74755 100644 --- a/ffmpeg/libavcodec/x86/lpc.c +++ b/ffmpeg/libavcodec/x86/lpc.c @@ -19,11 +19,16 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "libavutil/x86/asm.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/lpc.h" +DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; +DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 }; + #if HAVE_SSE2_INLINE static void lpc_apply_welch_window_sse2(const int32_t *data, int len, @@ -35,8 +40,8 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len, x86_reg j = n2*sizeof(int32_t); __asm__ volatile( "movsd %4, %%xmm7 \n\t" - "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" + "movapd "MANGLE(pd_1)", %%xmm6 \n\t" + "movapd "MANGLE(pd_2)", %%xmm5 \n\t" "movlhps %%xmm7, %%xmm7 \n\t" "subpd %%xmm5, %%xmm7 \n\t" "addsd %%xmm6, %%xmm7 \n\t" @@ -85,9 +90,9 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, x86_reg i = -len*sizeof(double); if(j == lag-2) { __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t" + "movsd "MANGLE(pd_1)", %%xmm0 \n\t" + "movsd "MANGLE(pd_1)", %%xmm1 \n\t" + "movsd "MANGLE(pd_1)", %%xmm2 \n\t" "1: \n\t" "movapd (%2,%0), %%xmm3 \n\t" "movupd -8(%3,%0), %%xmm4 \n\t" @@ -115,8 +120,8 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, ); } else { __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" + "movsd "MANGLE(pd_1)", %%xmm0 \n\t" + "movsd "MANGLE(pd_1)", %%xmm1 \n\t" "1: \n\t" "movapd (%3,%0), %%xmm3 \n\t" "movupd -8(%4,%0), %%xmm4 \n\t" @@ -144,9 +149,9 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, av_cold void ff_lpc_init_x86(LPCContext *c) { #if HAVE_SSE2_INLINE - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; } diff --git a/ffmpeg/libavcodec/x86/mathops.h b/ffmpeg/libavcodec/x86/mathops.h index 79e29e6..9c48afe 100644 --- a/ffmpeg/libavcodec/x86/mathops.h +++ b/ffmpeg/libavcodec/x86/mathops.h @@ -68,13 +68,13 @@ static av_always_inline av_const int64_t MUL64(int a, int b) #endif /* ARCH_X86_32 */ -#if HAVE_CMOV +#if HAVE_I686 /* median of 3 */ #define mid_pred mid_pred static inline av_const int mid_pred(int a, int b, int c) { int i=b; - __asm__ volatile( + __asm__ ( "cmp %2, %1 \n\t" "cmovg %1, %0 \n\t" "cmovg %2, %1 \n\t" @@ -87,9 +87,7 @@ static inline av_const int mid_pred(int a, int b, int c) ); return i; } -#endif -#if HAVE_CMOV #define COPY3_IF_LT(x, y, a, b, c, d)\ __asm__ volatile(\ "cmpl %0, %3 \n\t"\ @@ -99,7 +97,7 @@ __asm__ volatile(\ : "+&r" (x), "+&r" (a), "+r" (c)\ : "r" (y), "r" (b), "r" (d)\ ); -#endif +#endif /* HAVE_I686 */ #define MASK_ABS(mask, level) \ __asm__ ("cltd \n\t" \ diff --git a/ffmpeg/libavcodec/x86/mlpdsp.c b/ffmpeg/libavcodec/x86/mlpdsp.c index 81cab5a..94849b7 100644 --- a/ffmpeg/libavcodec/x86/mlpdsp.c +++ b/ffmpeg/libavcodec/x86/mlpdsp.c @@ -20,7 +20,9 @@ */ #include "libavutil/attributes.h" +#include "libavutil/cpu.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/mlpdsp.h" #include "libavcodec/mlp.h" @@ -177,6 +179,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) { #if HAVE_7REGS && HAVE_INLINE_ASM - c->mlp_filter_channel = mlp_filter_channel_x86; + int cpu_flags = av_get_cpu_flags(); + if (INLINE_MMX(cpu_flags)) + c->mlp_filter_channel = mlp_filter_channel_x86; #endif } diff --git a/ffmpeg/libavcodec/x86/motion_est.c b/ffmpeg/libavcodec/x86/motion_est.c index 3ffb002..5f5d93e 100644 --- a/ffmpeg/libavcodec/x86/motion_est.c +++ b/ffmpeg/libavcodec/x86/motion_est.c @@ -26,7 +26,8 @@ #include "libavutil/avassert.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" -#include "dsputil_mmx.h" +#include "libavutil/x86/cpu.h" +#include "dsputil_x86.h" #if HAVE_INLINE_ASM @@ -435,9 +436,9 @@ PIX_SAD(mmxext) av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) { #if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX) { + if (INLINE_MMX(cpu_flags)) { c->pix_abs[0][0] = sad16_mmx; c->pix_abs[0][1] = sad16_x2_mmx; c->pix_abs[0][2] = sad16_y2_mmx; @@ -450,7 +451,7 @@ av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) c->sad[0]= sad16_mmx; c->sad[1]= sad8_mmx; } - if (mm_flags & AV_CPU_FLAG_MMXEXT) { + if (INLINE_MMXEXT(cpu_flags)) { c->pix_abs[0][0] = sad16_mmxext; c->pix_abs[1][0] = sad8_mmxext; @@ -466,7 +467,7 @@ av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) c->pix_abs[1][3] = sad8_xy2_mmxext; } } - if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { + if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { c->sad[0]= sad16_sse2; } #endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg/libavcodec/x86/mpegaudiodec.c b/ffmpeg/libavcodec/x86/mpegaudiodec.c deleted file mode 100644 index 287d8ff..0000000 --- a/ffmpeg/libavcodec/x86/mpegaudiodec.c +++ /dev/null @@ -1,273 +0,0 @@ -/* - * MMX optimized MP3 decoding functions - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/internal.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/mpegaudiodsp.h" - -#define DECL(CPU)\ -static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ -void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); - -DECL(sse) -DECL(sse2) -DECL(sse3) -DECL(ssse3) -DECL(avx) - -void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, - float *tmpbuf); -void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, - float *tmpbuf); - -DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; - -#if HAVE_SSE2_INLINE - -#define MACS(rt, ra, rb) rt+=(ra)*(rb) -#define MLSS(rt, ra, rb) rt-=(ra)*(rb) - -#define SUM8(op, sum, w, p) \ -{ \ - op(sum, (w)[0 * 64], (p)[0 * 64]); \ - op(sum, (w)[1 * 64], (p)[1 * 64]); \ - op(sum, (w)[2 * 64], (p)[2 * 64]); \ - op(sum, (w)[3 * 64], (p)[3 * 64]); \ - op(sum, (w)[4 * 64], (p)[4 * 64]); \ - op(sum, (w)[5 * 64], (p)[5 * 64]); \ - op(sum, (w)[6 * 64], (p)[6 * 64]); \ - op(sum, (w)[7 * 64], (p)[7 * 64]); \ -} - -static void apply_window(const float *buf, const float *win1, - const float *win2, float *sum1, float *sum2, int len) -{ - x86_reg count = - 4*len; - const float *win1a = win1+len; - const float *win2a = win2+len; - const float *bufa = buf+len; - float *sum1a = sum1+len; - float *sum2a = sum2+len; - - -#define MULT(a, b) \ - "movaps " #a "(%1,%0), %%xmm1 \n\t" \ - "movaps " #a "(%3,%0), %%xmm2 \n\t" \ - "mulps %%xmm2, %%xmm1 \n\t" \ - "subps %%xmm1, %%xmm0 \n\t" \ - "mulps " #b "(%2,%0), %%xmm2 \n\t" \ - "subps %%xmm2, %%xmm4 \n\t" \ - - __asm__ volatile( - "1: \n\t" - "xorps %%xmm0, %%xmm0 \n\t" - "xorps %%xmm4, %%xmm4 \n\t" - - MULT( 0, 0) - MULT( 256, 64) - MULT( 512, 128) - MULT( 768, 192) - MULT(1024, 256) - MULT(1280, 320) - MULT(1536, 384) - MULT(1792, 448) - - "movaps %%xmm0, (%4,%0) \n\t" - "movaps %%xmm4, (%5,%0) \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - :"+&r"(count) - :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) - ); - -#undef MULT -} - -static void apply_window_mp3(float *in, float *win, int *unused, float *out, - int incr) -{ - LOCAL_ALIGNED_16(float, suma, [17]); - LOCAL_ALIGNED_16(float, sumb, [17]); - LOCAL_ALIGNED_16(float, sumc, [17]); - LOCAL_ALIGNED_16(float, sumd, [17]); - - float sum; - - /* copy to avoid wrap */ - __asm__ volatile( - "movaps 0(%0), %%xmm0 \n\t" \ - "movaps 16(%0), %%xmm1 \n\t" \ - "movaps 32(%0), %%xmm2 \n\t" \ - "movaps 48(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 0(%1) \n\t" \ - "movaps %%xmm1, 16(%1) \n\t" \ - "movaps %%xmm2, 32(%1) \n\t" \ - "movaps %%xmm3, 48(%1) \n\t" \ - "movaps 64(%0), %%xmm0 \n\t" \ - "movaps 80(%0), %%xmm1 \n\t" \ - "movaps 96(%0), %%xmm2 \n\t" \ - "movaps 112(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 64(%1) \n\t" \ - "movaps %%xmm1, 80(%1) \n\t" \ - "movaps %%xmm2, 96(%1) \n\t" \ - "movaps %%xmm3, 112(%1) \n\t" - ::"r"(in), "r"(in+512) - :"memory" - ); - - apply_window(in + 16, win , win + 512, suma, sumc, 16); - apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); - - SUM8(MACS, suma[0], win + 32, in + 48); - - sumc[ 0] = 0; - sumb[16] = 0; - sumd[16] = 0; - -#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ - "movups " #sumd "(%4), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "subps " #suma "(%1), %%xmm0 \n\t" \ - "movaps %%xmm0," #out1 "(%0) \n\t" \ -\ - "movups " #sumc "(%3), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "addps " #sumb "(%2), %%xmm0 \n\t" \ - "movaps %%xmm0," #out2 "(%0) \n\t" - - if (incr == 1) { - __asm__ volatile( - SUMS( 0, 48, 4, 52, 0, 112) - SUMS(16, 32, 20, 36, 16, 96) - SUMS(32, 16, 36, 20, 32, 80) - SUMS(48, 0, 52, 4, 48, 64) - - :"+&r"(out) - :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) - :"memory" - ); - out += 16*incr; - } else { - int j; - float *out2 = out + 32 * incr; - out[0 ] = -suma[ 0]; - out += incr; - out2 -= incr; - for(j=1;j<16;j++) { - *out = -suma[ j] + sumd[16-j]; - *out2 = sumb[16-j] + sumc[ j]; - out += incr; - out2 -= incr; - } - } - - sum = 0; - SUM8(MLSS, sum, win + 16 + 32, in + 32); - *out = sum; -} - -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_YASM -#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ -static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ - int count, int switch_point, int block_type) \ -{ \ - int align_end = count - (count & 3); \ - int j; \ - for (j = 0; j < align_end; j+= 4) { \ - LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ - float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ - in += 4*18; \ - buf += 4*18; \ - out += 4; \ - } \ - for (; j < count; j++) { \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - int win_idx = (switch_point && j < 2) ? 0 : block_type; \ - float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ - \ - ff_imdct36_float_ ## CPU1(out, buf, in, win); \ - \ - in += 18; \ - buf++; \ - out++; \ - } \ -} - -#if HAVE_SSE -DECL_IMDCT_BLOCKS(sse,sse) -DECL_IMDCT_BLOCKS(sse2,sse) -DECL_IMDCT_BLOCKS(sse3,sse) -DECL_IMDCT_BLOCKS(ssse3,sse) -#endif -#if HAVE_AVX_EXTERNAL -DECL_IMDCT_BLOCKS(avx,avx) -#endif -#endif /* HAVE_YASM */ - -av_cold void ff_mpadsp_init_x86(MPADSPContext *s) -{ - int mm_flags = av_get_cpu_flags(); - - int i, j; - for (j = 0; j < 4; j++) { - for (i = 0; i < 40; i ++) { - mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; - mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; - mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - } - } - -#if HAVE_SSE2_INLINE - if (mm_flags & AV_CPU_FLAG_SSE2) { - s->apply_window_float = apply_window_mp3; - } -#endif /* HAVE_SSE2_INLINE */ - -#if HAVE_YASM - if (EXTERNAL_AVX(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_avx; - } else if (EXTERNAL_SSSE3(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_ssse3; - } else if (EXTERNAL_SSE3(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse3; - } else if (EXTERNAL_SSE2(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse2; - } else if (EXTERNAL_SSE(mm_flags)) { - s->imdct36_blocks_float = imdct36_blocks_sse; - } -#endif /* HAVE_YASM */ -} diff --git a/ffmpeg/libavcodec/x86/mpegvideo.c b/ffmpeg/libavcodec/x86/mpegvideo.c index 903ad62..b2ce680 100644 --- a/ffmpeg/libavcodec/x86/mpegvideo.c +++ b/ffmpeg/libavcodec/x86/mpegvideo.c @@ -22,11 +22,12 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) @@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, qmul = qscale << 1; qadd = (qscale - 1) | 1; - assert(s->block_last_index[n]>=0 || s->h263_aic); + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; @@ -164,28 +165,6 @@ __asm__ volatile( ); } - -/* - We can suppose that result of two multiplications can't be greater than 0xFFFF - i.e. is 16-bit, so we use here only PMULLW instruction and can avoid - a complex multiplication. -===================================================== - Full formula for multiplication of 2 integer numbers - which are represent as high:low words: - input: value1 = high1:low1 - value2 = high2:low2 - output: value3 = value1*value2 - value3=high3:low3 (on overflow: modulus 2^32 wrap-around) - this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 - but this algorithm will compute only 0x66cb0ce4 - this limited by 16-bit size of operands - --------------------------------- - tlow1 = high1*low2 - tlow2 = high2*low1 - tlow1 = tlow1 + tlow2 - high3:low3 = low1*low2 - high3 += tlow1 -*/ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) { @@ -464,124 +443,14 @@ __asm__ volatile( ); } -static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "1: \n\t" - "pxor %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "movq (%0), %%mm2 \n\t" - "movq 8(%0), %%mm3 \n\t" - "pcmpgtw %%mm2, %%mm0 \n\t" - "pcmpgtw %%mm3, %%mm1 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psubusw (%2), %%mm2 \n\t" - "psubusw 8(%2), %%mm3 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, %%mm2 \n\t" - "movq %%mm5, %%mm3 \n\t" - "punpcklwd %%mm7, %%mm4 \n\t" - "punpckhwd %%mm7, %%mm2 \n\t" - "punpcklwd %%mm7, %%mm5 \n\t" - "punpckhwd %%mm7, %%mm3 \n\t" - "paddd (%1), %%mm4 \n\t" - "paddd 8(%1), %%mm2 \n\t" - "paddd 16(%1), %%mm5 \n\t" - "paddd 24(%1), %%mm3 \n\t" - "movq %%mm4, (%1) \n\t" - "movq %%mm2, 8(%1) \n\t" - "movq %%mm5, 16(%1) \n\t" - "movq %%mm3, 24(%1) \n\t" - "add $16, %0 \n\t" - "add $32, %1 \n\t" - "add $16, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - ); -} - -static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "1: \n\t" - "pxor %%xmm0, %%xmm0 \n\t" - "pxor %%xmm1, %%xmm1 \n\t" - "movdqa (%0), %%xmm2 \n\t" - "movdqa 16(%0), %%xmm3 \n\t" - "pcmpgtw %%xmm2, %%xmm0 \n\t" - "pcmpgtw %%xmm3, %%xmm1 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, %%xmm4 \n\t" - "movdqa %%xmm3, %%xmm5 \n\t" - "psubusw (%2), %%xmm2 \n\t" - "psubusw 16(%2), %%xmm3 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm6 \n\t" - "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm7, %%xmm4 \n\t" - "punpckhwd %%xmm7, %%xmm6 \n\t" - "punpcklwd %%xmm7, %%xmm5 \n\t" - "punpckhwd %%xmm7, %%xmm0 \n\t" - "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm6 \n\t" - "paddd 32(%1), %%xmm5 \n\t" - "paddd 48(%1), %%xmm0 \n\t" - "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm6, 16(%1) \n\t" - "movdqa %%xmm5, 32(%1) \n\t" - "movdqa %%xmm0, 48(%1) \n\t" - "add $32, %0 \n\t" - "add $64, %1 \n\t" - "add $32, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); -} - -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ av_cold void ff_MPV_common_init_x86(MpegEncContext *s) { -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); +#if HAVE_MMX_INLINE + int cpu_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX) { + if (INLINE_MMX(cpu_flags)) { s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; @@ -589,12 +458,6 @@ av_cold void ff_MPV_common_init_x86(MpegEncContext *s) if(!(s->flags & CODEC_FLAG_BITEXACT)) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; - - if (mm_flags & AV_CPU_FLAG_SSE2) { - s->denoise_dct= denoise_dct_sse2; - } else { - s->denoise_dct= denoise_dct_mmx; - } } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ } diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc.c b/ffmpeg/libavcodec/x86/mpegvideoenc.c index 6219667..7dd9959 100644 --- a/ffmpeg/libavcodec/x86/mpegvideoenc.c +++ b/ffmpeg/libavcodec/x86/mpegvideoenc.c @@ -26,9 +26,10 @@ #include "libavcodec/avcodec.h" #include "libavcodec/dct.h" #include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" -extern uint16_t ff_inv_zigzag_direct16[64]; +/* not permutated inverse zigzag_direct + 1 for MMX quantizer */ +DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64]; #if HAVE_MMX_INLINE #define COMPILE_TEMPLATE_MMXEXT 0 @@ -81,26 +82,146 @@ extern uint16_t ff_inv_zigzag_direct16[64]; #include "mpegvideoenc_template.c" #endif /* HAVE_SSSE3_INLINE */ +#if HAVE_INLINE_ASM +static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + "pxor %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "movq (%0), %%mm2 \n\t" + "movq 8(%0), %%mm3 \n\t" + "pcmpgtw %%mm2, %%mm0 \n\t" + "pcmpgtw %%mm3, %%mm1 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psubusw (%2), %%mm2 \n\t" + "psubusw 8(%2), %%mm3 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, %%mm2 \n\t" + "movq %%mm5, %%mm3 \n\t" + "punpcklwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm2 \n\t" + "punpcklwd %%mm7, %%mm5 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "paddd (%1), %%mm4 \n\t" + "paddd 8(%1), %%mm2 \n\t" + "paddd 16(%1), %%mm5 \n\t" + "paddd 24(%1), %%mm3 \n\t" + "movq %%mm4, (%1) \n\t" + "movq %%mm2, 8(%1) \n\t" + "movq %%mm5, 16(%1) \n\t" + "movq %%mm3, 24(%1) \n\t" + "add $16, %0 \n\t" + "add $32, %1 \n\t" + "add $16, %2 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + +static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "pxor %%xmm0, %%xmm0 \n\t" + "pxor %%xmm1, %%xmm1 \n\t" + "movdqa (%0), %%xmm2 \n\t" + "movdqa 16(%0), %%xmm3 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "pcmpgtw %%xmm3, %%xmm1 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, %%xmm4 \n\t" + "movdqa %%xmm3, %%xmm5 \n\t" + "psubusw (%2), %%xmm2 \n\t" + "psubusw 16(%2), %%xmm3 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm3, 16(%0) \n\t" + "movdqa %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "punpcklwd %%xmm7, %%xmm4 \n\t" + "punpckhwd %%xmm7, %%xmm6 \n\t" + "punpcklwd %%xmm7, %%xmm5 \n\t" + "punpckhwd %%xmm7, %%xmm0 \n\t" + "paddd (%1), %%xmm4 \n\t" + "paddd 16(%1), %%xmm6 \n\t" + "paddd 32(%1), %%xmm5 \n\t" + "paddd 48(%1), %%xmm0 \n\t" + "movdqa %%xmm4, (%1) \n\t" + "movdqa %%xmm6, 16(%1) \n\t" + "movdqa %%xmm5, 32(%1) \n\t" + "movdqa %%xmm0, 48(%1) \n\t" + "add $32, %0 \n\t" + "add $64, %1 \n\t" + "add $32, %2 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7") + ); +} +#endif /* HAVE_INLINE_ASM */ + av_cold void ff_dct_encode_init_x86(MpegEncContext *s) { - int mm_flags = av_get_cpu_flags(); const int dct_algo = s->avctx->dct_algo; + int i; + + for (i = 0; i < 64; i++) + inv_zigzag_direct16[ff_zigzag_direct[i]] = i + 1; if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { #if HAVE_MMX_INLINE - if (INLINE_MMX(mm_flags)) + int cpu_flags = av_get_cpu_flags(); + if (INLINE_MMX(cpu_flags)) { s->dct_quantize = dct_quantize_MMX; + s->denoise_dct = denoise_dct_mmx; + } #endif #if HAVE_MMXEXT_INLINE - if (INLINE_MMXEXT(mm_flags)) + if (INLINE_MMXEXT(cpu_flags)) s->dct_quantize = dct_quantize_MMXEXT; #endif #if HAVE_SSE2_INLINE - if (INLINE_SSE2(mm_flags)) + if (INLINE_SSE2(cpu_flags)) { s->dct_quantize = dct_quantize_SSE2; + s->denoise_dct = denoise_dct_sse2; + } #endif #if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(mm_flags)) + if (INLINE_SSSE3(cpu_flags)) s->dct_quantize = dct_quantize_SSSE3; #endif } diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc_template.c b/ffmpeg/libavcodec/x86/mpegvideoenc_template.c index 1e0505e..0defc40 100644 --- a/ffmpeg/libavcodec/x86/mpegvideoenc_template.c +++ b/ffmpeg/libavcodec/x86/mpegvideoenc_template.c @@ -171,7 +171,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat), "r" (bias), - "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) + "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7") ); @@ -205,7 +205,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat+64), "r" (bias+64), - "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) + "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7") ); diff --git a/ffmpeg/libavcodec/x86/pngdsp.asm b/ffmpeg/libavcodec/x86/pngdsp.asm index c05f3da..8e23ccf 100644 --- a/ffmpeg/libavcodec/x86/pngdsp.asm +++ b/ffmpeg/libavcodec/x86/pngdsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/pngdsp_init.c b/ffmpeg/libavcodec/x86/pngdsp_init.c index 4c54ed3..7dca62c 100644 --- a/ffmpeg/libavcodec/x86/pngdsp_init.c +++ b/ffmpeg/libavcodec/x86/pngdsp_init.c @@ -35,16 +35,16 @@ void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1, av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp) { - int flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 - if (EXTERNAL_MMX(flags)) + if (EXTERNAL_MMX(cpu_flags)) dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; #endif - if (EXTERNAL_MMXEXT(flags)) + if (EXTERNAL_MMXEXT(cpu_flags)) dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext; - if (EXTERNAL_SSE2(flags)) + if (EXTERNAL_SSE2(cpu_flags)) dsp->add_bytes_l2 = ff_add_bytes_l2_sse2; - if (EXTERNAL_SSSE3(flags)) + if (EXTERNAL_SSSE3(cpu_flags)) dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3; } diff --git a/ffmpeg/libavcodec/x86/proresdsp_init.c b/ffmpeg/libavcodec/x86/proresdsp_init.c index 91ff257..0273d61 100644 --- a/ffmpeg/libavcodec/x86/proresdsp_init.c +++ b/ffmpeg/libavcodec/x86/proresdsp_init.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/attributes.h" #include "libavutil/x86/cpu.h" #include "libavcodec/dsputil.h" #include "libavcodec/proresdsp.h" @@ -31,25 +32,25 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, int16_t *block, const int16_t *qmat); -void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) +av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) { #if ARCH_X86_64 - int flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); if(avctx->flags & CODEC_FLAG_BITEXACT) return; - if (EXTERNAL_SSE2(flags)) { + if (EXTERNAL_SSE2(cpu_flags)) { dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; dsp->idct_put = ff_prores_idct_put_10_sse2; } - if (EXTERNAL_SSE4(flags)) { + if (EXTERNAL_SSE4(cpu_flags)) { dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; dsp->idct_put = ff_prores_idct_put_10_sse4; } - if (EXTERNAL_AVX(flags)) { + if (EXTERNAL_AVX(cpu_flags)) { dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; dsp->idct_put = ff_prores_idct_put_10_avx; } diff --git a/ffmpeg/libavcodec/x86/qpelbase.asm b/ffmpeg/libavcodec/x86/qpelbase.asm deleted file mode 100644 index c2ffb86..0000000 --- a/ffmpeg/libavcodec/x86/qpelbase.asm +++ /dev/null @@ -1,176 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2003-2013 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -%macro op_avgh 3 - movh %3, %2 - pavgb %1, %3 - movh %2, %1 -%endmacro - -%macro op_avg 2 - pavgb %1, %2 - mova %2, %1 -%endmacro - -%macro op_puth 2-3 - movh %2, %1 -%endmacro - -%macro op_put 2 - mova %2, %1 -%endmacro - -; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS4_L2 1 -%define OP op_%1h -cglobal %1_pixels4_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - movd m0, [r1] - movd m1, [r2] - add r1, r4 - add r2, 4 - pavgb m0, m1 - OP m0, [r0], m3 - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2] - pavgb m1, [r2+4] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2+8] - pavgb m1, [r2+12] - OP m0, [r0], m3 - OP m1, [r0+r3], m3 - lea r0, [r0+2*r3] - add r2, 16 - sub r5d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS4_L2 put -PIXELS4_L2 avg - -; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS8_L2 1 -%define OP op_%1 -cglobal %1_pixels8_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r2] - add r1, r4 - add r2, 8 - pavgb m0, m1 - OP m0, [r0] - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2] - pavgb m1, [r2+8] - OP m0, [r0] - OP m1, [r0+r3] - lea r0, [r0+2*r3] - mova m0, [r1] - mova m1, [r1+r4] - lea r1, [r1+2*r4] - pavgb m0, [r2+16] - pavgb m1, [r2+24] - OP m0, [r0] - OP m1, [r0+r3] - lea r0, [r0+2*r3] - add r2, 32 - sub r5d, 4 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS8_L2 put -PIXELS8_L2 avg - -; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -%macro PIXELS16_L2 1 -%define OP op_%1 -cglobal %1_pixels16_l2, 6,6 - movsxdifnidn r3, r3d - movsxdifnidn r4, r4d - test r5d, 1 - je .loop - mova m0, [r1] - mova m1, [r1+8] - pavgb m0, [r2] - pavgb m1, [r2+8] - add r1, r4 - add r2, 16 - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - dec r5d -.loop: - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - pavgb m0, [r2] - pavgb m1, [r2+8] - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - mova m0, [r1] - mova m1, [r1+8] - add r1, r4 - pavgb m0, [r2+16] - pavgb m1, [r2+24] - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - add r2, 32 - sub r5d, 2 - jne .loop - REP_RET -%endmacro - -INIT_MMX mmxext -PIXELS16_L2 put -PIXELS16_L2 avg diff --git a/ffmpeg/libavcodec/x86/rv34dsp.asm b/ffmpeg/libavcodec/x86/rv34dsp.asm index 4d9c35b..7732d65 100644 --- a/ffmpeg/libavcodec/x86/rv34dsp.asm +++ b/ffmpeg/libavcodec/x86/rv34dsp.asm @@ -2,20 +2,20 @@ ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/rv34dsp_init.c b/ffmpeg/libavcodec/x86/rv34dsp_init.c index a2dea74..027efe9 100644 --- a/ffmpeg/libavcodec/x86/rv34dsp_init.c +++ b/ffmpeg/libavcodec/x86/rv34dsp_init.c @@ -2,20 +2,20 @@ * RV30/40 MMX/SSE2 optimizations * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,14 +32,14 @@ void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags)) + if (EXTERNAL_MMX(cpu_flags)) c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext; c->rv34_idct_add = ff_rv34_idct_add_mmxext; } - if (EXTERNAL_SSE4(mm_flags)) + if (EXTERNAL_SSE4(cpu_flags)) c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; } diff --git a/ffmpeg/libavcodec/x86/rv40dsp.asm b/ffmpeg/libavcodec/x86/rv40dsp.asm index 7ec72be..792a54f 100644 --- a/ffmpeg/libavcodec/x86/rv40dsp.asm +++ b/ffmpeg/libavcodec/x86/rv40dsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -98,11 +98,7 @@ SECTION .text %endif packuswb %1, %1 %ifidn %3, avg -%if cpuflag(3dnow) - pavgusb %1, %2 -%else - pavgb %1, %2 -%endif + PAVGB %1, %2 %endif movh [dstq], %1 %endmacro diff --git a/ffmpeg/libavcodec/x86/rv40dsp_init.c b/ffmpeg/libavcodec/x86/rv40dsp_init.c index 2f97518..75ba8ba 100644 --- a/ffmpeg/libavcodec/x86/rv40dsp_init.c +++ b/ffmpeg/libavcodec/x86/rv40dsp_init.c @@ -30,7 +30,7 @@ #include "libavutil/attributes.h" #include "libavutil/mem.h" #include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #if HAVE_YASM void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, @@ -188,25 +188,58 @@ QPEL_FUNCS_SET (OP, 3, 2, OPT) #endif /* HAVE_YASM */ +#if HAVE_MMX_INLINE +static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels8_xy2_mmx(dst, src, stride, 8); +} +static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_put_pixels16_xy2_mmx(dst, src, stride, 16); +} +static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels8_xy2_mmx(dst, src, stride, 8); +} +static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, + ptrdiff_t stride) +{ + ff_avg_pixels16_xy2_mmx(dst, src, stride, 16); +} +#endif /* HAVE_MMX_INLINE */ + av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) { -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags)) { - c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; - c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; #if HAVE_MMX_INLINE - c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx; - c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx; - c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx; - c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx; + if (INLINE_MMX(cpu_flags)) { + c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx; + c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx; + c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx; + c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx; + } #endif /* HAVE_MMX_INLINE */ + +#if HAVE_YASM + if (EXTERNAL_MMX(cpu_flags)) { + c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; + c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; #if ARCH_X86_32 QPEL_MC_SET(put_, _mmx) #endif } - if (EXTERNAL_MMXEXT(mm_flags)) { + if (EXTERNAL_AMD3DNOW(cpu_flags)) { + c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; + c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; +#if ARCH_X86_32 + QPEL_MC_SET(avg_, _3dnow) +#endif + } + if (EXTERNAL_MMXEXT(cpu_flags)) { c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; @@ -216,14 +249,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) #if ARCH_X86_32 QPEL_MC_SET(avg_, _mmxext) #endif - } else if (EXTERNAL_AMD3DNOW(mm_flags)) { - c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; - c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; -#if ARCH_X86_32 - QPEL_MC_SET(avg_, _3dnow) -#endif } - if (EXTERNAL_SSE2(mm_flags)) { + if (EXTERNAL_SSE2(cpu_flags)) { c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; @@ -231,7 +258,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) QPEL_MC_SET(put_, _sse2) QPEL_MC_SET(avg_, _sse2) } - if (EXTERNAL_SSSE3(mm_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) { c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; diff --git a/ffmpeg/libavcodec/x86/sbrdsp.asm b/ffmpeg/libavcodec/x86/sbrdsp.asm index 1b7f3a8..adc13c4 100644 --- a/ffmpeg/libavcodec/x86/sbrdsp.asm +++ b/ffmpeg/libavcodec/x86/sbrdsp.asm @@ -2,20 +2,20 @@ ;* AAC Spectral Band Replication decoding functions ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -24,7 +24,14 @@ SECTION_RODATA ; mask equivalent for multiply by -1.0 1.0 ps_mask times 2 dd 1<<31, 0 +ps_mask2 times 2 dd 0, 1<<31 ps_neg times 4 dd 1<<31 +ps_noise0 times 2 dd 1.0, 0.0, +ps_noise2 times 2 dd -1.0, 0.0 +ps_noise13 dd 0.0, 1.0, 0.0, -1.0 + dd 0.0, -1.0, 0.0, 1.0 + dd 0.0, 1.0, 0.0, -1.0 +cextern sbr_noise_table SECTION_TEXT @@ -220,3 +227,199 @@ cglobal sbr_qmf_post_shuffle, 2,3,4,W,z cmp zq, r2q jl .loop REP_RET + +INIT_XMM sse +cglobal sbr_neg_odd_64, 1,2,4,z + lea r1q, [zq+256] +.loop: + mova m0, [zq+ 0] + mova m1, [zq+16] + mova m2, [zq+32] + mova m3, [zq+48] + xorps m0, [ps_mask2] + xorps m1, [ps_mask2] + xorps m2, [ps_mask2] + xorps m3, [ps_mask2] + mova [zq+ 0], m0 + mova [zq+16], m1 + mova [zq+32], m2 + mova [zq+48], m3 + add zq, 64 + cmp zq, r1q + jne .loop + REP_RET + +; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1) +%macro SBR_QMF_DEINT_BFLY 0 +cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c + mov cq, 64*4-2*mmsize + lea vrevq, [vq + 64*4] +.loop: + mova m0, [src0q+cq] + mova m1, [src1q] + mova m4, [src0q+cq+mmsize] + mova m5, [src1q+mmsize] +%if cpuflag(sse2) + pshufd m2, m0, q0123 + pshufd m3, m1, q0123 + pshufd m6, m4, q0123 + pshufd m7, m5, q0123 +%else + shufps m2, m0, m0, q0123 + shufps m3, m1, m1, q0123 + shufps m6, m4, m4, q0123 + shufps m7, m5, m5, q0123 +%endif + addps m5, m2 + subps m0, m7 + addps m1, m6 + subps m4, m3 + mova [vrevq], m1 + mova [vrevq+mmsize], m5 + mova [vq+cq], m0 + mova [vq+cq+mmsize], m4 + add src1q, 2*mmsize + add vrevq, 2*mmsize + sub cq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +cglobal sbr_qmf_pre_shuffle, 1,4,6,z +%define OFFSET (32*4-2*mmsize) + mov r3q, OFFSET + lea r1q, [zq + (32+1)*4] + lea r2q, [zq + 64*4] + mova m5, [ps_neg] +.loop: + movu m0, [r1q] + movu m2, [r1q + mmsize] + movu m1, [zq + r3q + 4 + mmsize] + movu m3, [zq + r3q + 4] + + pxor m2, m5 + pxor m0, m5 + pshufd m2, m2, q0123 + pshufd m0, m0, q0123 + SBUTTERFLY dq, 2, 3, 4 + SBUTTERFLY dq, 0, 1, 4 + mova [r2q + 2*r3q + 0*mmsize], m2 + mova [r2q + 2*r3q + 1*mmsize], m3 + mova [r2q + 2*r3q + 2*mmsize], m0 + mova [r2q + 2*r3q + 3*mmsize], m1 + add r1q, 2*mmsize + sub r3q, 2*mmsize + jge .loop + movq m2, [zq] + movq [r2q], m2 + REP_RET + +%ifdef PIC +%define NREGS 1 +%if UNIX64 +%define NOISE_TABLE r6q ; r5q is m_max +%else +%define NOISE_TABLE r5q +%endif +%else +%define NREGS 0 +%define NOISE_TABLE sbr_noise_table +%endif + +%macro LOAD_NST 1 +%ifdef PIC + lea NOISE_TABLE, [%1] + mova m0, [kxq + NOISE_TABLE] +%else + mova m0, [kxq + %1] +%endif +%endmacro + +INIT_XMM sse2 +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise0] + jmp apply_noise_main + +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13 + jmp apply_noise_main + +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise2] + jmp apply_noise_main + +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13+16 + +apply_noise_main: +%if ARCH_X86_64 == 0 || WIN64 + mov kxd, m_maxm +%define count kxq +%else +%define count m_maxq +%endif + dec noiseq + shl count, 2 +%ifdef PIC + lea NOISE_TABLE, [sbr_noise_table] +%endif + lea Yq, [Yq + 2*count] + add s_mq, count + add q_filtq, count + shl noiseq, 3 + pxor m5, m5 + neg count +.loop: + mova m1, [q_filtq + count] + movu m3, [noiseq + NOISE_TABLE + 1*mmsize] + movu m4, [noiseq + NOISE_TABLE + 2*mmsize] + add noiseq, 2*mmsize + and noiseq, 0x1ff<<3 + punpckhdq m2, m1, m1 + punpckldq m1, m1 + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mova m3, [s_mq + count] + ; TODO: replace by a vpermd in AVX2 + punpckhdq m4, m3, m3 + punpckldq m3, m3 + pcmpeqd m6, m3, m5 ; m6 == 0 + pcmpeqd m7, m4, m5 ; m7 == 0 + mulps m3, m0 ; s_m[m] * phi_sign + mulps m4, m0 ; s_m[m] * phi_sign + pand m1, m6 + pand m2, m7 + movu m6, [Yq + 2*count] + movu m7, [Yq + 2*count + mmsize] + addps m3, m1 + addps m4, m2 + addps m6, m3 + addps m7, m4 + movu [Yq + 2*count], m6 + movu [Yq + 2*count + mmsize], m7 + add count, mmsize + jl .loop + RET diff --git a/ffmpeg/libavcodec/x86/sbrdsp_init.c b/ffmpeg/libavcodec/x86/sbrdsp_init.c index 27fade1..2b912d0 100644 --- a/ffmpeg/libavcodec/x86/sbrdsp_init.c +++ b/ffmpeg/libavcodec/x86/sbrdsp_init.c @@ -2,20 +2,20 @@ * AAC Spectral Band Replication decoding functions * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,17 +32,45 @@ void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2], void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], const float alpha0[2], const float alpha1[2], float bw, int start, int end); +void ff_sbr_neg_odd_64_sse(float *z); void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z); +void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1); +void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1); +void ff_sbr_qmf_pre_shuffle_sse2(float *z); + +void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_SSE(mm_flags)) { + if (EXTERNAL_SSE(cpu_flags)) { + s->neg_odd_64 = ff_sbr_neg_odd_64_sse; s->sum_square = ff_sbr_sum_square_sse; s->sum64x5 = ff_sbr_sum64x5_sse; s->hf_g_filt = ff_sbr_hf_g_filt_sse; s->hf_gen = ff_sbr_hf_gen_sse; s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2; + s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2; } } diff --git a/ffmpeg/libavcodec/x86/simple_idct.c b/ffmpeg/libavcodec/x86/simple_idct.c index f27d2b9..c666b1a 100644 --- a/ffmpeg/libavcodec/x86/simple_idct.c +++ b/ffmpeg/libavcodec/x86/simple_idct.c @@ -21,7 +21,7 @@ */ #include "libavcodec/simple_idct.h" #include "libavutil/mem.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #if HAVE_INLINE_ASM @@ -80,7 +80,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { static inline void idct(int16_t *block) { - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); int16_t * const temp= (int16_t*)align_tmp; __asm__ volatile( diff --git a/ffmpeg/libavcodec/x86/snowdsp.c b/ffmpeg/libavcodec/x86/snowdsp.c index 5505ee8..735e790 100644 --- a/ffmpeg/libavcodec/x86/snowdsp.c +++ b/ffmpeg/libavcodec/x86/snowdsp.c @@ -24,7 +24,7 @@ #include "libavcodec/avcodec.h" #include "libavcodec/snow.h" #include "libavcodec/snow_dwt.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #if HAVE_INLINE_ASM diff --git a/ffmpeg/libavcodec/x86/v210.asm b/ffmpeg/libavcodec/x86/v210.asm index 5473126..6554a43 100644 --- a/ffmpeg/libavcodec/x86/v210.asm +++ b/ffmpeg/libavcodec/x86/v210.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> ;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/ffmpeg/libavcodec/x86/vc1dsp_init.c b/ffmpeg/libavcodec/x86/vc1dsp_init.c index 228f4dc..9f18131 100644 --- a/ffmpeg/libavcodec/x86/vc1dsp_init.c +++ b/ffmpeg/libavcodec/x86/vc1dsp_init.c @@ -27,7 +27,7 @@ #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" #include "libavcodec/vc1dsp.h" -#include "dsputil_mmx.h" +#include "dsputil_x86.h" #include "vc1dsp.h" #include "config.h" @@ -83,12 +83,12 @@ void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) { - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (INLINE_MMX(mm_flags)) + if (INLINE_MMX(cpu_flags)) ff_vc1dsp_init_mmx(dsp); - if (INLINE_MMXEXT(mm_flags)) + if (INLINE_MMXEXT(cpu_flags)) ff_vc1dsp_init_mmxext(dsp); #define ASSIGN_LF(EXT) \ @@ -100,31 +100,30 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT #if HAVE_YASM - if (mm_flags & AV_CPU_FLAG_MMX) { + if (EXTERNAL_MMX(cpu_flags)) { dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx; } - - if (mm_flags & AV_CPU_FLAG_MMXEXT) { + if (EXTERNAL_AMD3DNOW(cpu_flags)) { + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; + } + if (EXTERNAL_MMXEXT(cpu_flags)) { ASSIGN_LF(mmxext); dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext; dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext; - } else if (mm_flags & AV_CPU_FLAG_3DNOW) { - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; } - - if (mm_flags & AV_CPU_FLAG_SSE2) { + if (EXTERNAL_SSE2(cpu_flags)) { dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; } - if (mm_flags & AV_CPU_FLAG_SSSE3) { + if (EXTERNAL_SSSE3(cpu_flags)) { ASSIGN_LF(ssse3); dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; } - if (mm_flags & AV_CPU_FLAG_SSE4) { + if (EXTERNAL_SSE4(cpu_flags)) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; } diff --git a/ffmpeg/libavcodec/x86/vc1dsp_mmx.c b/ffmpeg/libavcodec/x86/vc1dsp_mmx.c index df0385f..5ceacd3 100644 --- a/ffmpeg/libavcodec/x86/vc1dsp_mmx.c +++ b/ffmpeg/libavcodec/x86/vc1dsp_mmx.c @@ -28,8 +28,9 @@ #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" -#include "dsputil_mmx.h" #include "libavcodec/vc1dsp.h" +#include "constants.h" +#include "dsputil_x86.h" #include "vc1dsp.h" #if HAVE_INLINE_ASM @@ -698,53 +699,59 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, ); } +static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_put_pixels8_mmx(dst, src, stride, 8); +} + av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; - dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; - dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; - dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; - dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; - dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; - dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; - dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; - dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; - dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; - dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; - dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; - dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; + dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx; + dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; + dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; + dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; + dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; + dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; + dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; + dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; + dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; + dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; + dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; + dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; + dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; } av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) { - dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; - dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; - dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; - dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; - dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; - dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; - dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; - - dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; - dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; - dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; - dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; + dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; + + dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; + dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; + + dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; + dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; + dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; + + dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; + dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; + dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; + dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; + + dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; + dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; + dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; + dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; } #endif /* HAVE_INLINE_ASM */ diff --git a/ffmpeg/libavcodec/x86/videodsp.asm b/ffmpeg/libavcodec/x86/videodsp.asm index 0eb4721..1ac0257 100644 --- a/ffmpeg/libavcodec/x86/videodsp.asm +++ b/ffmpeg/libavcodec/x86/videodsp.asm @@ -23,577 +23,409 @@ SECTION .text -; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, -; x86_reg start_y, x86_reg end_y, x86_reg block_h, -; x86_reg start_x, x86_reg end_x, x86_reg block_w); -; -; The actual function itself is below. It basically wraps a very simple -; w = end_x - start_x -; if (w) { -; if (w > 22) { -; jump to the slow loop functions -; } else { -; jump to the fast loop functions -; } -; } -; -; ... and then the same for left/right extend also. See below for loop -; function implementations. Fast are fixed-width, slow is variable-width - -%macro EMU_EDGE_FUNC 0 -%if ARCH_X86_64 -%define w_reg r7 -cglobal emu_edge_core, 6, 9, 1 - mov r8, r5 ; save block_h -%else -%define w_reg r6 -cglobal emu_edge_core, 2, 7, 0 - mov r4, r4m ; end_y - mov r5, r5m ; block_h -%endif +; slow vertical extension loop function. Works with variable-width, and +; does per-line reading/writing of source data + +%macro V_COPY_ROW 2 ; type (top/body/bottom), h +.%1_y_loop: ; do { + mov wq, r7mp ; initialize w (r7mp = wmp) +.%1_x_loop: ; do { + movu m0, [srcq+wq] ; m0 = read($mmsize) + movu [dstq+wq], m0 ; write(m0, $mmsize) + add wq, mmsize ; w -= $mmsize + cmp wq, -mmsize ; } while (w > $mmsize); + jl .%1_x_loop + movu m0, [srcq-mmsize] ; m0 = read($mmsize) + movu [dstq-mmsize], m0 ; write(m0, $mmsize) +%ifidn %1, body ; if ($type == body) { + add srcq, src_strideq ; src += src_stride +%endif ; } + add dstq, dst_strideq ; dst += dst_stride + dec %2 ; } while (--$h); + jnz .%1_y_loop +%endmacro - ; start with vertical extend (top/bottom) and body pixel copy - mov w_reg, r7m - sub w_reg, r6m ; w = start_x - end_x - sub r5, r4 +%macro vvar_fn 0 +; .----. <- zero +; | | <- top is copied from first line in body of source +; |----| <- start_y +; | | <- body is copied verbatim (line-by-line) from source +; |----| <- end_y +; | | <- bottom is copied from last line in body of source +; '----' <- bh %if ARCH_X86_64 - sub r4, r3 -%else - sub r4, dword r3m +cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh, w +%else ; x86-32 +cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w +%define src_strideq r3mp +%define dst_strideq r1mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp %endif - cmp w_reg, 22 - jg .slow_v_extend_loop + sub bhq, end_yq ; bh -= end_q + sub end_yq, start_yq ; end_q -= start_q + add srcq, r7mp ; (r7mp = wmp) + add dstq, r7mp ; (r7mp = wmp) + neg r7mp ; (r7mp = wmp) + test start_yq, start_yq ; if (start_q) { + jz .body + V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) +.body: ; } + V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) + test bhq, bhq ; if (bh) { + jz .end + sub srcq, src_strideq ; src -= src_stride + V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) +.end: ; } + RET +%endmacro + %if ARCH_X86_32 - mov r2, r2m ; linesize -%endif - sal w_reg, 7 ; w * 128 -%ifdef PIC - lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] - add w_reg, rax -%else - lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] +INIT_MMX mmx +vvar_fn %endif - call w_reg ; fast top extend, body copy and bottom extend -.v_extend_end: - ; horizontal extend (left/right) - mov w_reg, r6m ; start_x - sub r0, w_reg -%if ARCH_X86_64 - mov r3, r0 ; backup of buf+block_h*linesize - mov r5, r8 -%else - mov r0m, r0 ; backup of buf+block_h*linesize - mov r5, r5m -%endif - test w_reg, w_reg - jz .right_extend - cmp w_reg, 22 - jg .slow_left_extend_loop - mov r1, w_reg - dec w_reg - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar w_reg, 1 - sal w_reg, 6 - ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs - ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h -%ifdef PIC - lea rax, [.emuedge_extend_left_2] - add w_reg, rax -%else - lea w_reg, [.emuedge_extend_left_2+w_reg] -%endif - call w_reg +INIT_XMM sse +vvar_fn + +%macro hvar_fn 0 +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w + lea dstq, [dstq+n_wordsq*2] + neg n_wordsq + lea start_xq, [start_xq+n_wordsq*2] +.y_loop: ; do { + ; FIXME also write a ssse3 version using pshufb + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd + mov wq, n_wordsq ; initialize w +%if cpuflag(sse2) + pshufd m0, m0, q0000 ; splat +%else ; mmx + punpckldq m0, m0 ; splat +%endif ; mmx/sse +.x_loop: ; do { + movu [dstq+wq*2], m0 ; write($reg, $mmsize) + add wq, mmsize/2 ; w -= $mmsize/2 + cmp wq, -mmsize/2 ; } while (w > $mmsize/2) + jl .x_loop + movu [dstq-mmsize], m0 ; write($reg, $mmsize) + add dstq, dst_strideq ; dst += dst_stride + dec hq ; } while (h--) + jnz .y_loop + RET +%endmacro - ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w -.right_extend: %if ARCH_X86_32 - mov r0, r0m - mov r5, r5m -%endif - mov w_reg, r7m ; end_x - mov r1, r8m ; block_w - mov r4, r1 - sub r1, w_reg - jz .h_extend_end ; if (end_x == block_w) goto h_extend_end - cmp r1, 22 - jg .slow_right_extend_loop - dec r1 - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar r1, 1 - sal r1, 6 -%ifdef PIC - lea rax, [.emuedge_extend_right_2] - add r1, rax -%else - lea r1, [.emuedge_extend_right_2+r1] +INIT_MMX mmx +hvar_fn %endif - call r1 -.h_extend_end: - RET -%if ARCH_X86_64 -%define vall al -%define valh ah -%define valw ax -%define valw2 r7w -%define valw3 r3w -%if WIN64 -%define valw4 r7w -%else ; unix64 -%define valw4 r3w -%endif -%define vald eax -%else -%define vall bl -%define valh bh -%define valw bx -%define valw2 r6w -%define valw3 valw2 -%define valw4 valw3 -%define vald ebx -%define stack_offset 0x14 -%endif - -%endmacro +INIT_XMM sse2 +hvar_fn ; macro to read/write a horizontal number of pixels (%2) to/from registers -; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels -; - if (%2 & 15 == 8) fills the last 8 bytes into rax -; - else if (%2 & 8) fills 8 bytes into mm0 -; - if (%2 & 7 == 4) fills the last 4 bytes into rax -; - else if (%2 & 4) fills 4 bytes into mm0-1 -; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax -; (note that we're using r3 for body/bottom because it's a shorter -; opcode, and then the loop fits in 128 bytes) -; - else fills remaining bytes into rax -; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels -; - if (%2 & 7 == 4) fills 4 bytes into ebx -; - else if (%2 & 4) fills 4 bytes into mm0-7 -; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx -; - else fills remaining bytes into ebx +; on sse, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 8) fills 8 bytes into xmm$next +; - if (%2 & 4) fills 4 bytes into xmm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax +; on mmx, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 4) fills 4 bytes into mm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax ; writing data out is in the same way %macro READ_NUM_BYTES 2 -%assign %%src_off 0 ; offset in source buffer -%assign %%smidx 0 ; mmx register idx -%assign %%sxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups xmm %+ %%sxidx, [r1+%%src_off] -%assign %%src_off %%src_off+16 -%assign %%sxidx %%sxidx+1 -%endrep ; %2/16 +%assign %%off 0 ; offset in source buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu xmm %+ %%xmm_idx, [srcq+%%off] +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu xmm %+ %%xmm_idx, [srcq+%2-16] +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 %endif -%if ARCH_X86_64 -%if (%2-%%src_off) == 8 - mov rax, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%endif ; (%2-%%src_off) == 8 -%endif ; x86-64 - -%rep (%2-%%src_off)/8 - movq mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%assign %%smidx %%smidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%src_off) == 4 - mov vald, [r1+%%src_off] -%elif (%2-%%src_off) & 4 - movd mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+4 -%endif ; (%2-%%src_off) ==/& 4 - -%if (%2-%%src_off) == 1 - mov vall, [r1+%%src_off] -%elif (%2-%%src_off) == 2 - mov valw, [r1+%%src_off] -%elif (%2-%%src_off) == 3 -%ifidn %1, top - mov valw2, [r1+%%src_off] +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq mm %+ %%mmx_idx, [srcq+%2-8] +%assign %%off %2 +%else + movd mm %+ %%mmx_idx, [srcq+%%off] +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd mm %+ %%mmx_idx, [srcq+%2-4] +%elif (%2-%%off) == 1 + mov valb, [srcq+%2-1] +%elif (%2-%%off) == 2 + mov valw, [srcq+%2-2] %elifidn %1, body - mov valw3, [r1+%%src_off] -%elifidn %1, bottom - mov valw4, [r1+%%src_off] -%endif ; %1 ==/!= top - mov vall, [r1+%%src_off+2] -%endif ; (%2-%%src_off) == 1/2/3 + mov vald, [srcq+%2-3] +%else + movd mm %+ %%mmx_idx, [srcq+%2-3] +%endif +%endif ; (%2-%%off) >= 1 %endmacro ; READ_NUM_BYTES %macro WRITE_NUM_BYTES 2 -%assign %%dst_off 0 ; offset in destination buffer -%assign %%dmidx 0 ; mmx register idx -%assign %%dxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups [r0+%%dst_off], xmm %+ %%dxidx -%assign %%dst_off %%dst_off+16 -%assign %%dxidx %%dxidx+1 -%endrep ; %2/16 +%assign %%off 0 ; offset in destination buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu [dstq+%%off], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu [dstq+%2-16], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 %endif -%if ARCH_X86_64 -%if (%2-%%dst_off) == 8 - mov [r0+%%dst_off], rax -%assign %%dst_off %%dst_off+8 -%endif ; (%2-%%dst_off) == 8 -%endif ; x86-64 - -%rep (%2-%%dst_off)/8 - movq [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+8 -%assign %%dmidx %%dmidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%dst_off) == 4 - mov [r0+%%dst_off], vald -%elif (%2-%%dst_off) & 4 - movd [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+4 -%endif ; (%2-%%dst_off) ==/& 4 - -%if (%2-%%dst_off) == 1 - mov [r0+%%dst_off], vall -%elif (%2-%%dst_off) == 2 - mov [r0+%%dst_off], valw -%elif (%2-%%dst_off) == 3 -%ifidn %1, top - mov [r0+%%dst_off], valw2 +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq [dstq+%2-8], mm %+ %%mmx_idx +%assign %%off %2 +%else + movd [dstq+%%off], mm %+ %%mmx_idx +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd [dstq+%2-4], mm %+ %%mmx_idx +%elif (%2-%%off) == 1 + mov [dstq+%2-1], valb +%elif (%2-%%off) == 2 + mov [dstq+%2-2], valw %elifidn %1, body - mov [r0+%%dst_off], valw3 -%elifidn %1, bottom - mov [r0+%%dst_off], valw4 -%endif ; %1 ==/!= top - mov [r0+%%dst_off+2], vall -%endif ; (%2-%%dst_off) == 1/2/3 + mov [dstq+%2-3], valw + shr vald, 16 + mov [dstq+%2-1], valb +%else + movd vald, mm %+ %%mmx_idx + mov [dstq+%2-3], valw + shr vald, 16 + mov [dstq+%2-1], valb +%endif +%endif ; (%2-%%off) >= 1 %endmacro ; WRITE_NUM_BYTES ; vertical top/bottom extend and body copy fast loops ; these are function pointers to set-width line copy functions, i.e. ; they read a fixed number of pixels into set registers, and write ; those out into the destination buffer -; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r6(eax/64)/r3(ebx/32)=val_reg -%macro VERTICAL_EXTEND 0 -%assign %%n 1 -%rep 22 -ALIGN 128 -.emuedge_v_extend_ %+ %%n: - ; extend pixels above body +%macro VERTICAL_EXTEND 2 +%assign %%n %1 +%rep 1+%2-%1 +%if %%n <= 3 %if ARCH_X86_64 - test r3 , r3 ; if (!start_y) - jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body -%else ; ARCH_X86_32 - cmp dword r3m, 0 - je .emuedge_copy_body_ %+ %%n %+ _loop -%endif ; ARCH_X86_64/32 - READ_NUM_BYTES top, %%n ; read bytes -.emuedge_extend_top_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES top, %%n ; write bytes - add r0 , r2 ; dst += linesize +cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ + start_y, end_y, val, bh + mov bhq, r6mp ; r6mp = bhmp +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh + mov dstq, r0mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%else %if ARCH_X86_64 - dec r3d -%else ; ARCH_X86_32 - dec dword r3m -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) +cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%endif + ; FIXME move this to c wrapper? + sub bhq, end_yq ; bh -= end_y + sub end_yq, start_yq ; end_y -= start_y + + ; extend pixels above body + test start_yq, start_yq ; if (start_y) { + jz .body_loop + READ_NUM_BYTES top, %%n ; $variable_regs = read($n) +.top_loop: ; do { + WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec start_yq ; } while (--start_y) + jnz .top_loop ; } ; copy body pixels -.emuedge_copy_body_ %+ %%n %+ _loop: ; do { - READ_NUM_BYTES body, %%n ; read bytes - WRITE_NUM_BYTES body, %%n ; write bytes - add r0 , r2 ; dst += linesize - add r1 , r2 ; src += linesize - dec r4d - jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) +.body_loop: ; do { + READ_NUM_BYTES body, %%n ; $variable_regs = read($n) + WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + add srcq, src_strideq ; src += src_stride + dec end_yq ; } while (--end_y) + jnz .body_loop ; copy bottom pixels - test r5 , r5 ; if (!block_h) - jz .emuedge_v_extend_end_ %+ %%n ; goto end - sub r1 , r2 ; src -= linesize - READ_NUM_BYTES bottom, %%n ; read bytes -.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES bottom, %%n ; write bytes - add r0 , r2 ; dst += linesize - dec r5d - jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) - -.emuedge_v_extend_end_ %+ %%n: -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 + test bhq, bhq ; if (block_h) { + jz .end + sub srcq, src_strideq ; src -= linesize + READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) +.bottom_loop: ; do { + WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec bhq ; } while (--bh) + jnz .bottom_loop ; } + +.end: + RET %assign %%n %%n+1 -%endrep -%endmacro VERTICAL_EXTEND +%endrep ; 1+%2-%1 +%endmacro ; VERTICAL_EXTEND + +INIT_MMX mmx +VERTICAL_EXTEND 1, 15 +%if ARCH_X86_32 +VERTICAL_EXTEND 16, 22 +%endif + +INIT_XMM sse +VERTICAL_EXTEND 16, 22 ; left/right (horizontal) fast extend functions ; these are essentially identical to the vertical extend ones above, ; just left/right separated because number of pixels to extend is ; obviously not the same on both sides. -; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the -; lowest two bytes of the register (so val*0x0101), and are splatted -; into each byte of mm0 as well if n_pixels >= 8 %macro READ_V_PIXEL 2 - mov vall, %2 - mov valh, vall + movzx vald, byte %2 + imul vald, 0x01010101 %if %1 >= 8 - movd mm0, vald -%if cpuflag(mmxext) - pshufw mm0, mm0, 0 -%else ; mmx - punpcklwd mm0, mm0 - punpckldq mm0, mm0 -%endif ; sse -%endif ; %1 >= 8 -%endmacro + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 +%else + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 +%endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 -%assign %%dst_off 0 -%rep %1/8 - movq [%2+%%dst_off], mm0 -%assign %%dst_off %%dst_off+8 -%endrep -%if %1 & 4 -%if %1 >= 8 - movd [%2+%%dst_off], mm0 -%else ; %1 < 8 - mov [%2+%%dst_off] , valw - mov [%2+%%dst_off+2], valw -%endif ; %1 >=/< 8 -%assign %%dst_off %%dst_off+4 -%endif ; %1 & 4 -%if %1&2 - mov [%2+%%dst_off], valw -%endif ; %1 & 2 -%endmacro +%assign %%off 0 -; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val -%macro LEFT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_left_ %+ %%n: ; do { - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+r1] ; read pixels - WRITE_V_PIXEL %%n, r0 ; write pixels - dec r5 - jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep -%endmacro ; LEFT_EXTEND - -; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val -%macro RIGHT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_right_ %+ %%n: ; do { -%if ARCH_X86_64 - sub r3, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels - dec r8 -%else ; ARCH_X86_32 - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels - dec r5 -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep - -%if ARCH_X86_32 -%define stack_offset 0x10 -%endif -%endmacro ; RIGHT_EXTEND - -; below follow the "slow" copy/extend functions, these act on a non-fixed -; width specified in a register, and run a loop to copy the full amount -; of bytes. They are optimized for copying of large amounts of pixels per -; line, so they unconditionally splat data into mm registers to copy 8 -; bytes per loop iteration. It could be considered to use xmm for x86-64 -; also, but I haven't optimized this as much (i.e. FIXME) -%macro V_COPY_NPX 4-5 -%if %0 == 4 - test w_reg, %4 - jz .%1_skip_%4_px -%else ; %0 == 5 -.%1_%4_px_loop: -%endif - %3 %2, [r1+cnt_reg] - %3 [r0+cnt_reg], %2 - add cnt_reg, %4 -%if %0 == 5 - sub w_reg, %4 - test w_reg, %5 - jnz .%1_%4_px_loop -%endif -.%1_skip_%4_px: -%endmacro +%if %1 >= 8 -%macro V_COPY_ROW 2 -%ifidn %1, bottom - sub r1, linesize -%endif -.%1_copy_loop: - xor cnt_reg, cnt_reg -%if notcpuflag(sse) -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 -%else ; sse - V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 -%if ARCH_X86_64 -%define linesize r2 - V_COPY_NPX %1, rax , mov, 8 -%else ; ARCH_X86_32 -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8 -%endif ; ARCH_X86_64/32 -%endif ; sse - V_COPY_NPX %1, vald, mov, 4 - V_COPY_NPX %1, valw, mov, 2 - V_COPY_NPX %1, vall, mov, 1 - mov w_reg, cnt_reg -%ifidn %1, body - add r1, linesize -%endif - add r0, linesize - dec %2 - jnz .%1_copy_loop -%endmacro +%rep %1/mmsize + movu [%2+%%off], m0 +%assign %%off %%off+mmsize +%endrep ; %1/mmsize -%macro SLOW_V_EXTEND 0 -.slow_v_extend_loop: -; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x -%if ARCH_X86_64 - push r8 ; save old value of block_h - test r3, r3 -%define cnt_reg r8 - jz .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, r3 +%if mmsize == 16 +%if %1-%%off >= 8 +%if %1 > 16 && %1-%%off > 8 + movu [%2+%1-16], m0 +%assign %%off %1 %else - cmp dword r3m, 0 -%define cnt_reg r2 - je .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, dword r3m + movq [%2+%%off], m0 +%assign %%off %%off+8 %endif +%endif ; %1-%%off >= 8 +%endif ; mmsize == 16 -.do_body_copy: - V_COPY_ROW body, r4 - -%if ARCH_X86_64 - pop r8 ; restore old value of block_h -%define cnt_reg r3 -%endif - test r5, r5 -%if ARCH_X86_64 - jz .v_extend_end +%if %1-%%off >= 4 +%if %1 > 8 && %1-%%off > 4 + movq [%2+%1-8], m0 +%assign %%off %1 %else - jz .skip_bottom_extend -%endif - V_COPY_ROW bottom, r5 -%if ARCH_X86_32 -.skip_bottom_extend: - mov r2, r2m + movd [%2+%%off], m0 +%assign %%off %%off+4 %endif - jmp .v_extend_end -%endmacro +%endif ; %1-%%off >= 4 -%macro SLOW_LEFT_EXTEND 0 -.slow_left_extend_loop: -; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x - mov r4, 8 - sub r0, linesize - READ_V_PIXEL 8, [r0+w_reg] -.left_extend_8px_loop: - movq [r0+r4-8], mm0 - add r4, 8 - cmp r4, w_reg - jle .left_extend_8px_loop - sub r4, 8 - cmp r4, w_reg - jge .left_extend_loop_end -.left_extend_2px_loop: - mov [r0+r4], valw - add r4, 2 - cmp r4, w_reg - jl .left_extend_2px_loop -.left_extend_loop_end: - dec r5 - jnz .slow_left_extend_loop -%if ARCH_X86_32 - mov r2, r2m -%endif - jmp .right_extend -%endmacro +%else ; %1 < 8 -%macro SLOW_RIGHT_EXTEND 0 -.slow_right_extend_loop: -; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, -; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr -%if ARCH_X86_64 -%define buf_reg r3 -%define bh_reg r8 -%else -%define buf_reg r0 -%define bh_reg r5 -%endif - lea r1, [r4-8] - sub buf_reg, linesize - READ_V_PIXEL 8, [buf_reg+w_reg-1] -.right_extend_8px_loop: - movq [buf_reg+r1], mm0 - sub r1, 8 - cmp r1, w_reg - jge .right_extend_8px_loop - add r1, 8 - cmp r1, w_reg - je .right_extend_loop_end -.right_extend_2px_loop: - sub r1, 2 - mov [buf_reg+r1], valw - cmp r1, w_reg - jg .right_extend_2px_loop -.right_extend_loop_end: - dec bh_reg - jnz .slow_right_extend_loop - jmp .h_extend_end -%endmacro +%rep %1/4 + mov [%2+%%off], vald +%assign %%off %%off+4 +%endrep ; %1/4 -%macro emu_edge 1 -INIT_XMM %1 -EMU_EDGE_FUNC -VERTICAL_EXTEND -LEFT_EXTEND -RIGHT_EXTEND -SLOW_V_EXTEND -SLOW_LEFT_EXTEND -SLOW_RIGHT_EXTEND -%endmacro +%endif ; %1 >=/< 8 + +%if %1-%%off == 2 + mov [%2+%%off], valw +%endif ; (%1-%%off)/2 +%endmacro ; WRITE_V_PIXEL + +%macro H_EXTEND 2 +%assign %%n %1 +%rep 1+(%2-%1)/2 +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +.loop_y: ; do { + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) + WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + dec bhq ; } while (--bh) + jnz .loop_y + RET +%assign %%n %%n+2 +%endrep ; 1+(%2-%1)/2 +%endmacro ; H_EXTEND -emu_edge sse +INIT_MMX mmx +H_EXTEND 2, 14 %if ARCH_X86_32 -emu_edge mmx +H_EXTEND 16, 22 %endif +INIT_XMM sse2 +H_EXTEND 16, 22 + %macro PREFETCH_FN 1 cglobal prefetch, 3, 3, 0, buf, stride, h .loop: diff --git a/ffmpeg/libavcodec/x86/videodsp_init.c b/ffmpeg/libavcodec/x86/videodsp_init.c index 902450e..2013a93 100644 --- a/ffmpeg/libavcodec/x86/videodsp_init.c +++ b/ffmpeg/libavcodec/x86/videodsp_init.c @@ -26,36 +26,131 @@ #include "libavutil/cpu.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/videodsp.h" #if HAVE_YASM -typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, - x86_reg linesize, x86_reg start_y, - x86_reg end_y, x86_reg block_h, - x86_reg start_x, x86_reg end_x, - x86_reg block_w); -extern emu_edge_core_func ff_emu_edge_core_mmx; -extern emu_edge_core_func ff_emu_edge_core_sse; - -static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize_arg, - int block_w, int block_h, - int src_x, int src_y, - int w, int h, - emu_edge_core_func *core_fn) +typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh); +typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh, + x86_reg w); + +extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx; +#if ARCH_X86_32 +static emu_edge_vfix_func *vfixtbl_mmx[22] = { + &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx, + &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx, + &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx, + &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx, + &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx, + &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx, + &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx, + &ff_emu_edge_vfix22_mmx +}; +#endif +extern emu_edge_vvar_func ff_emu_edge_vvar_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; +static emu_edge_vfix_func *vfixtbl_sse[22] = { + ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, + ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, + ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, + ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, + ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, + ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, + ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, + ff_emu_edge_vfix22_sse +}; +extern emu_edge_vvar_func ff_emu_edge_vvar_sse; + +typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg bh); +typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg n_words, x86_reg bh); + +extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx; +#if ARCH_X86_32 +static emu_edge_hfix_func *hfixtbl_mmx[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx, + ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx +}; +#endif +extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; +static emu_edge_hfix_func *hfixtbl_sse2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; + +static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride, + ptrdiff_t src_stride, + x86_reg block_w, x86_reg block_h, + x86_reg src_x, x86_reg src_y, + x86_reg w, x86_reg h, + emu_edge_vfix_func **vfix_tbl, + emu_edge_vvar_func *v_extend_var, + emu_edge_hfix_func **hfix_tbl, + emu_edge_hvar_func *h_extend_var) { - int start_y, start_x, end_y, end_x, src_y_add = 0; - int linesize = linesize_arg; + x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; - if(!w || !h) + if (!w || !h) return; if (src_y >= h) { - src -= src_y*linesize; + src -= src_y*src_stride; src_y_add = h - 1; src_y = h - 1; } else if (src_y <= -block_h) { - src -= src_y*linesize; + src -= src_y*src_stride; src_y_add = 1 - block_h; src_y = 1 - block_h; } @@ -75,30 +170,72 @@ static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, av_assert2(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below - src += (src_y_add + start_y) * linesize + start_x; - buf += start_x; - core_fn(buf, src, linesize, start_y, end_y, - block_h, start_x, end_x, block_w); + src += (src_y_add + start_y) * src_stride + start_x; + w = end_x - start_x; + if (w <= 22) { + vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h); + } else { + v_extend_var(dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h, w); + } + + // fill left + if (start_x) { + if (start_x <= 22) { + hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); + } else { + h_extend_var(dst, dst_stride, + start_x, (start_x + 1) >> 1, block_h); + } + } + + // fill right + p = block_w - end_x; + if (p) { + if (p <= 22) { + hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, + -!(p & 1), block_h); + } else { + h_extend_var(dst + end_x - (p & 1), dst_stride, + -!(p & 1), (p + 1) >> 1, block_h); + } + } } #if ARCH_X86_32 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, int block_w, int block_h, int src_x, int src_y, int w, int h) { - emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, - w, h, &ff_emu_edge_core_mmx); + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, + hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } -#endif static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, - ptrdiff_t linesize, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, int block_w, int block_h, int src_x, int src_y, int w, int h) { - emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, - w, h, &ff_emu_edge_core_sse); + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_mmx, &ff_emu_edge_hvar_mmx); +} +#endif + +static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_sse2, &ff_emu_edge_hvar_sse2); } #endif /* HAVE_YASM */ @@ -108,21 +245,26 @@ void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h); av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) { #if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 - if (bpc <= 8 && mm_flags & AV_CPU_FLAG_MMX) { + if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) { ctx->emulated_edge_mc = emulated_edge_mc_mmx; } - if (mm_flags & AV_CPU_FLAG_3DNOW) { + if (EXTERNAL_AMD3DNOW(cpu_flags)) { ctx->prefetch = ff_prefetch_3dnow; } #endif /* ARCH_X86_32 */ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { + if (EXTERNAL_MMXEXT(cpu_flags)) { ctx->prefetch = ff_prefetch_mmxext; } - if (bpc <= 8 && mm_flags & AV_CPU_FLAG_SSE) { +#if ARCH_X86_32 + if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) { ctx->emulated_edge_mc = emulated_edge_mc_sse; } +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_sse2; + } #endif /* HAVE_YASM */ } diff --git a/ffmpeg/libavcodec/x86/vorbisdsp_init.c b/ffmpeg/libavcodec/x86/vorbisdsp_init.c index 08a2c09..284a528 100644 --- a/ffmpeg/libavcodec/x86/vorbisdsp_init.c +++ b/ffmpeg/libavcodec/x86/vorbisdsp_init.c @@ -21,6 +21,7 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/vorbisdsp.h" void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, @@ -31,13 +32,13 @@ void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) { #if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 - if (mm_flags & AV_CPU_FLAG_3DNOW) + if (EXTERNAL_AMD3DNOW(cpu_flags)) dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; #endif /* ARCH_X86_32 */ - if (mm_flags & AV_CPU_FLAG_SSE) + if (EXTERNAL_SSE(cpu_flags)) dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; #endif /* HAVE_YASM */ } diff --git a/ffmpeg/libavcodec/x86/vp3dsp.asm b/ffmpeg/libavcodec/x86/vp3dsp.asm index a47b8f2..24496ae 100644 --- a/ffmpeg/libavcodec/x86/vp3dsp.asm +++ b/ffmpeg/libavcodec/x86/vp3dsp.asm @@ -33,7 +33,7 @@ vp3_idct_data: times 8 dw 64277 times 8 dw 25080 times 8 dw 12785 -pb_7: times 8 db 7 +pb_7: times 8 db 0x07 pb_1F: times 8 db 0x1f pb_81: times 8 db 0x81 diff --git a/ffmpeg/libavcodec/x86/vp3dsp_init.c b/ffmpeg/libavcodec/x86/vp3dsp_init.c index 252b40a..1f02a6f 100644 --- a/ffmpeg/libavcodec/x86/vp3dsp_init.c +++ b/ffmpeg/libavcodec/x86/vp3dsp_init.c @@ -43,7 +43,7 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, int *bounding_values); -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE #define MOVQ_BFE(regd) \ __asm__ volatile ( \ @@ -95,24 +95,24 @@ static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const u :"memory"); // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx") } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) { - int cpuflags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx; -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ #if ARCH_X86_32 - if (EXTERNAL_MMX(cpuflags)) { + if (EXTERNAL_MMX(cpu_flags)) { c->idct_put = ff_vp3_idct_put_mmx; c->idct_add = ff_vp3_idct_add_mmx; } #endif - if (EXTERNAL_MMXEXT(cpuflags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; if (!(flags & CODEC_FLAG_BITEXACT)) { @@ -121,7 +121,7 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) } } - if (EXTERNAL_SSE2(cpuflags)) { + if (EXTERNAL_SSE2(cpu_flags)) { c->idct_put = ff_vp3_idct_put_sse2; c->idct_add = ff_vp3_idct_add_sse2; } diff --git a/ffmpeg/libavcodec/x86/vp56dsp.asm b/ffmpeg/libavcodec/x86/vp56dsp.asm deleted file mode 100644 index 3d874ea..0000000 --- a/ffmpeg/libavcodec/x86/vp56dsp.asm +++ /dev/null @@ -1,170 +0,0 @@ -;****************************************************************************** -;* MMX/SSE2-optimized functions for the VP6 decoder -;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> -;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -cextern pw_64 - -SECTION .text - -%macro DIAG4 6 -%if mmsize == 8 - movq m0, [%1+%2] - movq m1, [%1+%3] - movq m3, m0 - movq m4, m1 - punpcklbw m0, m7 - punpcklbw m1, m7 - punpckhbw m3, m7 - punpckhbw m4, m7 - pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] - pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] - pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] - pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] - paddw m0, m1 - paddw m3, m4 - movq m1, [%1+%4] - movq m2, [%1+%5] - movq m4, m1 - movq m5, m2 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpckhbw m4, m7 - punpckhbw m5, m7 - pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] - pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] - pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] - pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] - paddw m1, m2 - paddw m4, m5 - paddsw m0, m1 - paddsw m3, m4 - paddsw m0, m6 ; Add 64 - paddsw m3, m6 ; Add 64 - psraw m0, 7 - psraw m3, 7 - packuswb m0, m3 - movq [%6], m0 -%else ; mmsize == 16 - movq m0, [%1+%2] - movq m1, [%1+%3] - punpcklbw m0, m7 - punpcklbw m1, m7 - pmullw m0, m4 ; src[x-8 ] * biweight [0] - pmullw m1, m5 ; src[x ] * biweight [1] - paddw m0, m1 - movq m1, [%1+%4] - movq m2, [%1+%5] - punpcklbw m1, m7 - punpcklbw m2, m7 - pmullw m1, m6 ; src[x+8 ] * biweight [2] - pmullw m2, m3 ; src[x+16] * biweight [3] - paddw m1, m2 - paddsw m0, m1 - paddsw m0, [pw_64] ; Add 64 - psraw m0, 7 - packuswb m0, m0 - movq [%6], m0 -%endif ; mmsize == 8/16 -%endmacro - -%macro SPLAT4REGS 0 -%if mmsize == 8 - movq m5, m3 - punpcklwd m3, m3 - movq m4, m3 - punpckldq m3, m3 - punpckhdq m4, m4 - punpckhwd m5, m5 - movq m2, m5 - punpckhdq m2, m2 - punpckldq m5, m5 - movq [rsp+8*11], m3 - movq [rsp+8*12], m4 - movq [rsp+8*13], m5 - movq [rsp+8*14], m2 -%else ; mmsize == 16 - pshuflw m4, m3, 0x0 - pshuflw m5, m3, 0x55 - pshuflw m6, m3, 0xAA - pshuflw m3, m3, 0xFF - punpcklqdq m4, m4 - punpcklqdq m5, m5 - punpcklqdq m6, m6 - punpcklqdq m3, m3 -%endif ; mmsize == 8/16 -%endmacro - -%macro vp6_filter_diag4 0 -; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, -; const int16_t h_weight[4], const int16_t v_weights[4]) -cglobal vp6_filter_diag4, 5, 7, 8 - mov r5, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack -%if mmsize == 16 - sub rsp, 8*11 -%else - sub rsp, 8*15 - movq m6, [pw_64] -%endif -%if ARCH_X86_64 - movsxd r2, r2d -%endif - - sub r1, r2 - - pxor m7, m7 - movq m3, [r3] - SPLAT4REGS - - mov r3, rsp - mov r6, 11 -.nextrow: - DIAG4 r1, -1, 0, 1, 2, r3 - add r3, 8 - add r1, r2 - dec r6 - jnz .nextrow - - movq m3, [r4] - SPLAT4REGS - - lea r3, [rsp+8] - mov r6, 8 -.nextcol: - DIAG4 r3, -8, 0, 8, 16, r0 - add r3, 8 - add r0, r2 - dec r6 - jnz .nextcol - - mov rsp, r5 ; restore stack pointer - RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -vp6_filter_diag4 -%endif - -INIT_XMM sse2 -vp6_filter_diag4 diff --git a/ffmpeg/libavcodec/x86/vp56dsp_init.c b/ffmpeg/libavcodec/x86/vp56dsp_init.c deleted file mode 100644 index defc63b..0000000 --- a/ffmpeg/libavcodec/x86/vp56dsp_init.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * VP6 MMX/SSE2 optimizations - * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> - * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/vp56dsp.h" - -void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights,const int16_t *v_weights); -void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights,const int16_t *v_weights); - -av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec) -{ - int mm_flags = av_get_cpu_flags(); - - if (CONFIG_VP6_DECODER && codec == AV_CODEC_ID_VP6) { -#if ARCH_X86_32 - if (EXTERNAL_MMX(mm_flags)) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; - } -#endif - - if (EXTERNAL_SSE2(mm_flags)) { - c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; - } - } -} diff --git a/ffmpeg/libavcodec/x86/vp8dsp.asm b/ffmpeg/libavcodec/x86/vp8dsp.asm index ca07333..85c7e99 100644 --- a/ffmpeg/libavcodec/x86/vp8dsp.asm +++ b/ffmpeg/libavcodec/x86/vp8dsp.asm @@ -143,27 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 -pw_27: times 8 dw 27 -pw_63: times 8 dw 63 pw_256: times 8 dw 256 pw_20091: times 4 dw 20091 pw_17734: times 4 dw 17734 -pb_4: times 16 db 4 -pb_F8: times 16 db 0xF8 -pb_FE: times 16 db 0xFE -pb_27_63: times 8 db 27, 63 -pb_18_63: times 8 db 18, 63 -pb_9_63: times 8 db 9, 63 - -cextern pb_1 cextern pw_3 -cextern pb_3 cextern pw_4 -cextern pw_9 -cextern pw_18 cextern pw_64 -cextern pb_80 SECTION .text @@ -1237,1544 +1223,3 @@ VP8_DC_WHT %endif INIT_MMX sse VP8_DC_WHT - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); -;----------------------------------------------------------------------------- - -; macro called with 7 mm register indexes as argument, and 4 regular registers -; -; first 4 mm registers will carry the transposed pixel data -; the other three are scratchspace (one would be sufficient, but this allows -; for more spreading/pipelining and thus faster execution on OOE CPUs) -; -; first two regular registers are buf+4*stride and buf+5*stride -; third is -stride, fourth is +stride -%macro READ_8x4_INTERLEAVED 11 - ; interleave 8 (A-H) rows of 4 pixels each - movd m%1, [%8+%10*4] ; A0-3 - movd m%5, [%9+%10*4] ; B0-3 - movd m%2, [%8+%10*2] ; C0-3 - movd m%6, [%8+%10] ; D0-3 - movd m%3, [%8] ; E0-3 - movd m%7, [%9] ; F0-3 - movd m%4, [%9+%11] ; G0-3 - punpcklbw m%1, m%5 ; A/B interleaved - movd m%5, [%9+%11*2] ; H0-3 - punpcklbw m%2, m%6 ; C/D interleaved - punpcklbw m%3, m%7 ; E/F interleaved - punpcklbw m%4, m%5 ; G/H interleaved -%endmacro - -; macro called with 7 mm register indexes as argument, and 5 regular registers -; first 11 mean the same as READ_8x4_TRANSPOSED above -; fifth regular register is scratchspace to reach the bottom 8 rows, it -; will be set to second regular register + 8*stride at the end -%macro READ_16x4_INTERLEAVED 12 - ; transpose 16 (A-P) rows of 4 pixels each - lea %12, [r0+8*r2] - - ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M - movd m%1, [%8+%10*4] ; A0-3 - movd m%3, [%12+%10*4] ; I0-3 - movd m%2, [%8+%10*2] ; C0-3 - movd m%4, [%12+%10*2] ; K0-3 - movd m%6, [%8+%10] ; D0-3 - movd m%5, [%12+%10] ; L0-3 - movd m%7, [%12] ; M0-3 - add %12, %11 - punpcklbw m%1, m%3 ; A/I - movd m%3, [%8] ; E0-3 - punpcklbw m%2, m%4 ; C/K - punpcklbw m%6, m%5 ; D/L - punpcklbw m%3, m%7 ; E/M - punpcklbw m%2, m%6 ; C/D/K/L interleaved - - ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P - movd m%5, [%9+%10*4] ; B0-3 - movd m%4, [%12+%10*4] ; J0-3 - movd m%7, [%9] ; F0-3 - movd m%6, [%12] ; N0-3 - punpcklbw m%5, m%4 ; B/J - punpcklbw m%7, m%6 ; F/N - punpcklbw m%1, m%5 ; A/B/I/J interleaved - punpcklbw m%3, m%7 ; E/F/M/N interleaved - movd m%4, [%9+%11] ; G0-3 - movd m%6, [%12+%11] ; O0-3 - movd m%5, [%9+%11*2] ; H0-3 - movd m%7, [%12+%11*2] ; P0-3 - punpcklbw m%4, m%6 ; G/O - punpcklbw m%5, m%7 ; H/P - punpcklbw m%4, m%5 ; G/H/O/P interleaved -%endmacro - -; write 4 mm registers of 2 dwords each -; first four arguments are mm register indexes containing source data -; last four are registers containing buf+4*stride, buf+5*stride, -; -stride and +stride -%macro WRITE_4x2D 8 - ; write out (2 dwords per register) - movd [%5+%7*4], m%1 - movd [%5+%7*2], m%2 - movd [%5], m%3 - movd [%6+%8], m%4 - punpckhdq m%1, m%1 - punpckhdq m%2, m%2 - punpckhdq m%3, m%3 - punpckhdq m%4, m%4 - movd [%6+%7*4], m%1 - movd [%5+%7], m%2 - movd [%6], m%3 - movd [%6+%8*2], m%4 -%endmacro - -; write 4 xmm registers of 4 dwords each -; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular -; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride -; we add 1*stride to the third regular registry in the process -; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the -; same memory region), or 8 if they cover two separate buffers (third one points to -; a different memory region than the first two), allowing for more optimal code for -; the 16-width case -%macro WRITE_4x4D 10 - ; write out (4 dwords per register), start with dwords zero - movd [%5+%8*4], m%1 - movd [%5], m%2 - movd [%7+%8*4], m%3 - movd [%7], m%4 - - ; store dwords 1 - psrldq m%1, 4 - psrldq m%2, 4 - psrldq m%3, 4 - psrldq m%4, 4 - movd [%6+%8*4], m%1 - movd [%6], m%2 -%if %10 == 16 - movd [%6+%9*4], m%3 -%endif - movd [%7+%9], m%4 - - ; write dwords 2 - psrldq m%1, 4 - psrldq m%2, 4 -%if %10 == 8 - movd [%5+%8*2], m%1 - movd %5d, m%3 -%endif - psrldq m%3, 4 - psrldq m%4, 4 -%if %10 == 16 - movd [%5+%8*2], m%1 -%endif - movd [%6+%9], m%2 - movd [%7+%8*2], m%3 - movd [%7+%9*2], m%4 - add %7, %9 - - ; store dwords 3 - psrldq m%1, 4 - psrldq m%2, 4 - psrldq m%3, 4 - psrldq m%4, 4 -%if %10 == 8 - mov [%7+%8*4], %5d - movd [%6+%8*2], m%1 -%else - movd [%5+%8], m%1 -%endif - movd [%6+%9*2], m%2 - movd [%7+%8*2], m%3 - movd [%7+%9*2], m%4 -%endmacro - -; write 4 or 8 words in the mmx/xmm registers as 8 lines -; 1 and 2 are the registers to write, this can be the same (for SSE2) -; for pre-SSE4: -; 3 is a general-purpose register that we will clobber -; for SSE4: -; 3 is a pointer to the destination's 5th line -; 4 is a pointer to the destination's 4th line -; 5/6 is -stride and +stride -%macro WRITE_2x4W 6 - movd %3d, %1 - punpckhdq %1, %1 - mov [%4+%5*4], %3w - shr %3, 16 - add %4, %6 - mov [%4+%5*4], %3w - - movd %3d, %1 - add %4, %5 - mov [%4+%5*2], %3w - shr %3, 16 - mov [%4+%5 ], %3w - - movd %3d, %2 - punpckhdq %2, %2 - mov [%4 ], %3w - shr %3, 16 - mov [%4+%6 ], %3w - - movd %3d, %2 - add %4, %6 - mov [%4+%6 ], %3w - shr %3, 16 - mov [%4+%6*2], %3w - add %4, %5 -%endmacro - -%macro WRITE_8W 5 -%if cpuflag(sse4) - pextrw [%3+%4*4], %1, 0 - pextrw [%2+%4*4], %1, 1 - pextrw [%3+%4*2], %1, 2 - pextrw [%3+%4 ], %1, 3 - pextrw [%3 ], %1, 4 - pextrw [%2 ], %1, 5 - pextrw [%2+%5 ], %1, 6 - pextrw [%2+%5*2], %1, 7 -%else - movd %2d, %1 - psrldq %1, 4 - mov [%3+%4*4], %2w - shr %2, 16 - add %3, %5 - mov [%3+%4*4], %2w - - movd %2d, %1 - psrldq %1, 4 - add %3, %4 - mov [%3+%4*2], %2w - shr %2, 16 - mov [%3+%4 ], %2w - - movd %2d, %1 - psrldq %1, 4 - mov [%3 ], %2w - shr %2, 16 - mov [%3+%5 ], %2w - - movd %2d, %1 - add %3, %5 - mov [%3+%5 ], %2w - shr %2, 16 - mov [%3+%5*2], %2w -%endif -%endmacro - -%macro SIMPLE_LOOPFILTER 2 -cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr -%if mmsize == 8 ; mmx/mmxext - mov cntrq, 2 -%endif -%if cpuflag(ssse3) - pxor m0, m0 -%endif - SPLATB_REG m7, flim, m0 ; splat "flim" into register - - ; set up indexes to address 4 rows -%if mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, cntr, dst2 -%else - DEFINE_ARGS dst1, mstride, stride, dst3, dst2 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+4*strideq-2] -%endif - -%if mmsize == 8 ; mmx / mmxext -.next8px: -%endif -%ifidn %1, v - ; read 4 half/full rows of pixels - mova m0, [dst1q+mstrideq*2] ; p1 - mova m1, [dst1q+mstrideq] ; p0 - mova m2, [dst1q] ; q0 - mova m3, [dst1q+ strideq] ; q1 -%else ; h - lea dst2q, [dst1q+ strideq] - -%if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq -%else ; sse2 - READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q -%endif - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif - - ; simple_limit - mova m5, m2 ; m5=backup of q0 - mova m6, m1 ; m6=backup of p0 - psubusb m1, m2 ; p0-q0 - psubusb m2, m6 ; q0-p0 - por m1, m2 ; FFABS(p0-q0) - paddusb m1, m1 ; m1=FFABS(p0-q0)*2 - - mova m4, m3 - mova m2, m0 - psubusb m3, m0 ; q1-p1 - psubusb m0, m4 ; p1-q1 - por m3, m0 ; FFABS(p1-q1) - mova m0, [pb_80] - pxor m2, m0 - pxor m4, m0 - psubsb m2, m4 ; m2=p1-q1 (signed) backup for below - pand m3, [pb_FE] - psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed - paddusb m3, m1 - psubusb m3, m7 - pxor m1, m1 - pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) - - ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) - mova m4, m5 - pxor m5, m0 - pxor m0, m6 - psubsb m5, m0 ; q0-p0 (signed) - paddsb m2, m5 - paddsb m2, m5 - paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) - pand m2, m3 ; apply filter mask (m3) - - mova m3, [pb_F8] - mova m1, m2 - paddsb m2, [pb_4] ; f1<<3=a+4 - paddsb m1, [pb_3] ; f2<<3=a+3 - pand m2, m3 - pand m1, m3 ; cache f2<<3 - - pxor m0, m0 - pxor m3, m3 - pcmpgtb m0, m2 ; which values are <0? - psubb m3, m2 ; -f1<<3 - psrlq m2, 3 ; +f1 - psrlq m3, 3 ; -f1 - pand m3, m0 - pandn m0, m2 - psubusb m4, m0 - paddusb m4, m3 ; q0-f1 - - pxor m0, m0 - pxor m3, m3 - pcmpgtb m0, m1 ; which values are <0? - psubb m3, m1 ; -f2<<3 - psrlq m1, 3 ; +f2 - psrlq m3, 3 ; -f2 - pand m3, m0 - pandn m0, m1 - paddusb m6, m0 - psubusb m6, m3 ; p0+f2 - - ; store -%ifidn %1, v - mova [dst1q], m4 - mova [dst1q+mstrideq], m6 -%else ; h - inc dst1q - SBUTTERFLY bw, 6, 4, 0 - -%if mmsize == 16 ; sse2 -%if cpuflag(sse4) - inc dst2q -%endif - WRITE_8W m6, dst2q, dst1q, mstrideq, strideq - lea dst2q, [dst3q+mstrideq+1] -%if cpuflag(sse4) - inc dst3q -%endif - WRITE_8W m4, dst3q, dst2q, mstrideq, strideq -%else ; mmx/mmxext - WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq -%endif -%endif - -%if mmsize == 8 ; mmx/mmxext - ; next 8 pixels -%ifidn %1, v - add dst1q, 8 ; advance 8 cols = pixels -%else ; h - lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines -%endif - dec cntrq - jg .next8px - REP_RET -%else ; sse2 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -INIT_MMX mmxext -SIMPLE_LOOPFILTER v, 4 -SIMPLE_LOOPFILTER h, 5 -%endif - -INIT_XMM sse2 -SIMPLE_LOOPFILTER v, 3 -SIMPLE_LOOPFILTER h, 5 -INIT_XMM ssse3 -SIMPLE_LOOPFILTER v, 3 -SIMPLE_LOOPFILTER h, 5 -INIT_XMM sse4 -SIMPLE_LOOPFILTER h, 5 - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, -; int flimE, int flimI, int hev_thr); -;----------------------------------------------------------------------------- - -%macro INNER_LOOPFILTER 2 -%define stack_size 0 -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%ifidn %1, v ; [3]=hev() result -%define stack_size mmsize * -4 -%else ; h ; extra storage space for transposes -%define stack_size mmsize * -5 -%endif -%endif - -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif - -%ifndef m8 - ; splat function arguments - SPLATB_REG m0, flimEq, m7 ; E - SPLATB_REG m1, flimIq, m7 ; I - SPLATB_REG m2, hevthrq, m7 ; hev_thresh - -%define m_flimE [rsp] -%define m_flimI [rsp+mmsize] -%define m_hevthr [rsp+mmsize*2] -%define m_maskres [rsp+mmsize*3] -%define m_p0backup [rsp+mmsize*3] -%define m_q0backup [rsp+mmsize*4] - - mova m_flimE, m0 - mova m_flimI, m1 - mova m_hevthr, m2 -%else -%define m_flimE m9 -%define m_flimI m10 -%define m_hevthr m11 -%define m_maskres m12 -%define m_p0backup m12 -%define m_q0backup m8 - - ; splat function arguments - SPLATB_REG m_flimE, flimEq, m7 ; E - SPLATB_REG m_flimI, flimIq, m7 ; I - SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh -%endif - -%if %2 == 8 ; chroma - DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 -%else - DEFINE_ARGS dst1, mstride, stride, dst2, dst8 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+strideq*4-4] -%if %2 == 8 ; chroma - lea dst8q, [dst8q+strideq*4-4] -%endif -%endif - -%if mmsize == 8 -.next8px: -%endif - ; read - lea dst2q, [dst1q+strideq] -%ifidn %1, v -%if %2 == 8 && mmsize == 16 -%define movrow movh -%else -%define movrow mova -%endif - movrow m0, [dst1q+mstrideq*4] ; p3 - movrow m1, [dst2q+mstrideq*4] ; p2 - movrow m2, [dst1q+mstrideq*2] ; p1 - movrow m5, [dst2q] ; q1 - movrow m6, [dst2q+ strideq*1] ; q2 - movrow m7, [dst2q+ strideq*2] ; q3 -%if mmsize == 16 && %2 == 8 - movhps m0, [dst8q+mstrideq*4] - movhps m2, [dst8q+mstrideq*2] - add dst8q, strideq - movhps m1, [dst8q+mstrideq*4] - movhps m5, [dst8q] - movhps m6, [dst8q+ strideq ] - movhps m7, [dst8q+ strideq*2] - add dst8q, mstrideq -%endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) -%if %2 == 16 - lea dst8q, [dst1q+ strideq*8] -%endif - - ; read 16 rows of 8px each, interleave - movh m0, [dst1q+mstrideq*4] - movh m1, [dst8q+mstrideq*4] - movh m2, [dst1q+mstrideq*2] - movh m5, [dst8q+mstrideq*2] - movh m3, [dst1q+mstrideq ] - movh m6, [dst8q+mstrideq ] - movh m4, [dst1q] - movh m7, [dst8q] - punpcklbw m0, m1 ; A/I - punpcklbw m2, m5 ; C/K - punpcklbw m3, m6 ; D/L - punpcklbw m4, m7 ; E/M - - add dst8q, strideq - movh m1, [dst2q+mstrideq*4] - movh m6, [dst8q+mstrideq*4] - movh m5, [dst2q] - movh m7, [dst8q] - punpcklbw m1, m6 ; B/J - punpcklbw m5, m7 ; F/N - movh m6, [dst2q+ strideq ] - movh m7, [dst8q+ strideq ] - punpcklbw m6, m7 ; G/O - - ; 8x16 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m8 - SWAP 1, 8 -%else - mova m_q0backup, m1 -%endif - movh m7, [dst2q+ strideq*2] - movh m1, [dst8q+ strideq*2] - punpcklbw m7, m1 ; H/P - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m8 - SWAP 1, 8 - SWAP 2, 8 -%else - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 -%endif - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m12 - SWAP 5, 12 -%else - mova m_p0backup, m5 ; store p0 -%endif - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%endif - - ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 - mova m4, m1 - SWAP 4, 1 - psubusb m4, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m4 ; abs(p3-p2) - - mova m4, m2 - SWAP 4, 2 - psubusb m4, m1 ; p1-p2 - psubusb m1, m2 ; p2-p1 - por m1, m4 ; abs(p2-p1) - - mova m4, m6 - SWAP 4, 6 - psubusb m4, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m4 ; abs(q3-q2) - - mova m4, m5 - SWAP 4, 5 - psubusb m4, m6 ; q1-q2 - psubusb m6, m5 ; q2-q1 - por m6, m4 ; abs(q2-q1) - -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 - pmaxub m0, m1 - pmaxub m6, m7 - pmaxub m0, m6 -%endif - - ; normal_limit and high_edge_variance for p1-p0, q1-q0 - SWAP 7, 3 ; now m7 is zero -%ifidn %1, v - movrow m3, [dst1q+mstrideq ] ; p0 -%if mmsize == 16 && %2 == 8 - movhps m3, [dst8q+mstrideq ] -%endif -%elifdef m12 - SWAP 3, 12 -%else - mova m3, m_p0backup -%endif - - mova m1, m2 - SWAP 1, 2 - mova m6, m3 - SWAP 3, 6 - psubusb m1, m3 ; p1-p0 - psubusb m6, m2 ; p0-p1 - por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 - pmaxub m0, m1 ; max_I - SWAP 1, 4 ; max_hev_thresh -%endif - - SWAP 6, 4 ; now m6 is I -%ifidn %1, v - movrow m4, [dst1q] ; q0 -%if mmsize == 16 && %2 == 8 - movhps m4, [dst8q] -%endif -%elifdef m8 - SWAP 4, 8 -%else - mova m4, m_q0backup -%endif - mova m1, m4 - SWAP 1, 4 - mova m7, m5 - SWAP 7, 5 - psubusb m1, m5 ; q0-q1 - psubusb m7, m4 ; q1-q0 - por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 - pxor m7, m7 - pmaxub m0, m1 - pmaxub m6, m1 - psubusb m0, m_flimI - psubusb m6, m_hevthr - pcmpeqb m0, m7 ; max(abs(..)) <= I - pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif -%ifdef m12 - SWAP 6, 12 -%else - mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) -%endif - - ; simple_limit - mova m1, m3 - SWAP 1, 3 - mova m6, m4 ; keep copies of p0/q0 around for later use - SWAP 6, 4 - psubusb m1, m4 ; p0-q0 - psubusb m6, m3 ; q0-p0 - por m1, m6 ; abs(q0-p0) - paddusb m1, m1 ; m1=2*abs(q0-p0) - - mova m7, m2 - SWAP 7, 2 - mova m6, m5 - SWAP 6, 5 - psubusb m7, m5 ; p1-q1 - psubusb m6, m2 ; q1-p1 - por m7, m6 ; abs(q1-p1) - pxor m6, m6 - pand m7, [pb_FE] - psrlq m7, 1 ; abs(q1-p1)/2 - paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, m_flimE - pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E - pand m0, m7 ; normal_limit result - - ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask -%ifdef m8 ; x86-64 && sse2 - mova m8, [pb_80] -%define m_pb_80 m8 -%else ; x86-32 or mmx/mmxext -%define m_pb_80 [pb_80] -%endif - mova m1, m4 - mova m7, m3 - pxor m1, m_pb_80 - pxor m7, m_pb_80 - psubsb m1, m7 ; (signed) q0-p0 - mova m6, m2 - mova m7, m5 - pxor m6, m_pb_80 - pxor m7, m_pb_80 - psubsb m6, m7 ; (signed) p1-q1 - mova m7, m_maskres - pandn m7, m6 - paddsb m7, m1 - paddsb m7, m1 - paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) - - pand m7, m0 - mova m1, [pb_F8] - mova m6, m7 - paddsb m7, [pb_3] - paddsb m6, [pb_4] - pand m7, m1 - pand m6, m1 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m1, m7 - psubb m0, m7 - psrlq m7, 3 ; +f2 - psrlq m0, 3 ; -f2 - pand m0, m1 - pandn m1, m7 - psubusb m3, m0 - paddusb m3, m1 ; p0+f2 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m0, m6 - psubb m1, m6 - psrlq m6, 3 ; +f1 - psrlq m1, 3 ; -f1 - pand m1, m0 - pandn m0, m6 - psubusb m4, m0 - paddusb m4, m1 ; q0-f1 - -%ifdef m12 - SWAP 6, 12 -%else - mova m6, m_maskres -%endif -%if notcpuflag(mmxext) - mova m7, [pb_1] -%else ; mmxext/sse2 - pxor m7, m7 -%endif - pand m0, m6 - pand m1, m6 -%if notcpuflag(mmxext) - paddusb m0, m7 - pand m1, [pb_FE] - pandn m7, m0 - psrlq m1, 1 - psrlq m7, 1 - SWAP 0, 7 -%else ; mmxext/sse2 - psubusb m1, [pb_1] - pavgb m0, m7 ; a - pavgb m1, m7 ; -a -%endif - psubusb m5, m0 - psubusb m2, m1 - paddusb m5, m1 ; q1-a - paddusb m2, m0 ; p1+a - - ; store -%ifidn %1, v - movrow [dst1q+mstrideq*2], m2 - movrow [dst1q+mstrideq ], m3 - movrow [dst1q], m4 - movrow [dst1q+ strideq ], m5 -%if mmsize == 16 && %2 == 8 - movhps [dst8q+mstrideq*2], m2 - movhps [dst8q+mstrideq ], m3 - movhps [dst8q], m4 - movhps [dst8q+ strideq ], m5 -%endif -%else ; h - add dst1q, 2 - add dst2q, 2 - - ; 4x8/16 transpose - TRANSPOSE4x4B 2, 3, 4, 5, 6 - -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq -%else ; sse2 (h) - lea dst8q, [dst8q+mstrideq +2] - WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 -%endif -%endif - -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 2 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-2] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -INIT_MMX mmxext -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 -%endif - -INIT_XMM sse2 -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -INIT_XMM ssse3 -INNER_LOOPFILTER v, 16 -INNER_LOOPFILTER h, 16 -INNER_LOOPFILTER v, 8 -INNER_LOOPFILTER h, 8 - -;----------------------------------------------------------------------------- -; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, -; int flimE, int flimI, int hev_thr); -;----------------------------------------------------------------------------- - -%macro MBEDGE_LOOPFILTER 2 -%define stack_size 0 -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%if mmsize == 16 ; [3]=hev() result - ; [4]=filter tmp result - ; [5]/[6] = p2/q2 backup - ; [7]=lim_res sign result -%define stack_size mmsize * -7 -%else ; 8 ; extra storage space for transposes -%define stack_size mmsize * -8 -%endif -%endif - -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif - -%ifndef m8 - ; splat function arguments - SPLATB_REG m0, flimEq, m7 ; E - SPLATB_REG m1, flimIq, m7 ; I - SPLATB_REG m2, hevthrq, m7 ; hev_thresh - -%define m_flimE [rsp] -%define m_flimI [rsp+mmsize] -%define m_hevthr [rsp+mmsize*2] -%define m_maskres [rsp+mmsize*3] -%define m_limres [rsp+mmsize*4] -%define m_p0backup [rsp+mmsize*3] -%define m_q0backup [rsp+mmsize*4] -%define m_p2backup [rsp+mmsize*5] -%define m_q2backup [rsp+mmsize*6] -%if mmsize == 16 -%define m_limsign [rsp] -%else -%define m_limsign [rsp+mmsize*7] -%endif - - mova m_flimE, m0 - mova m_flimI, m1 - mova m_hevthr, m2 -%else ; sse2 on x86-64 -%define m_flimE m9 -%define m_flimI m10 -%define m_hevthr m11 -%define m_maskres m12 -%define m_limres m8 -%define m_p0backup m12 -%define m_q0backup m8 -%define m_p2backup m13 -%define m_q2backup m14 -%define m_limsign m9 - - ; splat function arguments - SPLATB_REG m_flimE, flimEq, m7 ; E - SPLATB_REG m_flimI, flimIq, m7 ; I - SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh -%endif - -%if %2 == 8 ; chroma - DEFINE_ARGS dst1, dst8, mstride, stride, dst2 -%elif mmsize == 8 - DEFINE_ARGS dst1, mstride, stride, dst2, cntr - mov cntrq, 2 -%else - DEFINE_ARGS dst1, mstride, stride, dst2, dst8 -%endif - mov strideq, mstrideq - neg mstrideq -%ifidn %1, h - lea dst1q, [dst1q+strideq*4-4] -%if %2 == 8 ; chroma - lea dst8q, [dst8q+strideq*4-4] -%endif -%endif - -%if mmsize == 8 -.next8px: -%endif - ; read - lea dst2q, [dst1q+ strideq ] -%ifidn %1, v -%if %2 == 8 && mmsize == 16 -%define movrow movh -%else -%define movrow mova -%endif - movrow m0, [dst1q+mstrideq*4] ; p3 - movrow m1, [dst2q+mstrideq*4] ; p2 - movrow m2, [dst1q+mstrideq*2] ; p1 - movrow m5, [dst2q] ; q1 - movrow m6, [dst2q+ strideq ] ; q2 - movrow m7, [dst2q+ strideq*2] ; q3 -%if mmsize == 16 && %2 == 8 - movhps m0, [dst8q+mstrideq*4] - movhps m2, [dst8q+mstrideq*2] - add dst8q, strideq - movhps m1, [dst8q+mstrideq*4] - movhps m5, [dst8q] - movhps m6, [dst8q+ strideq ] - movhps m7, [dst8q+ strideq*2] - add dst8q, mstrideq -%endif -%elif mmsize == 8 ; mmx/mmxext (h) - ; read 8 rows of 8px each - movu m0, [dst1q+mstrideq*4] - movu m1, [dst2q+mstrideq*4] - movu m2, [dst1q+mstrideq*2] - movu m3, [dst1q+mstrideq ] - movu m4, [dst1q] - movu m5, [dst2q] - movu m6, [dst2q+ strideq ] - - ; 8x8 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova m_q0backup, m1 - movu m7, [dst2q+ strideq*2] - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova m_p0backup, m5 ; store p0 - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%else ; sse2 (h) -%if %2 == 16 - lea dst8q, [dst1q+ strideq*8 ] -%endif - - ; read 16 rows of 8px each, interleave - movh m0, [dst1q+mstrideq*4] - movh m1, [dst8q+mstrideq*4] - movh m2, [dst1q+mstrideq*2] - movh m5, [dst8q+mstrideq*2] - movh m3, [dst1q+mstrideq ] - movh m6, [dst8q+mstrideq ] - movh m4, [dst1q] - movh m7, [dst8q] - punpcklbw m0, m1 ; A/I - punpcklbw m2, m5 ; C/K - punpcklbw m3, m6 ; D/L - punpcklbw m4, m7 ; E/M - - add dst8q, strideq - movh m1, [dst2q+mstrideq*4] - movh m6, [dst8q+mstrideq*4] - movh m5, [dst2q] - movh m7, [dst8q] - punpcklbw m1, m6 ; B/J - punpcklbw m5, m7 ; F/N - movh m6, [dst2q+ strideq ] - movh m7, [dst8q+ strideq ] - punpcklbw m6, m7 ; G/O - - ; 8x16 transpose - TRANSPOSE4x4B 0, 1, 2, 3, 7 -%ifdef m8 - SWAP 1, 8 -%else - mova m_q0backup, m1 -%endif - movh m7, [dst2q+ strideq*2] - movh m1, [dst8q+ strideq*2] - punpcklbw m7, m1 ; H/P - TRANSPOSE4x4B 4, 5, 6, 7, 1 - SBUTTERFLY dq, 0, 4, 1 ; p3/p2 - SBUTTERFLY dq, 2, 6, 1 ; q0/q1 - SBUTTERFLY dq, 3, 7, 1 ; q2/q3 -%ifdef m8 - SWAP 1, 8 - SWAP 2, 8 -%else - mova m1, m_q0backup - mova m_q0backup, m2 ; store q0 -%endif - SBUTTERFLY dq, 1, 5, 2 ; p1/p0 -%ifdef m12 - SWAP 5, 12 -%else - mova m_p0backup, m5 ; store p0 -%endif - SWAP 1, 4 - SWAP 2, 4 - SWAP 6, 3 - SWAP 5, 3 -%endif - - ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 - mova m4, m1 - SWAP 4, 1 - psubusb m4, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m4 ; abs(p3-p2) - - mova m4, m2 - SWAP 4, 2 - psubusb m4, m1 ; p1-p2 - mova m_p2backup, m1 - psubusb m1, m2 ; p2-p1 - por m1, m4 ; abs(p2-p1) - - mova m4, m6 - SWAP 4, 6 - psubusb m4, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m4 ; abs(q3-q2) - - mova m4, m5 - SWAP 4, 5 - psubusb m4, m6 ; q1-q2 - mova m_q2backup, m6 - psubusb m6, m5 ; q2-q1 - por m6, m4 ; abs(q2-q1) - -%if notcpuflag(mmxext) - mova m4, m_flimI - pxor m3, m3 - psubusb m0, m4 - psubusb m1, m4 - psubusb m7, m4 - psubusb m6, m4 - pcmpeqb m0, m3 ; abs(p3-p2) <= I - pcmpeqb m1, m3 ; abs(p2-p1) <= I - pcmpeqb m7, m3 ; abs(q3-q2) <= I - pcmpeqb m6, m3 ; abs(q2-q1) <= I - pand m0, m1 - pand m7, m6 - pand m0, m7 -%else ; mmxext/sse2 - pmaxub m0, m1 - pmaxub m6, m7 - pmaxub m0, m6 -%endif - - ; normal_limit and high_edge_variance for p1-p0, q1-q0 - SWAP 7, 3 ; now m7 is zero -%ifidn %1, v - movrow m3, [dst1q+mstrideq ] ; p0 -%if mmsize == 16 && %2 == 8 - movhps m3, [dst8q+mstrideq ] -%endif -%elifdef m12 - SWAP 3, 12 -%else - mova m3, m_p0backup -%endif - - mova m1, m2 - SWAP 1, 2 - mova m6, m3 - SWAP 3, 6 - psubusb m1, m3 ; p1-p0 - psubusb m6, m2 ; p0-p1 - por m1, m6 ; abs(p1-p0) -%if notcpuflag(mmxext) - mova m6, m1 - psubusb m1, m4 - psubusb m6, m_hevthr - pcmpeqb m1, m7 ; abs(p1-p0) <= I - pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh - pand m0, m1 - mova m_maskres, m6 -%else ; mmxext/sse2 - pmaxub m0, m1 ; max_I - SWAP 1, 4 ; max_hev_thresh -%endif - - SWAP 6, 4 ; now m6 is I -%ifidn %1, v - movrow m4, [dst1q] ; q0 -%if mmsize == 16 && %2 == 8 - movhps m4, [dst8q] -%endif -%elifdef m8 - SWAP 4, 8 -%else - mova m4, m_q0backup -%endif - mova m1, m4 - SWAP 1, 4 - mova m7, m5 - SWAP 7, 5 - psubusb m1, m5 ; q0-q1 - psubusb m7, m4 ; q1-q0 - por m1, m7 ; abs(q1-q0) -%if notcpuflag(mmxext) - mova m7, m1 - psubusb m1, m6 - psubusb m7, m_hevthr - pxor m6, m6 - pcmpeqb m1, m6 ; abs(q1-q0) <= I - pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, m_maskres - pand m0, m1 ; abs([pq][321]-[pq][210]) <= I - pand m6, m7 -%else ; mmxext/sse2 - pxor m7, m7 - pmaxub m0, m1 - pmaxub m6, m1 - psubusb m0, m_flimI - psubusb m6, m_hevthr - pcmpeqb m0, m7 ; max(abs(..)) <= I - pcmpeqb m6, m7 ; !(max(abs..) > thresh) -%endif -%ifdef m12 - SWAP 6, 12 -%else - mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) -%endif - - ; simple_limit - mova m1, m3 - SWAP 1, 3 - mova m6, m4 ; keep copies of p0/q0 around for later use - SWAP 6, 4 - psubusb m1, m4 ; p0-q0 - psubusb m6, m3 ; q0-p0 - por m1, m6 ; abs(q0-p0) - paddusb m1, m1 ; m1=2*abs(q0-p0) - - mova m7, m2 - SWAP 7, 2 - mova m6, m5 - SWAP 6, 5 - psubusb m7, m5 ; p1-q1 - psubusb m6, m2 ; q1-p1 - por m7, m6 ; abs(q1-p1) - pxor m6, m6 - pand m7, [pb_FE] - psrlq m7, 1 ; abs(q1-p1)/2 - paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, m_flimE - pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E - pand m0, m7 ; normal_limit result - - ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask -%ifdef m8 ; x86-64 && sse2 - mova m8, [pb_80] -%define m_pb_80 m8 -%else ; x86-32 or mmx/mmxext -%define m_pb_80 [pb_80] -%endif - mova m1, m4 - mova m7, m3 - pxor m1, m_pb_80 - pxor m7, m_pb_80 - psubsb m1, m7 ; (signed) q0-p0 - mova m6, m2 - mova m7, m5 - pxor m6, m_pb_80 - pxor m7, m_pb_80 - psubsb m6, m7 ; (signed) p1-q1 - mova m7, m_maskres - paddsb m6, m1 - paddsb m6, m1 - paddsb m6, m1 - pand m6, m0 -%ifdef m8 - mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge - pand m_limres, m7 -%else - mova m0, m6 - pand m0, m7 - mova m_limres, m0 -%endif - pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common - - mova m1, [pb_F8] - mova m6, m7 - paddsb m7, [pb_3] - paddsb m6, [pb_4] - pand m7, m1 - pand m6, m1 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m1, m7 - psubb m0, m7 - psrlq m7, 3 ; +f2 - psrlq m0, 3 ; -f2 - pand m0, m1 - pandn m1, m7 - psubusb m3, m0 - paddusb m3, m1 ; p0+f2 - - pxor m1, m1 - pxor m0, m0 - pcmpgtb m0, m6 - psubb m1, m6 - psrlq m6, 3 ; +f1 - psrlq m1, 3 ; -f1 - pand m1, m0 - pandn m0, m6 - psubusb m4, m0 - paddusb m4, m1 ; q0-f1 - - ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) -%if cpuflag(ssse3) - mova m7, [pb_1] -%else - mova m7, [pw_63] -%endif -%ifdef m8 - SWAP 1, 8 -%else - mova m1, m_limres -%endif - pxor m0, m0 - mova m6, m1 - pcmpgtb m0, m1 ; which are negative -%if cpuflag(ssse3) - punpcklbw m6, m7 ; interleave with "1" for rounding - punpckhbw m1, m7 -%else - punpcklbw m6, m0 ; signed byte->word - punpckhbw m1, m0 -%endif - mova m_limsign, m0 -%if cpuflag(ssse3) - mova m7, [pb_27_63] -%ifndef m8 - mova m_limres, m1 -%endif -%ifdef m10 - SWAP 0, 10 ; don't lose lim_sign copy -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%ifdef m10 - SWAP 0, 10 -%else - mova m0, m_limsign -%endif -%else - mova m_maskres, m6 ; backup for later in filter - mova m_limres, m1 - pmullw m6, [pw_27] - pmullw m1, [pw_27] - paddw m6, m7 - paddw m1, m7 -%endif - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a0 - pxor m1, m1 - psubb m1, m6 - pand m1, m0 ; -a0 - pandn m0, m6 ; +a0 -%if cpuflag(ssse3) - mova m6, [pb_18_63] ; pipelining -%endif - psubusb m3, m1 - paddusb m4, m1 - paddusb m3, m0 ; p0+a0 - psubusb m4, m0 ; q0-a0 - -%if cpuflag(ssse3) - SWAP 6, 7 -%ifdef m10 - SWAP 1, 10 -%else - mova m1, m_limres -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%ifdef m10 - SWAP 0, 10 -%endif - mova m0, m_limsign -%else - mova m6, m_maskres - mova m1, m_limres - pmullw m6, [pw_18] - pmullw m1, [pw_18] - paddw m6, m7 - paddw m1, m7 -%endif - mova m0, m_limsign - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a1 - pxor m1, m1 - psubb m1, m6 - pand m1, m0 ; -a1 - pandn m0, m6 ; +a1 -%if cpuflag(ssse3) - mova m6, [pb_9_63] -%endif - psubusb m2, m1 - paddusb m5, m1 - paddusb m2, m0 ; p1+a1 - psubusb m5, m0 ; q1-a1 - -%if cpuflag(ssse3) - SWAP 6, 7 -%ifdef m10 - SWAP 1, 10 -%else - mova m1, m_limres -%endif - mova m0, m7 - pmaddubsw m7, m6 - SWAP 6, 7 - pmaddubsw m0, m1 - SWAP 1, 0 -%else -%ifdef m8 - SWAP 6, 12 - SWAP 1, 8 -%else - mova m6, m_maskres - mova m1, m_limres -%endif - pmullw m6, [pw_9] - pmullw m1, [pw_9] - paddw m6, m7 - paddw m1, m7 -%endif -%ifdef m9 - SWAP 7, 9 -%else - mova m7, m_limsign -%endif - psraw m6, 7 - psraw m1, 7 - packsswb m6, m1 ; a1 - pxor m0, m0 - psubb m0, m6 - pand m0, m7 ; -a1 - pandn m7, m6 ; +a1 -%ifdef m8 - SWAP 1, 13 - SWAP 6, 14 -%else - mova m1, m_p2backup - mova m6, m_q2backup -%endif - psubusb m1, m0 - paddusb m6, m0 - paddusb m1, m7 ; p1+a1 - psubusb m6, m7 ; q1-a1 - - ; store -%ifidn %1, v - movrow [dst2q+mstrideq*4], m1 - movrow [dst1q+mstrideq*2], m2 - movrow [dst1q+mstrideq ], m3 - movrow [dst1q], m4 - movrow [dst2q], m5 - movrow [dst2q+ strideq ], m6 -%if mmsize == 16 && %2 == 8 - add dst8q, mstrideq - movhps [dst8q+mstrideq*2], m1 - movhps [dst8q+mstrideq ], m2 - movhps [dst8q], m3 - add dst8q, strideq - movhps [dst8q], m4 - movhps [dst8q+ strideq ], m5 - movhps [dst8q+ strideq*2], m6 -%endif -%else ; h - inc dst1q - inc dst2q - - ; 4x8/16 transpose - TRANSPOSE4x4B 1, 2, 3, 4, 0 - SBUTTERFLY bw, 5, 6, 0 - -%if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq - add dst1q, 4 - WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq -%else ; sse2 (h) - lea dst8q, [dst8q+mstrideq+1] - WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 - lea dst1q, [dst2q+mstrideq+4] - lea dst8q, [dst8q+mstrideq+4] -%if cpuflag(sse4) - add dst2q, 4 -%endif - WRITE_8W m5, dst2q, dst1q, mstrideq, strideq -%if cpuflag(sse4) - lea dst2q, [dst8q+ strideq ] -%endif - WRITE_8W m6, dst2q, dst8q, mstrideq, strideq -%endif -%endif - -%if mmsize == 8 -%if %2 == 8 ; chroma -%ifidn %1, h - sub dst1q, 5 -%endif - cmp dst1q, dst8q - mov dst1q, dst8q - jnz .next8px -%else -%ifidn %1, h - lea dst1q, [dst1q+ strideq*8-5] -%else ; v - add dst1q, 8 -%endif - dec cntrq - jg .next8px -%endif - REP_RET -%else ; mmsize == 16 - RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_MMX mmxext -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 -%endif - -INIT_XMM sse2 -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_XMM ssse3 -MBEDGE_LOOPFILTER v, 16 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER v, 8 -MBEDGE_LOOPFILTER h, 8 - -INIT_XMM sse4 -MBEDGE_LOOPFILTER h, 16 -MBEDGE_LOOPFILTER h, 8 diff --git a/ffmpeg/libavcodec/x86/vp8dsp_init.c b/ffmpeg/libavcodec/x86/vp8dsp_init.c index 09e2d91..dca00f5 100644 --- a/ffmpeg/libavcodec/x86/vp8dsp_init.c +++ b/ffmpeg/libavcodec/x86/vp8dsp_init.c @@ -23,6 +23,7 @@ #include "libavutil/cpu.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/vp8dsp.h" #if HAVE_YASM @@ -30,93 +31,93 @@ /* * MC functions */ -extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - -extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - - -extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, - uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); +void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + + +void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, + uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ @@ -230,58 +231,56 @@ HVBILIN(ssse3, 8, 4, 8) HVBILIN(ssse3, 8, 8, 16) HVBILIN(ssse3, 8, 16, 16) -extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], - ptrdiff_t stride); -extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], - ptrdiff_t stride); -extern void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]); -extern void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); -extern void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); -extern void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], - ptrdiff_t stride); - -#define DECLARE_LOOP_FILTER(NAME)\ -extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim);\ -extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim);\ -extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ - ptrdiff_t stride,\ - int e, int i, int hvt);\ -extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt);\ -extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ - uint8_t *dstV,\ - ptrdiff_t s, \ - int e, int i, int hvt); +void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], + ptrdiff_t stride); +void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); + +#define DECLARE_LOOP_FILTER(NAME) \ +void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); DECLARE_LOOP_FILTER(mmx) DECLARE_LOOP_FILTER(mmxext) @@ -318,9 +317,9 @@ DECLARE_LOOP_FILTER(sse4) av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) { #if HAVE_YASM - int mm_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX) { + if (EXTERNAL_MMX(cpu_flags)) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; #if ARCH_X86_32 @@ -351,7 +350,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) /* note that 4-tap width=16 functions are missing because w=16 * is only used for luma, and luma is always a copy or sixtap. */ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { + if (EXTERNAL_MMXEXT(cpu_flags)) { VP8_MC_FUNC(2, 4, mmxext); VP8_BILINEAR_MC_FUNC(2, 4, mmxext); #if ARCH_X86_32 @@ -375,14 +374,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) #endif } - if (mm_flags & AV_CPU_FLAG_SSE) { + if (EXTERNAL_SSE(cpu_flags)) { c->vp8_idct_add = ff_vp8_idct_add_sse; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -397,7 +396,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; } - if (mm_flags & AV_CPU_FLAG_SSE2) { + if (EXTERNAL_SSE2(cpu_flags)) { c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; @@ -409,7 +408,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; } - if (mm_flags & AV_CPU_FLAG_SSSE3) { + if (EXTERNAL_SSSE3(cpu_flags)) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3); VP8_MC_FUNC(2, 4, ssse3); @@ -431,7 +430,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; } - if (mm_flags & AV_CPU_FLAG_SSE4) { + if (EXTERNAL_SSE4(cpu_flags)) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; diff --git a/ffmpeg/libavcodec/x86/w64xmmtest.c b/ffmpeg/libavcodec/x86/w64xmmtest.c index f6e3de9..25e833f 100644 --- a/ffmpeg/libavcodec/x86/w64xmmtest.c +++ b/ffmpeg/libavcodec/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -78,3 +78,9 @@ wrap(avcodec_encode_subtitle(AVCodecContext *avctx, { testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub); } + +wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt, + const AVFrame *frame, int *got_packet_ptr)) +{ + testxmmclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr); +} |
