summaryrefslogtreecommitdiff
path: root/ffmpeg/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'ffmpeg/libavcodec/x86')
-rw-r--r--ffmpeg/libavcodec/x86/Makefile95
-rw-r--r--ffmpeg/libavcodec/x86/ac3dsp.asm457
-rw-r--r--ffmpeg/libavcodec/x86/ac3dsp_init.c232
-rw-r--r--ffmpeg/libavcodec/x86/cabac.h229
-rw-r--r--ffmpeg/libavcodec/x86/cavsdsp.c512
-rw-r--r--ffmpeg/libavcodec/x86/constants.c39
-rw-r--r--ffmpeg/libavcodec/x86/dct32.asm490
-rw-r--r--ffmpeg/libavcodec/x86/deinterlace.asm82
-rw-r--r--ffmpeg/libavcodec/x86/dirac_dwt.c202
-rw-r--r--ffmpeg/libavcodec/x86/dirac_dwt.h30
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_mmx.c104
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_mmx.h47
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_yasm.asm264
-rw-r--r--ffmpeg/libavcodec/x86/dnxhdenc.c66
-rw-r--r--ffmpeg/libavcodec/x86/dsputil.asm652
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_mmx.c1636
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_mmx.h113
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_qns_template.c101
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_rnd_template.c221
-rw-r--r--ffmpeg/libavcodec/x86/dsputilenc.asm487
-rw-r--r--ffmpeg/libavcodec/x86/dsputilenc_mmx.c1060
-rw-r--r--ffmpeg/libavcodec/x86/dwt_yasm.asm306
-rw-r--r--ffmpeg/libavcodec/x86/fdct.c586
-rw-r--r--ffmpeg/libavcodec/x86/fft.asm1093
-rw-r--r--ffmpeg/libavcodec/x86/fft.h41
-rw-r--r--ffmpeg/libavcodec/x86/fft_init.c68
-rw-r--r--ffmpeg/libavcodec/x86/fmtconvert.asm429
-rw-r--r--ffmpeg/libavcodec/x86/fmtconvert_init.c148
-rw-r--r--ffmpeg/libavcodec/x86/fpelbase.asm106
-rw-r--r--ffmpeg/libavcodec/x86/h263_loopfilter.asm189
-rw-r--r--ffmpeg/libavcodec/x86/h264_chromamc.asm678
-rw-r--r--ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm271
-rw-r--r--ffmpeg/libavcodec/x86/h264_deblock.asm1083
-rw-r--r--ffmpeg/libavcodec/x86/h264_deblock_10bit.asm923
-rw-r--r--ffmpeg/libavcodec/x86/h264_i386.h204
-rw-r--r--ffmpeg/libavcodec/x86/h264_idct.asm1073
-rw-r--r--ffmpeg/libavcodec/x86/h264_idct_10bit.asm589
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred.asm2702
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm1199
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred_init.c402
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel.c644
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel_10bit.asm884
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel_8bit.asm862
-rw-r--r--ffmpeg/libavcodec/x86/h264_weight.asm317
-rw-r--r--ffmpeg/libavcodec/x86/h264_weight_10bit.asm282
-rw-r--r--ffmpeg/libavcodec/x86/h264chroma_init.c118
-rw-r--r--ffmpeg/libavcodec/x86/h264dsp_init.c375
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp.asm461
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp_avg_template.c77
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp_init.c415
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c428
-rw-r--r--ffmpeg/libavcodec/x86/idct_mmx_xvid.c558
-rw-r--r--ffmpeg/libavcodec/x86/idct_sse2_xvid.c407
-rw-r--r--ffmpeg/libavcodec/x86/idct_xvid.h43
-rw-r--r--ffmpeg/libavcodec/x86/imdct36.asm724
-rw-r--r--ffmpeg/libavcodec/x86/lpc.c154
-rw-r--r--ffmpeg/libavcodec/x86/mathops.h130
-rw-r--r--ffmpeg/libavcodec/x86/mlpdsp.c182
-rw-r--r--ffmpeg/libavcodec/x86/motion_est.c473
-rw-r--r--ffmpeg/libavcodec/x86/mpeg4qpel.asm560
-rw-r--r--ffmpeg/libavcodec/x86/mpegaudiodec.c273
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideo.c600
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideoenc.c107
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideoenc_template.c364
-rw-r--r--ffmpeg/libavcodec/x86/pngdsp.asm173
-rw-r--r--ffmpeg/libavcodec/x86/pngdsp_init.c50
-rw-r--r--ffmpeg/libavcodec/x86/proresdsp.asm326
-rw-r--r--ffmpeg/libavcodec/x86/proresdsp_init.c57
-rw-r--r--ffmpeg/libavcodec/x86/qpelbase.asm176
-rw-r--r--ffmpeg/libavcodec/x86/rv34dsp.asm196
-rw-r--r--ffmpeg/libavcodec/x86/rv34dsp_init.c45
-rw-r--r--ffmpeg/libavcodec/x86/rv40dsp.asm505
-rw-r--r--ffmpeg/libavcodec/x86/rv40dsp_init.c243
-rw-r--r--ffmpeg/libavcodec/x86/sbrdsp.asm222
-rw-r--r--ffmpeg/libavcodec/x86/sbrdsp_init.c48
-rw-r--r--ffmpeg/libavcodec/x86/simple_idct.c1167
-rw-r--r--ffmpeg/libavcodec/x86/snowdsp.c902
-rw-r--r--ffmpeg/libavcodec/x86/v210-init.c48
-rw-r--r--ffmpeg/libavcodec/x86/v210.asm88
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp.asm317
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp.h29
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp_init.c132
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp_mmx.c750
-rw-r--r--ffmpeg/libavcodec/x86/videodsp.asm612
-rw-r--r--ffmpeg/libavcodec/x86/videodsp_init.c128
-rw-r--r--ffmpeg/libavcodec/x86/vorbisdsp.asm83
-rw-r--r--ffmpeg/libavcodec/x86/vorbisdsp_init.c43
-rw-r--r--ffmpeg/libavcodec/x86/vp3dsp.asm709
-rw-r--r--ffmpeg/libavcodec/x86/vp3dsp_init.c128
-rw-r--r--ffmpeg/libavcodec/x86/vp56_arith.h54
-rw-r--r--ffmpeg/libavcodec/x86/vp56dsp.asm170
-rw-r--r--ffmpeg/libavcodec/x86/vp56dsp_init.c48
-rw-r--r--ffmpeg/libavcodec/x86/vp8dsp.asm2780
-rw-r--r--ffmpeg/libavcodec/x86/vp8dsp_init.c442
-rw-r--r--ffmpeg/libavcodec/x86/w64xmmtest.c80
95 files changed, 39398 insertions, 0 deletions
diff --git a/ffmpeg/libavcodec/x86/Makefile b/ffmpeg/libavcodec/x86/Makefile
new file mode 100644
index 0000000..38ef867
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/Makefile
@@ -0,0 +1,95 @@
+OBJS += x86/fmtconvert_init.o \
+ x86/constants.o
+
+OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
+OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
+OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
+OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
+OBJS-$(CONFIG_FFT) += x86/fft_init.o
+OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
+OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
+OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
+OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
+OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LPC) += x86/lpc.o
+OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
+OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o
+OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
+OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
+OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
+OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
+OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
+OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
+ x86/rv40dsp_init.o
+OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
+OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
+OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
+OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
+OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
+OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o
+OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o
+OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
+MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
+ x86/fdct.o \
+ x86/idct_mmx_xvid.o \
+ x86/idct_sse2_xvid.o \
+ x86/simple_idct.o \
+
+MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
+ x86/motion_est.o
+MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
+
+YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
+YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
+ x86/dwt_yasm.o
+YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
+YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
+YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o
+YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o
+YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
+ x86/h264_chromamc_10bit.o
+YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
+ x86/h264_deblock_10bit.o \
+ x86/h264_idct.o \
+ x86/h264_idct_10bit.o \
+ x86/h264_weight.o \
+ x86/h264_weight_10bit.o
+YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
+ x86/h264_intrapred_10bit.o
+YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
+ x86/h264_qpel_10bit.o \
+ x86/qpelbase.o \
+ x86/fpelbase.o
+YASM-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp.o \
+ x86/fpelbase.o
+YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
+YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
+YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
+YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
+ x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
+YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
+YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
+YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
+YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
+YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
+YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
+
+YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
+ x86/mpeg4qpel.o \
+ x86/qpelbase.o \
+ x86/fpelbase.o
+
+YASM-OBJS += x86/deinterlace.o \
+ x86/fmtconvert.o
diff --git a/ffmpeg/libavcodec/x86/ac3dsp.asm b/ffmpeg/libavcodec/x86/ac3dsp.asm
new file mode 100644
index 0000000..98fb446
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/ac3dsp.asm
@@ -0,0 +1,457 @@
+;*****************************************************************************
+;* x86-optimized AC-3 DSP utils
+;* Copyright (c) 2011 Justin Ruggles
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; 16777216.0f - used in ff_float_to_fixed24()
+pf_1_24: times 4 dd 0x4B800000
+
+; used in ff_ac3_compute_mantissa_size()
+cextern ac3_bap_bits
+pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
+pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
+
+; used in ff_ac3_extract_exponents()
+pd_1: times 4 dd 1
+pd_151: times 4 dd 151
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
+;-----------------------------------------------------------------------------
+
+%macro AC3_EXPONENT_MIN 0
+cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
+ shl reuse_blksq, 8
+ jz .end
+ LOOP_ALIGN
+.nextexp:
+ mov offsetq, reuse_blksq
+ mova m0, [expq+offsetq]
+ sub offsetq, 256
+ LOOP_ALIGN
+.nextblk:
+ PMINUB m0, [expq+offsetq], m1
+ sub offsetq, 256
+ jae .nextblk
+ mova [expq], m0
+ add expq, mmsize
+ sub expnq, mmsize
+ jg .nextexp
+.end:
+ REP_RET
+%endmacro
+
+%define LOOP_ALIGN
+INIT_MMX mmx
+AC3_EXPONENT_MIN
+%if HAVE_MMXEXT_EXTERNAL
+%define LOOP_ALIGN ALIGN 16
+INIT_MMX mmxext
+AC3_EXPONENT_MIN
+%endif
+%if HAVE_SSE2_EXTERNAL
+INIT_XMM sse2
+AC3_EXPONENT_MIN
+%endif
+%undef LOOP_ALIGN
+
+;-----------------------------------------------------------------------------
+; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
+;
+; This function uses 2 different methods to calculate a valid result.
+; 1) logical 'or' of abs of each element
+; This is used for ssse3 because of the pabsw instruction.
+; It is also used for mmx because of the lack of min/max instructions.
+; 2) calculate min/max for the array, then or(abs(min),abs(max))
+; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
+;-----------------------------------------------------------------------------
+
+; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
+%macro OR_WORDS_HORIZ 2 ; src, tmp
+%if cpuflag(sse2)
+ movhlps %2, %1
+ por %1, %2
+ pshuflw %2, %1, q0032
+ por %1, %2
+ pshuflw %2, %1, q0001
+ por %1, %2
+%elif cpuflag(mmxext)
+ pshufw %2, %1, q0032
+ por %1, %2
+ pshufw %2, %1, q0001
+ por %1, %2
+%else ; mmx
+ movq %2, %1
+ psrlq %2, 32
+ por %1, %2
+ movq %2, %1
+ psrlq %2, 16
+ por %1, %2
+%endif
+%endmacro
+
+%macro AC3_MAX_MSB_ABS_INT16 1
+cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
+ pxor m2, m2
+ pxor m3, m3
+.loop:
+%ifidn %1, min_max
+ mova m0, [srcq]
+ mova m1, [srcq+mmsize]
+ pminsw m2, m0
+ pminsw m2, m1
+ pmaxsw m3, m0
+ pmaxsw m3, m1
+%else ; or_abs
+%if notcpuflag(ssse3)
+ mova m0, [srcq]
+ mova m1, [srcq+mmsize]
+ ABS2 m0, m1, m3, m4
+%else ; ssse3
+ ; using memory args is faster for ssse3
+ pabsw m0, [srcq]
+ pabsw m1, [srcq+mmsize]
+%endif
+ por m2, m0
+ por m2, m1
+%endif
+ add srcq, mmsize*2
+ sub lend, mmsize
+ ja .loop
+%ifidn %1, min_max
+ ABS2 m2, m3, m0, m1
+ por m2, m3
+%endif
+ OR_WORDS_HORIZ m2, m0
+ movd eax, m2
+ and eax, 0xFFFF
+ RET
+%endmacro
+
+INIT_MMX mmx
+AC3_MAX_MSB_ABS_INT16 or_abs
+INIT_MMX mmxext
+AC3_MAX_MSB_ABS_INT16 min_max
+INIT_XMM sse2
+AC3_MAX_MSB_ABS_INT16 min_max
+INIT_XMM ssse3
+AC3_MAX_MSB_ABS_INT16 or_abs
+
+;-----------------------------------------------------------------------------
+; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
+;-----------------------------------------------------------------------------
+
+%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
+cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
+ movd m0, shiftd
+.loop:
+ mova m1, [srcq ]
+ mova m2, [srcq+mmsize ]
+ mova m3, [srcq+mmsize*2]
+ mova m4, [srcq+mmsize*3]
+ %3 m1, m0
+ %3 m2, m0
+ %3 m3, m0
+ %3 m4, m0
+ mova [srcq ], m1
+ mova [srcq+mmsize ], m2
+ mova [srcq+mmsize*2], m3
+ mova [srcq+mmsize*3], m4
+ add srcq, mmsize*4
+ sub lend, mmsize*32/%2
+ ja .loop
+.end:
+ REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+AC3_SHIFT l, 16, psllw
+INIT_XMM sse2
+AC3_SHIFT l, 16, psllw
+
+;-----------------------------------------------------------------------------
+; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+AC3_SHIFT r, 32, psrad
+INIT_XMM sse2
+AC3_SHIFT r, 32, psrad
+
+;-----------------------------------------------------------------------------
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
+; than round-to-nearest.
+INIT_MMX 3dnow
+cglobal float_to_fixed24, 3, 3, 0, dst, src, len
+ movq m0, [pf_1_24]
+.loop:
+ movq m1, [srcq ]
+ movq m2, [srcq+8 ]
+ movq m3, [srcq+16]
+ movq m4, [srcq+24]
+ pfmul m1, m0
+ pfmul m2, m0
+ pfmul m3, m0
+ pfmul m4, m0
+ pf2id m1, m1
+ pf2id m2, m2
+ pf2id m3, m3
+ pf2id m4, m4
+ movq [dstq ], m1
+ movq [dstq+8 ], m2
+ movq [dstq+16], m3
+ movq [dstq+24], m4
+ add srcq, 32
+ add dstq, 32
+ sub lend, 8
+ ja .loop
+ femms
+ RET
+
+INIT_XMM sse
+cglobal float_to_fixed24, 3, 3, 3, dst, src, len
+ movaps m0, [pf_1_24]
+.loop:
+ movaps m1, [srcq ]
+ movaps m2, [srcq+16]
+ mulps m1, m0
+ mulps m2, m0
+ cvtps2pi mm0, m1
+ movhlps m1, m1
+ cvtps2pi mm1, m1
+ cvtps2pi mm2, m2
+ movhlps m2, m2
+ cvtps2pi mm3, m2
+ movq [dstq ], mm0
+ movq [dstq+ 8], mm1
+ movq [dstq+16], mm2
+ movq [dstq+24], mm3
+ add srcq, 32
+ add dstq, 32
+ sub lend, 8
+ ja .loop
+ emms
+ RET
+
+INIT_XMM sse2
+cglobal float_to_fixed24, 3, 3, 9, dst, src, len
+ movaps m0, [pf_1_24]
+.loop:
+ movaps m1, [srcq ]
+ movaps m2, [srcq+16 ]
+ movaps m3, [srcq+32 ]
+ movaps m4, [srcq+48 ]
+%ifdef m8
+ movaps m5, [srcq+64 ]
+ movaps m6, [srcq+80 ]
+ movaps m7, [srcq+96 ]
+ movaps m8, [srcq+112]
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mulps m3, m0
+ mulps m4, m0
+%ifdef m8
+ mulps m5, m0
+ mulps m6, m0
+ mulps m7, m0
+ mulps m8, m0
+%endif
+ cvtps2dq m1, m1
+ cvtps2dq m2, m2
+ cvtps2dq m3, m3
+ cvtps2dq m4, m4
+%ifdef m8
+ cvtps2dq m5, m5
+ cvtps2dq m6, m6
+ cvtps2dq m7, m7
+ cvtps2dq m8, m8
+%endif
+ movdqa [dstq ], m1
+ movdqa [dstq+16 ], m2
+ movdqa [dstq+32 ], m3
+ movdqa [dstq+48 ], m4
+%ifdef m8
+ movdqa [dstq+64 ], m5
+ movdqa [dstq+80 ], m6
+ movdqa [dstq+96 ], m7
+ movdqa [dstq+112], m8
+ add srcq, 128
+ add dstq, 128
+ sub lenq, 32
+%else
+ add srcq, 64
+ add dstq, 64
+ sub lenq, 16
+%endif
+ ja .loop
+ REP_RET
+
+;------------------------------------------------------------------------------
+; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
+;------------------------------------------------------------------------------
+
+%macro PHADDD4 2 ; xmm src, xmm tmp
+ movhlps %2, %1
+ paddd %1, %2
+ pshufd %2, %1, 0x1
+ paddd %1, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
+ movdqa m0, [mant_cntq ]
+ movdqa m1, [mant_cntq+ 1*16]
+ paddw m0, [mant_cntq+ 2*16]
+ paddw m1, [mant_cntq+ 3*16]
+ paddw m0, [mant_cntq+ 4*16]
+ paddw m1, [mant_cntq+ 5*16]
+ paddw m0, [mant_cntq+ 6*16]
+ paddw m1, [mant_cntq+ 7*16]
+ paddw m0, [mant_cntq+ 8*16]
+ paddw m1, [mant_cntq+ 9*16]
+ paddw m0, [mant_cntq+10*16]
+ paddw m1, [mant_cntq+11*16]
+ pmaddwd m0, [ac3_bap_bits ]
+ pmaddwd m1, [ac3_bap_bits+16]
+ paddd m0, m1
+ PHADDD4 m0, m1
+ movd sumd, m0
+ movdqa m3, [pw_bap_mul1]
+ movhpd m0, [mant_cntq +2]
+ movlpd m0, [mant_cntq+1*32+2]
+ movhpd m1, [mant_cntq+2*32+2]
+ movlpd m1, [mant_cntq+3*32+2]
+ movhpd m2, [mant_cntq+4*32+2]
+ movlpd m2, [mant_cntq+5*32+2]
+ pmulhuw m0, m3
+ pmulhuw m1, m3
+ pmulhuw m2, m3
+ paddusw m0, m1
+ paddusw m0, m2
+ pmaddwd m0, [pw_bap_mul2]
+ PHADDD4 m0, m1
+ movd eax, m0
+ add eax, sumd
+ RET
+
+;------------------------------------------------------------------------------
+; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
+;------------------------------------------------------------------------------
+
+%macro PABSD 1-2 ; src/dst, unused
+%if cpuflag(ssse3)
+ pabsd %1, %1
+%else ; src/dst, tmp
+ pxor %2, %2
+ pcmpgtd %2, %1
+ pxor %1, %2
+ psubd %1, %2
+%endif
+%endmacro
+
+%if HAVE_AMD3DNOW_EXTERNAL
+INIT_MMX 3dnow
+cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len
+ add expq, lenq
+ lea coefq, [coefq+4*lenq]
+ neg lenq
+ movq m3, [pd_1]
+ movq m4, [pd_151]
+.loop:
+ movq m0, [coefq+4*lenq ]
+ movq m1, [coefq+4*lenq+8]
+ PABSD m0, m2
+ PABSD m1, m2
+ pslld m0, 1
+ por m0, m3
+ pi2fd m2, m0
+ psrld m2, 23
+ movq m0, m4
+ psubd m0, m2
+ pslld m1, 1
+ por m1, m3
+ pi2fd m2, m1
+ psrld m2, 23
+ movq m1, m4
+ psubd m1, m2
+ packssdw m0, m0
+ packuswb m0, m0
+ packssdw m1, m1
+ packuswb m1, m1
+ punpcklwd m0, m1
+ movd [expq+lenq], m0
+ add lenq, 4
+ jl .loop
+ REP_RET
+%endif
+
+%macro AC3_EXTRACT_EXPONENTS 0
+cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
+ add expq, lenq
+ lea coefq, [coefq+4*lenq]
+ neg lenq
+ mova m2, [pd_1]
+ mova m3, [pd_151]
+.loop:
+ ; move 4 32-bit coefs to xmm0
+ mova m0, [coefq+4*lenq]
+ ; absolute value
+ PABSD m0, m1
+ ; convert to float and extract exponents
+ pslld m0, 1
+ por m0, m2
+ cvtdq2ps m1, m0
+ psrld m1, 23
+ mova m0, m3
+ psubd m0, m1
+ ; move the lowest byte in each of 4 dwords to the low dword
+ ; NOTE: We cannot just extract the low bytes with pshufb because the dword
+ ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
+ ; clips this to 0, which is the correct exponent.
+ packssdw m0, m0
+ packuswb m0, m0
+ movd [expq+lenq], m0
+
+ add lenq, 4
+ jl .loop
+ REP_RET
+%endmacro
+
+%if HAVE_SSE2_EXTERNAL
+INIT_XMM sse2
+AC3_EXTRACT_EXPONENTS
+%endif
+%if HAVE_SSSE3_EXTERNAL
+INIT_XMM ssse3
+AC3_EXTRACT_EXPONENTS
+%endif
diff --git a/ffmpeg/libavcodec/x86/ac3dsp_init.c b/ffmpeg/libavcodec/x86/ac3dsp_init.c
new file mode 100644
index 0000000..e2a190e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/ac3dsp_init.c
@@ -0,0 +1,232 @@
+/*
+ * x86-optimized AC-3 DSP utils
+ * Copyright (c) 2011 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "dsputil_mmx.h"
+#include "libavcodec/ac3.h"
+#include "libavcodec/ac3dsp.h"
+
+extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+
+extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
+extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
+
+extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
+extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
+
+extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
+extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
+
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+
+extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+
+#if ARCH_X86_32 && defined(__INTEL_COMPILER)
+# undef HAVE_7REGS
+# define HAVE_7REGS 0
+#endif
+
+#if HAVE_SSE_INLINE && HAVE_7REGS
+
+#define IF1(x) x
+#define IF0(x)
+
+#define MIX5(mono, stereo) \
+ __asm__ volatile ( \
+ "movss 0(%1), %%xmm5 \n" \
+ "movss 8(%1), %%xmm6 \n" \
+ "movss 24(%1), %%xmm7 \n" \
+ "shufps $0, %%xmm5, %%xmm5 \n" \
+ "shufps $0, %%xmm6, %%xmm6 \n" \
+ "shufps $0, %%xmm7, %%xmm7 \n" \
+ "1: \n" \
+ "movaps (%0, %2), %%xmm0 \n" \
+ "movaps (%0, %3), %%xmm1 \n" \
+ "movaps (%0, %4), %%xmm2 \n" \
+ "movaps (%0, %5), %%xmm3 \n" \
+ "movaps (%0, %6), %%xmm4 \n" \
+ "mulps %%xmm5, %%xmm0 \n" \
+ "mulps %%xmm6, %%xmm1 \n" \
+ "mulps %%xmm5, %%xmm2 \n" \
+ "mulps %%xmm7, %%xmm3 \n" \
+ "mulps %%xmm7, %%xmm4 \n" \
+ stereo("addps %%xmm1, %%xmm0 \n") \
+ "addps %%xmm1, %%xmm2 \n" \
+ "addps %%xmm3, %%xmm0 \n" \
+ "addps %%xmm4, %%xmm2 \n" \
+ mono("addps %%xmm2, %%xmm0 \n") \
+ "movaps %%xmm0, (%0, %2) \n" \
+ stereo("movaps %%xmm2, (%0, %3) \n") \
+ "add $16, %0 \n" \
+ "jl 1b \n" \
+ : "+&r"(i) \
+ : "r"(matrix), \
+ "r"(samples[0] + len), \
+ "r"(samples[1] + len), \
+ "r"(samples[2] + len), \
+ "r"(samples[3] + len), \
+ "r"(samples[4] + len) \
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
+ "memory" \
+ );
+
+#define MIX_MISC(stereo) \
+ __asm__ volatile ( \
+ "mov %5, %2 \n" \
+ "1: \n" \
+ "mov -%c7(%6, %2, %c8), %3 \n" \
+ "movaps (%3, %0), %%xmm0 \n" \
+ stereo("movaps %%xmm0, %%xmm1 \n") \
+ "mulps %%xmm4, %%xmm0 \n" \
+ stereo("mulps %%xmm5, %%xmm1 \n") \
+ "2: \n" \
+ "mov (%6, %2, %c8), %1 \n" \
+ "movaps (%1, %0), %%xmm2 \n" \
+ stereo("movaps %%xmm2, %%xmm3 \n") \
+ "mulps (%4, %2, 8), %%xmm2 \n" \
+ stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
+ "addps %%xmm2, %%xmm0 \n" \
+ stereo("addps %%xmm3, %%xmm1 \n") \
+ "add $4, %2 \n" \
+ "jl 2b \n" \
+ "mov %5, %2 \n" \
+ stereo("mov (%6, %2, %c8), %1 \n") \
+ "movaps %%xmm0, (%3, %0) \n" \
+ stereo("movaps %%xmm1, (%1, %0) \n") \
+ "add $16, %0 \n" \
+ "jl 1b \n" \
+ : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
+ : "r"(matrix_simd + in_ch), \
+ "g"((intptr_t) - 4 * (in_ch - 1)), \
+ "r"(samp + in_ch), \
+ "i"(sizeof(float *)), "i"(sizeof(float *)/4) \
+ : "memory" \
+ );
+
+static void ac3_downmix_sse(float **samples, float (*matrix)[2],
+ int out_ch, int in_ch, int len)
+{
+ int (*matrix_cmp)[2] = (int(*)[2])matrix;
+ intptr_t i, j, k, m;
+
+ i = -len * sizeof(float);
+ if (in_ch == 5 && out_ch == 2 &&
+ !(matrix_cmp[0][1] | matrix_cmp[2][0] |
+ matrix_cmp[3][1] | matrix_cmp[4][0] |
+ (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
+ (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
+ MIX5(IF0, IF1);
+ } else if (in_ch == 5 && out_ch == 1 &&
+ matrix_cmp[0][0] == matrix_cmp[2][0] &&
+ matrix_cmp[3][0] == matrix_cmp[4][0]) {
+ MIX5(IF1, IF0);
+ } else {
+ DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
+ float *samp[AC3_MAX_CHANNELS];
+
+ for (j = 0; j < in_ch; j++)
+ samp[j] = samples[j] + len;
+
+ j = 2 * in_ch * sizeof(float);
+ __asm__ volatile (
+ "1: \n"
+ "sub $8, %0 \n"
+ "movss (%2, %0), %%xmm4 \n"
+ "movss 4(%2, %0), %%xmm5 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "shufps $0, %%xmm5, %%xmm5 \n"
+ "movaps %%xmm4, (%1, %0, 4) \n"
+ "movaps %%xmm5, 16(%1, %0, 4) \n"
+ "jg 1b \n"
+ : "+&r"(j)
+ : "r"(matrix_simd), "r"(matrix)
+ : "memory"
+ );
+ if (out_ch == 2) {
+ MIX_MISC(IF1);
+ } else {
+ MIX_MISC(IF0);
+ }
+ }
+}
+
+#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
+ c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
+ c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
+ }
+ if (EXTERNAL_AMD3DNOW(mm_flags)) {
+ c->extract_exponents = ff_ac3_extract_exponents_3dnow;
+ if (!bit_exact) {
+ c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+ }
+ }
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
+ }
+ if (EXTERNAL_SSE(mm_flags)) {
+ c->float_to_fixed24 = ff_float_to_fixed24_sse;
+ }
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+ c->float_to_fixed24 = ff_float_to_fixed24_sse2;
+ c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
+ c->extract_exponents = ff_ac3_extract_exponents_sse2;
+ if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
+ c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
+ }
+ }
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
+ if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
+ c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+ }
+ }
+
+#if HAVE_SSE_INLINE && HAVE_7REGS
+ if (INLINE_SSE(mm_flags)) {
+ c->downmix = ac3_downmix_sse;
+ }
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/cabac.h b/ffmpeg/libavcodec/x86/cabac.h
new file mode 100644
index 0000000..2c9f77e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/cabac.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_CABAC_H
+#define AVCODEC_X86_CABAC_H
+
+#include "libavcodec/cabac.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/internal.h"
+#include "config.h"
+
+#if HAVE_INLINE_ASM
+
+#ifdef BROKEN_RELOCATIONS
+#define TABLES_ARG , "r"(tables)
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+ "cmp "low" , "tmp" \n\t"\
+ "cmova %%ecx , "range" \n\t"\
+ "sbb %%rcx , %%rcx \n\t"\
+ "and %%ecx , "tmp" \n\t"\
+ "xor %%rcx , "retq" \n\t"\
+ "sub "tmp" , "low" \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
+ "sub "low" , "tmp" \n\t"\
+ "sar $31 , "tmp" \n\t"\
+ "sub %%ecx , "range" \n\t"\
+ "and "tmp" , "range" \n\t"\
+ "add %%ecx , "range" \n\t"\
+ "shl $17 , %%ecx \n\t"\
+ "and "tmp" , %%ecx \n\t"\
+ "sub %%ecx , "low" \n\t"\
+ "xor "tmp" , "ret" \n\t"\
+ "movslq "ret" , "retq" \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+ "movzbl "statep" , "ret" \n\t"\
+ "mov "range" , "tmp" \n\t"\
+ "and $0xC0 , "range" \n\t"\
+ "lea ("ret", "range", 2), %%ecx \n\t"\
+ "movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
+ "sub "range" , "tmp" \n\t"\
+ "mov "tmp" , %%ecx \n\t"\
+ "shl $17 , "tmp" \n\t"\
+ BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+ "movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
+ "shl %%cl , "range" \n\t"\
+ "movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
+ "shl %%cl , "low" \n\t"\
+ "mov "tmpbyte" , "statep" \n\t"\
+ "test "lowword" , "lowword" \n\t"\
+ "jnz 2f \n\t"\
+ "mov "byte" , %%"REG_c" \n\t"\
+ "add"OPSIZE" $2 , "byte" \n\t"\
+ "movzwl (%%"REG_c") , "tmp" \n\t"\
+ "lea -1("low") , %%ecx \n\t"\
+ "xor "low" , %%ecx \n\t"\
+ "shr $15 , %%ecx \n\t"\
+ "bswap "tmp" \n\t"\
+ "shr $15 , "tmp" \n\t"\
+ "movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
+ "sub $0xFFFF , "tmp" \n\t"\
+ "neg %%ecx \n\t"\
+ "add $7 , %%ecx \n\t"\
+ "shl %%cl , "tmp" \n\t"\
+ "add "tmp" , "low" \n\t"\
+ "2: \n\t"
+
+#else /* BROKEN_RELOCATIONS */
+#define TABLES_ARG
+#define RIP_ARG
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+ "mov "tmp" , %%ecx \n\t"\
+ "shl $17 , "tmp" \n\t"\
+ "cmp "low" , "tmp" \n\t"\
+ "cmova %%ecx , "range" \n\t"\
+ "sbb %%ecx , %%ecx \n\t"\
+ "and %%ecx , "tmp" \n\t"\
+ "xor %%ecx , "ret" \n\t"\
+ "sub "tmp" , "low" \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
+ "mov "tmp" , %%ecx \n\t"\
+ "shl $17 , "tmp" \n\t"\
+ "sub "low" , "tmp" \n\t"\
+ "sar $31 , "tmp" \n\t" /*lps_mask*/\
+ "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
+ "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
+ "add %%ecx , "range" \n\t" /*new range*/\
+ "shl $17 , %%ecx \n\t"\
+ "and "tmp" , %%ecx \n\t"\
+ "sub %%ecx , "low" \n\t"\
+ "xor "tmp" , "ret" \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+ "movzbl "statep" , "ret" \n\t"\
+ "mov "range" , "tmp" \n\t"\
+ "and $0xC0 , "range" \n\t"\
+ "movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
+ "sub "range" , "tmp" \n\t"\
+ BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
+ "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
+ "shl %%cl , "range" \n\t"\
+ "movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
+ "shl %%cl , "low" \n\t"\
+ "mov "tmpbyte" , "statep" \n\t"\
+ "test "lowword" , "lowword" \n\t"\
+ " jnz 2f \n\t"\
+ "mov "byte" , %%"REG_c" \n\t"\
+ "add"OPSIZE" $2 , "byte" \n\t"\
+ "movzwl (%%"REG_c") , "tmp" \n\t"\
+ "lea -1("low") , %%ecx \n\t"\
+ "xor "low" , %%ecx \n\t"\
+ "shr $15 , %%ecx \n\t"\
+ "bswap "tmp" \n\t"\
+ "shr $15 , "tmp" \n\t"\
+ "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
+ "sub $0xFFFF , "tmp" \n\t"\
+ "neg %%ecx \n\t"\
+ "add $7 , %%ecx \n\t"\
+ "shl %%cl , "tmp" \n\t"\
+ "add "tmp" , "low" \n\t"\
+ "2: \n\t"
+
+#endif /* BROKEN_RELOCATIONS */
+
+
+#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+ && !( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
+#define get_cabac_inline get_cabac_inline_x86
+static av_always_inline int get_cabac_inline_x86(CABACContext *c,
+ uint8_t *const state)
+{
+ int bit, tmp;
+#ifdef BROKEN_RELOCATIONS
+ void *tables;
+
+ __asm__ volatile(
+ "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
+ : "=&r"(tables)
+ );
+#endif
+
+ __asm__ volatile(
+ BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
+ "%2", "%q2", "%3", "%b3",
+ "%c6(%5)", "%c7(%5)",
+ AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+ AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+ AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+ "%8")
+ : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
+ : "r"(state), "r"(c),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
+ TABLES_ARG
+ ,"1"(c->low), "2"(c->range)
+ : "%"REG_c, "memory"
+ );
+ return bit & 1;
+}
+#endif /* HAVE_7REGS */
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
+static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
+{
+ x86_reg tmp;
+ __asm__ volatile(
+ "movl %c6(%2), %k1 \n\t"
+ "movl %c3(%2), %%eax \n\t"
+ "shl $17, %k1 \n\t"
+ "add %%eax, %%eax \n\t"
+ "sub %k1, %%eax \n\t"
+ "cltd \n\t"
+ "and %%edx, %k1 \n\t"
+ "add %k1, %%eax \n\t"
+ "xor %%edx, %%ecx \n\t"
+ "sub %%edx, %%ecx \n\t"
+ "test %%ax, %%ax \n\t"
+ "jnz 1f \n\t"
+ "mov %c4(%2), %1 \n\t"
+ "subl $0xFFFF, %%eax \n\t"
+ "movzwl (%1), %%edx \n\t"
+ "bswap %%edx \n\t"
+ "shrl $15, %%edx \n\t"
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %c4(%2) \n\t"
+ "1: \n\t"
+ "movl %%eax, %c3(%2) \n\t"
+
+ : "+c"(val), "=&r"(tmp)
+ : "r"(c),
+ "i"(offsetof(CABACContext, low)),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
+ "i"(offsetof(CABACContext, range))
+ : "%eax", "%edx", "memory"
+ );
+ return val;
+}
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_CABAC_H */
diff --git a/ffmpeg/libavcodec/x86/cavsdsp.c b/ffmpeg/libavcodec/x86/cavsdsp.c
new file mode 100644
index 0000000..deeb5cf
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/cavsdsp.c
@@ -0,0 +1,512 @@
+/*
+ * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
+ * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
+ *
+ * MMX-optimized DSP functions, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/cavsdsp.h"
+#include "dsputil_mmx.h"
+#include "config.h"
+
+#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
+
+/* in/out: mma=mma+mmb, mmb=mmb-mma */
+#define SUMSUB_BA( a, b ) \
+ "paddw "#b", "#a" \n\t"\
+ "paddw "#b", "#b" \n\t"\
+ "psubw "#a", "#b" \n\t"
+
+/*****************************************************************************
+ *
+ * inverse transform
+ *
+ ****************************************************************************/
+
+static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
+{
+ __asm__ volatile(
+ "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
+ "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
+ "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
+ "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
+ "movq %%mm4, %%mm0 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "movq %%mm2, %%mm6 \n\t"
+ "movq %%mm7, %%mm1 \n\t"
+
+ "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
+ "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
+ "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
+ "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
+ "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
+ "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
+ "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
+ "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
+ "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
+ "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
+ "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
+ "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
+
+ "movq %%mm5, %%mm4 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "movq %%mm3, %%mm0 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
+ "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
+ "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
+ "paddw %%mm7, %%mm7 \n\t"
+ "paddw %%mm5, %%mm5 \n\t"
+ "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
+ "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
+
+ SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
+ "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
+ "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
+ "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
+ "paddw %%mm1, %%mm1 \n\t"
+ "paddw %%mm3, %%mm3 \n\t"
+ "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
+ "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
+
+ "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
+ "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
+ "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
+ "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
+ "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm0 \n\t"
+ "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
+ "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
+
+ "movq (%0), %%mm2 \n\t" /* mm2 = src0 */
+ "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
+ SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
+ "psllw $3, %%mm0 \n\t"
+ "psllw $3, %%mm2 \n\t"
+ "paddw %1, %%mm0 \n\t" /* add rounding bias */
+ "paddw %1, %%mm2 \n\t" /* add rounding bias */
+
+ SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
+ SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
+ SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
+ SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
+ SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
+ SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
+ :: "r"(block), "m"(bias)
+ );
+}
+
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+ int i;
+ DECLARE_ALIGNED(8, int16_t, b2)[64];
+
+ for(i=0; i<2; i++){
+ DECLARE_ALIGNED(8, uint64_t, tmp);
+
+ cavs_idct8_1d(block+4*i, ff_pw_4.a);
+
+ __asm__ volatile(
+ "psraw $3, %%mm7 \n\t"
+ "psraw $3, %%mm6 \n\t"
+ "psraw $3, %%mm5 \n\t"
+ "psraw $3, %%mm4 \n\t"
+ "psraw $3, %%mm3 \n\t"
+ "psraw $3, %%mm2 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm0 \n\t"
+ "movq %%mm7, %0 \n\t"
+ TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+ "movq %%mm0, 8(%1) \n\t"
+ "movq %%mm6, 24(%1) \n\t"
+ "movq %%mm7, 40(%1) \n\t"
+ "movq %%mm4, 56(%1) \n\t"
+ "movq %0, %%mm7 \n\t"
+ TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+ "movq %%mm7, (%1) \n\t"
+ "movq %%mm1, 16(%1) \n\t"
+ "movq %%mm0, 32(%1) \n\t"
+ "movq %%mm3, 48(%1) \n\t"
+ : "=m"(tmp)
+ : "r"(b2+32*i)
+ : "memory"
+ );
+ }
+
+ for(i=0; i<2; i++){
+ cavs_idct8_1d(b2+4*i, ff_pw_64.a);
+
+ __asm__ volatile(
+ "psraw $7, %%mm7 \n\t"
+ "psraw $7, %%mm6 \n\t"
+ "psraw $7, %%mm5 \n\t"
+ "psraw $7, %%mm4 \n\t"
+ "psraw $7, %%mm3 \n\t"
+ "psraw $7, %%mm2 \n\t"
+ "psraw $7, %%mm1 \n\t"
+ "psraw $7, %%mm0 \n\t"
+ "movq %%mm7, (%0) \n\t"
+ "movq %%mm5, 16(%0) \n\t"
+ "movq %%mm3, 32(%0) \n\t"
+ "movq %%mm1, 48(%0) \n\t"
+ "movq %%mm0, 64(%0) \n\t"
+ "movq %%mm2, 80(%0) \n\t"
+ "movq %%mm4, 96(%0) \n\t"
+ "movq %%mm6, 112(%0) \n\t"
+ :: "r"(b2+4*i)
+ : "memory"
+ );
+ }
+
+ ff_add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+/*****************************************************************************
+ *
+ * motion compensation
+ *
+ ****************************************************************************/
+
+/* vertical filter [-1 -2 96 42 -7 0] */
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "pmullw %5, %%mm6 \n\t"\
+ "movq "#D", %%mm7 \n\t"\
+ "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
+ "psllw $3, "#E" \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "psraw $3, "#E" \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw "#E", %%mm6 \n\t"\
+ "paddw "#B", "#B" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psraw $1, "#B" \n\t"\
+ "psubw "#A", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $7, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+/* vertical filter [ 0 -1 5 5 -1 0] */
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "paddw "#D", %%mm6 \n\t"\
+ "pmullw %5, %%mm6 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $3, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+/* vertical filter [ 0 -7 42 96 -2 -1] */
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
+ "movq "#D", %%mm7 \n\t"\
+ "pmullw %5, %%mm7 \n\t"\
+ "psllw $3, "#B" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psraw $3, "#B" \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw "#B", %%mm6 \n\t"\
+ "paddw "#E", "#E" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "psraw $1, "#E" \n\t"\
+ "psubw "#F", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $7, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+
+#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
+ int w= 2;\
+ src -= 2*srcStride;\
+ \
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+ : "memory"\
+ );\
+ }\
+ src += 4-(h+5)*srcStride;\
+ dst += 4-h*dstStride;\
+ }
+
+#define QPEL_CAVS(OPNAME, OP, MMX)\
+static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %5, %%mm6 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "movq %6, %%mm5 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm5, %%mm1 \n\t"\
+ "psraw $3, %%mm0 \n\t"\
+ "psraw $3, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q) \
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+m"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+ : "memory"\
+ );\
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
+}\
+\
+static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define CAVS_MC(OPNAME, SIZE, MMX) \
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
+}\
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgusb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+#define AVG_MMXEXT_OP(a, b, temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+
+#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
+
+#if HAVE_MMXEXT_INLINE
+QPEL_CAVS(put_, PUT_OP, mmxext)
+QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
+
+CAVS_MC(put_, 8, mmxext)
+CAVS_MC(put_, 16, mmxext)
+CAVS_MC(avg_, 8, mmxext)
+CAVS_MC(avg_, 16, mmxext)
+
+static av_cold void ff_cavsdsp_init_mmxext(CAVSDSPContext *c,
+ AVCodecContext *avctx)
+{
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \
+ c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmxext; \
+ c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmxext; \
+ c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmxext; \
+ c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmxext; \
+
+ dspfunc(put_cavs_qpel, 0, 16);
+ dspfunc(put_cavs_qpel, 1, 8);
+ dspfunc(avg_cavs_qpel, 0, 16);
+ dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+ c->cavs_idct8_add = cavs_idct8_add_mmx;
+ c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
+}
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_AMD3DNOW_INLINE
+QPEL_CAVS(put_, PUT_OP, 3dnow)
+QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
+
+CAVS_MC(put_, 8, 3dnow)
+CAVS_MC(put_, 16,3dnow)
+CAVS_MC(avg_, 8, 3dnow)
+CAVS_MC(avg_, 16,3dnow)
+
+static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c,
+ AVCodecContext *avctx)
+{
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \
+ c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
+ c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
+ c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
+ c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
+
+ dspfunc(put_cavs_qpel, 0, 16);
+ dspfunc(put_cavs_qpel, 1, 8);
+ dspfunc(avg_cavs_qpel, 0, 16);
+ dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+ c->cavs_idct8_add = cavs_idct8_add_mmx;
+ c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
+}
+#endif /* HAVE_AMD3DNOW_INLINE */
+
+av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
+{
+ int mm_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmxext(c, avctx);
+#endif /* HAVE_MMXEXT_INLINE */
+#if HAVE_AMD3DNOW_INLINE
+ if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
+#endif /* HAVE_AMD3DNOW_INLINE */
+}
diff --git a/ffmpeg/libavcodec/x86/constants.c b/ffmpeg/libavcodec/x86/constants.c
new file mode 100644
index 0000000..821d73f
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/constants.c
@@ -0,0 +1,39 @@
+/*
+ * MMX/SSE constants used across x86 dsp optimizations.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h" // for xmm_reg
+
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
diff --git a/ffmpeg/libavcodec/x86/dct32.asm b/ffmpeg/libavcodec/x86/dct32.asm
new file mode 100644
index 0000000..6fd5ba3
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dct32.asm
@@ -0,0 +1,490 @@
+;******************************************************************************
+;* 32 point SSE-optimized DCT transform
+;* Copyright (c) 2010 Vitor Sessak
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+align 32
+ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
+ dd 0.553104, 0.582935, 0.622504, 0.674808
+ dd -10.190008, -3.407609, -2.057781, -1.484165
+ dd -1.169440, -0.972568, -0.839350, -0.744536
+ dd 0.502419, 0.522499, 0.566944, 0.646822
+ dd 0.788155, 1.060678, 1.722447, 5.101149
+ dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 1.000000, 1.000000, 1.306563, 0.541196
+ dd 1.000000, 1.000000, 1.306563, 0.541196
+ dd 1.000000, 0.707107, 1.000000, -0.707107
+ dd 1.000000, 0.707107, 1.000000, -0.707107
+ dd 0.707107, 0.707107, 0.707107, 0.707107
+
+align 32
+ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
+
+%macro BUTTERFLY 4
+ subps %4, %1, %2
+ addps %2, %2, %1
+ mulps %1, %4, %3
+%endmacro
+
+%macro BUTTERFLY0 5
+%if cpuflag(sse2) && notcpuflag(avx)
+ pshufd %4, %1, %5
+ xorps %1, %2
+ addps %1, %4
+ mulps %1, %3
+%else
+ shufps %4, %1, %1, %5
+ xorps %1, %1, %2
+ addps %4, %4, %1
+ mulps %1, %4, %3
+%endif
+%endmacro
+
+%macro BUTTERFLY2 4
+ BUTTERFLY0 %1, %2, %3, %4, 0x1b
+%endmacro
+
+%macro BUTTERFLY3 4
+ BUTTERFLY0 %1, %2, %3, %4, 0xb1
+%endmacro
+
+%macro BUTTERFLY3V 5
+ movaps m%5, m%1
+ addps m%1, m%2
+ subps m%5, m%2
+ SWAP %2, %5
+ mulps m%2, [ps_cos_vec+192]
+ movaps m%5, m%3
+ addps m%3, m%4
+ subps m%4, m%5
+ mulps m%4, [ps_cos_vec+192]
+%endmacro
+
+%macro PASS6_AND_PERMUTE 0
+ mov tmpd, [outq+4]
+ movss m7, [outq+72]
+ addss m7, [outq+76]
+ movss m3, [outq+56]
+ addss m3, [outq+60]
+ addss m4, m3
+ movss m2, [outq+52]
+ addss m2, m3
+ movss m3, [outq+104]
+ addss m3, [outq+108]
+ addss m1, m3
+ addss m5, m4
+ movss [outq+ 16], m1
+ movss m1, [outq+100]
+ addss m1, m3
+ movss m3, [outq+40]
+ movss [outq+ 48], m1
+ addss m3, [outq+44]
+ movss m1, [outq+100]
+ addss m4, m3
+ addss m3, m2
+ addss m1, [outq+108]
+ movss [outq+ 40], m3
+ addss m2, [outq+36]
+ movss m3, [outq+8]
+ movss [outq+ 56], m2
+ addss m3, [outq+12]
+ movss [outq+ 32], m3
+ movss m3, [outq+80]
+ movss [outq+ 8], m5
+ movss [outq+ 80], m1
+ movss m2, [outq+52]
+ movss m5, [outq+120]
+ addss m5, [outq+124]
+ movss m1, [outq+64]
+ addss m2, [outq+60]
+ addss m0, m5
+ addss m5, [outq+116]
+ mov [outq+64], tmpd
+ addss m6, m0
+ addss m1, m6
+ mov tmpd, [outq+12]
+ mov [outq+ 96], tmpd
+ movss [outq+ 4], m1
+ movss m1, [outq+24]
+ movss [outq+ 24], m4
+ movss m4, [outq+88]
+ addss m4, [outq+92]
+ addss m3, m4
+ addss m4, [outq+84]
+ mov tmpd, [outq+108]
+ addss m1, [outq+28]
+ addss m0, m1
+ addss m1, m5
+ addss m6, m3
+ addss m3, m0
+ addss m0, m7
+ addss m5, [outq+20]
+ addss m7, m1
+ movss [outq+ 12], m6
+ mov [outq+112], tmpd
+ movss m6, [outq+28]
+ movss [outq+ 28], m0
+ movss m0, [outq+36]
+ movss [outq+ 36], m7
+ addss m1, m4
+ movss m7, [outq+116]
+ addss m0, m2
+ addss m7, [outq+124]
+ movss [outq+ 72], m0
+ movss m0, [outq+44]
+ addss m2, m0
+ movss [outq+ 44], m1
+ movss [outq+ 88], m2
+ addss m0, [outq+60]
+ mov tmpd, [outq+60]
+ mov [outq+120], tmpd
+ movss [outq+104], m0
+ addss m4, m5
+ addss m5, [outq+68]
+ movss [outq+52], m4
+ movss [outq+60], m5
+ movss m4, [outq+68]
+ movss m5, [outq+20]
+ movss [outq+ 20], m3
+ addss m5, m7
+ addss m7, m6
+ addss m4, m5
+ movss m2, [outq+84]
+ addss m2, [outq+92]
+ addss m5, m2
+ movss [outq+ 68], m4
+ addss m2, m7
+ movss m4, [outq+76]
+ movss [outq+ 84], m2
+ movss [outq+ 76], m5
+ addss m7, m4
+ addss m6, [outq+124]
+ addss m4, m6
+ addss m6, [outq+92]
+ movss [outq+100], m4
+ movss [outq+108], m6
+ movss m6, [outq+92]
+ movss [outq+92], m7
+ addss m6, [outq+124]
+ movss [outq+116], m6
+%endmacro
+
+INIT_YMM avx
+SECTION_TEXT
+%if HAVE_AVX_EXTERNAL
+; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
+cglobal dct32_float, 2,3,8, out, in, tmp
+ ; pass 1
+ vmovaps m4, [inq+0]
+ vinsertf128 m5, m5, [inq+96], 1
+ vinsertf128 m5, m5, [inq+112], 0
+ vshufps m5, m5, m5, 0x1b
+ BUTTERFLY m4, m5, [ps_cos_vec], m6
+
+ vmovaps m2, [inq+64]
+ vinsertf128 m6, m6, [inq+32], 1
+ vinsertf128 m6, m6, [inq+48], 0
+ vshufps m6, m6, m6, 0x1b
+ BUTTERFLY m2, m6, [ps_cos_vec+32], m0
+
+ ; pass 2
+
+ BUTTERFLY m5, m6, [ps_cos_vec+64], m0
+ BUTTERFLY m4, m2, [ps_cos_vec+64], m7
+
+
+ ; pass 3
+ vperm2f128 m3, m6, m4, 0x31
+ vperm2f128 m1, m6, m4, 0x20
+ vshufps m3, m3, m3, 0x1b
+
+ BUTTERFLY m1, m3, [ps_cos_vec+96], m6
+
+
+ vperm2f128 m4, m5, m2, 0x20
+ vperm2f128 m5, m5, m2, 0x31
+ vshufps m5, m5, m5, 0x1b
+
+ BUTTERFLY m4, m5, [ps_cos_vec+96], m6
+
+ ; pass 4
+ vmovaps m6, [ps_p1p1m1m1+0]
+ vmovaps m2, [ps_cos_vec+128]
+
+ BUTTERFLY2 m5, m6, m2, m7
+ BUTTERFLY2 m4, m6, m2, m7
+ BUTTERFLY2 m1, m6, m2, m7
+ BUTTERFLY2 m3, m6, m2, m7
+
+
+ ; pass 5
+ vshufps m6, m6, m6, 0xcc
+ vmovaps m2, [ps_cos_vec+160]
+
+ BUTTERFLY3 m5, m6, m2, m7
+ BUTTERFLY3 m4, m6, m2, m7
+ BUTTERFLY3 m1, m6, m2, m7
+ BUTTERFLY3 m3, m6, m2, m7
+
+ vperm2f128 m6, m3, m3, 0x31
+ vmovaps [outq], m3
+
+ vextractf128 [outq+64], m5, 1
+ vextractf128 [outq+32], m5, 0
+
+ vextractf128 [outq+80], m4, 1
+ vextractf128 [outq+48], m4, 0
+
+ vperm2f128 m0, m1, m1, 0x31
+ vmovaps [outq+96], m1
+
+ vzeroupper
+
+ ; pass 6, no SIMD...
+INIT_XMM
+ PASS6_AND_PERMUTE
+ RET
+%endif
+
+%if ARCH_X86_64
+%define SPILL SWAP
+%define UNSPILL SWAP
+
+%macro PASS5 0
+ nop ; FIXME code alignment
+ SWAP 5, 8
+ SWAP 4, 12
+ SWAP 6, 14
+ SWAP 7, 13
+ SWAP 0, 15
+ PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
+ TRANSPOSE4x4PS 8, 9, 10, 11, 0
+ BUTTERFLY3V 8, 9, 10, 11, 0
+ addps m10, m11
+ TRANSPOSE4x4PS 12, 13, 14, 15, 0
+ BUTTERFLY3V 12, 13, 14, 15, 0
+ addps m14, m15
+ addps m12, m14
+ addps m14, m13
+ addps m13, m15
+%endmacro
+
+%macro PASS6 0
+ SWAP 9, 12
+ SWAP 11, 14
+ movss [outq+0x00], m8
+ pshuflw m0, m8, 0xe
+ movss [outq+0x10], m9
+ pshuflw m1, m9, 0xe
+ movss [outq+0x20], m10
+ pshuflw m2, m10, 0xe
+ movss [outq+0x30], m11
+ pshuflw m3, m11, 0xe
+ movss [outq+0x40], m12
+ pshuflw m4, m12, 0xe
+ movss [outq+0x50], m13
+ pshuflw m5, m13, 0xe
+ movss [outq+0x60], m14
+ pshuflw m6, m14, 0xe
+ movaps [outq+0x70], m15
+ pshuflw m7, m15, 0xe
+ addss m0, m1
+ addss m1, m2
+ movss [outq+0x08], m0
+ addss m2, m3
+ movss [outq+0x18], m1
+ addss m3, m4
+ movss [outq+0x28], m2
+ addss m4, m5
+ movss [outq+0x38], m3
+ addss m5, m6
+ movss [outq+0x48], m4
+ addss m6, m7
+ movss [outq+0x58], m5
+ movss [outq+0x68], m6
+ movss [outq+0x78], m7
+
+ PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
+ movhlps m0, m1
+ pshufd m1, m1, 3
+ SWAP 0, 2, 4, 6, 8, 10, 12, 14
+ SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%rep 7
+ movhlps m0, m1
+ pshufd m1, m1, 3
+ addss m15, m1
+ SWAP 0, 2, 4, 6, 8, 10, 12, 14
+ SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%endrep
+%assign i 4
+%rep 15
+ addss m0, m1
+ movss [outq+i], m0
+ SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ %assign i i+8
+%endrep
+%endmacro
+
+%else ; ARCH_X86_32
+%macro SPILL 2 ; xmm#, mempos
+ movaps [outq+(%2-8)*16], m%1
+%endmacro
+%macro UNSPILL 2
+ movaps m%1, [outq+(%2-8)*16]
+%endmacro
+
+%define PASS6 PASS6_AND_PERMUTE
+%macro PASS5 0
+ movaps m2, [ps_cos_vec+160]
+ shufps m3, m3, 0xcc
+
+ BUTTERFLY3 m5, m3, m2, m1
+ SPILL 5, 8
+
+ UNSPILL 1, 9
+ BUTTERFLY3 m1, m3, m2, m5
+ SPILL 1, 14
+
+ BUTTERFLY3 m4, m3, m2, m5
+ SPILL 4, 12
+
+ BUTTERFLY3 m7, m3, m2, m5
+ SPILL 7, 13
+
+ UNSPILL 5, 10
+ BUTTERFLY3 m5, m3, m2, m7
+ SPILL 5, 10
+
+ UNSPILL 4, 11
+ BUTTERFLY3 m4, m3, m2, m7
+ SPILL 4, 11
+
+ BUTTERFLY3 m6, m3, m2, m7
+ SPILL 6, 9
+
+ BUTTERFLY3 m0, m3, m2, m7
+ SPILL 0, 15
+%endmacro
+%endif
+
+
+; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
+%macro DCT32_FUNC 0
+cglobal dct32_float, 2, 3, 16, out, in, tmp
+ ; pass 1
+
+ movaps m0, [inq+0]
+ LOAD_INV m1, [inq+112]
+ BUTTERFLY m0, m1, [ps_cos_vec], m3
+
+ movaps m7, [inq+64]
+ LOAD_INV m4, [inq+48]
+ BUTTERFLY m7, m4, [ps_cos_vec+32], m3
+
+ ; pass 2
+ movaps m2, [ps_cos_vec+64]
+ BUTTERFLY m1, m4, m2, m3
+ SPILL 1, 11
+ SPILL 4, 8
+
+ ; pass 1
+ movaps m1, [inq+16]
+ LOAD_INV m6, [inq+96]
+ BUTTERFLY m1, m6, [ps_cos_vec+16], m3
+
+ movaps m4, [inq+80]
+ LOAD_INV m5, [inq+32]
+ BUTTERFLY m4, m5, [ps_cos_vec+48], m3
+
+ ; pass 2
+ BUTTERFLY m0, m7, m2, m3
+
+ movaps m2, [ps_cos_vec+80]
+ BUTTERFLY m6, m5, m2, m3
+
+ BUTTERFLY m1, m4, m2, m3
+
+ ; pass 3
+ movaps m2, [ps_cos_vec+96]
+ shufps m1, m1, 0x1b
+ BUTTERFLY m0, m1, m2, m3
+ SPILL 0, 15
+ SPILL 1, 14
+
+ UNSPILL 0, 8
+ shufps m5, m5, 0x1b
+ BUTTERFLY m0, m5, m2, m3
+
+ UNSPILL 1, 11
+ shufps m6, m6, 0x1b
+ BUTTERFLY m1, m6, m2, m3
+ SPILL 1, 11
+
+ shufps m4, m4, 0x1b
+ BUTTERFLY m7, m4, m2, m3
+
+ ; pass 4
+ movaps m3, [ps_p1p1m1m1+0]
+ movaps m2, [ps_cos_vec+128]
+
+ BUTTERFLY2 m5, m3, m2, m1
+
+ BUTTERFLY2 m0, m3, m2, m1
+ SPILL 0, 9
+
+ BUTTERFLY2 m6, m3, m2, m1
+ SPILL 6, 10
+
+ UNSPILL 0, 11
+ BUTTERFLY2 m0, m3, m2, m1
+ SPILL 0, 11
+
+ BUTTERFLY2 m4, m3, m2, m1
+
+ BUTTERFLY2 m7, m3, m2, m1
+
+ UNSPILL 6, 14
+ BUTTERFLY2 m6, m3, m2, m1
+
+ UNSPILL 0, 15
+ BUTTERFLY2 m0, m3, m2, m1
+
+ PASS5
+ PASS6
+ RET
+%endmacro
+
+%macro LOAD_INV 2
+%if cpuflag(sse2)
+ pshufd %1, %2, 0x1b
+%elif cpuflag(sse)
+ movaps %1, %2
+ shufps %1, %1, 0x1b
+%endif
+%endmacro
+
+INIT_XMM sse
+DCT32_FUNC
+INIT_XMM sse2
+DCT32_FUNC
diff --git a/ffmpeg/libavcodec/x86/deinterlace.asm b/ffmpeg/libavcodec/x86/deinterlace.asm
new file mode 100644
index 0000000..3812dbe
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/deinterlace.asm
@@ -0,0 +1,82 @@
+;******************************************************************************
+;* MMX optimized deinterlacing functions
+;* Copyright (c) 2010 Vitor Sessak
+;* Copyright (c) 2002 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_4
+
+SECTION .text
+
+%macro DEINTERLACE 1
+%ifidn %1, inplace
+;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
+cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
+%else
+;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
+cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
+%endif
+ pxor mm7, mm7
+ movq mm6, [pw_4]
+.nextrow:
+ movd mm0, [lum_m4q]
+ movd mm1, [lum_m3q]
+ movd mm2, [lum_m2q]
+%ifidn %1, inplace
+ movd [lum_m4q], mm2
+%endif
+ movd mm3, [lum_m1q]
+ movd mm4, [lumq]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+ paddw mm1, mm3
+ psllw mm2, 1
+ paddw mm0, mm4
+ psllw mm1, 2
+ paddw mm2, mm6
+ paddw mm1, mm2
+ psubusw mm1, mm0
+ psrlw mm1, 3
+ packuswb mm1, mm7
+%ifidn %1, inplace
+ movd [lum_m2q], mm1
+%else
+ movd [dstq], mm1
+ add dstq, 4
+%endif
+ add lum_m4q, 4
+ add lum_m3q, 4
+ add lum_m2q, 4
+ add lum_m1q, 4
+ add lumq, 4
+ sub sized, 4
+ jg .nextrow
+ REP_RET
+%endmacro
+
+DEINTERLACE ""
+
+DEINTERLACE inplace
diff --git a/ffmpeg/libavcodec/x86/dirac_dwt.c b/ffmpeg/libavcodec/x86/dirac_dwt.c
new file mode 100644
index 0000000..fbb25a4
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dirac_dwt.c
@@ -0,0 +1,202 @@
+/*
+ * MMX optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "dsputil_mmx.h"
+#include "dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+ IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+ IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) { \
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+ } \
+\
+ ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = tmp[x];\
+ b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+ }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = (tmp[x] + 1)>>1;\
+ b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+ }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+ int w2= w>>1;
+ int x= w2 - (w2&7);
+ ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+ for (; x < w2; x++) {
+ b[2*x ] = (tmp[x] + 1)>>1;
+ b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+ }
+}
+#endif
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+ if (!(mm_flags & AV_CPU_FLAG_MMX))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar0i_mmx;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar1i_mmx;
+ break;
+ }
+#endif
+
+ if (!(mm_flags & AV_CPU_FLAG_SSE2))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar0i_sse2;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar1i_sse2;
+ break;
+ }
+
+ if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+ break;
+ }
+#endif // HAVE_YASM
+}
diff --git a/ffmpeg/libavcodec/x86/dirac_dwt.h b/ffmpeg/libavcodec/x86/dirac_dwt.h
new file mode 100644
index 0000000..126b290
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dirac_dwt.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRAC_DWT_H
+#define AVCODEC_X86_DIRAC_DWT_H
+
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
+
+#endif
diff --git a/ffmpeg/libavcodec/x86/diracdsp_mmx.c b/ffmpeg/libavcodec/x86/diracdsp_mmx.c
new file mode 100644
index 0000000..cb6465f
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/diracdsp_mmx.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+#include "diracdsp_mmx.h"
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#define HPEL_FILTER(MMSIZE, EXT) \
+ void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
+ void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
+ \
+ static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
+ const uint8_t *src, int stride, int width, int height) \
+ { \
+ while( height-- ) \
+ { \
+ ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+ ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
+ ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
+ \
+ dsth += stride; \
+ dstv += stride; \
+ dstc += stride; \
+ src += stride; \
+ } \
+ }
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+#define PIXFUNC(PFX, IDX, EXT) \
+ /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
+ c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+ c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (!(mm_flags & AV_CPU_FLAG_MMX))
+ return;
+
+#if HAVE_YASM
+ c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+ c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+ c->add_rect_clamped = ff_add_rect_clamped_mmx;
+ c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
+#endif
+#endif
+
+#if HAVE_MMX_INLINE
+ PIXFUNC(put, 0, mmx);
+ PIXFUNC(avg, 0, mmx);
+#endif
+
+#if HAVE_MMXEXT_INLINE
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ PIXFUNC(avg, 0, mmxext);
+ }
+#endif
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+#if HAVE_YASM
+ c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+ c->add_rect_clamped = ff_add_rect_clamped_sse2;
+ c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
+
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+#endif
+#if HAVE_SSE2_INLINE
+ c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+ c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+ c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+ c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+#endif
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/diracdsp_mmx.h b/ffmpeg/libavcodec/x86/diracdsp_mmx.h
new file mode 100644
index 0000000..8985854
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/diracdsp_mmx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRACDSP_H
+#define AVCODEC_X86_DIRACDSP_H
+
+#include "libavcodec/diracdsp.h"
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c);
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+#endif
diff --git a/ffmpeg/libavcodec/x86/diracdsp_yasm.asm b/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
new file mode 100644
index 0000000..3e9765b
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
@@ -0,0 +1,264 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_3: times 8 dw 3
+pw_7: times 8 dw 7
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+pb_128: times 16 db 128
+
+section .text
+
+%macro UNPACK_ADD 6
+ mov%5 %1, %3
+ mov%6 m5, %4
+ mova m4, %1
+ mova %2, m5
+ punpcklbw %1, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m7
+ punpckhbw %2, m7
+ paddw %1, m5
+ paddw %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+ mov src0q, srcq
+ lea stridex3q, [3*strideq]
+ sub src0q, stridex3q
+ pxor m7, m7
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, mmsize
+ add srcq, mmsize
+ add src0q, mmsize
+ sub widthd, mmsize
+ jg .loop
+ RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+ dec widthd
+ pxor m7, m7
+ and widthd, ~(mmsize-1)
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq + widthq], m0
+ sub widthd, mmsize
+ jge .loop
+ RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+ mova m0, [pb_128]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd dst_strideq, dst_strided
+ movsxd src_strideq, src_strided
+ mov r7d, r5m
+ mov r8d, wd
+ %define wspill r8d
+ %define hd r7d
+%else
+ mov r4m, wd
+ %define wspill r4m
+ %define hd r5mp
+%endif
+
+.loopy
+ lea src2q, [srcq+src_strideq*2]
+ lea dst2q, [dstq+dst_strideq]
+.loopx:
+ sub wd, mmsize
+ mova m1, [srcq +2*wq]
+ mova m2, [src2q+2*wq]
+ packsswb m1, [srcq +2*wq+mmsize]
+ packsswb m2, [src2q+2*wq+mmsize]
+ paddb m1, m0
+ paddb m2, m0
+ mova [dstq +wq], m1
+ mova [dst2q+wq], m2
+ jg .loopx
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*2]
+ sub hd, 2
+ mov wd, wspill
+ jg .loopy
+ RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+ mova m0, [pw_32]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd strideq, strided
+ movsxd idwt_strideq, idwt_strided
+ mov r8d, wd
+ %define wspill r8d
+%else
+ mov r5m, wd
+ %define wspill r5m
+%endif
+
+.loop:
+ sub wd, mmsize
+ movu m1, [srcq +2*wq] ; FIXME: ensure alignment
+ paddw m1, m0
+ psraw m1, 6
+ movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+ paddw m2, m0
+ psraw m2, 6
+ paddw m1, [idwtq+2*wq]
+ paddw m2, [idwtq+2*wq+mmsize]
+ packuswb m1, m2
+ mova [dstq +wq], m1
+ jg .loop
+
+ lea srcq, [srcq + 2*strideq]
+ add dstq, strideq
+ lea idwtq, [idwtq+ 2*idwt_strideq]
+ sub hd, 1
+ mov wd, wspill
+ jg .loop
+ RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+ pxor m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+ mova m0, [srcq+i]
+ mova m1, m0
+ punpcklbw m0, m4
+ punpckhbw m1, m4
+ mova m2, [obmcq+i]
+ mova m3, m2
+ punpcklbw m2, m4
+ punpckhbw m3, m4
+ pmullw m0, m2
+ pmullw m1, m3
+ movu m2, [dstq+2*i]
+ movu m3, [dstq+2*i+mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ movu [dstq+2*i], m0
+ movu [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/ffmpeg/libavcodec/x86/dnxhdenc.c b/ffmpeg/libavcodec/x86/dnxhdenc.c
new file mode 100644
index 0000000..349fbb0
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dnxhdenc.c
@@ -0,0 +1,66 @@
+/*
+ * VC3/DNxHD SIMD functions
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/dnxhdenc.h"
+
+#if HAVE_SSE2_INLINE
+
+static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size)
+{
+ __asm__ volatile(
+ "pxor %%xmm5, %%xmm5 \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm1 \n\t"
+ "movq (%0, %2), %%xmm2 \n\t"
+ "movq (%0, %2,2), %%xmm3 \n\t"
+ "punpcklbw %%xmm5, %%xmm0 \n\t"
+ "punpcklbw %%xmm5, %%xmm1 \n\t"
+ "punpcklbw %%xmm5, %%xmm2 \n\t"
+ "punpcklbw %%xmm5, %%xmm3 \n\t"
+ "movdqa %%xmm0, (%1) \n\t"
+ "movdqa %%xmm1, 16(%1) \n\t"
+ "movdqa %%xmm2, 32(%1) \n\t"
+ "movdqa %%xmm3, 48(%1) \n\t"
+ "movdqa %%xmm3 , 64(%1) \n\t"
+ "movdqa %%xmm2 , 80(%1) \n\t"
+ "movdqa %%xmm1 , 96(%1) \n\t"
+ "movdqa %%xmm0, 112(%1) \n\t"
+ : "+r" (pixels)
+ : "r" (block), "r" ((x86_reg)line_size)
+ );
+}
+
+#endif /* HAVE_SSE2_INLINE */
+
+av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
+{
+#if HAVE_SSE2_INLINE
+ if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) {
+ if (ctx->cid_table->bit_depth == 8)
+ ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
+ }
+#endif /* HAVE_SSE2_INLINE */
+}
diff --git a/ffmpeg/libavcodec/x86/dsputil.asm b/ffmpeg/libavcodec/x86/dsputil.asm
new file mode 100644
index 0000000..9970c02
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputil.asm
@@ -0,0 +1,652 @@
+;******************************************************************************
+;* MMX optimized DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_f: times 16 db 15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
+pd_16384: times 4 dd 16384
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
+cglobal scalarproduct_int16, 3,3,3, v1, v2, order
+ shl orderq, 1
+ add v1q, orderq
+ add v2q, orderq
+ neg orderq
+ pxor m2, m2
+.loop:
+ movu m0, [v1q + orderq]
+ movu m1, [v1q + orderq + mmsize]
+ pmaddwd m0, [v2q + orderq]
+ pmaddwd m1, [v2q + orderq + mmsize]
+ paddd m2, m0
+ paddd m2, m1
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m2
+ paddd m2, m0
+ pshuflw m0, m2, 0x4e
+%else
+ pshufw m0, m2, 0x4e
+%endif
+ paddd m2, m0
+ movd eax, m2
+ RET
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+%if mmsize == 16
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+%else
+ pshufw m7, m7, 0
+%endif
+ pxor m6, m6
+ add v1q, orderq
+ add v2q, orderq
+ add v3q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ mova m4, [v1q + orderq]
+ mova m5, [v1q + orderq + mmsize]
+ movu m2, [v3q + orderq]
+ movu m3, [v3q + orderq + mmsize]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmullw m2, m7
+ pmullw m3, m7
+ paddd m6, m0
+ paddd m6, m1
+ paddw m2, m4
+ paddw m3, m5
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+%else
+ pshufw m0, m6, 0x4e
+%endif
+ paddd m6, m0
+ movd eax, m6
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+ sub orderq, mmsize*2
+%if %1
+ mova m1, m4
+ mova m4, [v2q + orderq]
+ mova m0, [v2q + orderq + mmsize]
+ palignr m1, m0, %1
+ palignr m0, m4, %1
+ mova m3, m5
+ mova m5, [v3q + orderq]
+ mova m2, [v3q + orderq + mmsize]
+ palignr m3, m2, %1
+ palignr m2, m5, %1
+%else
+ mova m0, [v2q + orderq]
+ mova m1, [v2q + orderq + mmsize]
+ mova m2, [v3q + orderq]
+ mova m3, [v3q + orderq + mmsize]
+%endif
+ %define t0 [v1q + orderq]
+ %define t1 [v1q + orderq + mmsize]
+%if ARCH_X86_64
+ mova m8, t0
+ mova m9, t1
+ %define t0 m8
+ %define t1 m9
+%endif
+ pmaddwd m0, t0
+ pmaddwd m1, t1
+ pmullw m2, m7
+ pmullw m3, m7
+ paddw m2, t0
+ paddw m3, t1
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ jg .loop%1
+%if %1
+ jmp .end
+%endif
+%endmacro
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+ pxor m6, m6
+ mov r4d, v2d
+ and r4d, 15
+ and v2q, ~15
+ and v3q, ~15
+ mova m4, [v2q + orderq]
+ mova m5, [v3q + orderq]
+ ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+ cmp r4d, 0
+ je .loop0
+ cmp r4d, 2
+ je .loop2
+ cmp r4d, 4
+ je .loop4
+ cmp r4d, 6
+ je .loop6
+ cmp r4d, 8
+ je .loop8
+ cmp r4d, 10
+ je .loop10
+ cmp r4d, 12
+ je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+ paddd m6, m0
+ movd eax, m6
+ RET
+
+
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+; const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+ pshufb %1, %2
+%elif cpuflag(sse2)
+ pshuflw %1, %1, 0x1B
+ pshufhw %1, %1, 0x1B
+ pshufd %1, %1, 0x4E
+%elif cpuflag(mmxext)
+ pshufw %1, %1, 0x1B
+%endif
+%endmacro
+
+%macro MUL16FIXED 3
+%if cpuflag(ssse3) ; dst, src, unused
+; dst = ((dst * src) + (1<<14)) >> 15
+ pmulhrsw %1, %2
+%elif cpuflag(mmxext) ; dst, src, temp
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+ mova %3, %1
+ pmulhw %1, %2
+ pmullw %3, %2
+ psrlw %3, 15
+ psllw %1, 1
+ por %1, %3
+%endif
+%endmacro
+
+%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
+%if %1
+cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
+%else
+cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
+%endif
+ lea offset2q, [offsetq-mmsize]
+%if cpuflag(ssse3) && notcpuflag(atom)
+ mova m5, [pb_revwords]
+ ALIGN 16
+%elif %1
+ mova m5, [pd_16384]
+%endif
+.loop:
+%if cpuflag(ssse3)
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The ssse3 version is bit-identical.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ pmulhrsw m1, m0
+ REVERSE_WORDS m0, m5
+ pmulhrsw m0, [ inputq+offsetq ]
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m0
+%elif %1
+ ; This version expands 16-bit to 32-bit, multiplies by the window,
+ ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+ ; save to the output. The window is reversed for the second half.
+ mova m3, [windowq+offset2q]
+ mova m4, [ inputq+offset2q]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offset2q], m0
+ REVERSE_WORDS m3
+ mova m4, [ inputq+offsetq]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offsetq], m0
+%else
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+ ; therefore are not bit-identical to the C version.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ mova m2, [ inputq+offsetq ]
+ MUL16FIXED m1, m0, m3
+ REVERSE_WORDS m0
+ MUL16FIXED m2, m0, m3
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m2
+%endif
+ add offsetd, mmsize
+ sub offset2d, mmsize
+ jae .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 0
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 1
+INIT_XMM sse2
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16 1
+
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
+ movq mm0, [topq]
+ movq mm2, mm0
+ movd mm4, [left_topq]
+ psllq mm2, 8
+ movq mm1, mm0
+ por mm4, mm2
+ movd mm3, [leftq]
+ psubb mm0, mm4 ; t-tl
+ add dstq, wq
+ add topq, wq
+ add diffq, wq
+ neg wq
+ jmp .skip
+.loop:
+ movq mm4, [topq+wq]
+ movq mm0, mm4
+ psllq mm4, 8
+ por mm4, mm1
+ movq mm1, mm0 ; t
+ psubb mm0, mm4 ; t-tl
+.skip:
+ movq mm2, [diffq+wq]
+%assign i 0
+%rep 8
+ movq mm4, mm0
+ paddb mm4, mm3 ; t-tl+l
+ movq mm5, mm3
+ pmaxub mm3, mm1
+ pminub mm5, mm1
+ pminub mm3, mm4
+ pmaxub mm3, mm5 ; median
+ paddb mm3, mm2 ; +residual
+%if i==0
+ movq mm7, mm3
+ psllq mm7, 56
+%else
+ movq mm6, mm3
+ psrlq mm7, 8
+ psllq mm6, 56
+ por mm7, mm6
+%endif
+%if i<7
+ psrlq mm0, 8
+ psrlq mm1, 8
+ psrlq mm2, 8
+%endif
+%assign i i+1
+%endrep
+ movq [dstq+wq], mm7
+ add wq, 8
+ jl .loop
+ movzx r2d, byte [dstq-1]
+ mov [leftq], r2d
+ movzx r2d, byte [topq-1]
+ mov [left_topq], r2d
+ RET
+
+
+%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+ add srcq, wq
+ add dstq, wq
+ neg wq
+%%.loop:
+%if %2
+ mova m1, [srcq+wq]
+%else
+ movu m1, [srcq+wq]
+%endif
+ mova m2, m1
+ psllw m1, 8
+ paddb m1, m2
+ mova m2, m1
+ pshufb m1, m3
+ paddb m1, m2
+ pshufb m0, m5
+ mova m2, m1
+ pshufb m1, m4
+ paddb m1, m2
+%if mmsize == 16
+ mova m2, m1
+ pshufb m1, m6
+ paddb m1, m2
+%endif
+ paddb m0, m1
+%if %1
+ mova [dstq+wq], m0
+%else
+ movq [dstq+wq], m0
+ movhps [dstq+wq+8], m0
+%endif
+ add wq, mmsize
+ jl %%.loop
+ mov eax, mmsize-1
+ sub eax, wd
+ movd m1, eax
+ pshufb m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
+.skip_prologue:
+ mova m5, [pb_7]
+ mova m4, [pb_zzzz3333zzzzbbbb]
+ mova m3, [pb_zz11zz55zz99zzdd]
+ movd m0, leftm
+ psllq m0, 56
+ ADD_HFYU_LEFT_LOOP 1, 1
+
+INIT_XMM sse4
+cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
+ mova m5, [pb_f]
+ mova m6, [pb_zzzzzzzz77777777]
+ mova m4, [pb_zzzz3333zzzzbbbb]
+ mova m3, [pb_zz11zz55zz99zzdd]
+ movd m0, leftm
+ pslldq m0, 15
+ test srcq, 15
+ jnz .src_unaligned
+ test dstq, 15
+ jnz .dst_unaligned
+ ADD_HFYU_LEFT_LOOP 1, 1
+.dst_unaligned:
+ ADD_HFYU_LEFT_LOOP 0, 1
+.src_unaligned:
+ ADD_HFYU_LEFT_LOOP 0, 0
+
+;-----------------------------------------------------------------------------
+; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
+; int32_t max, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; %1 = number of xmm registers used
+; %2 = number of inline load/process/store loops per asm loop
+; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
+; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
+; %5 = suffix
+%macro VECTOR_CLIP_INT32 4-5
+cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
+%if %4
+ cvtsi2ss m4, minm
+ cvtsi2ss m5, maxm
+%else
+ movd m4, minm
+ movd m5, maxm
+%endif
+ SPLATD m4
+ SPLATD m5
+.loop:
+%assign %%i 1
+%rep %2
+ mova m0, [srcq+mmsize*0*%%i]
+ mova m1, [srcq+mmsize*1*%%i]
+ mova m2, [srcq+mmsize*2*%%i]
+ mova m3, [srcq+mmsize*3*%%i]
+%if %3
+ mova m7, [srcq+mmsize*4*%%i]
+ mova m8, [srcq+mmsize*5*%%i]
+ mova m9, [srcq+mmsize*6*%%i]
+ mova m10, [srcq+mmsize*7*%%i]
+%endif
+ CLIPD m0, m4, m5, m6
+ CLIPD m1, m4, m5, m6
+ CLIPD m2, m4, m5, m6
+ CLIPD m3, m4, m5, m6
+%if %3
+ CLIPD m7, m4, m5, m6
+ CLIPD m8, m4, m5, m6
+ CLIPD m9, m4, m5, m6
+ CLIPD m10, m4, m5, m6
+%endif
+ mova [dstq+mmsize*0*%%i], m0
+ mova [dstq+mmsize*1*%%i], m1
+ mova [dstq+mmsize*2*%%i], m2
+ mova [dstq+mmsize*3*%%i], m3
+%if %3
+ mova [dstq+mmsize*4*%%i], m7
+ mova [dstq+mmsize*5*%%i], m8
+ mova [dstq+mmsize*6*%%i], m9
+ mova [dstq+mmsize*7*%%i], m10
+%endif
+%assign %%i %%i+1
+%endrep
+ add srcq, mmsize*4*(%2+%3)
+ add dstq, mmsize*4*(%2+%3)
+ sub lend, mmsize*(%2+%3)
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+%define CLIPD CLIPD_MMX
+VECTOR_CLIP_INT32 0, 1, 0, 0
+INIT_XMM sse2
+VECTOR_CLIP_INT32 6, 1, 0, 0, _int
+%define CLIPD CLIPD_SSE2
+VECTOR_CLIP_INT32 6, 2, 0, 1
+INIT_XMM sse4
+%define CLIPD CLIPD_SSE41
+%ifdef m8
+VECTOR_CLIP_INT32 11, 1, 1, 0
+%else
+VECTOR_CLIP_INT32 6, 1, 0, 0
+%endif
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS 1
+ mov r3, r2
+ sar r2, 3
+ jz .left4_%1
+.loop8_%1:
+ mov%1 m0, [r1 + 0]
+ mov%1 m1, [r1 + 16]
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ pshufb m1, m2
+ mova [r0 + 0], m0
+ mova [r0 + 16], m1
+%else
+ pshuflw m0, m0, 10110001b
+ pshuflw m1, m1, 10110001b
+ pshufhw m0, m0, 10110001b
+ pshufhw m1, m1, 10110001b
+ mova m2, m0
+ mova m3, m1
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m2, 8
+ psrlw m3, 8
+ por m2, m0
+ por m3, m1
+ mova [r0 + 0], m2
+ mova [r0 + 16], m3
+%endif
+ add r0, 32
+ add r1, 32
+ dec r2
+ jnz .loop8_%1
+.left4_%1:
+ mov r2, r3
+ and r3, 4
+ jz .left
+ mov%1 m0, [r1]
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ mova [r0], m0
+%else
+ pshuflw m0, m0, 10110001b
+ pshufhw m0, m0, 10110001b
+ mova m2, m0
+ psllw m0, 8
+ psrlw m2, 8
+ por m2, m0
+ mova [r0], m2
+%endif
+ add r1, 16
+ add r0, 16
+%endmacro
+
+; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)
+cglobal bswap32_buf, 3,4,3
+ mov r3, r1
+ mova m2, [pb_bswap32]
+%else
+cglobal bswap32_buf, 3,4,5
+ mov r3, r1
+%endif
+ and r3, 15
+ jz .start_align
+ BSWAP_LOOPS u
+ jmp .left
+.start_align:
+ BSWAP_LOOPS a
+.left:
+%if cpuflag(ssse3)
+ mov r3, r2
+ and r2, 2
+ jz .left1
+ movq m0, [r1]
+ pshufb m0, m2
+ movq [r0], m0
+ add r1, 8
+ add r0, 8
+.left1:
+ and r3, 1
+ jz .end
+ mov r2d, [r1]
+ bswap r2d
+ mov [r0], r2d
+%else
+ and r2, 3
+ jz .end
+.loop2:
+ mov r3d, [r1]
+ bswap r3d
+ mov [r0], r3d
+ add r1, 4
+ add r0, 4
+ dec r2
+ jnz .loop2
+%endif
+.end:
+ RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF
diff --git a/ffmpeg/libavcodec/x86/dsputil_mmx.c b/ffmpeg/libavcodec/x86/dsputil_mmx.c
new file mode 100644
index 0000000..fe59d22
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputil_mmx.c
@@ -0,0 +1,1636 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264dsp.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/simple_idct.h"
+#include "libavcodec/videodsp.h"
+#include "dsputil_mmx.h"
+#include "idct_xvid.h"
+#include "diracdsp_mmx.h"
+
+//#undef NDEBUG
+//#include <assert.h>
+
+/* pixel operations */
+DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
+DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+
+
+#if HAVE_YASM
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
+ uint8_t *src2, int dstStride,
+ int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ ff_put_pixels8_mmxext(block, pixels, line_size, h);
+ ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride,
+ int h);
+void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride,
+ int h);
+void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride);
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+
+#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
+
+#define MOVQ_BFE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "paddb %%"#regd", %%"#regd" \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "psrlw $15, %%"#regd" \n\t" \
+ "packuswb %%"#regd", %%"#regd" \n\t" ::)
+
+#define MOVQ_WTWO(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "psrlw $15, %%"#regd" \n\t" \
+ "psllw $1, %%"#regd" \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+ "movq "#rega", "#regr" \n\t" \
+ "por "#regb", "#regr" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pand "#regfe", "#regb" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psubb "#regb", "#regr" \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+ "movq "#rega", "#regr" \n\t" \
+ "movq "#regc", "#regp" \n\t" \
+ "por "#regb", "#regr" \n\t" \
+ "por "#regd", "#regp" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pxor "#regc", "#regd" \n\t" \
+ "pand %%mm6, "#regb" \n\t" \
+ "pand %%mm6, "#regd" \n\t" \
+ "psrlq $1, "#regd" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psubb "#regb", "#regr" \n\t" \
+ "psubb "#regd", "#regp" \n\t"
+
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ## _mmx
+#define SET_RND MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
+#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
+
+#include "dsputil_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef OP_AVG
+
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_YASM
+
+/***********************************/
+/* MMXEXT specific */
+
+//FIXME the following could be optimized too ...
+static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
+{
+ ff_avg_pixels8_mmxext(block, pixels, line_size, h);
+ ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+/***********************************/
+/* standard MMX */
+
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ const int16_t *p;
+ uint8_t *pix;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ /* unrolled loop */
+ __asm__ volatile (
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
+ "r"(p)
+ : "memory");
+ pix += line_size * 4;
+ p += 32;
+
+ // if here would be an exact copy of the code above
+ // compiler would generate some very strange code
+ // thus using "r"
+ __asm__ volatile (
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
+ : "memory");
+}
+
+#define put_signed_pixels_clamped_mmx_half(off) \
+ "movq "#off"(%2), %%mm1 \n\t" \
+ "movq 16 + "#off"(%2), %%mm2 \n\t" \
+ "movq 32 + "#off"(%2), %%mm3 \n\t" \
+ "movq 48 + "#off"(%2), %%mm4 \n\t" \
+ "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
+ "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
+ "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
+ "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
+ "paddb %%mm0, %%mm1 \n\t" \
+ "paddb %%mm0, %%mm2 \n\t" \
+ "paddb %%mm0, %%mm3 \n\t" \
+ "paddb %%mm0, %%mm4 \n\t" \
+ "movq %%mm1, (%0) \n\t" \
+ "movq %%mm2, (%0, %3) \n\t" \
+ "movq %%mm3, (%0, %3, 2) \n\t" \
+ "movq %%mm4, (%0, %1) \n\t"
+
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ x86_reg line_skip = line_size;
+ x86_reg line_skip3;
+
+ __asm__ volatile (
+ "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
+ "lea (%3, %3, 2), %1 \n\t"
+ put_signed_pixels_clamped_mmx_half(0)
+ "lea (%0, %3, 4), %0 \n\t"
+ put_signed_pixels_clamped_mmx_half(64)
+ : "+&r"(pixels), "=&r"(line_skip3)
+ : "r"(block), "r"(line_skip)
+ : "memory");
+}
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ const int16_t *p;
+ uint8_t *pix;
+ int i;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ MOVQ_ZERO(mm7);
+ i = 4;
+ do {
+ __asm__ volatile (
+ "movq (%2), %%mm0 \n\t"
+ "movq 8(%2), %%mm1 \n\t"
+ "movq 16(%2), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "movq %0, %%mm4 \n\t"
+ "movq %1, %%mm6 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm4, %%mm0 \n\t"
+ "paddsw %%mm5, %%mm1 \n\t"
+ "movq %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm6, %%mm2 \n\t"
+ "paddsw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %0 \n\t"
+ "movq %%mm2, %1 \n\t"
+ : "+m"(*pix), "+m"(*(pix + line_size))
+ : "r"(p)
+ : "memory");
+ pix += line_size * 2;
+ p += 16;
+ } while (--i);
+}
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ __asm__ volatile (
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r"(pixels), "+r"(block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ __asm__ volatile (
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq 8(%1 ), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq 8(%1 ), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r"(pixels), "+r"(block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+#define CLEAR_BLOCKS(name, n) \
+static void name(int16_t *blocks) \
+{ \
+ __asm__ volatile ( \
+ "pxor %%mm7, %%mm7 \n\t" \
+ "mov %1, %%"REG_a" \n\t" \
+ "1: \n\t" \
+ "movq %%mm7, (%0, %%"REG_a") \n\t" \
+ "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
+ "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
+ "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
+ "add $32, %%"REG_a" \n\t" \
+ "js 1b \n\t" \
+ :: "r"(((uint8_t *)blocks) + 128 * n), \
+ "i"(-128 * n) \
+ : "%"REG_a \
+ ); \
+}
+CLEAR_BLOCKS(clear_blocks_mmx, 6)
+CLEAR_BLOCKS(clear_block_mmx, 1)
+
+static void clear_block_sse(int16_t *block)
+{
+ __asm__ volatile (
+ "xorps %%xmm0, %%xmm0 \n"
+ "movaps %%xmm0, (%0) \n"
+ "movaps %%xmm0, 16(%0) \n"
+ "movaps %%xmm0, 32(%0) \n"
+ "movaps %%xmm0, 48(%0) \n"
+ "movaps %%xmm0, 64(%0) \n"
+ "movaps %%xmm0, 80(%0) \n"
+ "movaps %%xmm0, 96(%0) \n"
+ "movaps %%xmm0, 112(%0) \n"
+ :: "r"(block)
+ : "memory"
+ );
+}
+
+static void clear_blocks_sse(int16_t *blocks)
+{
+ __asm__ volatile (
+ "xorps %%xmm0, %%xmm0 \n"
+ "mov %1, %%"REG_a" \n"
+ "1: \n"
+ "movaps %%xmm0, (%0, %%"REG_a") \n"
+ "movaps %%xmm0, 16(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 32(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 48(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 64(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 80(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 96(%0, %%"REG_a") \n"
+ "movaps %%xmm0, 112(%0, %%"REG_a") \n"
+ "add $128, %%"REG_a" \n"
+ "js 1b \n"
+ :: "r"(((uint8_t *)blocks) + 128 * 6),
+ "i"(-128 * 6)
+ : "%"REG_a
+ );
+}
+
+static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
+{
+ x86_reg i = 0;
+ __asm__ volatile (
+ "jmp 2f \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq (%2, %0), %%mm1 \n\t"
+ "paddb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, (%2, %0) \n\t"
+ "movq 8(%1, %0), %%mm0 \n\t"
+ "movq 8(%2, %0), %%mm1 \n\t"
+ "paddb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "2: \n\t"
+ "cmp %3, %0 \n\t"
+ "js 1b \n\t"
+ : "+r"(i)
+ : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
+ );
+ for ( ; i < w; i++)
+ dst[i + 0] += src[i + 0];
+}
+
+#if HAVE_7REGS
+static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, int w,
+ int *left, int *left_top)
+{
+ x86_reg w2 = -w;
+ x86_reg x;
+ int l = *left & 0xff;
+ int tl = *left_top & 0xff;
+ int t;
+ __asm__ volatile (
+ "mov %7, %3 \n"
+ "1: \n"
+ "movzbl (%3, %4), %2 \n"
+ "mov %2, %k3 \n"
+ "sub %b1, %b3 \n"
+ "add %b0, %b3 \n"
+ "mov %2, %1 \n"
+ "cmp %0, %2 \n"
+ "cmovg %0, %2 \n"
+ "cmovg %1, %0 \n"
+ "cmp %k3, %0 \n"
+ "cmovg %k3, %0 \n"
+ "mov %7, %3 \n"
+ "cmp %2, %0 \n"
+ "cmovl %2, %0 \n"
+ "add (%6, %4), %b0 \n"
+ "mov %b0, (%5, %4) \n"
+ "inc %4 \n"
+ "jl 1b \n"
+ : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
+ : "r"(dst + w), "r"(diff + w), "rm"(top + w)
+ );
+ *left = l;
+ *left_top = tl;
+}
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+
+#if HAVE_INLINE_ASM
+/* Draw the edges of width 'w' of an image of size width, height
+ * this MMX version can only handle w == 8 || w == 16. */
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
+ int w, int h, int sides)
+{
+ uint8_t *ptr, *last_line;
+ int i;
+
+ last_line = buf + (height - 1) * wrap;
+ /* left and right */
+ ptr = buf;
+ if (w == 8) {
+ __asm__ volatile (
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "punpckldq %%mm0, %%mm0 \n\t"
+ "movq %%mm0, -8(%0) \n\t"
+ "movq -8(%0, %2), %%mm1 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movq %%mm1, (%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ "jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+ );
+ } else if(w==16){
+ __asm__ volatile (
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "punpckldq %%mm0, %%mm0 \n\t"
+ "movq %%mm0, -8(%0) \n\t"
+ "movq %%mm0, -16(%0) \n\t"
+ "movq -8(%0, %2), %%mm1 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movq %%mm1, (%0, %2) \n\t"
+ "movq %%mm1, 8(%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ "jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+ );
+ } else {
+ av_assert1(w == 4);
+ __asm__ volatile (
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "movd %%mm0, -4(%0) \n\t"
+ "movd -4(%0, %2), %%mm1 \n\t"
+ "punpcklbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, (%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ "jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+ );
+ }
+
+ /* top and bottom (and hopefully also the corners) */
+ if (sides & EDGE_TOP) {
+ for (i = 0; i < h; i += 4) {
+ ptr = buf - (i + 1) * wrap - w;
+ __asm__ volatile (
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ "jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
+ "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
+ );
+ }
+ }
+
+ if (sides & EDGE_BOTTOM) {
+ for (i = 0; i < h; i += 4) {
+ ptr = last_line + (i + 1) * wrap - w;
+ __asm__ volatile (
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ "jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)last_line - (x86_reg)ptr - w),
+ "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
+ "r"(ptr + width + 2 * w)
+ );
+ }
+ }
+}
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_YASM
+#define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[8]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
+ stride, 8); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
+ stride, stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[8]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
+ stride, 8); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[8]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
+ 8, stride); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
+ stride, stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
+ stride, stride); \
+} \
+ \
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[8]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
+ 8, stride); \
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
+ stride, 8, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
+ 8, stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[8 + 9]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
+ stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[9]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
+ stride, 9); \
+ ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 8); \
+} \
+ \
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[32]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
+ stride, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
+ stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
+ stride, stride, 16);\
+} \
+ \
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[32]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
+ stride, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
+ stride, stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[32]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
+ stride); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
+ stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
+ stride, stride); \
+} \
+ \
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t temp[32]; \
+ uint8_t * const half = (uint8_t*)temp; \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
+ stride); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
+ stride, stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[16 * 2 + 17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \
+ uint8_t * const halfHV = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
+ 16, 16); \
+ ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
+ stride, 16, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
+} \
+ \
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ uint64_t half[17 * 2]; \
+ uint8_t * const halfH = ((uint8_t*)half); \
+ ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
+ stride, 17); \
+ ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
+ stride, 16); \
+}
+
+QPEL_OP(put_, ff_pw_16, _, mmxext)
+QPEL_OP(avg_, ff_pw_16, _, mmxext)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ put_pixels8_xy2_mmx(dst, src, stride, 8);
+}
+void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ put_pixels16_xy2_mmx(dst, src, stride, 16);
+}
+void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ avg_pixels8_xy2_mmx(dst, src, stride, 8);
+}
+void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ avg_pixels16_xy2_mmx(dst, src, stride, 16);
+}
+
+typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t linesize, int block_w, int block_h,
+ int src_x, int src_y, int w, int h);
+
+static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height,
+ emulated_edge_mc_func *emu_edge_fn)
+{
+ const int w = 8;
+ const int ix = ox >> (16 + shift);
+ const int iy = oy >> (16 + shift);
+ const int oxs = ox >> 4;
+ const int oys = oy >> 4;
+ const int dxxs = dxx >> 4;
+ const int dxys = dxy >> 4;
+ const int dyxs = dyx >> 4;
+ const int dyys = dyy >> 4;
+ const uint16_t r4[4] = { r, r, r, r };
+ const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
+ const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
+ const uint64_t shift2 = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+ uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
+ int x, y;
+
+ const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
+ const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
+ const int dxh = dxy * (h - 1);
+ const int dyw = dyx * (w - 1);
+ int need_emu = (unsigned)ix >= width - w ||
+ (unsigned)iy >= height - h;
+
+ if ( // non-constant fullpel offset (3% of blocks)
+ ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
+ (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
+ // uses more than 16 bits of subpel mv (only at huge resolution)
+ || (dxx | dxy | dyx | dyy) & 15
+ || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
+ // FIXME could still use mmx for some of the rows
+ ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
+ shift, r, width, height);
+ return;
+ }
+
+ src += ix + iy * stride;
+ if (need_emu) {
+ emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
+ src = edge_buf;
+ }
+
+ __asm__ volatile (
+ "movd %0, %%mm6 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "punpcklwd %%mm6, %%mm6 \n\t"
+ "punpcklwd %%mm6, %%mm6 \n\t"
+ :: "r"(1<<shift)
+ );
+
+ for (x = 0; x < w; x += 4) {
+ uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
+ oxs - dxys + dxxs * (x + 1),
+ oxs - dxys + dxxs * (x + 2),
+ oxs - dxys + dxxs * (x + 3) };
+ uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
+ oys - dyys + dyxs * (x + 1),
+ oys - dyys + dyxs * (x + 2),
+ oys - dyys + dyxs * (x + 3) };
+
+ for (y = 0; y < h; y++) {
+ __asm__ volatile (
+ "movq %0, %%mm4 \n\t"
+ "movq %1, %%mm5 \n\t"
+ "paddw %2, %%mm4 \n\t"
+ "paddw %3, %%mm5 \n\t"
+ "movq %%mm4, %0 \n\t"
+ "movq %%mm5, %1 \n\t"
+ "psrlw $12, %%mm4 \n\t"
+ "psrlw $12, %%mm5 \n\t"
+ : "+m"(*dx4), "+m"(*dy4)
+ : "m"(*dxy4), "m"(*dyy4)
+ );
+
+ __asm__ volatile (
+ "movq %%mm6, %%mm2 \n\t"
+ "movq %%mm6, %%mm1 \n\t"
+ "psubw %%mm4, %%mm2 \n\t"
+ "psubw %%mm5, %%mm1 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm4, %%mm3 \n\t"
+ "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
+ "pmullw %%mm5, %%mm3 \n\t" // dx * dy
+ "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
+ "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
+
+ "movd %4, %%mm5 \n\t"
+ "movd %3, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
+ "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
+
+ "movd %2, %%mm5 \n\t"
+ "movd %1, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
+ "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
+ "paddw %5, %%mm1 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+
+ "psrlw %6, %%mm0 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %0 \n\t"
+
+ : "=m"(dst[x + y * stride])
+ : "m"(src[0]), "m"(src[1]),
+ "m"(src[stride]), "m"(src[stride + 1]),
+ "m"(*r4), "m"(shift2)
+ );
+ src += stride;
+ }
+ src += 4 - h * stride;
+ }
+}
+
+#if CONFIG_VIDEODSP
+#if HAVE_YASM
+#if ARCH_X86_32
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+static void gmc_sse(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#else
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+#if HAVE_INLINE_ASM
+
+/* CAVS-specific */
+void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ put_pixels8_mmx(dst, src, stride, 8);
+}
+
+void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ avg_pixels8_mmx(dst, src, stride, 8);
+}
+
+void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ put_pixels16_mmx(dst, src, stride, 16);
+}
+
+void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
+{
+ avg_pixels16_mmx(dst, src, stride, 16);
+}
+
+/* VC-1-specific */
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride, int rnd)
+{
+ put_pixels8_mmx(dst, src, stride, 8);
+}
+
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3) {\
+ ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+ } else {\
+ OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
+ OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+ }\
+}
+
+#if HAVE_MMX_INLINE
+DIRAC_PIXOP(put, put, mmx)
+DIRAC_PIXOP(avg, avg, mmx)
+#endif
+
+#if HAVE_YASM
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_put_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_avg_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_put_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_put_pixels16_sse2(dst , src[0] , stride, h);
+ ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_avg_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_avg_pixels16_sse2(dst , src[0] , stride, h);
+ ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+#endif
+#endif
+
+static void vector_clipf_sse(float *dst, const float *src,
+ float min, float max, int len)
+{
+ x86_reg i = (len - 16) * 4;
+ __asm__ volatile (
+ "movss %3, %%xmm4 \n\t"
+ "movss %4, %%xmm5 \n\t"
+ "shufps $0, %%xmm4, %%xmm4 \n\t"
+ "shufps $0, %%xmm5, %%xmm5 \n\t"
+ "1: \n\t"
+ "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
+ "movaps 16(%2, %0), %%xmm1 \n\t"
+ "movaps 32(%2, %0), %%xmm2 \n\t"
+ "movaps 48(%2, %0), %%xmm3 \n\t"
+ "maxps %%xmm4, %%xmm0 \n\t"
+ "maxps %%xmm4, %%xmm1 \n\t"
+ "maxps %%xmm4, %%xmm2 \n\t"
+ "maxps %%xmm4, %%xmm3 \n\t"
+ "minps %%xmm5, %%xmm0 \n\t"
+ "minps %%xmm5, %%xmm1 \n\t"
+ "minps %%xmm5, %%xmm2 \n\t"
+ "minps %%xmm5, %%xmm3 \n\t"
+ "movaps %%xmm0, (%1, %0) \n\t"
+ "movaps %%xmm1, 16(%1, %0) \n\t"
+ "movaps %%xmm2, 32(%1, %0) \n\t"
+ "movaps %%xmm3, 48(%1, %0) \n\t"
+ "sub $64, %0 \n\t"
+ "jge 1b \n\t"
+ : "+&r"(i)
+ : "r"(dst), "r"(src), "m"(min), "m"(max)
+ : "memory"
+ );
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
+ int order);
+int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
+ int order);
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+
+void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+
+void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+
+void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, int w,
+ int *left, int *left_top);
+int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
+ int w, int left);
+int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
+ int w, int left);
+
+void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
+ int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
+ int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
+ int32_t min, int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
+ int32_t min, int32_t max, unsigned int len);
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
+ do { \
+ c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
+ } while (0)
+
+static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+ const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+
+#if HAVE_INLINE_ASM
+ c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+ c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
+
+ if (!high_bit_depth) {
+ c->clear_block = clear_block_mmx;
+ c->clear_blocks = clear_blocks_mmx;
+ c->draw_edges = draw_edges_mmx;
+ }
+
+#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
+ c->gmc = gmc_mmx;
+#endif
+
+ c->add_bytes = add_bytes_mmx;
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+ if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
+ c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
+ c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
+ }
+
+ c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+#endif
+
+}
+
+static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+
+#if HAVE_YASM
+ SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
+
+ SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
+#endif /* HAVE_YASM */
+
+#if HAVE_MMXEXT_EXTERNAL
+ /* slower than cmov version on AMD */
+ if (!(mm_flags & AV_CPU_FLAG_3DNOW))
+ c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
+
+ c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_mmxext;
+ } else {
+ c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
+ }
+#endif /* HAVE_MMXEXT_EXTERNAL */
+}
+
+static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+ const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+
+#if HAVE_INLINE_ASM
+ if (!high_bit_depth) {
+ if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
+ /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+ c->clear_block = clear_block_sse;
+ c->clear_blocks = clear_blocks_sse;
+ }
+ }
+
+ c->vector_clipf = vector_clipf_sse;
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
+ c->gmc = gmc_sse;
+#endif
+#endif /* HAVE_YASM */
+}
+
+static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+ const int bit_depth = avctx->bits_per_raw_sample;
+ const int high_bit_depth = bit_depth > 8;
+
+#if HAVE_SSE2_INLINE
+ if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+ c->idct_put = ff_idct_xvid_sse2_put;
+ c->idct_add = ff_idct_xvid_sse2_add;
+ c->idct = ff_idct_xvid_sse2;
+ c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+ }
+#endif /* HAVE_SSE2_INLINE */
+
+#if HAVE_SSE2_EXTERNAL
+ c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+ if (mm_flags & AV_CPU_FLAG_ATOM) {
+ c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
+ } else {
+ c->vector_clip_int32 = ff_vector_clip_int32_sse2;
+ }
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_sse2;
+ } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ c->apply_window_int16 = ff_apply_window_int16_round_sse2;
+ }
+ c->bswap_buf = ff_bswap32_buf_sse2;
+#endif /* HAVE_SSE2_EXTERNAL */
+}
+
+static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
+ if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
+ c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
+
+ if (mm_flags & AV_CPU_FLAG_ATOM)
+ c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+ else
+ c->apply_window_int16 = ff_apply_window_int16_ssse3;
+ if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+ c->bswap_buf = ff_bswap32_buf_ssse3;
+#endif /* HAVE_SSSE3_EXTERNAL */
+}
+
+static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
+ int mm_flags)
+{
+#if HAVE_SSE4_EXTERNAL
+ c->vector_clip_int32 = ff_vector_clip_int32_sse4;
+#endif /* HAVE_SSE4_EXTERNAL */
+}
+
+av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
+{
+ int mm_flags = av_get_cpu_flags();
+
+#if HAVE_7REGS && HAVE_INLINE_ASM
+ if (mm_flags & AV_CPU_FLAG_CMOV)
+ c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
+#endif
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+#if HAVE_INLINE_ASM
+ const int idct_algo = avctx->idct_algo;
+
+ if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
+ if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
+ c->idct_put = ff_simple_idct_put_mmx;
+ c->idct_add = ff_simple_idct_add_mmx;
+ c->idct = ff_simple_idct_mmx;
+ c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+ } else if (idct_algo == FF_IDCT_XVIDMMX) {
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ c->idct_put = ff_idct_xvid_sse2_put;
+ c->idct_add = ff_idct_xvid_sse2_add;
+ c->idct = ff_idct_xvid_sse2;
+ c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+ } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->idct_put = ff_idct_xvid_mmxext_put;
+ c->idct_add = ff_idct_xvid_mmxext_add;
+ c->idct = ff_idct_xvid_mmxext;
+ } else {
+ c->idct_put = ff_idct_xvid_mmx_put;
+ c->idct_add = ff_idct_xvid_mmx_add;
+ c->idct = ff_idct_xvid_mmx;
+ }
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+
+ dsputil_init_mmx(c, avctx, mm_flags);
+ }
+
+ if (mm_flags & AV_CPU_FLAG_MMXEXT)
+ dsputil_init_mmxext(c, avctx, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_SSE)
+ dsputil_init_sse(c, avctx, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_SSE2)
+ dsputil_init_sse2(c, avctx, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_SSSE3)
+ dsputil_init_ssse3(c, avctx, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_SSE4)
+ dsputil_init_sse4(c, avctx, mm_flags);
+
+ if (CONFIG_ENCODERS)
+ ff_dsputilenc_init_mmx(c, avctx);
+}
diff --git a/ffmpeg/libavcodec/x86/dsputil_mmx.h b/ffmpeg/libavcodec/x86/dsputil_mmx.h
new file mode 100644
index 0000000..28b0078
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputil_mmx.h
@@ -0,0 +1,113 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DSPUTIL_MMX_H
+#define AVCODEC_X86_DSPUTIL_MMX_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavcodec/dsputil.h"
+#include "libavutil/x86/asm.h"
+
+extern const uint64_t ff_bone;
+extern const uint64_t ff_wtwo;
+
+extern const xmm_reg ff_pw_3;
+extern const xmm_reg ff_pw_4;
+extern const xmm_reg ff_pw_5;
+extern const xmm_reg ff_pw_8;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg ff_pw_16;
+extern const xmm_reg ff_pw_18;
+extern const uint64_t ff_pw_20;
+extern const xmm_reg ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_53;
+extern const xmm_reg ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_255;
+
+extern const xmm_reg ff_pb_1;
+extern const xmm_reg ff_pb_3;
+extern const uint64_t ff_pb_3F;
+extern const xmm_reg ff_pb_F8;
+extern const uint64_t ff_pb_FC;
+
+extern const double ff_pd_1[2];
+extern const double ff_pd_2[2];
+
+#define SBUTTERFLY(a,b,t,n,m)\
+ "mov" #m " " #a ", " #t " \n\t" /* abcd */\
+ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
+ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
+
+#define TRANSPOSE4(a,b,c,d,t)\
+ SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
+ SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
+ SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
+ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
+
+#define MOVQ_WONE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd ::)
+
+void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
+
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd);
+
+void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride);
+void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride);
+void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride);
+void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride);
+
+void ff_mmx_idct(int16_t *block);
+void ff_mmxext_idct(int16_t *block);
+
+
+void ff_deinterlace_line_mmx(uint8_t *dst,
+ const uint8_t *lum_m4, const uint8_t *lum_m3,
+ const uint8_t *lum_m2, const uint8_t *lum_m1,
+ const uint8_t *lum,
+ int size);
+
+void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
+ const uint8_t *lum_m3,
+ const uint8_t *lum_m2,
+ const uint8_t *lum_m1,
+ const uint8_t *lum, int size);
+
+#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff --git a/ffmpeg/libavcodec/x86/dsputil_qns_template.c b/ffmpeg/libavcodec/x86/dsputil_qns_template.c
new file mode 100644
index 0000000..77a41b9
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputil_qns_template.c
@@ -0,0 +1,101 @@
+/*
+ * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+ x86_reg i=0;
+
+ assert(FFABS(scale) < MAX_ABS);
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+ SET_RND(mm6);
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movd %4, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "psraw $6, %%mm0 \n\t"
+ "psraw $6, %%mm1 \n\t"
+ "pmullw (%3, %0), %%mm0 \n\t"
+ "pmullw 8(%3, %0), %%mm1 \n\t"
+ "pmaddwd %%mm0, %%mm0 \n\t"
+ "pmaddwd %%mm1, %%mm1 \n\t"
+ "paddd %%mm1, %%mm0 \n\t"
+ "psrld $4, %%mm0 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" //FIXME optimize & bench
+ " jb 1b \n\t"
+ PHADDD(%%mm7, %%mm6)
+ "psrld $2, %%mm7 \n\t"
+ "movd %%mm7, %0 \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+ );
+ return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+ x86_reg i=0;
+
+ if(FFABS(scale) < MAX_ABS){
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+ SET_RND(mm6);
+ __asm__ volatile(
+ "movd %3, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "movq %%mm0, (%2, %0) \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" // FIXME optimize & bench
+ " jb 1b \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "g"(scale)
+ );
+ }else{
+ for(i=0; i<8*8; i++){
+ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+ }
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/dsputil_rnd_template.c b/ffmpeg/libavcodec/x86/dsputil_rnd_template.c
new file mode 100644
index 0000000..1a89b77
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputil_rnd_template.c
@@ -0,0 +1,221 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// put_pixels
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "movq %%mm4, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ "movq 8%0, %%mm0 \n\t"
+ "movq 8%1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, 8%0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
+ "movq %%mm5, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(put, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
diff --git a/ffmpeg/libavcodec/x86/dsputilenc.asm b/ffmpeg/libavcodec/x86/dsputilenc.asm
new file mode 100644
index 0000000..1839bee
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputilenc.asm
@@ -0,0 +1,487 @@
+;*****************************************************************************
+;* MMX optimized DSP utils
+;*****************************************************************************
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro DIFF_PIXELS_1 4
+ movh %1, %3
+ movh %2, %4
+ punpcklbw %2, %1
+ punpcklbw %1, %1
+ psubw %1, %2
+%endmacro
+
+; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
+; %6=temporary storage location
+; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
+%macro DIFF_PIXELS_8 6
+ DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
+ DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
+ DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
+ add %1, %5
+ add %2, %5
+ DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
+ DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
+ DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
+ DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
+%ifdef m8
+ DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
+%else
+ mova [%6], m0
+ DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
+ mova m0, [%6]
+%endif
+ sub %1, %5
+ sub %2, %5
+%endmacro
+
+%macro HADAMARD8 0
+ SUMSUB_BADC w, 0, 1, 2, 3
+ SUMSUB_BADC w, 4, 5, 6, 7
+ SUMSUB_BADC w, 0, 2, 1, 3
+ SUMSUB_BADC w, 4, 6, 5, 7
+ SUMSUB_BADC w, 0, 4, 1, 5
+ SUMSUB_BADC w, 2, 6, 3, 7
+%endmacro
+
+%macro ABS1_SUM 3
+ ABS1 %1, %2
+ paddusw %3, %1
+%endmacro
+
+%macro ABS2_SUM 6
+ ABS2 %1, %2, %3, %4
+ paddusw %5, %1
+ paddusw %6, %2
+%endmacro
+
+%macro ABS_SUM_8x8_64 1
+ ABS2 m0, m1, m8, m9
+ ABS2_SUM m2, m3, m8, m9, m0, m1
+ ABS2_SUM m4, m5, m8, m9, m0, m1
+ ABS2_SUM m6, m7, m8, m9, m0, m1
+ paddusw m0, m1
+%endmacro
+
+%macro ABS_SUM_8x8_32 1
+ mova [%1], m7
+ ABS1 m0, m7
+ ABS1 m1, m7
+ ABS1_SUM m2, m7, m0
+ ABS1_SUM m3, m7, m1
+ ABS1_SUM m4, m7, m0
+ ABS1_SUM m5, m7, m1
+ ABS1_SUM m6, m7, m0
+ mova m2, [%1]
+ ABS1_SUM m2, m7, m1
+ paddusw m0, m1
+%endmacro
+
+; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
+; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
+; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
+%macro HSUM 3
+%if cpuflag(sse2)
+ movhlps %2, %1
+ paddusw %1, %2
+ pshuflw %2, %1, 0xE
+ paddusw %1, %2
+ pshuflw %2, %1, 0x1
+ paddusw %1, %2
+ movd %3, %1
+%elif cpuflag(mmxext)
+ pshufw %2, %1, 0xE
+ paddusw %1, %2
+ pshufw %2, %1, 0x1
+ paddusw %1, %2
+ movd %3, %1
+%elif cpuflag(mmx)
+ mova %2, %1
+ psrlq %1, 32
+ paddusw %1, %2
+ mova %2, %1
+ psrlq %1, 16
+ paddusw %1, %2
+ movd %3, %1
+%endif
+%endmacro
+
+%macro STORE4 5
+ mova [%1+mmsize*0], %2
+ mova [%1+mmsize*1], %3
+ mova [%1+mmsize*2], %4
+ mova [%1+mmsize*3], %5
+%endmacro
+
+%macro LOAD4 5
+ mova %2, [%1+mmsize*0]
+ mova %3, [%1+mmsize*1]
+ mova %4, [%1+mmsize*2]
+ mova %5, [%1+mmsize*3]
+%endmacro
+
+%macro hadamard8_16_wrapper 2
+cglobal hadamard8_diff, 4, 4, %1
+%ifndef m8
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
+ SUB rsp, pad
+%endif
+ call hadamard8x8_diff %+ SUFFIX
+%ifndef m8
+ ADD rsp, pad
+%endif
+ RET
+
+cglobal hadamard8_diff16, 5, 6, %1
+%ifndef m8
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
+ SUB rsp, pad
+%endif
+
+ call hadamard8x8_diff %+ SUFFIX
+ mov r5d, eax
+
+ add r1, 8
+ add r2, 8
+ call hadamard8x8_diff %+ SUFFIX
+ add r5d, eax
+
+ cmp r4d, 16
+ jne .done
+
+ lea r1, [r1+r3*8-8]
+ lea r2, [r2+r3*8-8]
+ call hadamard8x8_diff %+ SUFFIX
+ add r5d, eax
+
+ add r1, 8
+ add r2, 8
+ call hadamard8x8_diff %+ SUFFIX
+ add r5d, eax
+
+.done:
+ mov eax, r5d
+%ifndef m8
+ ADD rsp, pad
+%endif
+ RET
+%endmacro
+
+%macro HADAMARD8_DIFF 0-1
+%if cpuflag(sse2)
+hadamard8x8_diff %+ SUFFIX:
+ lea r0, [r3*3]
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
+ HADAMARD8
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
+%endif
+ HADAMARD8
+ ABS_SUM_8x8 rsp+gprsize
+ HSUM m0, m1, eax
+ and eax, 0xFFFF
+ ret
+
+hadamard8_16_wrapper %1, 3
+%elif cpuflag(mmx)
+ALIGN 16
+; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
+; int stride, int h)
+; r0 = void *s = unused, int h = unused (always 8)
+; note how r1, r2 and r3 are not clobbered in this function, so 16x16
+; can simply call this 2x2x (and that's why we access rsp+gprsize
+; everywhere, which is rsp of calling func
+hadamard8x8_diff %+ SUFFIX:
+ lea r0, [r3*3]
+
+ ; first 4x8 pixels
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
+ HADAMARD8
+ mova [rsp+gprsize+0x60], m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 7
+ STORE4 rsp+gprsize, m0, m1, m2, m3
+ mova m7, [rsp+gprsize+0x60]
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
+
+ ; second 4x8 pixels
+ DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
+ HADAMARD8
+ mova [rsp+gprsize+0x60], m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 7
+ STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
+ mova m7, [rsp+gprsize+0x60]
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+
+ LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
+ HADAMARD8
+ ABS_SUM_8x8_32 rsp+gprsize+0x60
+ mova [rsp+gprsize+0x60], m0
+
+ LOAD4 rsp+gprsize , m0, m1, m2, m3
+ LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
+ HADAMARD8
+ ABS_SUM_8x8_32 rsp+gprsize
+ paddusw m0, [rsp+gprsize+0x60]
+
+ HSUM m0, m1, eax
+ and rax, 0xFFFF
+ ret
+
+hadamard8_16_wrapper 0, 14
+%endif
+%endmacro
+
+INIT_MMX mmx
+HADAMARD8_DIFF
+
+INIT_MMX mmxext
+HADAMARD8_DIFF
+
+INIT_XMM sse2
+%if ARCH_X86_64
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
+%else
+%define ABS_SUM_8x8 ABS_SUM_8x8_32
+%endif
+HADAMARD8_DIFF 10
+
+INIT_XMM ssse3
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
+HADAMARD8_DIFF 9
+
+INIT_XMM sse2
+; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
+cglobal sse16, 5, 5, 8
+ shr r4d, 1
+ pxor m0, m0 ; mm0 = 0
+ pxor m7, m7 ; mm7 holds the sum
+
+.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
+ movu m1, [r1 ] ; mm1 = pix1[0][0-15]
+ movu m2, [r2 ] ; mm2 = pix2[0][0-15]
+ movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
+ movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
+
+ ; todo: mm1-mm2, mm3-mm4
+ ; algo: subtract mm1 from mm2 with saturation and vice versa
+ ; OR the result to get the absolute difference
+ mova m5, m1
+ mova m6, m3
+ psubusb m1, m2
+ psubusb m3, m4
+ psubusb m2, m5
+ psubusb m4, m6
+
+ por m2, m1
+ por m4, m3
+
+ ; now convert to 16-bit vectors so we can square them
+ mova m1, m2
+ mova m3, m4
+
+ punpckhbw m2, m0
+ punpckhbw m4, m0
+ punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
+ punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
+
+ pmaddwd m2, m2
+ pmaddwd m4, m4
+ pmaddwd m1, m1
+ pmaddwd m3, m3
+
+ lea r1, [r1+r3*2] ; pix1 += 2*line_size
+ lea r2, [r2+r3*2] ; pix2 += 2*line_size
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m7, m1
+ paddd m7, m3
+
+ dec r4
+ jnz .next2lines
+
+ mova m1, m7
+ psrldq m7, 8 ; shift hi qword to lo
+ paddd m7, m1
+ mova m1, m7
+ psrldq m7, 4 ; shift hi dword to lo
+ paddd m7, m1
+ movd eax, m7 ; return value
+ RET
+
+INIT_MMX mmx
+; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
+cglobal get_pixels, 3,4
+ movsxdifnidn r2, r2d
+ add r0, 128
+ mov r3, -128
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ mova [r0+r3+ 0], m0
+ mova [r0+r3+ 8], m1
+ mova [r0+r3+16], m2
+ mova [r0+r3+24], m3
+ lea r1, [r1+r2*2]
+ add r3, 32
+ js .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal get_pixels, 3, 4
+ movsxdifnidn r2, r2d
+ lea r3, [r2*3]
+ pxor m4, m4
+ movh m0, [r1]
+ movh m1, [r1+r2]
+ movh m2, [r1+r2*2]
+ movh m3, [r1+r3]
+ lea r1, [r1+r2*4]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ mova [r0], m0
+ mova [r0+0x10], m1
+ mova [r0+0x20], m2
+ mova [r0+0x30], m3
+ movh m0, [r1]
+ movh m1, [r1+r2*1]
+ movh m2, [r1+r2*2]
+ movh m3, [r1+r3]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ mova [r0+0x40], m0
+ mova [r0+0x50], m1
+ mova [r0+0x60], m2
+ mova [r0+0x70], m3
+ RET
+
+INIT_MMX mmx
+; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
+cglobal diff_pixels, 4,5
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+ add r0, 128
+ mov r4, -128
+.loop:
+ mova m0, [r1]
+ mova m2, [r2]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ mova [r0+r4+0], m0
+ mova [r0+r4+8], m1
+ add r1, r3
+ add r2, r3
+ add r4, 16
+ jne .loop
+ REP_RET
+
+INIT_MMX mmx
+; pix_sum16_mmx(uint8_t * pix, int line_size)
+cglobal pix_sum16, 2, 3
+ movsxdifnidn r1, r1d
+ mov r2, r1
+ neg r2
+ shl r2, 4
+ sub r0, r2
+ pxor m7, m7
+ pxor m6, m6
+.loop:
+ mova m0, [r0+r2+0]
+ mova m1, [r0+r2+0]
+ mova m2, [r0+r2+8]
+ mova m3, [r0+r2+8]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ paddw m1, m0
+ paddw m3, m2
+ paddw m3, m1
+ paddw m6, m3
+ add r2, r1
+ js .loop
+ mova m5, m6
+ psrlq m6, 32
+ paddw m6, m5
+ mova m5, m6
+ psrlq m6, 16
+ paddw m6, m5
+ movd eax, m6
+ and eax, 0xffff
+ RET
+
+INIT_MMX mmx
+; pix_norm1_mmx(uint8_t *pix, int line_size)
+cglobal pix_norm1, 2, 4
+ movsxdifnidn r1, r1d
+ mov r2, 16
+ pxor m0, m0
+ pxor m7, m7
+.loop:
+ mova m2, [r0+0]
+ mova m3, [r0+8]
+ mova m1, m2
+ punpckhbw m1, m0
+ punpcklbw m2, m0
+ mova m4, m3
+ punpckhbw m3, m0
+ punpcklbw m4, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m2, m1
+ paddd m4, m3
+ paddd m7, m2
+ add r0, r1
+ paddd m7, m4
+ dec r2
+ jne .loop
+ mova m1, m7
+ psrlq m7, 32
+ paddd m1, m7
+ movd eax, m1
+ RET
+
diff --git a/ffmpeg/libavcodec/x86/dsputilenc_mmx.c b/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
new file mode 100644
index 0000000..a3f268e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
@@ -0,0 +1,1060 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dct.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mathops.h"
+#include "dsputil_mmx.h"
+
+void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
+void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride);
+int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+
+#if HAVE_INLINE_ASM
+
+static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "shr $1,%%ecx\n"
+ "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
+ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
+ "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
+ "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
+ "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movq %%mm1,%%mm5\n"
+ "movq %%mm3,%%mm6\n"
+ "psubusb %%mm2,%%mm1\n"
+ "psubusb %%mm4,%%mm3\n"
+ "psubusb %%mm5,%%mm2\n"
+ "psubusb %%mm6,%%mm4\n"
+
+ "por %%mm1,%%mm2\n"
+ "por %%mm3,%%mm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movq %%mm2,%%mm1\n"
+ "movq %%mm4,%%mm3\n"
+
+ "punpckhbw %%mm0,%%mm2\n"
+ "punpckhbw %%mm0,%%mm4\n"
+ "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%mm2,%%mm2\n"
+ "pmaddwd %%mm4,%%mm4\n"
+ "pmaddwd %%mm1,%%mm1\n"
+ "pmaddwd %%mm3,%%mm3\n"
+
+ "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
+ "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
+
+ "paddd %%mm2,%%mm1\n"
+ "paddd %%mm4,%%mm3\n"
+ "paddd %%mm1,%%mm7\n"
+ "paddd %%mm3,%%mm7\n"
+
+ "decl %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm7,%%mm1\n"
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
+ "paddd %%mm7,%%mm1\n"
+ "movd %%mm1,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+
+static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
+ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
+ "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
+ "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
+ "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movq %%mm1,%%mm5\n"
+ "movq %%mm3,%%mm6\n"
+ "psubusb %%mm2,%%mm1\n"
+ "psubusb %%mm4,%%mm3\n"
+ "psubusb %%mm5,%%mm2\n"
+ "psubusb %%mm6,%%mm4\n"
+
+ "por %%mm1,%%mm2\n"
+ "por %%mm3,%%mm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movq %%mm2,%%mm1\n"
+ "movq %%mm4,%%mm3\n"
+
+ "punpckhbw %%mm0,%%mm2\n"
+ "punpckhbw %%mm0,%%mm4\n"
+ "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%mm2,%%mm2\n"
+ "pmaddwd %%mm4,%%mm4\n"
+ "pmaddwd %%mm1,%%mm1\n"
+ "pmaddwd %%mm3,%%mm3\n"
+
+ "add %3,%0\n"
+ "add %3,%1\n"
+
+ "paddd %%mm2,%%mm1\n"
+ "paddd %%mm4,%%mm3\n"
+ "paddd %%mm1,%%mm7\n"
+ "paddd %%mm3,%%mm7\n"
+
+ "decl %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm7,%%mm1\n"
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
+ "paddd %%mm7,%%mm1\n"
+ "movd %%mm1,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+
+static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm7,%%mm7\n"
+ "pxor %%mm6,%%mm6\n"
+
+ "movq (%0),%%mm0\n"
+ "movq %%mm0, %%mm1\n"
+ "psllq $8, %%mm0\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm0\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq %%mm4, %%mm1\n"
+ "psllq $8, %%mm4\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm4\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "1:\n"
+
+ "movq (%0),%%mm0\n"
+ "movq %%mm0, %%mm1\n"
+ "psllq $8, %%mm0\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm0\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+ "psubw %%mm0, %%mm4\n"
+ "psubw %%mm2, %%mm5\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm4, %%mm3\n\t"
+ "pcmpgtw %%mm5, %%mm1\n\t"
+ "pxor %%mm3, %%mm4\n"
+ "pxor %%mm1, %%mm5\n"
+ "psubw %%mm3, %%mm4\n"
+ "psubw %%mm1, %%mm5\n"
+ "paddw %%mm4, %%mm5\n"
+ "paddw %%mm5, %%mm6\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq %%mm4, %%mm1\n"
+ "psllq $8, %%mm4\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm4\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "subl $2, %%ecx\n"
+ " jnz 1b\n"
+
+ "movq %%mm6, %%mm0\n"
+ "punpcklwd %%mm7,%%mm0\n"
+ "punpckhwd %%mm7,%%mm6\n"
+ "paddd %%mm0, %%mm6\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddd %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix1), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "g" (h-2)
+ : "%ecx");
+ return tmp;
+}
+
+static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
+ int tmp;
+ uint8_t * pix= pix1;
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm7,%%mm7\n"
+ "pxor %%mm6,%%mm6\n"
+
+ "movq (%0),%%mm0\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "1:\n"
+
+ "movq (%0),%%mm0\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+ "psubw %%mm0, %%mm4\n"
+ "psubw %%mm2, %%mm5\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm4, %%mm3\n\t"
+ "pcmpgtw %%mm5, %%mm1\n\t"
+ "pxor %%mm3, %%mm4\n"
+ "pxor %%mm1, %%mm5\n"
+ "psubw %%mm3, %%mm4\n"
+ "psubw %%mm1, %%mm5\n"
+ "paddw %%mm4, %%mm5\n"
+ "paddw %%mm5, %%mm6\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "subl $2, %%ecx\n"
+ " jnz 1b\n"
+
+ "movq %%mm6, %%mm0\n"
+ "punpcklwd %%mm7,%%mm0\n"
+ "punpckhwd %%mm7,%%mm6\n"
+ "paddd %%mm0, %%mm6\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddd %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix1), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "g" (h-2)
+ : "%ecx");
+ return tmp + hf_noise8_mmx(pix+8, line_size, h);
+}
+
+static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
+ int score1, score2;
+
+ if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
+ else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
+ score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
+
+ if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+ else return score1 + FFABS(score2)*8;
+}
+
+static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
+ int score1= sse8_mmx(c, pix1, pix2, line_size, h);
+ int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
+
+ if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+ else return score1 + FFABS(score2)*8;
+}
+
+static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ av_assert2( (((int)pix) & 7) == 0);
+ av_assert2((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), %%mm2\n"\
+ "movq 8(%0), %%mm3\n"\
+ "add %2,%0\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "add %2,%0\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0xFFFF;
+}
+#undef SUM
+
+static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
+ int line_size, int h)
+{
+ int tmp;
+
+ av_assert2( (((int)pix) & 7) == 0);
+ av_assert2((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), " #out0 "\n"\
+ "movq 8(%0), " #out1 "\n"\
+ "add %2,%0\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "add %2,%0\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ av_assert2( (((int)pix1) & 7) == 0);
+ av_assert2( (((int)pix2) & 7) == 0);
+ av_assert2((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0),%%mm2\n"\
+ "movq (%1)," #out0 "\n"\
+ "movq 8(%0),%%mm3\n"\
+ "movq 8(%1)," #out1 "\n"\
+ "add %3,%0\n"\
+ "add %3,%1\n"\
+ "psubb " #out0 ", %%mm2\n"\
+ "psubb " #out1 ", %%mm3\n"\
+ "pxor %%mm7, %%mm2\n"\
+ "pxor %%mm7, %%mm3\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "add %3,%0\n"
+ "add %3,%1\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0x7FFF;
+}
+#undef SUM
+
+static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
+ int line_size, int h)
+{
+ int tmp;
+
+ av_assert2( (((int)pix1) & 7) == 0);
+ av_assert2( (((int)pix2) & 7) == 0);
+ av_assert2((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0)," #out0 "\n"\
+ "movq (%1),%%mm2\n"\
+ "movq 8(%0)," #out1 "\n"\
+ "movq 8(%1),%%mm3\n"\
+ "add %3,%0\n"\
+ "add %3,%1\n"\
+ "psubb %%mm2, " #out0 "\n"\
+ "psubb %%mm3, " #out1 "\n"\
+ "pxor %%mm7, " #out0 "\n"\
+ "pxor %%mm7, " #out1 "\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "add %3,%0\n"
+ "add %3,%1\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
+ x86_reg i=0;
+ if(w>=16)
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%2, %0), %%mm0 \n\t"
+ "movq (%1, %0), %%mm1 \n\t"
+ "psubb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, (%3, %0) \n\t"
+ "movq 8(%2, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "psubb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, 8(%3, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (i)
+ : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
+ );
+ for(; i<w; i++)
+ dst[i+0] = src1[i+0]-src2[i+0];
+}
+
+static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
+ const uint8_t *src2, int w,
+ int *left, int *left_top)
+{
+ x86_reg i=0;
+ uint8_t l, lt;
+
+ __asm__ volatile(
+ "movq (%1, %0), %%mm0 \n\t" // LT
+ "psllq $8, %%mm0 \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm1 \n\t" // T
+ "movq -1(%2, %0), %%mm2 \n\t" // L
+ "movq (%2, %0), %%mm3 \n\t" // X
+ "movq %%mm2, %%mm4 \n\t" // L
+ "psubb %%mm0, %%mm2 \n\t"
+ "paddb %%mm1, %%mm2 \n\t" // L + T - LT
+ "movq %%mm4, %%mm5 \n\t" // L
+ "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
+ "pminub %%mm5, %%mm1 \n\t" // min(T, L)
+ "pminub %%mm2, %%mm4 \n\t"
+ "pmaxub %%mm1, %%mm4 \n\t"
+ "psubb %%mm4, %%mm3 \n\t" // dst - pred
+ "movq %%mm3, (%3, %0) \n\t"
+ "add $8, %0 \n\t"
+ "movq -1(%1, %0), %%mm0 \n\t" // LT
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (i)
+ : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
+ );
+
+ l= *left;
+ lt= *left_top;
+
+ dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
+
+ *left_top= src1[w-1];
+ *left = src2[w-1];
+}
+
+#define MMABS_MMX(a,z)\
+ "pxor " #z ", " #z " \n\t"\
+ "pcmpgtw " #a ", " #z " \n\t"\
+ "pxor " #z ", " #a " \n\t"\
+ "psubw " #z ", " #a " \n\t"
+
+#define MMABS_MMXEXT(a, z) \
+ "pxor " #z ", " #z " \n\t"\
+ "psubw " #a ", " #z " \n\t"\
+ "pmaxsw " #z ", " #a " \n\t"
+
+#define MMABS_SSSE3(a,z)\
+ "pabsw " #a ", " #a " \n\t"
+
+#define MMABS_SUM(a,z, sum)\
+ MMABS(a,z)\
+ "paddusw " #a ", " #sum " \n\t"
+
+/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
+ * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
+ * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
+#define HSUM_MMX(a, t, dst)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $16, "#a" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define HSUM_MMXEXT(a, t, dst) \
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshufw $0x01, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define HSUM_SSE2(a, t, dst)\
+ "movhlps "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshuflw $0x0E, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshuflw $0x01, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define DCT_SAD4(m,mm,o)\
+ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
+ "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
+ "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
+ "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
+ MMABS_SUM(mm##2, mm##6, mm##0)\
+ MMABS_SUM(mm##3, mm##7, mm##1)\
+ MMABS_SUM(mm##4, mm##6, mm##0)\
+ MMABS_SUM(mm##5, mm##7, mm##1)\
+
+#define DCT_SAD_MMX\
+ "pxor %%mm0, %%mm0 \n\t"\
+ "pxor %%mm1, %%mm1 \n\t"\
+ DCT_SAD4(q, %%mm, 0)\
+ DCT_SAD4(q, %%mm, 8)\
+ DCT_SAD4(q, %%mm, 64)\
+ DCT_SAD4(q, %%mm, 72)\
+ "paddusw %%mm1, %%mm0 \n\t"\
+ HSUM(%%mm0, %%mm1, %0)
+
+#define DCT_SAD_SSE2\
+ "pxor %%xmm0, %%xmm0 \n\t"\
+ "pxor %%xmm1, %%xmm1 \n\t"\
+ DCT_SAD4(dqa, %%xmm, 0)\
+ DCT_SAD4(dqa, %%xmm, 64)\
+ "paddusw %%xmm1, %%xmm0 \n\t"\
+ HSUM(%%xmm0, %%xmm1, %0)
+
+#define DCT_SAD_FUNC(cpu) \
+static int sum_abs_dctelem_##cpu(int16_t *block){\
+ int sum;\
+ __asm__ volatile(\
+ DCT_SAD\
+ :"=r"(sum)\
+ :"r"(block)\
+ );\
+ return sum&0xFFFF;\
+}
+
+#define DCT_SAD DCT_SAD_MMX
+#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
+#define MMABS(a,z) MMABS_MMX(a,z)
+DCT_SAD_FUNC(mmx)
+#undef MMABS
+#undef HSUM
+
+#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
+#define MMABS(a,z) MMABS_MMXEXT(a,z)
+DCT_SAD_FUNC(mmxext)
+#undef HSUM
+#undef DCT_SAD
+
+#define DCT_SAD DCT_SAD_SSE2
+#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
+DCT_SAD_FUNC(sse2)
+#undef MMABS
+
+#if HAVE_SSSE3_INLINE
+#define MMABS(a,z) MMABS_SSSE3(a,z)
+DCT_SAD_FUNC(ssse3)
+#undef MMABS
+#endif
+#undef HSUM
+#undef DCT_SAD
+
+static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
+ int sum;
+ x86_reg i=size;
+ __asm__ volatile(
+ "pxor %%mm4, %%mm4 \n"
+ "1: \n"
+ "sub $8, %0 \n"
+ "movq (%2,%0), %%mm2 \n"
+ "movq (%3,%0,2), %%mm0 \n"
+ "movq 8(%3,%0,2), %%mm1 \n"
+ "punpckhbw %%mm2, %%mm3 \n"
+ "punpcklbw %%mm2, %%mm2 \n"
+ "psraw $8, %%mm3 \n"
+ "psraw $8, %%mm2 \n"
+ "psubw %%mm3, %%mm1 \n"
+ "psubw %%mm2, %%mm0 \n"
+ "pmaddwd %%mm1, %%mm1 \n"
+ "pmaddwd %%mm0, %%mm0 \n"
+ "paddd %%mm1, %%mm4 \n"
+ "paddd %%mm0, %%mm4 \n"
+ "jg 1b \n"
+ "movq %%mm4, %%mm3 \n"
+ "psrlq $32, %%mm3 \n"
+ "paddd %%mm3, %%mm4 \n"
+ "movd %%mm4, %1 \n"
+ :"+r"(i), "=r"(sum)
+ :"r"(pix1), "r"(pix2)
+ );
+ return sum;
+}
+
+#define PHADDD(a, t)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddd "#t", "#a" \n\t"
+/*
+ pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
+ pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
+ pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
+ */
+#define PMULHRW(x, y, s, o)\
+ "pmulhw " #s ", "#x " \n\t"\
+ "pmulhw " #s ", "#y " \n\t"\
+ "paddw " #o ", "#x " \n\t"\
+ "paddw " #o ", "#y " \n\t"\
+ "psraw $1, "#x " \n\t"\
+ "psraw $1, "#y " \n\t"
+#define DEF(x) x ## _mmx
+#define SET_RND MOVQ_WONE
+#define SCALE_OFFSET 1
+
+#include "dsputil_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#define DEF(x) x ## _3dnow
+#define SET_RND(x)
+#define SCALE_OFFSET 0
+#define PMULHRW(x, y, s, o)\
+ "pmulhrw " #s ", "#x " \n\t"\
+ "pmulhrw " #s ", "#y " \n\t"
+
+#include "dsputil_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#if HAVE_SSSE3_INLINE
+#undef PHADDD
+#define DEF(x) x ## _ssse3
+#define SET_RND(x)
+#define SCALE_OFFSET -1
+#define PHADDD(a, t)\
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
+#define PMULHRW(x, y, s, o)\
+ "pmulhrsw " #s ", "#x " \n\t"\
+ "pmulhrsw " #s ", "#y " \n\t"
+
+#include "dsputil_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+#undef PHADDD
+#endif /* HAVE_SSSE3_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
+
+#define hadamard_func(cpu) \
+int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
+ int stride, int h); \
+int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
+ int stride, int h);
+
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
+
+av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
+{
+ int mm_flags = av_get_cpu_flags();
+ int bit_depth = avctx->bits_per_raw_sample;
+
+#if HAVE_YASM
+ if (EXTERNAL_MMX(mm_flags)) {
+ if (bit_depth <= 8)
+ c->get_pixels = ff_get_pixels_mmx;
+ c->diff_pixels = ff_diff_pixels_mmx;
+ c->pix_sum = ff_pix_sum16_mmx;
+
+ c->pix_norm1 = ff_pix_norm1_mmx;
+ }
+ if (EXTERNAL_SSE2(mm_flags))
+ if (bit_depth <= 8)
+ c->get_pixels = ff_get_pixels_sse2;
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ const int dct_algo = avctx->dct_algo;
+ if (avctx->bits_per_raw_sample <= 8 &&
+ (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
+ if(mm_flags & AV_CPU_FLAG_SSE2){
+ c->fdct = ff_fdct_sse2;
+ } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->fdct = ff_fdct_mmxext;
+ }else{
+ c->fdct = ff_fdct_mmx;
+ }
+ }
+
+
+ c->diff_bytes= diff_bytes_mmx;
+ c->sum_abs_dctelem= sum_abs_dctelem_mmx;
+
+ c->sse[0] = sse16_mmx;
+ c->sse[1] = sse8_mmx;
+ c->vsad[4]= vsad_intra16_mmx;
+
+ c->nsse[0] = nsse16_mmx;
+ c->nsse[1] = nsse8_mmx;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmx;
+ }
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_mmx;
+ }
+ c->add_8x8basis= add_8x8basis_mmx;
+
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
+ c->vsad[4] = vsad_intra16_mmxext;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmxext;
+ }
+
+ c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
+ }
+
+ if(mm_flags & AV_CPU_FLAG_SSE2){
+ c->sum_abs_dctelem= sum_abs_dctelem_sse2;
+ }
+
+#if HAVE_SSSE3_INLINE
+ if(mm_flags & AV_CPU_FLAG_SSSE3){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_ssse3;
+ }
+ c->add_8x8basis= add_8x8basis_ssse3;
+ c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
+ }
+#endif
+
+ if(mm_flags & AV_CPU_FLAG_3DNOW){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_3dnow;
+ }
+ c->add_8x8basis= add_8x8basis_3dnow;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->sse[0] = ff_sse16_sse2;
+
+#if HAVE_ALIGNED_STACK
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
+ c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
+#endif
+ }
+
+ if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) {
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
+ c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+ }
+ }
+
+ ff_dsputil_init_pix_mmx(c, avctx);
+}
diff --git a/ffmpeg/libavcodec/x86/dwt_yasm.asm b/ffmpeg/libavcodec/x86/dwt_yasm.asm
new file mode 100644
index 0000000..5253abc
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/dwt_yasm.asm
@@ -0,0 +1,306 @@
+;******************************************************************************
+;* MMX optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_1991: times 4 dw 9,-1
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
+%macro COMPOSE_53iL0 4
+ paddw %2, %3
+ paddw %2, %4
+ psraw %2, 2
+ psubw %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered m3: pw_8 m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+ paddw m0, %3
+ paddw m1, %2
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+%if %0 > 3
+ movu %1, %4
+%endif
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ paddw m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+ mova m2, [pw_2]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b0q+2*widthq]
+ mova m0, [b1q+2*widthq]
+ COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+ mova m1, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ paddw m0, [b2q+2*widthq]
+ paddw m0, m1
+ psraw m0, 1
+ paddw m0, [b1q+2*widthq]
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+ mova [b2q+2*widthq], m1
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+ mova m3, [pw_16]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ mova m5, [b2q+2*widthq]
+ paddw m0, [b4q+2*widthq]
+ paddw m1, [b3q+2*widthq]
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ psubw m5, m1
+ mova [b2q+2*widthq], m5
+ jg .loop
+ REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+ mova m3, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b1q+2*widthq]
+ mova m0, [b0q+2*widthq]
+ mova m2, m1
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [b0q+2*widthq], m0
+ paddw m2, m0
+ mova [b1q+2*widthq], m2
+ jg .loop
+ REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+ mov %3, [tmpq]
+%assign %%i 1
+%rep %1
+ mov [tmpq-2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+ mov %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+ mov [tmpq+2*w2q+2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xq, xq
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ mova m3, [pw_1]
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [tmpq + 2*xq], m0
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .lowpass_loop
+
+ xor xq, xq
+ and w2q, ~(mmsize/2 - 1)
+ cmp w2q, mmsize/2
+ jl .end
+
+.highpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [tmpq + 2*xq]
+ paddw m1, m0
+
+ ; shift and interleave
+%if %2 == 1
+ paddw m0, m3
+ paddw m1, m3
+ psraw m0, 1
+ psraw m1, 1
+%endif
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m0
+ mova [bq+4*xq+mmsize], m2
+
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .highpass_loop
+.end:
+ REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xd, xd
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ movu m4, [bq+wq]
+ mova m7, [pw_2]
+ pslldq m4, 14
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ mova m2, m1
+ palignr m1, m4, 14
+ mova m4, m2
+ COMPOSE_53iL0 m0, m1, m2, m7
+ mova [tmpq + 2*xq], m0
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .lowpass_loop
+
+ EDGE_EXTENSION 1, 2, xw
+ ; leave the last up to 7 (sse) or 3 (mmx) values for C
+ xor xd, xd
+ and w2d, ~(mmsize/2 - 1)
+ cmp w2d, mmsize/2
+ jl .end
+
+ mova m7, [tmpq-mmsize]
+ mova m0, [tmpq]
+ mova m5, [pw_1]
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+.highpass_loop:
+ mova m6, m0
+ palignr m0, m7, 14
+ mova m7, [tmpq + 2*xq + 16]
+ mova m1, m7
+ mova m2, m7
+ palignr m1, m6, 2
+ palignr m2, m6, 4
+ COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+ mova m0, m7
+ mova m7, m6
+
+ ; shift and interleave
+ paddw m6, m5
+ paddw m1, m5
+ psraw m6, 1
+ psraw m1, 1
+ mova m2, m6
+ punpcklwd m6, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m6
+ mova [bq+4*xq+mmsize], m2
+
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .highpass_loop
+.end:
+ REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/ffmpeg/libavcodec/x86/fdct.c b/ffmpeg/libavcodec/x86/fdct.c
new file mode 100644
index 0000000..d35245d
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fdct.c
@@ -0,0 +1,586 @@
+/*
+ * MMX optimized forward DCT
+ * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
+ * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
+ *
+ * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
+ *
+ * Intel Application Note AP-922 - fast, precise implementation of DCT
+ * http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/dct.h"
+
+#if HAVE_INLINE_ASM
+
+//////////////////////////////////////////////////////////////////////
+//
+// constants for the forward DCT
+// -----------------------------
+//
+// Be sure to check that your compiler is aligning all constants to QWORD
+// (8-byte) memory boundaries! Otherwise the unaligned memory access will
+// severely stall MMX execution.
+//
+//////////////////////////////////////////////////////////////////////
+
+#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
+#define SHIFT_FRW_COL BITS_FRW_ACC
+#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
+#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
+//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+//concatenated table, for forward DCT transformation
+DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
+ X8(13036), // tg * (2<<16) + 0.5
+ X8(27146), // tg * (2<<16) + 0.5
+ X8(-21746) // tg * (2<<16) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
+ X8(23170) //cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
+
+DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
+} fdct_r_row_sse2 =
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
+ 16384, 16384, 22725, 19266,
+ 16384, 16384, 12873, 4520,
+ 21407, 8867, 19266, -4520,
+ -8867, -21407, -22725, -12873,
+ 16384, -16384, 12873, -22725,
+ -16384, 16384, 4520, 19266,
+ 8867, -21407, 4520, -12873,
+ 21407, -8867, 19266, -22725,
+
+ 22725, 22725, 31521, 26722,
+ 22725, 22725, 17855, 6270,
+ 29692, 12299, 26722, -6270,
+ -12299, -29692, -31521, -17855,
+ 22725, -22725, 17855, -31521,
+ -22725, 22725, 6270, 26722,
+ 12299, -29692, 6270, -17855,
+ 29692, -12299, 26722, -31521,
+
+ 21407, 21407, 29692, 25172,
+ 21407, 21407, 16819, 5906,
+ 27969, 11585, 25172, -5906,
+ -11585, -27969, -29692, -16819,
+ 21407, -21407, 16819, -29692,
+ -21407, 21407, 5906, 25172,
+ 11585, -27969, 5906, -16819,
+ 27969, -11585, 25172, -29692,
+
+ 19266, 19266, 26722, 22654,
+ 19266, 19266, 15137, 5315,
+ 25172, 10426, 22654, -5315,
+ -10426, -25172, -26722, -15137,
+ 19266, -19266, 15137, -26722,
+ -19266, 19266, 5315, 22654,
+ 10426, -25172, 5315, -15137,
+ 25172, -10426, 22654, -26722,
+
+ 16384, 16384, 22725, 19266,
+ 16384, 16384, 12873, 4520,
+ 21407, 8867, 19266, -4520,
+ -8867, -21407, -22725, -12873,
+ 16384, -16384, 12873, -22725,
+ -16384, 16384, 4520, 19266,
+ 8867, -21407, 4520, -12873,
+ 21407, -8867, 19266, -22725,
+
+ 19266, 19266, 26722, 22654,
+ 19266, 19266, 15137, 5315,
+ 25172, 10426, 22654, -5315,
+ -10426, -25172, -26722, -15137,
+ 19266, -19266, 15137, -26722,
+ -19266, 19266, 5315, 22654,
+ 10426, -25172, 5315, -15137,
+ 25172, -10426, 22654, -26722,
+
+ 21407, 21407, 29692, 25172,
+ 21407, 21407, 16819, 5906,
+ 27969, 11585, 25172, -5906,
+ -11585, -27969, -29692, -16819,
+ 21407, -21407, 16819, -29692,
+ -21407, 21407, 5906, 25172,
+ 11585, -27969, 5906, -16819,
+ 27969, -11585, 25172, -29692,
+
+ 22725, 22725, 31521, 26722,
+ 22725, 22725, 17855, 6270,
+ 29692, 12299, 26722, -6270,
+ -12299, -29692, -31521, -17855,
+ 22725, -22725, 17855, -31521,
+ -22725, 22725, 6270, 26722,
+ 12299, -29692, 6270, -17855,
+ 29692, -12299, 26722, -31521,
+};
+
+static const struct
+{
+ DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
+} tab_frw_01234567_sse2 =
+{{
+//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
+#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
+ C4, C4, C5, C7, C2, C6, C3, -C7, \
+ -C4, C4, C7, C3, C6, -C2, C7, -C5, \
+ C4, -C4, C5, -C1, C2, -C6, C3, -C1,
+// c1..c7 * cos(pi/4) * 2^15
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+}};
+
+#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
+
+#define FDCT_COL(cpu, mm, mov)\
+static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
+{\
+ __asm__ volatile (\
+ #mov" 16(%0), %%"#mm"0 \n\t" \
+ #mov" 96(%0), %%"#mm"1 \n\t" \
+ #mov" %%"#mm"0, %%"#mm"2 \n\t" \
+ #mov" 32(%0), %%"#mm"3 \n\t" \
+ "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
+ #mov" 80(%0), %%"#mm"4 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
+ #mov" (%0), %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
+ "paddsw 112(%0), %%"#mm"5 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
+ #mov" %%"#mm"0, %%"#mm"6 \n\t" \
+ "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
+ #mov" 16(%1), %%"#mm"1 \n\t" \
+ "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
+ #mov" 48(%0), %%"#mm"7 \n\t" \
+ "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
+ "paddsw 64(%0), %%"#mm"7 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
+ #mov" %%"#mm"5, %%"#mm"4 \n\t" \
+ "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
+ "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
+ "por (%2), %%"#mm"1 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
+ "pmulhw 16(%1), %%"#mm"5 \n\t" \
+ #mov" %%"#mm"4, %%"#mm"7 \n\t" \
+ "psubsw 80(%0), %%"#mm"3 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
+ #mov" %%"#mm"1, 32(%3) \n\t" \
+ "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
+ #mov" 48(%0), %%"#mm"1 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
+ "psubsw 64(%0), %%"#mm"1 \n\t" \
+ #mov" %%"#mm"2, %%"#mm"6 \n\t" \
+ #mov" %%"#mm"4, 64(%3) \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
+ "pmulhw (%4), %%"#mm"2 \n\t" \
+ "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
+ "pmulhw (%4), %%"#mm"6 \n\t" \
+ "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
+ "por (%2), %%"#mm"5 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
+ "por (%2), %%"#mm"2 \n\t" \
+ #mov" %%"#mm"1, %%"#mm"4 \n\t" \
+ #mov" (%0), %%"#mm"3 \n\t" \
+ "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
+ "psubsw 112(%0), %%"#mm"3 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
+ #mov" (%1), %%"#mm"0 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
+ #mov" 32(%1), %%"#mm"6 \n\t" \
+ "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
+ #mov" %%"#mm"7, (%3) \n\t" \
+ "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
+ #mov" %%"#mm"5, 96(%3) \n\t" \
+ #mov" %%"#mm"3, %%"#mm"7 \n\t" \
+ #mov" 32(%1), %%"#mm"5 \n\t" \
+ "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
+ "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
+ "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
+ "pmulhw (%1), %%"#mm"3 \n\t" \
+ "por (%2), %%"#mm"0 \n\t" \
+ "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
+ #mov" %%"#mm"0, 16(%3) \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
+ #mov" %%"#mm"7, 48(%3) \n\t" \
+ "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
+ #mov" %%"#mm"5, 80(%3) \n\t" \
+ #mov" %%"#mm"3, 112(%3) \n\t" \
+ : \
+ : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
+ "r" (out + offset), "r" (ocos_4_16)); \
+}
+
+FDCT_COL(mmx, mm, movq)
+FDCT_COL(sse2, xmm, movdqa)
+
+static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
+{
+ __asm__ volatile(
+#define FDCT_ROW_SSE2_H1(i,t) \
+ "movq " #i "(%0), %%xmm2 \n\t" \
+ "movq " #i "+8(%0), %%xmm0 \n\t" \
+ "movdqa " #t "+32(%1), %%xmm3 \n\t" \
+ "movdqa " #t "+48(%1), %%xmm7 \n\t" \
+ "movdqa " #t "(%1), %%xmm4 \n\t" \
+ "movdqa " #t "+16(%1), %%xmm5 \n\t"
+
+#define FDCT_ROW_SSE2_H2(i,t) \
+ "movq " #i "(%0), %%xmm2 \n\t" \
+ "movq " #i "+8(%0), %%xmm0 \n\t" \
+ "movdqa " #t "+32(%1), %%xmm3 \n\t" \
+ "movdqa " #t "+48(%1), %%xmm7 \n\t"
+
+#define FDCT_ROW_SSE2(i) \
+ "movq %%xmm2, %%xmm1 \n\t" \
+ "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "psubsw %%xmm0, %%xmm2 \n\t" \
+ "punpckldq %%xmm2, %%xmm1 \n\t" \
+ "pshufd $78, %%xmm1, %%xmm2 \n\t" \
+ "pmaddwd %%xmm2, %%xmm3 \n\t" \
+ "pmaddwd %%xmm1, %%xmm7 \n\t" \
+ "pmaddwd %%xmm5, %%xmm2 \n\t" \
+ "pmaddwd %%xmm4, %%xmm1 \n\t" \
+ "paddd %%xmm7, %%xmm3 \n\t" \
+ "paddd %%xmm2, %%xmm1 \n\t" \
+ "paddd %%xmm6, %%xmm3 \n\t" \
+ "paddd %%xmm6, %%xmm1 \n\t" \
+ "psrad %3, %%xmm3 \n\t" \
+ "psrad %3, %%xmm1 \n\t" \
+ "packssdw %%xmm3, %%xmm1 \n\t" \
+ "movdqa %%xmm1, " #i "(%4) \n\t"
+
+ "movdqa (%2), %%xmm6 \n\t"
+ FDCT_ROW_SSE2_H1(0,0)
+ FDCT_ROW_SSE2(0)
+ FDCT_ROW_SSE2_H2(64,0)
+ FDCT_ROW_SSE2(64)
+
+ FDCT_ROW_SSE2_H1(16,64)
+ FDCT_ROW_SSE2(16)
+ FDCT_ROW_SSE2_H2(112,64)
+ FDCT_ROW_SSE2(112)
+
+ FDCT_ROW_SSE2_H1(32,128)
+ FDCT_ROW_SSE2(32)
+ FDCT_ROW_SSE2_H2(96,128)
+ FDCT_ROW_SSE2(96)
+
+ FDCT_ROW_SSE2_H1(48,192)
+ FDCT_ROW_SSE2(48)
+ FDCT_ROW_SSE2_H2(80,192)
+ FDCT_ROW_SSE2(80)
+ :
+ : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
+ "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ );
+}
+
+static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
+ const int16_t *table)
+{
+ __asm__ volatile (
+ "pshufw $0x1B, 8(%0), %%mm5 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "paddsw %%mm5, %%mm0 \n\t"
+ "psubsw %%mm5, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "punpckldq %%mm1, %%mm0 \n\t"
+ "punpckhdq %%mm1, %%mm2 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm5 \n\t"
+ "movq 32(%1), %%mm6 \n\t"
+ "movq 40(%1), %%mm7 \n\t"
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm0, %%mm4 \n\t"
+ "pmaddwd %%mm2, %%mm5 \n\t"
+ "pmaddwd %%mm0, %%mm6 \n\t"
+ "pmaddwd %%mm2, %%mm7 \n\t"
+ "pmaddwd 48(%1), %%mm0 \n\t"
+ "pmaddwd 56(%1), %%mm2 \n\t"
+ "paddd %%mm1, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "paddd %%mm6, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm0, %%mm5 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packssdw %%mm2, %%mm7 \n\t"
+ "movq %%mm3, (%3) \n\t"
+ "movq %%mm7, 8(%3) \n\t"
+ :
+ : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
+{
+ //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
+ __asm__ volatile(
+ "movd 12(%0), %%mm1 \n\t"
+ "punpcklwd 8(%0), %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psrlq $0x20, %%mm1 \n\t"
+ "movq 0(%0), %%mm0 \n\t"
+ "punpcklwd %%mm2, %%mm1 \n\t"
+ "movq %%mm0, %%mm5 \n\t"
+ "paddsw %%mm1, %%mm0 \n\t"
+ "psubsw %%mm1, %%mm5 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "punpckldq %%mm5, %%mm0 \n\t"
+ "punpckhdq %%mm5, %%mm2 \n\t"
+ "movq 0(%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm5 \n\t"
+ "movq 32(%1), %%mm6 \n\t"
+ "movq 40(%1), %%mm7 \n\t"
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm0, %%mm4 \n\t"
+ "pmaddwd %%mm2, %%mm5 \n\t"
+ "pmaddwd %%mm0, %%mm6 \n\t"
+ "pmaddwd %%mm2, %%mm7 \n\t"
+ "pmaddwd 48(%1), %%mm0 \n\t"
+ "pmaddwd 56(%1), %%mm2 \n\t"
+ "paddd %%mm1, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "paddd %%mm6, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm0, %%mm5 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packssdw %%mm2, %%mm7 \n\t"
+ "movq %%mm3, 0(%3) \n\t"
+ "movq %%mm7, 8(%3) \n\t"
+ :
+ : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+void ff_fdct_mmx(int16_t *block)
+{
+ DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+ int16_t * block1= (int16_t*)align_tmp;
+ const int16_t *table= tab_frw_01234567;
+ int i;
+
+ fdct_col_mmx(block, block1, 0);
+ fdct_col_mmx(block, block1, 4);
+
+ for(i=8;i>0;i--) {
+ fdct_row_mmx(block1, block, table);
+ block1 += 8;
+ table += 32;
+ block += 8;
+ }
+}
+
+void ff_fdct_mmxext(int16_t *block)
+{
+ DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+ int16_t *block1= (int16_t*)align_tmp;
+ const int16_t *table= tab_frw_01234567;
+ int i;
+
+ fdct_col_mmx(block, block1, 0);
+ fdct_col_mmx(block, block1, 4);
+
+ for(i=8;i>0;i--) {
+ fdct_row_mmxext(block1, block, table);
+ block1 += 8;
+ table += 32;
+ block += 8;
+ }
+}
+
+void ff_fdct_sse2(int16_t *block)
+{
+ DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
+ int16_t * const block1= (int16_t*)align_tmp;
+
+ fdct_col_sse2(block, block1, 0);
+ fdct_row_sse2(block1, block);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/fft.asm b/ffmpeg/libavcodec/x86/fft.asm
new file mode 100644
index 0000000..5071741
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fft.asm
@@ -0,0 +1,1093 @@
+;******************************************************************************
+;* FFT transform with SSE/3DNow optimizations
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2011 Vitor Sessak
+;*
+;* This algorithm (though not any of the implementation details) is
+;* based on libdjbfft by D. J. Bernstein.
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; These functions are not individually interchangeable with the C versions.
+; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
+; in blocks as conventient to the vector size.
+; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+%define pointer resq
+%else
+%define pointer resd
+%endif
+
+SECTION_RODATA
+
+struc FFTContext
+ .nbits: resd 1
+ .reverse: resd 1
+ .revtab: pointer 1
+ .tmpbuf: pointer 1
+ .mdctsize: resd 1
+ .mdctbits: resd 1
+ .tcos: pointer 1
+ .tsin: pointer 1
+ .fftperm: pointer 1
+ .fftcalc: pointer 1
+ .imdctcalc:pointer 1
+ .imdcthalf:pointer 1
+endstruc
+
+%define M_SQRT1_2 0.70710678118654752440
+%define M_COS_PI_1_8 0.923879532511287
+%define M_COS_PI_3_8 0.38268343236509
+
+align 32
+ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
+ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
+
+ps_root2: times 8 dd M_SQRT1_2
+ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
+
+perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
+perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
+ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
+ps_m1m1m1m1: times 4 dd 1<<31
+ps_m1p1: dd 1<<31, 0
+
+%assign i 16
+%rep 13
+cextern cos_ %+ i
+%assign i i<<1
+%endrep
+
+%if ARCH_X86_64
+ %define pointer dq
+%else
+ %define pointer dd
+%endif
+
+%macro IF0 1+
+%endmacro
+%macro IF1 1+
+ %1
+%endmacro
+
+SECTION_TEXT
+
+%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
+ mova %1, %3
+ mova %2, %1
+ pfadd %1, %4
+ pfsub %2, %4
+%endmacro
+
+%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
+ mova %5, %3
+ pfsub %3, %4
+ pfadd %5, %4 ; {t6,t5}
+ pxor %3, [ps_m1p1] ; {t8,t7}
+ mova %6, %1
+ movd [r0+12], %3
+ punpckhdq %3, [r0+8]
+ pfadd %1, %5 ; {r0,i0}
+ pfsub %6, %5 ; {r2,i2}
+ mova %4, %2
+ pfadd %2, %3 ; {r1,i1}
+ pfsub %4, %3 ; {r3,i3}
+ SWAP %3, %6
+%endmacro
+
+; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
+; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
+; %3, %4, %5 tmp
+; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
+; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
+%macro T8_AVX 5
+ vsubps %5, %1, %2 ; v = %1 - %2
+ vaddps %3, %1, %2 ; w = %1 + %2
+ vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
+ vpermilps %2, %2, [perm1]
+ vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
+ vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
+ vsubps %4, %5, %1 ; s = r - q
+ vaddps %1, %5, %1 ; u = r + q
+ vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
+ vshufps %5, %4, %1, 0xbb
+ vshufps %3, %4, %1, 0xee
+ vperm2f128 %3, %3, %5, 0x13
+ vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
+ vshufps %2, %1, %4, 0xdd
+ vshufps %1, %1, %4, 0x88
+ vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
+ vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
+ vsubps %5, %1, %3
+ vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
+ vsubps %2, %4, %1 ; %2 = v - w
+ vaddps %1, %4, %1 ; %1 = v + w
+%endmacro
+
+; In SSE mode do one fft4 transforms
+; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
+; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
+;
+; In AVX mode do two fft4 transforms
+; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
+; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
+%macro T4_SSE 3
+ subps %3, %1, %2 ; {t3,t4,-t8,t7}
+ addps %1, %1, %2 ; {t1,t2,t6,t5}
+ xorps %3, %3, [ps_p1p1m1p1]
+ shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
+ shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
+ subps %3, %1, %2 ; {r2,i2,r3,i3}
+ addps %1, %1, %2 ; {r0,i0,r1,i1}
+ shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
+ shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
+%endmacro
+
+; In SSE mode do one FFT8
+; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
+; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
+;
+; In AVX mode do two FFT8
+; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
+; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
+; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
+; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
+%macro T8_SSE 6
+ addps %6, %3, %4 ; {t1,t2,t3,t4}
+ subps %3, %3, %4 ; {r5,i5,r7,i7}
+ shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
+ mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
+ mulps %4, %4, [ps_root2]
+ addps %3, %3, %4 ; {t8,t7,ta,t9}
+ shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
+ shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
+ subps %3, %6, %4 ; {t6,t5,tc,tb}
+ addps %6, %6, %4 ; {t1,t2,t9,ta}
+ shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
+ shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
+ subps %3, %1, %6 ; {r4,r5,r6,r7}
+ addps %1, %1, %6 ; {r0,r1,r2,r3}
+ subps %4, %2, %5 ; {i4,i5,i6,i7}
+ addps %2, %2, %5 ; {i0,i1,i2,i3}
+%endmacro
+
+; scheduled for cpu-bound sizes
+%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
+IF%1 mova m4, Z(4)
+IF%1 mova m5, Z(5)
+ mova m0, %2 ; wre
+ mova m1, %3 ; wim
+ mulps m2, m4, m0 ; r2*wre
+IF%1 mova m6, Z2(6)
+ mulps m3, m5, m1 ; i2*wim
+IF%1 mova m7, Z2(7)
+ mulps m4, m4, m1 ; r2*wim
+ mulps m5, m5, m0 ; i2*wre
+ addps m2, m2, m3 ; r2*wre + i2*wim
+ mulps m3, m1, m7 ; i3*wim
+ subps m5, m5, m4 ; i2*wre - r2*wim
+ mulps m1, m1, m6 ; r3*wim
+ mulps m4, m0, m6 ; r3*wre
+ mulps m0, m0, m7 ; i3*wre
+ subps m4, m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m0, m1 ; i3*wre + r3*wim
+ subps m1, m4, m2 ; t3
+ addps m4, m4, m2 ; t5
+ subps m3, m3, m4 ; r2
+ addps m4, m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ subps m3, m5, m0 ; t4
+ subps m4, m6, m3 ; r3
+ addps m3, m3, m6 ; r1
+ mova Z2(6), m4
+ mova Z(2), m3
+ mova m2, Z(3)
+ addps m3, m5, m0 ; t6
+ subps m2, m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, m1, Z(3) ; i1
+ mova Z2(7), m2
+ mova Z(3), m1
+ subps m4, m7, m3 ; i2
+ addps m3, m3, m7 ; i0
+ mova Z(5), m4
+ mova Z(1), m3
+%endmacro
+
+; scheduled to avoid store->load aliasing
+%macro PASS_BIG 1 ; (!interleave)
+ mova m4, Z(4) ; r2
+ mova m5, Z(5) ; i2
+ mova m0, [wq] ; wre
+ mova m1, [wq+o1q] ; wim
+ mulps m2, m4, m0 ; r2*wre
+ mova m6, Z2(6) ; r3
+ mulps m3, m5, m1 ; i2*wim
+ mova m7, Z2(7) ; i3
+ mulps m4, m4, m1 ; r2*wim
+ mulps m5, m5, m0 ; i2*wre
+ addps m2, m2, m3 ; r2*wre + i2*wim
+ mulps m3, m1, m7 ; i3*wim
+ mulps m1, m1, m6 ; r3*wim
+ subps m5, m5, m4 ; i2*wre - r2*wim
+ mulps m4, m0, m6 ; r3*wre
+ mulps m0, m0, m7 ; i3*wre
+ subps m4, m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m0, m1 ; i3*wre + r3*wim
+ subps m1, m4, m2 ; t3
+ addps m4, m4, m2 ; t5
+ subps m3, m3, m4 ; r2
+ addps m4, m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ subps m3, m5, m0 ; t4
+ subps m4, m6, m3 ; r3
+ addps m3, m3, m6 ; r1
+IF%1 mova Z2(6), m4
+IF%1 mova Z(2), m3
+ mova m2, Z(3)
+ addps m5, m5, m0 ; t6
+ subps m2, m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, m1, Z(3) ; i1
+IF%1 mova Z2(7), m2
+IF%1 mova Z(3), m1
+ subps m6, m7, m5 ; i2
+ addps m5, m5, m7 ; i0
+IF%1 mova Z(5), m6
+IF%1 mova Z(1), m5
+%if %1==0
+ INTERL m1, m3, m7, Z, 2
+ INTERL m2, m4, m0, Z2, 6
+
+ mova m1, Z(0)
+ mova m2, Z(4)
+
+ INTERL m5, m1, m3, Z, 0
+ INTERL m6, m2, m7, Z, 4
+%endif
+%endmacro
+
+%macro PUNPCK 3
+ mova %3, %1
+ punpckldq %1, %2
+ punpckhdq %3, %2
+%endmacro
+
+%define Z(x) [r0+mmsize*x]
+%define Z2(x) [r0+mmsize*x]
+%define ZH(x) [r0+mmsize*x+mmsize/2]
+
+INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
+align 16
+fft8_avx:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T8_AVX m0, m1, m2, m3, m4
+ mova Z(0), m0
+ mova Z(1), m1
+ ret
+
+
+align 16
+fft16_avx:
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T4_SSE m2, m3, m7
+
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T8_AVX m0, m1, m4, m5, m7
+
+ mova m4, [ps_cos16_1]
+ mova m5, [ps_cos16_2]
+ vmulps m6, m2, m4
+ vmulps m7, m3, m5
+ vaddps m7, m7, m6
+ vmulps m2, m2, m5
+ vmulps m3, m3, m4
+ vsubps m3, m3, m2
+ vblendps m2, m7, m3, 0xf0
+ vperm2f128 m3, m7, m3, 0x21
+ vaddps m4, m2, m3
+ vsubps m2, m3, m2
+ vperm2f128 m2, m2, m2, 0x01
+ vsubps m3, m1, m2
+ vaddps m1, m1, m2
+ vsubps m5, m0, m4
+ vaddps m0, m0, m4
+ vextractf128 Z(0), m0, 0
+ vextractf128 ZH(0), m1, 0
+ vextractf128 Z(1), m0, 1
+ vextractf128 ZH(1), m1, 1
+ vextractf128 Z(2), m5, 0
+ vextractf128 ZH(2), m3, 0
+ vextractf128 Z(3), m5, 1
+ vextractf128 ZH(3), m3, 1
+ ret
+
+align 16
+fft32_avx:
+ call fft16_avx
+
+ mova m0, Z(4)
+ mova m1, Z(5)
+
+ T4_SSE m0, m1, m4
+
+ mova m2, Z(6)
+ mova m3, Z(7)
+
+ T8_SSE m0, m1, m2, m3, m4, m6
+ ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
+ ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
+
+ vperm2f128 m4, m0, m2, 0x20
+ vperm2f128 m5, m1, m3, 0x20
+ vperm2f128 m6, m0, m2, 0x31
+ vperm2f128 m7, m1, m3, 0x31
+
+ PASS_SMALL 0, [cos_32], [cos_32+32]
+
+ ret
+
+fft32_interleave_avx:
+ call fft32_avx
+ mov r2d, 32
+.deint_loop:
+ mova m2, Z(0)
+ mova m3, Z(1)
+ vunpcklps m0, m2, m3
+ vunpckhps m1, m2, m3
+ vextractf128 Z(0), m0, 0
+ vextractf128 ZH(0), m1, 0
+ vextractf128 Z(1), m0, 1
+ vextractf128 ZH(1), m1, 1
+ add r0, mmsize*2
+ sub r2d, mmsize/4
+ jg .deint_loop
+ ret
+
+%endif
+
+INIT_XMM sse
+
+align 16
+fft4_avx:
+fft4_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova Z(0), m0
+ mova Z(1), m1
+ ret
+
+align 16
+fft8_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T8_SSE m0, m1, m2, m3, m4, m5
+ mova Z(0), m0
+ mova Z(1), m1
+ mova Z(2), m2
+ mova Z(3), m3
+ ret
+
+align 16
+fft16_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T8_SSE m0, m1, m2, m3, m4, m5
+ mova m4, Z(4)
+ mova m5, Z(5)
+ mova Z(0), m0
+ mova Z(1), m1
+ mova Z(2), m2
+ mova Z(3), m3
+ T4_SSE m4, m5, m6
+ mova m6, Z2(6)
+ mova m7, Z2(7)
+ T4_SSE m6, m7, m0
+ PASS_SMALL 0, [cos_16], [cos_16+16]
+ ret
+
+
+%macro FFT48_3DNOW 0
+align 16
+fft4 %+ SUFFIX:
+ T2_3DNOW m0, m1, Z(0), Z(1)
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T4_3DNOW m0, m1, m2, m3, m4, m5
+ PUNPCK m0, m1, m4
+ PUNPCK m2, m3, m5
+ mova Z(0), m0
+ mova Z(1), m4
+ mova Z(2), m2
+ mova Z(3), m5
+ ret
+
+align 16
+fft8 %+ SUFFIX:
+ T2_3DNOW m0, m1, Z(0), Z(1)
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T4_3DNOW m0, m1, m2, m3, m4, m5
+ mova Z(0), m0
+ mova Z(2), m2
+ T2_3DNOW m4, m5, Z(4), Z(5)
+ T2_3DNOW m6, m7, Z2(6), Z2(7)
+ PSWAPD m0, m5
+ PSWAPD m2, m7
+ pxor m0, [ps_m1p1]
+ pxor m2, [ps_m1p1]
+ pfsub m5, m0
+ pfadd m7, m2
+ pfmul m5, [ps_root2]
+ pfmul m7, [ps_root2]
+ T4_3DNOW m1, m3, m5, m7, m0, m2
+ mova Z(5), m5
+ mova Z2(7), m7
+ mova m0, Z(0)
+ mova m2, Z(2)
+ T4_3DNOW m0, m2, m4, m6, m5, m7
+ PUNPCK m0, m1, m5
+ PUNPCK m2, m3, m7
+ mova Z(0), m0
+ mova Z(1), m5
+ mova Z(2), m2
+ mova Z(3), m7
+ PUNPCK m4, Z(5), m5
+ PUNPCK m6, Z2(7), m7
+ mova Z(4), m4
+ mova Z(5), m5
+ mova Z2(6), m6
+ mova Z2(7), m7
+ ret
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnowext
+FFT48_3DNOW
+
+INIT_MMX 3dnow
+FFT48_3DNOW
+%endif
+
+%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
+%define Z2(x) [zcq + o3q + mmsize*(x&1)]
+%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
+%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
+
+%macro DECL_PASS 2+ ; name, payload
+align 16
+%1:
+DEFINE_ARGS zc, w, n, o1, o3
+ lea o3q, [nq*3]
+ lea o1q, [nq*8]
+ shl o3q, 4
+.loop:
+ %2
+ add zcq, mmsize*2
+ add wq, mmsize
+ sub nd, mmsize/8
+ jg .loop
+ rep ret
+%endmacro
+
+%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
+ lea r2, [dispatch_tab%1]
+ mov r2, [r2 + (%2q-2)*gprsize]
+%ifdef PIC
+ lea r3, [$$]
+ add r2, r3
+%endif
+ call r2
+%endmacro ; FFT_DISPATCH
+
+INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
+%macro INTERL_AVX 5
+ vunpckhps %3, %2, %1
+ vunpcklps %2, %2, %1
+ vextractf128 %4(%5), %2, 0
+ vextractf128 %4 %+ H(%5), %3, 0
+ vextractf128 %4(%5 + 1), %2, 1
+ vextractf128 %4 %+ H(%5 + 1), %3, 1
+%endmacro
+
+%define INTERL INTERL_AVX
+
+DECL_PASS pass_avx, PASS_BIG 1
+DECL_PASS pass_interleave_avx, PASS_BIG 0
+
+cglobal fft_calc, 2,5,8
+ mov r3d, [r0 + FFTContext.nbits]
+ mov r0, r1
+ mov r1, r3
+ FFT_DISPATCH _interleave %+ SUFFIX, r1
+ REP_RET
+
+%endif
+
+INIT_XMM sse
+
+%macro INTERL_SSE 5
+ mova %3, %2
+ unpcklps %2, %1
+ unpckhps %3, %1
+ mova %4(%5), %2
+ mova %4(%5+1), %3
+%endmacro
+
+%define INTERL INTERL_SSE
+
+DECL_PASS pass_sse, PASS_BIG 1
+DECL_PASS pass_interleave_sse, PASS_BIG 0
+
+%macro FFT_CALC_FUNC 0
+cglobal fft_calc, 2,5,8
+ mov r3d, [r0 + FFTContext.nbits]
+ PUSH r1
+ PUSH r3
+ mov r0, r1
+ mov r1, r3
+ FFT_DISPATCH _interleave %+ SUFFIX, r1
+ POP rcx
+ POP r4
+ cmp rcx, 3+(mmsize/16)
+ jg .end
+ mov r2, -1
+ add rcx, 3
+ shl r2, cl
+ sub r4, r2
+.loop:
+%if mmsize == 8
+ PSWAPD m0, [r4 + r2 + 4]
+ mova [r4 + r2 + 4], m0
+%else
+ movaps xmm0, [r4 + r2]
+ movaps xmm1, xmm0
+ unpcklps xmm0, [r4 + r2 + 16]
+ unpckhps xmm1, [r4 + r2 + 16]
+ movaps [r4 + r2], xmm0
+ movaps [r4 + r2 + 16], xmm1
+%endif
+ add r2, mmsize*2
+ jl .loop
+.end:
+%if cpuflag(3dnow)
+ femms
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+FFT_CALC_FUNC
+INIT_MMX 3dnowext
+FFT_CALC_FUNC
+%endif
+INIT_XMM sse
+FFT_CALC_FUNC
+
+cglobal fft_permute, 2,7,1
+ mov r4, [r0 + FFTContext.revtab]
+ mov r5, [r0 + FFTContext.tmpbuf]
+ mov ecx, [r0 + FFTContext.nbits]
+ mov r2, 1
+ shl r2, cl
+ xor r0, r0
+%if ARCH_X86_32
+ mov r1, r1m
+%endif
+.loop:
+ movaps xmm0, [r1 + 8*r0]
+ movzx r6, word [r4 + 2*r0]
+ movzx r3, word [r4 + 2*r0 + 2]
+ movlps [r5 + 8*r6], xmm0
+ movhps [r5 + 8*r3], xmm0
+ add r0, 2
+ cmp r0, r2
+ jl .loop
+ shl r2, 3
+ add r1, r2
+ add r5, r2
+ neg r2
+; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
+.loopcopy:
+ movaps xmm0, [r5 + r2]
+ movaps xmm1, [r5 + r2 + 16]
+ movaps [r1 + r2], xmm0
+ movaps [r1 + r2 + 16], xmm1
+ add r2, 32
+ jl .loopcopy
+ REP_RET
+
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+ mov r3d, [r0 + FFTContext.mdctsize]
+ mov r4, [r0 + FFTContext.imdcthalf]
+ add r1, r3
+ PUSH r3
+ PUSH r1
+%if ARCH_X86_32
+ push r2
+ push r1
+ push r0
+%else
+ sub rsp, 8
+%endif
+ call r4
+%if ARCH_X86_32
+ add esp, 12
+%else
+ add rsp, 8
+%endif
+ POP r1
+ POP r3
+ lea r0, [r1 + 2*r3]
+ mov r2, r3
+ sub r3, mmsize
+ neg r2
+ mova m2, [ps_m1m1m1m1]
+.loop:
+%if mmsize == 8
+ PSWAPD m0, [r1 + r3]
+ PSWAPD m1, [r0 + r2]
+ pxor m0, m2
+%else
+ mova m0, [r1 + r3]
+ mova m1, [r0 + r2]
+ shufps m0, m0, 0x1b
+ shufps m1, m1, 0x1b
+ xorps m0, m2
+%endif
+ mova [r0 + r3], m1
+ mova [r1 + r2], m0
+ sub r3, mmsize
+ add r2, mmsize
+ jl .loop
+%if cpuflag(3dnow)
+ femms
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+%define mulps pfmul
+%define addps pfadd
+%define subps pfsub
+%define unpcklps punpckldq
+%define unpckhps punpckhdq
+DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
+DECL_PASS pass_interleave_3dnow, PASS_BIG 0
+%define pass_3dnowext pass_3dnow
+%define pass_interleave_3dnowext pass_interleave_3dnow
+%endif
+
+%ifdef PIC
+%define SECTION_REL - $$
+%else
+%define SECTION_REL
+%endif
+
+%macro DECL_FFT 1-2 ; nbits, suffix
+%ifidn %0, 1
+%xdefine fullsuffix SUFFIX
+%else
+%xdefine fullsuffix %2 %+ SUFFIX
+%endif
+%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
+%if %1>=5
+%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
+%endif
+%if %1>=6
+%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
+%endif
+
+%assign n 1<<%1
+%rep 17-%1
+%assign n2 n/2
+%assign n4 n/4
+%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
+
+align 16
+fft %+ n %+ fullsuffix:
+ call fft %+ n2 %+ SUFFIX
+ add r0, n*4 - (n&(-2<<%1))
+ call fft %+ n4 %+ SUFFIX
+ add r0, n*2 - (n2&(-2<<%1))
+ call fft %+ n4 %+ SUFFIX
+ sub r0, n*6 + (n2&(-2<<%1))
+ lea r1, [cos_ %+ n]
+ mov r2d, n4/2
+ jmp pass %+ fullsuffix
+
+%assign n n*2
+%endrep
+%undef n
+
+align 8
+dispatch_tab %+ fullsuffix: pointer list_of_fft
+%endmacro ; DECL_FFT
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+DECL_FFT 6
+DECL_FFT 6, _interleave
+%endif
+INIT_XMM sse
+DECL_FFT 5
+DECL_FFT 5, _interleave
+%if ARCH_X86_32
+INIT_MMX 3dnow
+DECL_FFT 4
+DECL_FFT 4, _interleave
+INIT_MMX 3dnowext
+DECL_FFT 4
+DECL_FFT 4, _interleave
+%endif
+
+INIT_XMM sse
+%undef mulps
+%undef addps
+%undef subps
+%undef unpcklps
+%undef unpckhps
+
+%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
+%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
+ PSWAPD m0, [%3+%2*4]
+ movq m2, [%3+%1*4-8]
+ movq m3, m0
+ punpckldq m0, m2
+ punpckhdq m2, m3
+ movd m1, [%4+%1*2-4] ; tcos[j]
+ movd m3, [%4+%2*2] ; tcos[n4-j-1]
+ punpckldq m1, [%5+%1*2-4] ; tsin[j]
+ punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
+
+ mova m4, m0
+ PSWAPD m5, m1
+ pfmul m0, m1
+ pfmul m4, m5
+ mova m6, m2
+ PSWAPD m5, m3
+ pfmul m2, m3
+ pfmul m6, m5
+%if cpuflag(3dnowext)
+ pfpnacc m0, m4
+ pfpnacc m2, m6
+%else
+ SBUTTERFLY dq, 0, 4, 1
+ SBUTTERFLY dq, 2, 6, 3
+ pxor m4, m7
+ pxor m6, m7
+ pfadd m0, m4
+ pfadd m2, m6
+%endif
+%else
+ movaps xmm0, [%3+%2*4]
+ movaps xmm1, [%3+%1*4-0x10]
+ movaps xmm2, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm1, xmm2, 0x77
+ movlps xmm4, [%4+%2*2]
+ movlps xmm5, [%5+%2*2+0x0]
+ movhps xmm4, [%4+%1*2-0x8]
+ movhps xmm5, [%5+%1*2-0x8]
+ movaps xmm2, xmm0
+ movaps xmm3, xmm1
+ mulps xmm0, xmm5
+ mulps xmm1, xmm4
+ mulps xmm2, xmm4
+ mulps xmm3, xmm5
+ subps xmm1, xmm0
+ addps xmm2, xmm3
+ movaps xmm0, xmm1
+ unpcklps xmm1, xmm2
+ unpckhps xmm0, xmm2
+%endif
+%endmacro
+
+%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
+ mulps m6, %3, [%5+%1]
+ mulps m7, %2, [%5+%1]
+ mulps %2, %2, [%6+%1]
+ mulps %3, %3, [%6+%1]
+ subps %2, %2, m6
+ addps %3, %3, m7
+%endmacro
+
+%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
+.post:
+ vmovaps ymm1, [%3+%1*2]
+ vmovaps ymm0, [%3+%1*2+0x20]
+ vmovaps ymm3, [%3+%2*2]
+ vmovaps ymm2, [%3+%2*2+0x20]
+
+ CMUL %1, ymm0, ymm1, %3, %4, %5
+ CMUL %2, ymm2, ymm3, %3, %4, %5
+ vshufps ymm1, ymm1, ymm1, 0x1b
+ vshufps ymm3, ymm3, ymm3, 0x1b
+ vperm2f128 ymm1, ymm1, ymm1, 0x01
+ vperm2f128 ymm3, ymm3, ymm3, 0x01
+ vunpcklps ymm6, ymm2, ymm1
+ vunpckhps ymm4, ymm2, ymm1
+ vunpcklps ymm7, ymm0, ymm3
+ vunpckhps ymm5, ymm0, ymm3
+
+ vextractf128 [%3+%1*2], ymm7, 0
+ vextractf128 [%3+%1*2+0x10], ymm5, 0
+ vextractf128 [%3+%1*2+0x20], ymm7, 1
+ vextractf128 [%3+%1*2+0x30], ymm5, 1
+
+ vextractf128 [%3+%2*2], ymm6, 0
+ vextractf128 [%3+%2*2+0x10], ymm4, 0
+ vextractf128 [%3+%2*2+0x20], ymm6, 1
+ vextractf128 [%3+%2*2+0x30], ymm4, 1
+ sub %2, 0x20
+ add %1, 0x20
+ jl .post
+%endmacro
+
+%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
+.post:
+ movaps xmm1, [%3+%1*2]
+ movaps xmm0, [%3+%1*2+0x10]
+ CMUL %1, xmm0, xmm1, %3, %4, %5
+ movaps xmm5, [%3+%2*2]
+ movaps xmm4, [%3+%2*2+0x10]
+ CMUL %2, xmm4, xmm5, %3, %4, %5
+ shufps xmm1, xmm1, 0x1b
+ shufps xmm5, xmm5, 0x1b
+ movaps xmm6, xmm4
+ unpckhps xmm4, xmm1
+ unpcklps xmm6, xmm1
+ movaps xmm2, xmm0
+ unpcklps xmm0, xmm5
+ unpckhps xmm2, xmm5
+ movaps [%3+%2*2], xmm6
+ movaps [%3+%2*2+0x10], xmm4
+ movaps [%3+%1*2], xmm0
+ movaps [%3+%1*2+0x10], xmm2
+ sub %2, 0x10
+ add %1, 0x10
+ jl .post
+%endmacro
+
+%macro CMUL_3DNOW 6
+ mova m6, [%1+%2*2]
+ mova %3, [%1+%2*2+8]
+ mova %4, m6
+ mova m7, %3
+ pfmul m6, [%5+%2]
+ pfmul %3, [%6+%2]
+ pfmul %4, [%6+%2]
+ pfmul m7, [%5+%2]
+ pfsub %3, m6
+ pfadd %4, m7
+%endmacro
+
+%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
+.post:
+ CMUL_3DNOW %3, %1, m0, m1, %4, %5
+ CMUL_3DNOW %3, %2, m2, m3, %4, %5
+ movd [%3+%1*2+ 0], m0
+ movd [%3+%2*2+12], m1
+ movd [%3+%2*2+ 0], m2
+ movd [%3+%1*2+12], m3
+ psrlq m0, 32
+ psrlq m1, 32
+ psrlq m2, 32
+ psrlq m3, 32
+ movd [%3+%1*2+ 8], m0
+ movd [%3+%2*2+ 4], m1
+ movd [%3+%2*2+ 8], m2
+ movd [%3+%1*2+ 4], m3
+ sub %2, 8
+ add %1, 8
+ jl .post
+%endmacro
+
+%macro DECL_IMDCT 1
+cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%if ARCH_X86_64
+%define rrevtab r7
+%define rtcos r8
+%define rtsin r9
+%else
+%define rrevtab r6
+%define rtsin r6
+%define rtcos r5
+%endif
+ mov r3d, [r0+FFTContext.mdctsize]
+ add r2, r3
+ shr r3, 1
+ mov rtcos, [r0+FFTContext.tcos]
+ mov rtsin, [r0+FFTContext.tsin]
+ add rtcos, r3
+ add rtsin, r3
+%if ARCH_X86_64 == 0
+ push rtcos
+ push rtsin
+%endif
+ shr r3, 1
+ mov rrevtab, [r0+FFTContext.revtab]
+ add rrevtab, r3
+%if ARCH_X86_64 == 0
+ push rrevtab
+%endif
+
+%if mmsize == 8
+ sub r3, 2
+%else
+ sub r3, 4
+%endif
+%if ARCH_X86_64 || mmsize == 8
+ xor r4, r4
+ sub r4, r3
+%endif
+%if notcpuflag(3dnowext) && mmsize == 8
+ movd m7, [ps_m1m1m1m1]
+%endif
+.pre:
+%if ARCH_X86_64 == 0
+;unspill
+%if mmsize != 8
+ xor r4, r4
+ sub r4, r3
+%endif
+ mov rtcos, [esp+8]
+ mov rtsin, [esp+4]
+%endif
+
+ PREROTATER r4, r3, r2, rtcos, rtsin
+%if mmsize == 8
+ mov r6, [esp] ; rrevtab = ptr+n8
+ movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
+ movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
+ mova [r1+r5*8], m0
+ mova [r1+r6*8], m2
+ add r4, 2
+ sub r3, 2
+%else
+%if ARCH_X86_64
+ movzx r5, word [rrevtab+r4-4]
+ movzx r6, word [rrevtab+r4-2]
+ movzx r10, word [rrevtab+r3]
+ movzx r11, word [rrevtab+r3+2]
+ movlps [r1+r5 *8], xmm0
+ movhps [r1+r6 *8], xmm0
+ movlps [r1+r10*8], xmm1
+ movhps [r1+r11*8], xmm1
+ add r4, 4
+%else
+ mov r6, [esp]
+ movzx r5, word [r6+r4-4]
+ movzx r4, word [r6+r4-2]
+ movlps [r1+r5*8], xmm0
+ movhps [r1+r4*8], xmm0
+ movzx r5, word [r6+r3]
+ movzx r4, word [r6+r3+2]
+ movlps [r1+r5*8], xmm1
+ movhps [r1+r4*8], xmm1
+%endif
+ sub r3, 4
+%endif
+ jns .pre
+
+ mov r5, r0
+ mov r6, r1
+ mov r0, r1
+ mov r1d, [r5+FFTContext.nbits]
+
+ FFT_DISPATCH SUFFIX, r1
+
+ mov r0d, [r5+FFTContext.mdctsize]
+ add r6, r0
+ shr r0, 1
+%if ARCH_X86_64 == 0
+%define rtcos r2
+%define rtsin r3
+ mov rtcos, [esp+8]
+ mov rtsin, [esp+4]
+%endif
+ neg r0
+ mov r1, -mmsize
+ sub r1, r0
+ %1 r0, r1, r6, rtcos, rtsin
+%if ARCH_X86_64 == 0
+ add esp, 12
+%endif
+%if mmsize == 8
+ femms
+%endif
+ RET
+%endmacro
+
+DECL_IMDCT POSROTATESHUF
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+DECL_IMDCT POSROTATESHUF_3DNOW
+
+INIT_MMX 3dnowext
+DECL_IMDCT POSROTATESHUF_3DNOW
+%endif
+
+INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/ffmpeg/libavcodec/x86/fft.h b/ffmpeg/libavcodec/x86/fft.h
new file mode 100644
index 0000000..3f8b21d
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fft.h
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_FFT_H
+#define AVCODEC_X86_FFT_H
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
+
+#endif /* AVCODEC_X86_FFT_H */
diff --git a/ffmpeg/libavcodec/x86/fft_init.c b/ffmpeg/libavcodec/x86/fft_init.c
new file mode 100644
index 0000000..bfa7947
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fft_init.c
@@ -0,0 +1,68 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dct.h"
+#include "fft.h"
+
+av_cold void ff_fft_init_x86(FFTContext *s)
+{
+ int has_vectors = av_get_cpu_flags();
+#if ARCH_X86_32
+ if (EXTERNAL_AMD3DNOW(has_vectors)) {
+ /* 3DNow! for K6-2/3 */
+ s->imdct_calc = ff_imdct_calc_3dnow;
+ s->imdct_half = ff_imdct_half_3dnow;
+ s->fft_calc = ff_fft_calc_3dnow;
+ }
+ if (EXTERNAL_AMD3DNOWEXT(has_vectors)) {
+ /* 3DNowEx for K7 */
+ s->imdct_calc = ff_imdct_calc_3dnowext;
+ s->imdct_half = ff_imdct_half_3dnowext;
+ s->fft_calc = ff_fft_calc_3dnowext;
+ }
+#endif
+ if (EXTERNAL_SSE(has_vectors)) {
+ /* SSE for P3/P4/K8 */
+ s->imdct_calc = ff_imdct_calc_sse;
+ s->imdct_half = ff_imdct_half_sse;
+ s->fft_permute = ff_fft_permute_sse;
+ s->fft_calc = ff_fft_calc_sse;
+ s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+ }
+ if (EXTERNAL_AVX(has_vectors) && s->nbits >= 5) {
+ /* AVX for SB */
+ s->imdct_half = ff_imdct_half_avx;
+ s->fft_calc = ff_fft_calc_avx;
+ s->fft_permutation = FF_FFT_PERM_AVX;
+ }
+}
+
+#if CONFIG_DCT
+av_cold void ff_dct_init_x86(DCTContext *s)
+{
+ int has_vectors = av_get_cpu_flags();
+ if (EXTERNAL_SSE(has_vectors))
+ s->dct32 = ff_dct32_float_sse;
+ if (EXTERNAL_SSE2(has_vectors))
+ s->dct32 = ff_dct32_float_sse2;
+ if (EXTERNAL_AVX(has_vectors))
+ s->dct32 = ff_dct32_float_avx;
+}
+#endif
diff --git a/ffmpeg/libavcodec/x86/fmtconvert.asm b/ffmpeg/libavcodec/x86/fmtconvert.asm
new file mode 100644
index 0000000..1bd13fc
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fmtconvert.asm
@@ -0,0 +1,429 @@
+;******************************************************************************
+;* x86 optimized Format Conversion Utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro CVTPS2PI 2
+%if cpuflag(sse)
+ cvtps2pi %1, %2
+%elif cpuflag(3dnow)
+ pf2id %1, %2
+%endif
+%endmacro
+
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 1
+%if UNIX64
+cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
+%endif
+%if WIN64
+ SWAP 0, 2
+%elif ARCH_X86_32
+ movss m0, mulm
+%endif
+ SPLATD m0
+ shl lenq, 2
+ add srcq, lenq
+ add dstq, lenq
+ neg lenq
+.loop:
+%if cpuflag(sse2)
+ cvtdq2ps m1, [srcq+lenq ]
+ cvtdq2ps m2, [srcq+lenq+16]
+%else
+ cvtpi2ps m1, [srcq+lenq ]
+ cvtpi2ps m3, [srcq+lenq+ 8]
+ cvtpi2ps m2, [srcq+lenq+16]
+ cvtpi2ps m4, [srcq+lenq+24]
+ movlhps m1, m3
+ movlhps m2, m4
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+16], m2
+ add lenq, 32
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_SCALAR 5
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_SCALAR 3
+
+
+;------------------------------------------------------------------------------
+; void ff_float_to_int16(int16_t *dst, const float *src, long len);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16 1
+cglobal float_to_int16, 3, 3, %1, dst, src, len
+ add lenq, lenq
+ lea srcq, [srcq+2*lenq]
+ add dstq, lenq
+ neg lenq
+.loop:
+%if cpuflag(sse2)
+ cvtps2dq m0, [srcq+2*lenq ]
+ cvtps2dq m1, [srcq+2*lenq+16]
+ packssdw m0, m1
+ mova [dstq+lenq], m0
+%else
+ CVTPS2PI m0, [srcq+2*lenq ]
+ CVTPS2PI m1, [srcq+2*lenq+ 8]
+ CVTPS2PI m2, [srcq+2*lenq+16]
+ CVTPS2PI m3, [srcq+2*lenq+24]
+ packssdw m0, m1
+ packssdw m2, m3
+ mova [dstq+lenq ], m0
+ mova [dstq+lenq+8], m2
+%endif
+ add lenq, 16
+ js .loop
+%if mmsize == 8
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLOAT_TO_INT16 2
+INIT_MMX sse
+FLOAT_TO_INT16 0
+INIT_MMX 3dnow
+FLOAT_TO_INT16 0
+
+;------------------------------------------------------------------------------
+; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_STEP 1
+cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
+ add lenq, lenq
+ lea srcq, [srcq+2*lenq]
+ lea step3q, [stepq*3]
+ neg lenq
+.loop:
+%if cpuflag(sse2)
+ cvtps2dq m0, [srcq+2*lenq ]
+ cvtps2dq m1, [srcq+2*lenq+16]
+ packssdw m0, m1
+ movd v1d, m0
+ psrldq m0, 4
+ movd v2d, m0
+ psrldq m0, 4
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+ movd v1d, m0
+ psrldq m0, 4
+ movd v2d, m0
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+%else
+ CVTPS2PI m0, [srcq+2*lenq ]
+ CVTPS2PI m1, [srcq+2*lenq+ 8]
+ CVTPS2PI m2, [srcq+2*lenq+16]
+ CVTPS2PI m3, [srcq+2*lenq+24]
+ packssdw m0, m1
+ packssdw m2, m3
+ movd v1d, m0
+ psrlq m0, 32
+ movd v2d, m0
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+ movd v1d, m2
+ psrlq m2, 32
+ movd v2d, m2
+ mov [dstq], v1w
+ mov [dstq+stepq*4], v2w
+ shr v1d, 16
+ shr v2d, 16
+ mov [dstq+stepq*2], v1w
+ mov [dstq+step3q*2], v2w
+ lea dstq, [dstq+stepq*8]
+%endif
+ add lenq, 16
+ js .loop
+%if mmsize == 8
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLOAT_TO_INT16_STEP 2
+INIT_MMX sse
+FLOAT_TO_INT16_STEP 0
+INIT_MMX 3dnow
+FLOAT_TO_INT16_STEP 0
+
+;-------------------------------------------------------------------------------
+; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
+;-------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_INTERLEAVE2 0
+cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
+ lea lenq, [4*r2q]
+ mov src1q, [src0q+gprsize]
+ mov src0q, [src0q]
+ add dstq, lenq
+ add src0q, lenq
+ add src1q, lenq
+ neg lenq
+.loop:
+%if cpuflag(sse2)
+ cvtps2dq m0, [src0q+lenq]
+ cvtps2dq m1, [src1q+lenq]
+ packssdw m0, m1
+ movhlps m1, m0
+ punpcklwd m0, m1
+ mova [dstq+lenq], m0
+%else
+ CVTPS2PI m0, [src0q+lenq ]
+ CVTPS2PI m1, [src0q+lenq+8]
+ CVTPS2PI m2, [src1q+lenq ]
+ CVTPS2PI m3, [src1q+lenq+8]
+ packssdw m0, m1
+ packssdw m2, m3
+ mova m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ mova [dstq+lenq ], m0
+ mova [dstq+lenq+8], m1
+%endif
+ add lenq, 16
+ js .loop
+%if mmsize == 8
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnow
+FLOAT_TO_INT16_INTERLEAVE2
+INIT_MMX sse
+FLOAT_TO_INT16_INTERLEAVE2
+INIT_XMM sse2
+FLOAT_TO_INT16_INTERLEAVE2
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 0
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
+%if ARCH_X86_64
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+ CVTPS2PI mm0, [srcq]
+ CVTPS2PI mm1, [srcq+src1q]
+ CVTPS2PI mm2, [srcq+src2q]
+ CVTPS2PI mm3, [srcq+src3q]
+ CVTPS2PI mm4, [srcq+src4q]
+ CVTPS2PI mm5, [srcq+src5q]
+ packssdw mm0, mm3
+ packssdw mm1, mm4
+ packssdw mm2, mm5
+ PSWAPD mm3, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm1, mm2
+ punpcklwd mm2, mm3
+ PSWAPD mm3, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm2, mm1
+ punpckldq mm1, mm3
+ movq [dstq ], mm0
+ movq [dstq+16], mm2
+ movq [dstq+ 8], mm1
+ add srcq, 8
+ add dstq, 24
+ sub lend, 2
+ jg .loop
+ emms
+ RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+INIT_MMX sse
+FLOAT_TO_INT16_INTERLEAVE6
+INIT_MMX 3dnow
+FLOAT_TO_INT16_INTERLEAVE6
+INIT_MMX 3dnowext
+FLOAT_TO_INT16_INTERLEAVE6
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE6 1
+cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
+%if ARCH_X86_64
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+%if cpuflag(sse)
+ movaps m0, [srcq]
+ movaps m1, [srcq+src1q]
+ movaps m2, [srcq+src2q]
+ movaps m3, [srcq+src3q]
+ movaps m4, [srcq+src4q]
+ movaps m5, [srcq+src5q]
+
+ SBUTTERFLYPS 0, 1, 6
+ SBUTTERFLYPS 2, 3, 6
+ SBUTTERFLYPS 4, 5, 6
+
+ movaps m6, m4
+ shufps m4, m0, 0xe4
+ movlhps m0, m2
+ movhlps m6, m2
+ movaps [dstq ], m0
+ movaps [dstq+16], m4
+ movaps [dstq+32], m6
+
+ movaps m6, m5
+ shufps m5, m1, 0xe4
+ movlhps m1, m3
+ movhlps m6, m3
+ movaps [dstq+48], m1
+ movaps [dstq+64], m5
+ movaps [dstq+80], m6
+%else ; mmx
+ movq m0, [srcq]
+ movq m1, [srcq+src1q]
+ movq m2, [srcq+src2q]
+ movq m3, [srcq+src3q]
+ movq m4, [srcq+src4q]
+ movq m5, [srcq+src5q]
+
+ SBUTTERFLY dq, 0, 1, 6
+ SBUTTERFLY dq, 2, 3, 6
+ SBUTTERFLY dq, 4, 5, 6
+ movq [dstq ], m0
+ movq [dstq+ 8], m2
+ movq [dstq+16], m4
+ movq [dstq+24], m1
+ movq [dstq+32], m3
+ movq [dstq+40], m5
+%endif
+ add srcq, mmsize
+ add dstq, mmsize*6
+ sub lend, mmsize/4
+ jg .loop
+%if mmsize == 8
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+FLOAT_INTERLEAVE6 0
+INIT_XMM sse
+FLOAT_INTERLEAVE6 7
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE2 1
+cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
+ mov src1q, [srcq+gprsize]
+ mov srcq, [srcq ]
+ sub src1q, srcq
+.loop:
+ mova m0, [srcq ]
+ mova m1, [srcq+src1q ]
+ mova m3, [srcq +mmsize]
+ mova m4, [srcq+src1q+mmsize]
+
+ mova m2, m0
+ PUNPCKLDQ m0, m1
+ PUNPCKHDQ m2, m1
+
+ mova m1, m3
+ PUNPCKLDQ m3, m4
+ PUNPCKHDQ m1, m4
+
+ mova [dstq ], m0
+ mova [dstq+1*mmsize], m2
+ mova [dstq+2*mmsize], m3
+ mova [dstq+3*mmsize], m1
+
+ add srcq, mmsize*2
+ add dstq, mmsize*4
+ sub lend, mmsize/2
+ jg .loop
+%if mmsize == 8
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+%define PUNPCKLDQ punpckldq
+%define PUNPCKHDQ punpckhdq
+FLOAT_INTERLEAVE2 0
+INIT_XMM sse
+%define PUNPCKLDQ unpcklps
+%define PUNPCKHDQ unpckhps
+FLOAT_INTERLEAVE2 5
diff --git a/ffmpeg/libavcodec/x86/fmtconvert_init.c b/ffmpeg/libavcodec/x86/fmtconvert_init.c
new file mode 100644
index 0000000..4a4c017
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fmtconvert_init.c
@@ -0,0 +1,148 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+#if HAVE_YASM
+
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
+
+void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
+
+void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
+void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
+
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
+
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
+
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ int c;\
+ for(c=0; c<channels; c++){\
+ ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
+ }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ if(channels==1)\
+ ff_float_to_int16_##cpu(dst, src[0], len);\
+ else if(channels==2){\
+ ff_float_to_int16_interleave2_##cpu(dst, src, len);\
+ }else if(channels==6){\
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+ }else\
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
+FLOAT_TO_INT16_INTERLEAVE(sse)
+FLOAT_TO_INT16_INTERLEAVE(sse2)
+
+static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
+ long len, int channels)
+{
+ if(channels==6)
+ ff_float_to_int16_interleave6_3dnowext(dst, src, len);
+ else
+ float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
+
+void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
+
+static void float_interleave_mmx(float *dst, const float **src,
+ unsigned int len, int channels)
+{
+ if (channels == 2) {
+ ff_float_interleave2_mmx(dst, src, len);
+ } else if (channels == 6)
+ ff_float_interleave6_mmx(dst, src, len);
+ else
+ ff_float_interleave_c(dst, src, len, channels);
+}
+
+static void float_interleave_sse(float *dst, const float **src,
+ unsigned int len, int channels)
+{
+ if (channels == 2) {
+ ff_float_interleave2_sse(dst, src, len);
+ } else if (channels == 6)
+ ff_float_interleave6_sse(dst, src, len);
+ else
+ ff_float_interleave_c(dst, src, len, channels);
+}
+#endif /* HAVE_YASM */
+
+av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->float_interleave = float_interleave_mmx;
+
+ if (EXTERNAL_AMD3DNOW(mm_flags)) {
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16 = ff_float_to_int16_3dnow;
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+ }
+ }
+ if (EXTERNAL_AMD3DNOWEXT(mm_flags)) {
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
+ }
+ }
+ if (EXTERNAL_SSE(mm_flags)) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+ c->float_to_int16 = ff_float_to_int16_sse;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse;
+ c->float_interleave = float_interleave_sse;
+ }
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+ c->float_to_int16 = ff_float_to_int16_sse2;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+ }
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/fpelbase.asm b/ffmpeg/libavcodec/x86/fpelbase.asm
new file mode 100644
index 0000000..a327206
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/fpelbase.asm
@@ -0,0 +1,106 @@
+;******************************************************************************
+;* MMX optimized DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_MMX mmxext
+; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PIXELS48 2
+%if %2 == 4
+%define OP movh
+%else
+%define OP mova
+%endif
+cglobal %1_pixels%2, 4,5
+ movsxdifnidn r2, r2d
+ lea r4, [r2*3]
+.loop:
+ OP m0, [r1]
+ OP m1, [r1+r2]
+ OP m2, [r1+r2*2]
+ OP m3, [r1+r4]
+ lea r1, [r1+r2*4]
+%ifidn %1, avg
+ pavgb m0, [r0]
+ pavgb m1, [r0+r2]
+ pavgb m2, [r0+r2*2]
+ pavgb m3, [r0+r4]
+%endif
+ OP [r0], m0
+ OP [r0+r2], m1
+ OP [r0+r2*2], m2
+ OP [r0+r4], m3
+ sub r3d, 4
+ lea r0, [r0+r2*4]
+ jne .loop
+ RET
+%endmacro
+
+PIXELS48 put, 4
+PIXELS48 avg, 4
+PIXELS48 put, 8
+PIXELS48 avg, 8
+
+
+INIT_XMM sse2
+; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+cglobal put_pixels16, 4,5,4
+ lea r4, [r2*3]
+.loop:
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r1+r2*2]
+ movu m3, [r1+r4]
+ lea r1, [r1+r2*4]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+r2*2], m2
+ mova [r0+r4], m3
+ sub r3d, 4
+ lea r0, [r0+r2*4]
+ jnz .loop
+ REP_RET
+
+; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+cglobal avg_pixels16, 4,5,4
+ lea r4, [r2*3]
+.loop:
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r1+r2*2]
+ movu m3, [r1+r4]
+ lea r1, [r1+r2*4]
+ pavgb m0, [r0]
+ pavgb m1, [r0+r2]
+ pavgb m2, [r0+r2*2]
+ pavgb m3, [r0+r4]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+r2*2], m2
+ mova [r0+r4], m3
+ sub r3d, 4
+ lea r0, [r0+r2*4]
+ jnz .loop
+ REP_RET
diff --git a/ffmpeg/libavcodec/x86/h263_loopfilter.asm b/ffmpeg/libavcodec/x86/h263_loopfilter.asm
new file mode 100644
index 0000000..a21baf1
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h263_loopfilter.asm
@@ -0,0 +1,189 @@
+;******************************************************************************
+;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_FC
+cextern h263_loop_filter_strength
+
+SECTION_TEXT
+
+%macro H263_LOOP_FILTER 5
+ pxor m7, m7
+ mova m0, [%1]
+ mova m1, [%1]
+ mova m2, [%4]
+ mova m3, [%4]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ mova m2, [%2]
+ mova m3, [%2]
+ mova m4, [%3]
+ mova m5, [%3]
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ punpcklbw m4, m7
+ punpckhbw m5, m7
+ psubw m4, m2
+ psubw m5, m3
+ psllw m4, 2
+ psllw m5, 2
+ paddw m4, m0
+ paddw m5, m1
+ pxor m6, m6
+ pcmpgtw m6, m4
+ pcmpgtw m7, m5
+ pxor m4, m6
+ pxor m5, m7
+ psubw m4, m6
+ psubw m5, m7
+ psrlw m4, 3
+ psrlw m5, 3
+ packuswb m4, m5
+ packsswb m6, m7
+ pxor m7, m7
+ movd m2, %5
+ punpcklbw m2, m2
+ punpcklbw m2, m2
+ punpcklbw m2, m2
+ psubusb m2, m4
+ mova m3, m2
+ psubusb m3, m4
+ psubb m2, m3
+ mova m3, [%2]
+ mova m4, [%3]
+ pxor m3, m6
+ pxor m4, m6
+ paddusb m3, m2
+ psubusb m4, m2
+ pxor m3, m6
+ pxor m4, m6
+ paddusb m2, m2
+ packsswb m0, m1
+ pcmpgtb m7, m0
+ pxor m0, m7
+ psubb m0, m7
+ mova m1, m0
+ psubusb m0, m2
+ psubb m1, m0
+ pand m1, [pb_FC]
+ psrlw m1, 2
+ pxor m1, m7
+ psubb m1, m7
+ mova m5, [%1]
+ mova m6, [%4]
+ psubb m5, m1
+ paddb m6, m1
+%endmacro
+
+INIT_MMX mmx
+; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
+cglobal h263_v_loop_filter, 3,5
+ movsxdifnidn r1, r1d
+ movsxdifnidn r2, r2d
+
+ lea r4, [h263_loop_filter_strength]
+ movzx r3d, BYTE [r4+r2]
+ movsx r2, r3b
+ shl r2, 1
+
+ mov r3, r0
+ sub r3, r1
+ mov r4, r3
+ sub r4, r1
+ H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
+
+ mova [r3], m3
+ mova [r0], m4
+ mova [r4], m5
+ mova [r0+r1], m6
+ RET
+
+%macro TRANSPOSE4X4 2
+ movd m0, [%1]
+ movd m1, [%1+r1]
+ movd m2, [%1+r1*2]
+ movd m3, [%1+r3]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ mova m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ movd [%2+ 0], m0
+ punpckhdq m0, m0
+ movd [%2+ 8], m0
+ movd [%2+16], m1
+ punpckhdq m1, m1
+ movd [%2+24], m1
+%endmacro
+
+
+; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
+INIT_MMX mmx
+cglobal h263_h_loop_filter, 3,5,0,32
+ movsxdifnidn r1, r1d
+ movsxdifnidn r2, r2d
+
+ lea r4, [h263_loop_filter_strength]
+ movzx r3d, BYTE [r4+r2]
+ movsx r2, r3b
+ shl r2, 1
+
+ sub r0, 2
+ lea r3, [r1*3]
+
+ TRANSPOSE4X4 r0, rsp
+ lea r4, [r0+r1*4]
+ TRANSPOSE4X4 r4, rsp+4
+
+ H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
+
+ mova m1, m5
+ mova m0, m4
+ punpcklbw m5, m3
+ punpcklbw m4, m6
+ punpckhbw m1, m3
+ punpckhbw m0, m6
+ mova m3, m5
+ mova m6, m1
+ punpcklwd m5, m4
+ punpcklwd m1, m0
+ punpckhwd m3, m4
+ punpckhwd m6, m0
+ movd [r0], m5
+ punpckhdq m5, m5
+ movd [r0+r1*1], m5
+ movd [r0+r1*2], m3
+ punpckhdq m3, m3
+ movd [r0+r3], m3
+ movd [r4], m1
+ punpckhdq m1, m1
+ movd [r4+r1*1], m1
+ movd [r4+r1*2], m6
+ punpckhdq m6, m6
+ movd [r4+r3], m6
+ RET
diff --git a/ffmpeg/libavcodec/x86/h264_chromamc.asm b/ffmpeg/libavcodec/x86/h264_chromamc.asm
new file mode 100644
index 0000000..32681aa
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_chromamc.asm
@@ -0,0 +1,678 @@
+;******************************************************************************
+;* MMX/SSSE3-optimized functions for H264 chroma MC
+;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
+;* 2005-2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+rnd_rv40_2d_tbl: times 4 dw 0
+ times 4 dw 16
+ times 4 dw 32
+ times 4 dw 16
+ times 4 dw 32
+ times 4 dw 28
+ times 4 dw 32
+ times 4 dw 28
+ times 4 dw 0
+ times 4 dw 32
+ times 4 dw 16
+ times 4 dw 32
+ times 4 dw 32
+ times 4 dw 28
+ times 4 dw 32
+ times 4 dw 28
+rnd_rv40_1d_tbl: times 4 dw 0
+ times 4 dw 2
+ times 4 dw 4
+ times 4 dw 2
+ times 4 dw 4
+ times 4 dw 3
+ times 4 dw 4
+ times 4 dw 3
+ times 4 dw 0
+ times 4 dw 4
+ times 4 dw 2
+ times 4 dw 4
+ times 4 dw 4
+ times 4 dw 3
+ times 4 dw 4
+ times 4 dw 3
+
+cextern pw_3
+cextern pw_4
+cextern pw_8
+pw_28: times 8 dw 28
+cextern pw_32
+cextern pw_64
+
+SECTION .text
+
+%macro mv0_pixels_mc8 0
+ lea r4, [r2*2 ]
+.next4rows:
+ movq mm0, [r1 ]
+ movq mm1, [r1+r2]
+ add r1, r4
+ CHROMAMC_AVG mm0, [r0 ]
+ CHROMAMC_AVG mm1, [r0+r2]
+ movq [r0 ], mm0
+ movq [r0+r2], mm1
+ add r0, r4
+ movq mm0, [r1 ]
+ movq mm1, [r1+r2]
+ add r1, r4
+ CHROMAMC_AVG mm0, [r0 ]
+ CHROMAMC_AVG mm1, [r0+r2]
+ movq [r0 ], mm0
+ movq [r0+r2], mm1
+ add r0, r4
+ sub r3d, 4
+ jne .next4rows
+%endmacro
+
+%macro chroma_mc8_mmx_func 2-3
+%ifidn %2, rv40
+%ifdef PIC
+%define rnd_1d_rv40 r8
+%define rnd_2d_rv40 r8
+%define extra_regs 2
+%else ; no-PIC
+%define rnd_1d_rv40 rnd_rv40_1d_tbl
+%define rnd_2d_rv40 rnd_rv40_2d_tbl
+%define extra_regs 1
+%endif ; PIC
+%else
+%define extra_regs 0
+%endif ; rv40
+; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
+; int stride, int h, int mx, int my)
+cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+ mov r6d, r5d
+ or r6d, r4d
+ jne .at_least_one_non_zero
+ ; mx == 0 AND my == 0 - no filter needed
+ mv0_pixels_mc8
+ REP_RET
+
+.at_least_one_non_zero:
+%ifidn %2, rv40
+%if ARCH_X86_64
+ mov r7, r5
+ and r7, 6 ; &~1 for mx/my=[0,7]
+ lea r7, [r7*4+r4]
+ sar r7d, 1
+%define rnd_bias r7
+%define dest_reg r0
+%else ; x86-32
+ mov r0, r5
+ and r0, 6 ; &~1 for mx/my=[0,7]
+ lea r0, [r0*4+r4]
+ sar r0d, 1
+%define rnd_bias r0
+%define dest_reg r5
+%endif
+%else ; vc1, h264
+%define rnd_bias 0
+%define dest_reg r0
+%endif
+
+ test r5d, r5d
+ mov r6, 1
+ je .my_is_zero
+ test r4d, r4d
+ mov r6, r2 ; dxy = x ? 1 : stride
+ jne .both_non_zero
+.my_is_zero:
+ ; mx == 0 XOR my == 0 - 1 dimensional filter only
+ or r4d, r5d ; x + y
+
+%ifidn %2, rv40
+%ifdef PIC
+ lea r8, [rnd_rv40_1d_tbl]
+%endif
+%if ARCH_X86_64 == 0
+ mov r5, r0m
+%endif
+%endif
+
+ movd m5, r4d
+ movq m4, [pw_8]
+ movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
+ punpcklwd m5, m5
+ punpckldq m5, m5 ; mm5 = B = x
+ pxor m7, m7
+ psubw m4, m5 ; mm4 = A = 8-x
+
+.next1drow:
+ movq m0, [r1 ] ; mm0 = src[0..7]
+ movq m2, [r1+r6] ; mm1 = src[1..8]
+
+ movq m1, m0
+ movq m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
+ pmullw m1, m4
+ pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
+ pmullw m3, m5
+
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 3
+ psrlw m1, 3
+ packuswb m0, m1
+ CHROMAMC_AVG m0, [dest_reg]
+ movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
+
+ add dest_reg, r2
+ add r1, r2
+ dec r3d
+ jne .next1drow
+ REP_RET
+
+.both_non_zero: ; general case, bilinear
+ movd m4, r4d ; x
+ movd m6, r5d ; y
+%ifidn %2, rv40
+%ifdef PIC
+ lea r8, [rnd_rv40_2d_tbl]
+%endif
+%if ARCH_X86_64 == 0
+ mov r5, r0m
+%endif
+%endif
+ mov r6, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+ sub rsp, 16 ; AA and DD
+
+ punpcklwd m4, m4
+ punpcklwd m6, m6
+ punpckldq m4, m4 ; mm4 = x words
+ punpckldq m6, m6 ; mm6 = y words
+ movq m5, m4
+ pmullw m4, m6 ; mm4 = x * y
+ psllw m5, 3
+ psllw m6, 3
+ movq m7, m5
+ paddw m7, m6
+ movq [rsp+8], m4 ; DD = x * y
+ psubw m5, m4 ; mm5 = B = 8x - xy
+ psubw m6, m4 ; mm6 = C = 8y - xy
+ paddw m4, [pw_64]
+ psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
+ pxor m7, m7
+ movq [rsp ], m4
+
+ movq m0, [r1 ] ; mm0 = src[0..7]
+ movq m1, [r1+1] ; mm1 = src[1..8]
+.next2drow:
+ add r1, r2
+
+ movq m2, m0
+ movq m3, m1
+ punpckhbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ pmullw m0, [rsp]
+ pmullw m2, [rsp]
+ pmullw m1, m5
+ pmullw m3, m5
+ paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
+ paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
+
+ movq m0, [r1]
+ movq m1, m0
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ pmullw m0, m6
+ pmullw m1, m6
+ paddw m2, m0
+ paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
+
+ movq m1, [r1+1]
+ movq m0, m1
+ movq m4, m1
+ punpcklbw m0, m7
+ punpckhbw m4, m7
+ pmullw m0, [rsp+8]
+ pmullw m4, [rsp+8]
+ paddw m2, m0
+ paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
+ movq m0, [r1]
+
+ paddw m2, [rnd_2d_%2+rnd_bias*8]
+ paddw m3, [rnd_2d_%2+rnd_bias*8]
+ psrlw m2, 6
+ psrlw m3, 6
+ packuswb m2, m3
+ CHROMAMC_AVG m2, [dest_reg]
+ movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
+
+ add dest_reg, r2
+ dec r3d
+ jne .next2drow
+ mov rsp, r6 ; restore stack pointer
+ RET
+%endmacro
+
+%macro chroma_mc4_mmx_func 2
+%define extra_regs 0
+%ifidn %2, rv40
+%ifdef PIC
+%define extra_regs 1
+%endif ; PIC
+%endif ; rv40
+cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+ pxor m7, m7
+ movd m2, r4d ; x
+ movd m3, r5d ; y
+ movq m4, [pw_8]
+ movq m5, [pw_8]
+ punpcklwd m2, m2
+ punpcklwd m3, m3
+ punpcklwd m2, m2
+ punpcklwd m3, m3
+ psubw m4, m2
+ psubw m5, m3
+
+%ifidn %2, rv40
+%ifdef PIC
+ lea r6, [rnd_rv40_2d_tbl]
+%define rnd_2d_rv40 r6
+%else
+%define rnd_2d_rv40 rnd_rv40_2d_tbl
+%endif
+ and r5, 6 ; &~1 for mx/my=[0,7]
+ lea r5, [r5*4+r4]
+ sar r5d, 1
+%define rnd_bias r5
+%else ; vc1, h264
+%define rnd_bias 0
+%endif
+
+ movd m0, [r1 ]
+ movd m6, [r1+1]
+ add r1, r2
+ punpcklbw m0, m7
+ punpcklbw m6, m7
+ pmullw m0, m4
+ pmullw m6, m2
+ paddw m6, m0
+
+.next2rows:
+ movd m0, [r1 ]
+ movd m1, [r1+1]
+ add r1, r2
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m4
+ pmullw m1, m2
+ paddw m1, m0
+ movq m0, m1
+
+ pmullw m6, m5
+ pmullw m1, m3
+ paddw m6, [rnd_2d_%2+rnd_bias*8]
+ paddw m1, m6
+ psrlw m1, 6
+ packuswb m1, m1
+ CHROMAMC_AVG4 m1, m6, [r0]
+ movd [r0], m1
+ add r0, r2
+
+ movd m6, [r1 ]
+ movd m1, [r1+1]
+ add r1, r2
+ punpcklbw m6, m7
+ punpcklbw m1, m7
+ pmullw m6, m4
+ pmullw m1, m2
+ paddw m1, m6
+ movq m6, m1
+ pmullw m0, m5
+ pmullw m1, m3
+ paddw m0, [rnd_2d_%2+rnd_bias*8]
+ paddw m1, m0
+ psrlw m1, 6
+ packuswb m1, m1
+ CHROMAMC_AVG4 m1, m0, [r0]
+ movd [r0], m1
+ add r0, r2
+ sub r3d, 2
+ jnz .next2rows
+ REP_RET
+%endmacro
+
+%macro chroma_mc2_mmx_func 2
+cglobal %1_%2_chroma_mc2, 6, 7, 0
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+
+ mov r6d, r4d
+ shl r4d, 16
+ sub r4d, r6d
+ add r4d, 8
+ imul r5d, r4d ; x*y<<16 | y*(8-x)
+ shl r4d, 3
+ sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
+
+ movd m5, r4d
+ movd m6, r5d
+ punpckldq m5, m5 ; mm5 = {A,B,A,B}
+ punpckldq m6, m6 ; mm6 = {C,D,C,D}
+ pxor m7, m7
+ movd m2, [r1]
+ punpcklbw m2, m7
+ pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
+
+.nextrow:
+ add r1, r2
+ movq m1, m2
+ pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
+ movd m0, [r1]
+ punpcklbw m0, m7
+ pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
+ movq m2, m0
+ pmaddwd m0, m6
+ paddw m1, [rnd_2d_%2]
+ paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
+ psrlw m1, 6
+ packssdw m1, m7
+ packuswb m1, m7
+ CHROMAMC_AVG4 m1, m3, [r0]
+ movd r5d, m1
+ mov [r0], r5w
+ add r0, r2
+ sub r3d, 1
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+%define rnd_1d_h264 pw_4
+%define rnd_2d_h264 pw_32
+%define rnd_1d_vc1 pw_3
+%define rnd_2d_vc1 pw_28
+
+%macro NOTHING 2-3
+%endmacro
+%macro DIRECT_AVG 2
+ PAVGB %1, %2
+%endmacro
+%macro COPY_AVG 3
+ movd %2, %3
+ PAVGB %1, %2
+%endmacro
+
+INIT_MMX mmx
+%define CHROMAMC_AVG NOTHING
+%define CHROMAMC_AVG4 NOTHING
+chroma_mc8_mmx_func put, h264, _rnd
+chroma_mc8_mmx_func put, vc1, _nornd
+chroma_mc8_mmx_func put, rv40
+chroma_mc4_mmx_func put, h264
+chroma_mc4_mmx_func put, rv40
+
+INIT_MMX mmxext
+chroma_mc2_mmx_func put, h264
+
+%define CHROMAMC_AVG DIRECT_AVG
+%define CHROMAMC_AVG4 COPY_AVG
+chroma_mc8_mmx_func avg, h264, _rnd
+chroma_mc8_mmx_func avg, vc1, _nornd
+chroma_mc8_mmx_func avg, rv40
+chroma_mc4_mmx_func avg, h264
+chroma_mc4_mmx_func avg, rv40
+chroma_mc2_mmx_func avg, h264
+
+INIT_MMX 3dnow
+chroma_mc8_mmx_func avg, h264, _rnd
+chroma_mc8_mmx_func avg, vc1, _nornd
+chroma_mc8_mmx_func avg, rv40
+chroma_mc4_mmx_func avg, h264
+chroma_mc4_mmx_func avg, rv40
+
+%macro chroma_mc8_ssse3_func 2-3
+cglobal %1_%2_chroma_mc8%3, 6, 7, 8
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+ mov r6d, r5d
+ or r6d, r4d
+ jne .at_least_one_non_zero
+ ; mx == 0 AND my == 0 - no filter needed
+ mv0_pixels_mc8
+ REP_RET
+
+.at_least_one_non_zero:
+ test r5d, r5d
+ je .my_is_zero
+ test r4d, r4d
+ je .mx_is_zero
+
+ ; general case, bilinear
+ mov r6d, r4d
+ shl r4d, 8
+ sub r4, r6
+ mov r6, 8
+ add r4, 8 ; x*288+8 = x<<8 | (8-x)
+ sub r6d, r5d
+ imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
+ imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
+
+ movd m7, r6d
+ movd m6, r4d
+ movdqa m5, [rnd_2d_%2]
+ movq m0, [r1 ]
+ movq m1, [r1+1]
+ pshuflw m7, m7, 0
+ pshuflw m6, m6, 0
+ punpcklbw m0, m1
+ movlhps m7, m7
+ movlhps m6, m6
+
+.next2rows:
+ movq m1, [r1+r2*1 ]
+ movq m2, [r1+r2*1+1]
+ movq m3, [r1+r2*2 ]
+ movq m4, [r1+r2*2+1]
+ lea r1, [r1+r2*2]
+ punpcklbw m1, m2
+ movdqa m2, m1
+ punpcklbw m3, m4
+ movdqa m4, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m3, m6
+ paddw m0, m5
+ paddw m2, m5
+ paddw m1, m0
+ paddw m3, m2
+ psrlw m1, 6
+ movdqa m0, m4
+ psrlw m3, 6
+%ifidn %1, avg
+ movq m2, [r0 ]
+ movhps m2, [r0+r2]
+%endif
+ packuswb m1, m3
+ CHROMAMC_AVG m1, m2
+ movq [r0 ], m1
+ movhps [r0+r2], m1
+ sub r3d, 2
+ lea r0, [r0+r2*2]
+ jg .next2rows
+ REP_RET
+
+.my_is_zero:
+ mov r5d, r4d
+ shl r4d, 8
+ add r4, 8
+ sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
+ movd m7, r4d
+ movdqa m6, [rnd_1d_%2]
+ pshuflw m7, m7, 0
+ movlhps m7, m7
+
+.next2xrows:
+ movq m0, [r1 ]
+ movq m1, [r1 +1]
+ movq m2, [r1+r2 ]
+ movq m3, [r1+r2+1]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ pmaddubsw m0, m7
+ pmaddubsw m2, m7
+%ifidn %1, avg
+ movq m4, [r0 ]
+ movhps m4, [r0+r2]
+%endif
+ paddw m0, m6
+ paddw m2, m6
+ psrlw m0, 3
+ psrlw m2, 3
+ packuswb m0, m2
+ CHROMAMC_AVG m0, m4
+ movq [r0 ], m0
+ movhps [r0+r2], m0
+ sub r3d, 2
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ jg .next2xrows
+ REP_RET
+
+.mx_is_zero:
+ mov r4d, r5d
+ shl r5d, 8
+ add r5, 8
+ sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
+ movd m7, r5d
+ movdqa m6, [rnd_1d_%2]
+ pshuflw m7, m7, 0
+ movlhps m7, m7
+
+.next2yrows:
+ movq m0, [r1 ]
+ movq m1, [r1+r2 ]
+ movdqa m2, m1
+ movq m3, [r1+r2*2]
+ lea r1, [r1+r2*2]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ pmaddubsw m0, m7
+ pmaddubsw m2, m7
+%ifidn %1, avg
+ movq m4, [r0 ]
+ movhps m4, [r0+r2]
+%endif
+ paddw m0, m6
+ paddw m2, m6
+ psrlw m0, 3
+ psrlw m2, 3
+ packuswb m0, m2
+ CHROMAMC_AVG m0, m4
+ movq [r0 ], m0
+ movhps [r0+r2], m0
+ sub r3d, 2
+ lea r0, [r0+r2*2]
+ jg .next2yrows
+ REP_RET
+%endmacro
+
+%macro chroma_mc4_ssse3_func 2
+cglobal %1_%2_chroma_mc4, 6, 7, 0
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+ mov r6, r4
+ shl r4d, 8
+ sub r4d, r6d
+ mov r6, 8
+ add r4d, 8 ; x*288+8
+ sub r6d, r5d
+ imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
+ imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
+
+ movd m7, r6d
+ movd m6, r4d
+ movq m5, [pw_32]
+ movd m0, [r1 ]
+ pshufw m7, m7, 0
+ punpcklbw m0, [r1+1]
+ pshufw m6, m6, 0
+
+.next2rows:
+ movd m1, [r1+r2*1 ]
+ movd m3, [r1+r2*2 ]
+ punpcklbw m1, [r1+r2*1+1]
+ punpcklbw m3, [r1+r2*2+1]
+ lea r1, [r1+r2*2]
+ movq m2, m1
+ movq m4, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m3, m6
+ paddw m0, m5
+ paddw m2, m5
+ paddw m1, m0
+ paddw m3, m2
+ psrlw m1, 6
+ movq m0, m4
+ psrlw m3, 6
+ packuswb m1, m1
+ packuswb m3, m3
+ CHROMAMC_AVG m1, [r0 ]
+ CHROMAMC_AVG m3, [r0+r2]
+ movd [r0 ], m1
+ movd [r0+r2], m3
+ sub r3d, 2
+ lea r0, [r0+r2*2]
+ jg .next2rows
+ REP_RET
+%endmacro
+
+%define CHROMAMC_AVG NOTHING
+INIT_XMM ssse3
+chroma_mc8_ssse3_func put, h264, _rnd
+chroma_mc8_ssse3_func put, vc1, _nornd
+INIT_MMX ssse3
+chroma_mc4_ssse3_func put, h264
+
+%define CHROMAMC_AVG DIRECT_AVG
+INIT_XMM ssse3
+chroma_mc8_ssse3_func avg, h264, _rnd
+chroma_mc8_ssse3_func avg, vc1, _nornd
+INIT_MMX ssse3
+chroma_mc4_ssse3_func avg, h264
diff --git a/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm b/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
new file mode 100644
index 0000000..b850551
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
@@ -0,0 +1,271 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+
+SECTION .text
+
+
+%macro MV0_PIXELS_MC8 0
+ lea r4, [r2*3 ]
+ lea r5, [r2*4 ]
+.next4rows:
+ movu m0, [r1 ]
+ movu m1, [r1+r2 ]
+ CHROMAMC_AVG m0, [r0 ]
+ CHROMAMC_AVG m1, [r0+r2 ]
+ mova [r0 ], m0
+ mova [r0+r2 ], m1
+ movu m0, [r1+r2*2]
+ movu m1, [r1+r4 ]
+ CHROMAMC_AVG m0, [r0+r2*2]
+ CHROMAMC_AVG m1, [r0+r4 ]
+ mova [r0+r2*2], m0
+ mova [r0+r4 ], m1
+ add r1, r5
+ add r0, r5
+ sub r3d, 4
+ jne .next4rows
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
+;-----------------------------------------------------------------------------
+%macro CHROMA_MC8 1
+; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
+; int stride, int h, int mx, int my)
+cglobal %1_h264_chroma_mc8_10, 6,7,8
+ movsxdifnidn r2, r2d
+ mov r6d, r5d
+ or r6d, r4d
+ jne .at_least_one_non_zero
+ ; mx == 0 AND my == 0 - no filter needed
+ MV0_PIXELS_MC8
+ REP_RET
+
+.at_least_one_non_zero:
+ mov r6d, 2
+ test r5d, r5d
+ je .x_interpolation
+ mov r6, r2 ; dxy = x ? 1 : stride
+ test r4d, r4d
+ jne .xy_interpolation
+.x_interpolation:
+ ; mx == 0 XOR my == 0 - 1 dimensional filter only
+ or r4d, r5d ; x + y
+ movd m5, r4d
+ mova m4, [pw_8]
+ mova m6, [pw_4] ; mm6 = rnd >> 3
+ SPLATW m5, m5 ; mm5 = B = x
+ psubw m4, m5 ; mm4 = A = 8-x
+
+.next1drow:
+ movu m0, [r1 ] ; mm0 = src[0..7]
+ movu m2, [r1+r6] ; mm2 = src[1..8]
+
+ pmullw m0, m4 ; mm0 = A * src[0..7]
+ pmullw m2, m5 ; mm2 = B * src[1..8]
+
+ paddw m0, m6
+ paddw m0, m2
+ psrlw m0, 3
+ CHROMAMC_AVG m0, [r0]
+ mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
+
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jne .next1drow
+ REP_RET
+
+.xy_interpolation: ; general case, bilinear
+ movd m4, r4m ; x
+ movd m6, r5m ; y
+
+ SPLATW m4, m4 ; mm4 = x words
+ SPLATW m6, m6 ; mm6 = y words
+ psllw m5, m4, 3 ; mm5 = 8x
+ pmullw m4, m6 ; mm4 = x * y
+ psllw m6, 3 ; mm6 = 8y
+ paddw m1, m5, m6 ; mm7 = 8x+8y
+ mova m7, m4 ; DD = x * y
+ psubw m5, m4 ; mm5 = B = 8x - xy
+ psubw m6, m4 ; mm6 = C = 8y - xy
+ paddw m4, [pw_64]
+ psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
+
+ movu m0, [r1 ] ; mm0 = src[0..7]
+ movu m1, [r1+2] ; mm1 = src[1..8]
+.next2drow:
+ add r1, r2
+
+ pmullw m2, m0, m4
+ pmullw m1, m5
+ paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
+
+ movu m0, [r1]
+ movu m1, [r1+2]
+ pmullw m3, m0, m6
+ paddw m2, m3 ; mm2 += C * src[0..7+strde]
+ pmullw m3, m1, m7
+ paddw m2, m3 ; mm2 += D * src[1..8+strde]
+
+ paddw m2, [pw_32]
+ psrlw m2, 6
+ CHROMAMC_AVG m2, [r0]
+ mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
+
+ add r0, r2
+ dec r3d
+ jne .next2drow
+ REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
+;-----------------------------------------------------------------------------
+;TODO: xmm mc4
+%macro MC4_OP 2
+ movq %1, [r1 ]
+ movq m1, [r1+2]
+ add r1, r2
+ pmullw %1, m4
+ pmullw m1, m2
+ paddw m1, %1
+ mova %1, m1
+
+ pmullw %2, m5
+ pmullw m1, m3
+ paddw %2, [pw_32]
+ paddw m1, %2
+ psrlw m1, 6
+ CHROMAMC_AVG m1, %2, [r0]
+ movq [r0], m1
+ add r0, r2
+%endmacro
+
+%macro CHROMA_MC4 1
+cglobal %1_h264_chroma_mc4_10, 6,6,7
+ movsxdifnidn r2, r2d
+ movd m2, r4m ; x
+ movd m3, r5m ; y
+ mova m4, [pw_8]
+ mova m5, m4
+ SPLATW m2, m2
+ SPLATW m3, m3
+ psubw m4, m2
+ psubw m5, m3
+
+ movq m0, [r1 ]
+ movq m6, [r1+2]
+ add r1, r2
+ pmullw m0, m4
+ pmullw m6, m2
+ paddw m6, m0
+
+.next2rows:
+ MC4_OP m0, m6
+ MC4_OP m6, m0
+ sub r3d, 2
+ jnz .next2rows
+ REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
+;-----------------------------------------------------------------------------
+%macro CHROMA_MC2 1
+cglobal %1_h264_chroma_mc2_10, 6,7
+ movsxdifnidn r2, r2d
+ mov r6d, r4d
+ shl r4d, 16
+ sub r4d, r6d
+ add r4d, 8
+ imul r5d, r4d ; x*y<<16 | y*(8-x)
+ shl r4d, 3
+ sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
+
+ movd m5, r4d
+ movd m6, r5d
+ punpckldq m5, m5 ; mm5 = {A,B,A,B}
+ punpckldq m6, m6 ; mm6 = {C,D,C,D}
+ pxor m7, m7
+ pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
+
+.nextrow:
+ add r1, r2
+ movq m1, m2
+ pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
+ pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
+ movq m2, m0
+ pmaddwd m0, m6
+ paddw m1, [pw_32]
+ paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
+ psrlw m1, 6
+ packssdw m1, m7
+ CHROMAMC_AVG m1, m3, [r0]
+ movd [r0], m1
+ add r0, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+%macro NOTHING 2-3
+%endmacro
+%macro AVG 2-3
+%if %0==3
+ movq %2, %3
+%endif
+ pavgw %1, %2
+%endmacro
+
+%define CHROMAMC_AVG NOTHING
+INIT_XMM sse2
+CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+CHROMA_MC8 put
+%endif
+INIT_MMX mmxext
+CHROMA_MC4 put
+CHROMA_MC2 put
+
+%define CHROMAMC_AVG AVG
+INIT_XMM sse2
+CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+CHROMA_MC8 avg
+%endif
+INIT_MMX mmxext
+CHROMA_MC4 avg
+CHROMA_MC2 avg
diff --git a/ffmpeg/libavcodec/x86/h264_deblock.asm b/ffmpeg/libavcodec/x86/h264_deblock.asm
new file mode 100644
index 0000000..d58e16c
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_deblock.asm
@@ -0,0 +1,1083 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized H.264 deblocking code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Oskar Arvidsson <oskar@irock.se>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_A1: times 16 db 0xA1
+pb_3_1: times 4 db 3, 1
+
+SECTION .text
+
+cextern pb_0
+cextern pb_1
+cextern pb_3
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base+stride], [base+stride*2], [base3], \
+ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+%define PASS8ROWS(base, base3, stride, stride3, offset) \
+ PASS8ROWS(base+offset, base3+offset, stride, stride3)
+
+; in: 8 rows of 4 bytes in %4..%11
+; out: 4 rows of 8 bytes in m0..m3
+%macro TRANSPOSE4x8_LOAD 11
+ movh m0, %4
+ movh m2, %5
+ movh m1, %6
+ movh m3, %7
+ punpckl%1 m0, m2
+ punpckl%1 m1, m3
+ mova m2, m0
+ punpckl%2 m0, m1
+ punpckh%2 m2, m1
+
+ movh m4, %8
+ movh m6, %9
+ movh m5, %10
+ movh m7, %11
+ punpckl%1 m4, m6
+ punpckl%1 m5, m7
+ mova m6, m4
+ punpckl%2 m4, m5
+ punpckh%2 m6, m5
+
+ punpckh%3 m1, m0, m4
+ punpckh%3 m3, m2, m6
+ punpckl%3 m0, m4
+ punpckl%3 m2, m6
+%endmacro
+
+; in: 4 rows of 8 bytes in m0..m3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4B_STORE 8
+ punpckhdq m4, m0, m0
+ punpckhdq m5, m1, m1
+ punpckhdq m6, m2, m2
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ movh %1, m1
+ punpckhdq m1, m1
+ movh %2, m1
+ movh %3, m0
+ punpckhdq m0, m0
+ movh %4, m0
+
+ punpckhdq m3, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m3
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ movh %5, m5
+ punpckhdq m5, m5
+ movh %6, m5
+ movh %7, m4
+ punpckhdq m4, m4
+ movh %8, m4
+%endmacro
+
+%macro TRANSPOSE4x8B_LOAD 8
+ TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
+%endmacro
+
+%macro SBUTTERFLY3 4
+ punpckh%1 %4, %2, %3
+ punpckl%1 %2, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+ RESET_MM_PERMUTATION
+ movq m0, %1
+ movq m1, %2
+ movq m2, %3
+ movq m3, %4
+ movq m4, %5
+ movq m5, %6
+ movq m6, %7
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+ SBUTTERFLY bw, 4, 5, 7
+ movq [%9+0x10], m3
+ SBUTTERFLY3 bw, m6, %8, m7
+ SBUTTERFLY wd, 0, 2, 3
+ SBUTTERFLY wd, 4, 6, 3
+ punpckhdq m0, m4
+ movq [%9+0x00], m0
+ SBUTTERFLY3 wd, m1, [%9+0x10], m3
+ SBUTTERFLY wd, 5, 7, 0
+ SBUTTERFLY dq, 1, 5, 0
+ SBUTTERFLY dq, 2, 6, 0
+ punpckldq m3, m7
+ movq [%9+0x10], m2
+ movq [%9+0x20], m6
+ movq [%9+0x30], m1
+ movq [%9+0x40], m5
+ movq [%9+0x50], m3
+ RESET_MM_PERMUTATION
+%endmacro
+
+; in: 8 rows of 8 in %1..%8
+; out: 8 rows of 8 in %9..%16
+%macro TRANSPOSE8x8_MEM 16
+ RESET_MM_PERMUTATION
+ movq m0, %1
+ movq m1, %2
+ movq m2, %3
+ movq m3, %4
+ movq m4, %5
+ movq m5, %6
+ movq m6, %7
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+ SBUTTERFLY bw, 4, 5, 7
+ SBUTTERFLY3 bw, m6, %8, m7
+ movq %9, m5
+ SBUTTERFLY wd, 0, 2, 5
+ SBUTTERFLY wd, 4, 6, 5
+ SBUTTERFLY wd, 1, 3, 5
+ movq %11, m6
+ movq m6, %9
+ SBUTTERFLY wd, 6, 7, 5
+ SBUTTERFLY dq, 0, 4, 5
+ SBUTTERFLY dq, 1, 6, 5
+ movq %9, m0
+ movq %10, m4
+ movq %13, m1
+ movq %14, m6
+ SBUTTERFLY3 dq, m2, %11, m0
+ SBUTTERFLY dq, 3, 7, 4
+ movq %11, m2
+ movq %12, m0
+ movq %15, m3
+ movq %16, m7
+ RESET_MM_PERMUTATION
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT 5
+%if avx_enabled == 0
+ mova %5, %2
+ mova %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+%else
+ psubusb %5, %2, %1
+ psubusb %4, %1, %2
+%endif
+ por %4, %5
+ psubusb %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT2 5
+%if ARCH_X86_64
+ psubusb %5, %2, %1
+ psubusb %4, %1, %2
+%else
+ mova %5, %2
+ mova %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+%endif
+ psubusb %5, %3
+ psubusb %4, %3
+ pcmpeqb %4, %5
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
+; out: m5=beta-1, m7=mask, %3=alpha-1
+; clobbers: m4,m6
+%macro LOAD_MASK 2-3
+ movd m4, %1
+ movd m5, %2
+ SPLATW m4, m4
+ SPLATW m5, m5
+ packuswb m4, m4 ; 16x alpha-1
+ packuswb m5, m5 ; 16x beta-1
+%if %0>2
+ mova %3, m4
+%endif
+ DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
+ DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
+ por m7, m4
+ DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
+ por m7, m4
+ pxor m6, m6
+ pcmpeqb m7, m6
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
+; out: m1=p0' m2=q0'
+; clobbers: m0,3-6
+%macro DEBLOCK_P0_Q0 0
+ pcmpeqb m4, m4
+ pxor m5, m1, m2 ; p0^q0
+ pxor m3, m4
+ pand m5, [pb_1] ; (p0^q0)&1
+ pavgb m3, m0 ; (p1 - q1 + 256)>>1
+ pxor m4, m1
+ pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pavgb m4, m2 ; (q0 - p0 + 256)>>1
+ pavgb m3, m5
+ mova m6, [pb_A1]
+ paddusb m3, m4 ; d+128+33
+ psubusb m6, m3
+ psubusb m3, [pb_A1]
+ pminub m6, m7
+ pminub m3, m7
+ psubusb m1, m6
+ psubusb m2, m3
+ paddusb m1, m3
+ paddusb m2, m6
+%endmacro
+
+; in: m1=p0 m2=q0
+; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1 6
+ pavgb %6, m1, m2
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pxor %6, %3
+ pand %6, [pb_1] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ psubusb %6, %1, %5
+ paddusb %5, %1
+ pmaxub %2, %6
+ pminub %2, %5
+ mova %4, %2
+%endmacro
+
+%if ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_LUMA 0
+cglobal deblock_v_luma_8, 5,5,10
+ movd m8, [r4] ; tc0
+ lea r4, [r1*3]
+ dec r2d ; alpha-1
+ neg r4
+ dec r3d ; beta-1
+ add r4, r0 ; pix-3*stride
+
+ mova m0, [r4+r1] ; p1
+ mova m1, [r4+2*r1] ; p0
+ mova m2, [r0] ; q0
+ mova m3, [r0+r1] ; q1
+ LOAD_MASK r2d, r3d
+
+ punpcklbw m8, m8
+ punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [r4] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ psubb m7, m8, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ movdqa m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ mova m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+
+ DEBLOCK_P0_Q0
+ mova [r4+2*r1], m1
+ mova [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX cpuname
+cglobal deblock_h_luma_8, 5,9
+ movsxd r7, r1d
+ lea r8, [r7+r7*2]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+%if WIN64
+ sub rsp, 0x98
+ %define pix_tmp rsp+0x30
+%else
+ sub rsp, 0x68
+ %define pix_tmp rsp
+%endif
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
+
+ ; vertical filter
+ ; alpha, beta, tc0 are still in r2d, r3d, r4
+ ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
+ lea r0, [pix_tmp+0x30]
+ mov r1d, 0x10
+%if WIN64
+ mov [rsp+0x20], r4
+%endif
+ call deblock_v_luma_8
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ add r6, 2
+ add r5, 2
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
+
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
+
+%if WIN64
+ add rsp, 0x98
+%else
+ add rsp, 0x68
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+DEBLOCK_LUMA
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA
+%endif
+
+%else
+
+%macro DEBLOCK_LUMA 2
+;-----------------------------------------------------------------------------
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_%1_luma_8, 5,5,8,2*%2
+ lea r4, [r1*3]
+ dec r2 ; alpha-1
+ neg r4
+ dec r3 ; beta-1
+ add r4, r0 ; pix-3*stride
+
+ mova m0, [r4+r1] ; p1
+ mova m1, [r4+2*r1] ; p0
+ mova m2, [r0] ; q0
+ mova m3, [r0+r1] ; q1
+ LOAD_MASK r2, r3
+
+ mov r3, r4mp
+ pcmpeqb m3, m3
+ movd m4, [r3] ; tc0
+ punpcklbw m4, m4
+ punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ mova [esp+%2], m4 ; tc
+ pcmpgtb m4, m3
+ mova m3, [r4] ; p2
+ pand m4, m7
+ mova [esp], m4 ; mask
+
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m4
+ pand m4, [esp+%2] ; tc
+ psubb m7, m4, m6
+ pand m6, m4
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ mova m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, [esp] ; mask
+ mova m5, [esp+%2] ; tc
+ psubb m7, m6
+ pand m5, m6
+ mova m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
+
+ DEBLOCK_P0_Q0
+ mova [r4+2*r1], m1
+ mova [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX cpuname
+cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
+ mov r0, r0mp
+ mov r3, r1m
+ lea r4, [r3*3]
+ sub r0, 4
+ lea r1, [r0+r4]
+%define pix_tmp esp+12*HAVE_ALIGNED_STACK
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
+
+ ; vertical filter
+ lea r0, [pix_tmp+0x30]
+ PUSH dword r4m
+ PUSH dword r3m
+ PUSH dword r2m
+ PUSH dword 16
+ PUSH dword r0
+ call deblock_%1_luma_8
+%ifidn %1, v8
+ add dword [esp ], 8 ; pix_tmp+0x38
+ add dword [esp+16], 2 ; tc0+2
+ call deblock_%1_luma_8
+%endif
+ ADD esp, 20
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ mov r0, r0mp
+ sub r0, 2
+
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ lea r1, [r0+r4]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ RET
+%endmacro ; DEBLOCK_LUMA
+
+INIT_MMX mmxext
+DEBLOCK_LUMA v8, 8
+INIT_XMM sse2
+DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA v, 16
+%endif
+
+%endif ; ARCH
+
+
+
+%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+%if ARCH_X86_64
+ pavgb t0, p2, p1
+ pavgb t1, p0, q0
+%else
+ mova t0, p2
+ mova t1, p0
+ pavgb t0, p1
+ pavgb t1, q0
+%endif
+ pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
+ mova t5, t1
+%if ARCH_X86_64
+ paddb t2, p2, p1
+ paddb t3, p0, q0
+%else
+ mova t2, p2
+ mova t3, p0
+ paddb t2, p1
+ paddb t3, q0
+%endif
+ paddb t2, t3
+ mova t3, t2
+ mova t4, t2
+ psrlw t2, 1
+ pavgb t2, mpb_0
+ pxor t2, t0
+ pand t2, mpb_1
+ psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
+
+%if ARCH_X86_64
+ pavgb t1, p2, q1
+ psubb t2, p2, q1
+%else
+ mova t1, p2
+ mova t2, p2
+ pavgb t1, q1
+ psubb t2, q1
+%endif
+ paddb t3, t3
+ psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
+ pand t2, mpb_1
+ psubb t1, t2
+ pavgb t1, p1
+ pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
+ psrlw t3, 2
+ pavgb t3, mpb_0
+ pxor t3, t1
+ pand t3, mpb_1
+ psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
+
+ pxor t3, p0, q1
+ pavgb t2, p0, q1
+ pand t3, mpb_1
+ psubb t2, t3
+ pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
+
+ pxor t1, t2
+ pxor t2, p0
+ pand t1, mask1p
+ pand t2, mask0
+ pxor t1, t2
+ pxor t1, p0
+ mova %1, t1 ; store p0
+
+ mova t1, %4 ; p3
+ paddb t2, t1, p2
+ pavgb t1, p2
+ pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
+ paddb t2, t2
+ paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
+ psrlw t2, 2
+ pavgb t2, mpb_0
+ pxor t2, t1
+ pand t2, mpb_1
+ psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
+
+ pxor t0, p1
+ pxor t1, p2
+ pand t0, mask1p
+ pand t1, mask1p
+ pxor t0, p1
+ pxor t1, p2
+ mova %2, t0 ; store p1
+ mova %3, t1 ; store p2
+%endmacro
+
+%macro LUMA_INTRA_SWAP_PQ 0
+ %define q1 m0
+ %define q0 m1
+ %define p0 m2
+ %define p1 m3
+ %define p2 q2
+ %define mask1p mask1q
+%endmacro
+
+%macro DEBLOCK_LUMA_INTRA 1
+ %define p1 m0
+ %define p0 m1
+ %define q0 m2
+ %define q1 m3
+ %define t0 m4
+ %define t1 m5
+ %define t2 m6
+ %define t3 m7
+%if ARCH_X86_64
+ %define p2 m8
+ %define q2 m9
+ %define t4 m10
+ %define t5 m11
+ %define mask0 m12
+ %define mask1p m13
+%if WIN64
+ %define mask1q [rsp]
+%else
+ %define mask1q [rsp-24]
+%endif
+ %define mpb_0 m14
+ %define mpb_1 m15
+%else
+ %define spill(x) [esp+16*x]
+ %define p2 [r4+r1]
+ %define q2 [r0+2*r1]
+ %define t4 spill(0)
+ %define t5 spill(1)
+ %define mask0 spill(2)
+ %define mask1p spill(3)
+ %define mask1q spill(4)
+ %define mpb_0 [pb_0]
+ %define mpb_1 [pb_1]
+%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+%if WIN64
+cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
+%else
+cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
+%endif
+ lea r4, [r1*4]
+ lea r5, [r1*3] ; 3*stride
+ dec r2d ; alpha-1
+ jl .end
+ neg r4
+ dec r3d ; beta-1
+ jl .end
+ add r4, r0 ; pix-4*stride
+ mova p1, [r4+2*r1]
+ mova p0, [r4+r5]
+ mova q0, [r0]
+ mova q1, [r0+r1]
+%if ARCH_X86_64
+ pxor mpb_0, mpb_0
+ mova mpb_1, [pb_1]
+ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+ SWAP 7, 12 ; m12=mask0
+ pavgb t5, mpb_0
+ pavgb t5, mpb_1 ; alpha/4+1
+ movdqa p2, [r4+r1]
+ movdqa q2, [r0+2*r1]
+ DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
+ DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
+ DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
+ pand t0, mask0
+ pand t4, t0
+ pand t2, t0
+ mova mask1q, t4
+ mova mask1p, t2
+%else
+ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+ mova m4, t5
+ mova mask0, m7
+ pavgb m4, [pb_0]
+ pavgb m4, [pb_1] ; alpha/4+1
+ DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
+ pand m6, mask0
+ DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
+ pand m4, m6
+ mova mask1p, m4
+ DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
+ pand m4, m6
+ mova mask1q, m4
+%endif
+ LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
+ LUMA_INTRA_SWAP_PQ
+ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
+.end:
+ RET
+
+INIT_MMX cpuname
+%if ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_8, 4,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+ sub rsp, 0x88
+ %define pix_tmp rsp
+
+ ; transpose 8x16 -> tmp space
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ lea r0, [pix_tmp+0x40]
+ mov r1, 0x10
+ call deblock_v_luma_intra_8
+
+ ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+ lea r5, [r6+r8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ add rsp, 0x88
+ RET
+%else
+cglobal deblock_h_luma_intra_8, 2,4,8,0x80
+ lea r3, [r1*3]
+ sub r0, 4
+ lea r2, [r0+r3]
+ %define pix_tmp rsp
+
+ ; transpose 8x16 -> tmp space
+ TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r0, [r0+r1*8]
+ lea r2, [r2+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ lea r0, [pix_tmp+0x40]
+ PUSH dword r3m
+ PUSH dword r2m
+ PUSH dword 16
+ PUSH r0
+ call deblock_%1_luma_intra_8
+%ifidn %1, v8
+ add dword [rsp], 8 ; pix_tmp+8
+ call deblock_%1_luma_intra_8
+%endif
+ ADD esp, 16
+
+ mov r1, r1m
+ mov r0, r0mp
+ lea r3, [r1*3]
+ sub r0, 4
+ lea r2, [r0+r3]
+ ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+ lea r0, [r0+r1*8]
+ lea r2, [r2+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+ RET
+%endif ; ARCH_X86_64
+%endmacro ; DEBLOCK_LUMA_INTRA
+
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA v
+%endif
+%if ARCH_X86_64 == 0
+INIT_MMX mmxext
+DEBLOCK_LUMA_INTRA v8
+%endif
+
+INIT_MMX mmxext
+
+%macro CHROMA_V_START 0
+ dec r2d ; alpha-1
+ dec r3d ; beta-1
+ mov t5, r0
+ sub t5, r1
+ sub t5, r1
+%endmacro
+
+%macro CHROMA_H_START 0
+ dec r2d
+ dec r3d
+ sub r0, 2
+ lea t6, [r1*3]
+ mov t5, r0
+ add r0, t6
+%endmacro
+
+%define t5 r5
+%define t6 r6
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_8, 5,6
+ CHROMA_V_START
+ movq m0, [t5]
+ movq m1, [t5+r1]
+ movq m2, [r0]
+ movq m3, [r0+r1]
+ call ff_chroma_inter_body_mmxext
+ movq [t5+r1], m1
+ movq [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_8, 5,7
+%if UNIX64
+ %define buf0 [rsp-24]
+ %define buf1 [rsp-16]
+%elif WIN64
+ sub rsp, 16
+ %define buf0 [rsp]
+ %define buf1 [rsp+8]
+%else
+ %define buf0 r0m
+ %define buf1 r2m
+%endif
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+%if WIN64
+ add rsp, 16
+%endif
+ RET
+
+ALIGN 16
+ff_chroma_inter_body_mmxext:
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ ret
+
+
+
+; in: %1=p0 %2=p1 %3=q1
+; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
+%macro CHROMA_INTRA_P0 3
+ movq m4, %1
+ pxor m4, %3
+ pand m4, [pb_1] ; m4 = (p0^q1)&1
+ pavgb %1, %3
+ psubusb %1, m4
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+%endmacro
+
+%define t5 r4
+%define t6 r5
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_intra_8, 4,5
+ CHROMA_V_START
+ movq m0, [t5]
+ movq m1, [t5+r1]
+ movq m2, [r0]
+ movq m3, [r0+r1]
+ call ff_chroma_intra_body_mmxext
+ movq [t5+r1], m1
+ movq [r0], m2
+ RET
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_intra_8, 4,6
+ CHROMA_H_START
+ TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+ call ff_chroma_intra_body_mmxext
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+ RET
+
+ALIGN 16
+ff_chroma_intra_body_mmxext:
+ LOAD_MASK r2d, r3d
+ movq m5, m1
+ movq m6, m2
+ CHROMA_INTRA_P0 m1, m0, m3
+ CHROMA_INTRA_P0 m2, m3, m0
+ psubb m1, m5
+ psubb m2, m6
+ pand m1, m7
+ pand m2, m7
+ paddb m1, m5
+ paddb m2, m6
+ ret
+
+;-----------------------------------------------------------------------------
+; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
+; int8_t ref[2][40], int16_t mv[2][40][2],
+; int bidir, int edges, int step,
+; int mask_mv0, int mask_mv1, int field);
+;
+; bidir is 0 or 1
+; edges is 1 or 4
+; step is 1 or 2
+; mask_mv0 is 0 or 3
+; mask_mv1 is 0 or 1
+; field is 0 or 1
+;-----------------------------------------------------------------------------
+%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
+ ; dir, d_idx, mask_dir, bidir
+%define edgesd %1
+%define stepd %2
+%define mask_mvd %3
+%define dir %4
+%define d_idx %5
+%define mask_dir %6
+%define bidir %7
+ xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
+%%.b_idx_loop:
+%if mask_dir == 0
+ pxor m0, m0
+%endif
+ test b_idxd, dword mask_mvd
+ jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
+%if bidir == 1
+ movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
+ punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
+ pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
+ pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
+ pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
+ psubb m0, m2 ; { ref0[b] != ref0[bn],
+ ; ref0[b] != ref1[bn] }
+ psubb m1, m3 ; { ref1[b] != ref1[bn],
+ ; ref1[b] != ref0[bn] }
+
+ por m0, m1
+ mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
+ mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+ mova m3, m1
+ mova m4, m2
+ psubw m1, [mvq+b_idxq*4+12*4]
+ psubw m2, [mvq+b_idxq*4+12*4+mmsize]
+ psubw m3, [mvq+b_idxq*4+52*4]
+ psubw m4, [mvq+b_idxq*4+52*4+mmsize]
+ packsswb m1, m2
+ packsswb m3, m4
+ paddb m1, m6
+ paddb m3, m6
+ psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
+ psubusb m3, m5
+ packsswb m1, m3
+
+ por m0, m1
+ mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
+ mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
+ mova m3, m1
+ mova m4, m2
+ psubw m1, [mvq+b_idxq*4+12*4]
+ psubw m2, [mvq+b_idxq*4+12*4+mmsize]
+ psubw m3, [mvq+b_idxq*4+52*4]
+ psubw m4, [mvq+b_idxq*4+52*4+mmsize]
+ packsswb m1, m2
+ packsswb m3, m4
+ paddb m1, m6
+ paddb m3, m6
+ psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
+ psubusb m3, m5
+ packsswb m1, m3
+
+ pshufw m1, m1, 0x4E
+ por m0, m1
+ pshufw m1, m0, 0x4E
+ pminub m0, m1
+%else ; bidir == 0
+ movd m0, [refq+b_idxq+12]
+ psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
+
+ mova m1, [mvq+b_idxq*4+12*4]
+ mova m2, [mvq+b_idxq*4+12*4+mmsize]
+ psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
+ psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+ packsswb m1, m2
+ paddb m1, m6
+ psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
+ packsswb m1, m1
+ por m0, m1
+%endif ; bidir == 1/0
+
+%%.skip_loop_iter:
+ movd m1, [nnzq+b_idxq+12]
+ por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
+
+ pminub m1, m7
+ pminub m0, m7
+ psllw m1, 1
+ pxor m2, m2
+ pmaxub m1, m0
+ punpcklbw m1, m2
+ movq [bsq+b_idxq+32*dir], m1
+
+ add b_idxd, dword stepd
+ cmp b_idxd, dword edgesd
+ jl %%.b_idx_loop
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
+ step, mask_mv0, mask_mv1, field
+%define b_idxq bidirq
+%define b_idxd bidird
+ cmp dword fieldm, 0
+ mova m7, [pb_1]
+ mova m5, [pb_3]
+ je .nofield
+ mova m5, [pb_3_1]
+.nofield:
+ mova m6, m5
+ paddb m5, m5
+
+ shl dword stepd, 3
+ shl dword edgesd, 3
+%if ARCH_X86_32
+%define mask_mv0d mask_mv0m
+%define mask_mv1d mask_mv1m
+%endif
+ shl dword mask_mv1d, 3
+ shl dword mask_mv0d, 3
+
+ cmp dword bidird, 0
+ jne .bidir
+ loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
+ loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
+
+ mova m0, [bsq+mmsize*0]
+ mova m1, [bsq+mmsize*1]
+ mova m2, [bsq+mmsize*2]
+ mova m3, [bsq+mmsize*3]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ mova [bsq+mmsize*0], m0
+ mova [bsq+mmsize*1], m1
+ mova [bsq+mmsize*2], m2
+ mova [bsq+mmsize*3], m3
+ RET
+
+.bidir:
+ loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
+ loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
+
+ mova m0, [bsq+mmsize*0]
+ mova m1, [bsq+mmsize*1]
+ mova m2, [bsq+mmsize*2]
+ mova m3, [bsq+mmsize*3]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ mova [bsq+mmsize*0], m0
+ mova [bsq+mmsize*1], m1
+ mova [bsq+mmsize*2], m2
+ mova [bsq+mmsize*3], m3
+ RET
diff --git a/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm b/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
new file mode 100644
index 0000000..d63ca02
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
@@ -0,0 +1,923 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Oskar Arvidsson <oskar@irock.se>
+;* Loren Merritt <lorenm@u.washington.edu>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+
+SECTION .text
+
+cextern pw_2
+cextern pw_3
+cextern pw_4
+
+; out: %4 = |%1-%2|-%3
+; clobbers: %5
+%macro ABS_SUB 5
+ psubusw %5, %2, %1
+ psubusw %4, %1, %2
+ por %4, %5
+ psubw %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|<%3
+%macro DIFF_LT 5
+ psubusw %4, %2, %1
+ psubusw %5, %1, %2
+ por %5, %4 ; |%1-%2|
+ pxor %4, %4
+ psubw %5, %3 ; |%1-%2|-%3
+ pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
+%endmacro
+
+%macro LOAD_AB 4
+ movd %1, %3
+ movd %2, %4
+ SPLATW %1, %1
+ SPLATW %2, %2
+%endmacro
+
+; in: %2=tc reg
+; out: %1=splatted tc
+%macro LOAD_TC 2
+ movd %1, [%2]
+ punpcklbw %1, %1
+%if mmsize == 8
+ pshufw %1, %1, 0
+%else
+ pshuflw %1, %1, 01010000b
+ pshufd %1, %1, 01010000b
+%endif
+ psraw %1, 6
+%endmacro
+
+; in: %1=p1, %2=p0, %3=q0, %4=q1
+; %5=alpha, %6=beta, %7-%9=tmp
+; out: %7=mask
+%macro LOAD_MASK 9
+ ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
+ ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
+ pand %8, %9
+ ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
+ pxor %7, %7
+ pand %8, %9
+ pcmpgtw %7, %8
+%endmacro
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', m2=q0'
+%macro DEBLOCK_P0_Q0 7
+ psubw %3, %4
+ pxor %7, %7
+ paddw %3, [pw_4]
+ psubw %7, %5
+ psubw %6, %2, %1
+ psllw %6, 2
+ paddw %3, %6
+ psraw %3, 3
+ mova %6, [pw_pixel_max]
+ CLIPW %3, %7, %5
+ pxor %7, %7
+ paddw %1, %3
+ psubw %2, %3
+ CLIPW %1, %7, %6
+ CLIPW %2, %7, %6
+%endmacro
+
+; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
+%macro LUMA_Q1 6
+ pavgw %6, %3, %4 ; (p0+q0+1)>>1
+ paddw %1, %6
+ pxor %6, %6
+ psraw %1, 1
+ psubw %6, %5
+ psubw %1, %2
+ CLIPW %1, %6, %5
+ paddw %1, %2
+%endmacro
+
+%macro LUMA_DEBLOCK_ONE 3
+ DIFF_LT m5, %1, bm, m4, m6
+ pxor m6, m6
+ mova %3, m4
+ pcmpgtw m6, tcm
+ pand m4, tcm
+ pandn m6, m7
+ pand m4, m6
+ LUMA_Q1 m5, %2, m1, m2, m4, m6
+%endmacro
+
+%macro LUMA_H_STORE 2
+%if mmsize == 8
+ movq [r0-4], m0
+ movq [r0+r1-4], m1
+ movq [r0+r1*2-4], m2
+ movq [r0+%2-4], m3
+%else
+ movq [r0-4], m0
+ movhps [r0+r1-4], m0
+ movq [r0+r1*2-4], m1
+ movhps [%1-4], m1
+ movq [%1+r1-4], m2
+ movhps [%1+r1*2-4], m2
+ movq [%1+%2-4], m3
+ movhps [%1+r1*4-4], m3
+%endif
+%endmacro
+
+%macro DEBLOCK_LUMA 0
+;-----------------------------------------------------------------------------
+; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
+ %assign pad 5*mmsize+12-(stack_offset&15)
+ %define tcm [rsp]
+ %define ms1 [rsp+mmsize]
+ %define ms2 [rsp+mmsize*2]
+ %define am [rsp+mmsize*3]
+ %define bm [rsp+mmsize*4]
+ SUB rsp, pad
+ shl r2d, 2
+ shl r3d, 2
+ LOAD_AB m4, m5, r2d, r3d
+ mov r3, 32/mmsize
+ mov r2, r0
+ sub r0, r1
+ mova am, m4
+ sub r0, r1
+ mova bm, m5
+ sub r0, r1
+.loop:
+ mova m0, [r0+r1]
+ mova m1, [r0+r1*2]
+ mova m2, [r2]
+ mova m3, [r2+r1]
+
+ LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
+ LOAD_TC m6, r4
+ mova tcm, m6
+
+ mova m5, [r0]
+ LUMA_DEBLOCK_ONE m1, m0, ms1
+ mova [r0+r1], m5
+
+ mova m5, [r2+r1*2]
+ LUMA_DEBLOCK_ONE m2, m3, ms2
+ mova [r2+r1], m5
+
+ pxor m5, m5
+ mova m6, tcm
+ pcmpgtw m5, tcm
+ psubw m6, ms1
+ pandn m5, m7
+ psubw m6, ms2
+ pand m5, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+ mova [r0+r1*2], m1
+ mova [r2], m2
+
+ add r0, mmsize
+ add r2, mmsize
+ add r4, mmsize/8
+ dec r3
+ jg .loop
+ ADD rsp, pad
+ RET
+
+cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
+ %assign pad 7*mmsize+12-(stack_offset&15)
+ %define tcm [rsp]
+ %define ms1 [rsp+mmsize]
+ %define ms2 [rsp+mmsize*2]
+ %define p1m [rsp+mmsize*3]
+ %define p2m [rsp+mmsize*4]
+ %define am [rsp+mmsize*5]
+ %define bm [rsp+mmsize*6]
+ SUB rsp, pad
+ shl r2d, 2
+ shl r3d, 2
+ LOAD_AB m4, m5, r2d, r3d
+ mov r3, r1
+ mova am, m4
+ add r3, r1
+ mov r5, 32/mmsize
+ mova bm, m5
+ add r3, r1
+%if mmsize == 16
+ mov r2, r0
+ add r2, r3
+%endif
+.loop:
+%if mmsize == 8
+ movq m2, [r0-8] ; y q2 q1 q0
+ movq m7, [r0+0]
+ movq m5, [r0+r1-8]
+ movq m3, [r0+r1+0]
+ movq m0, [r0+r1*2-8]
+ movq m6, [r0+r1*2+0]
+ movq m1, [r0+r3-8]
+ TRANSPOSE4x4W 2, 5, 0, 1, 4
+ SWAP 2, 7
+ movq m7, [r0+r3]
+ TRANSPOSE4x4W 2, 3, 6, 7, 4
+%else
+ movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
+ movu m0, [r0+r1-8]
+ movu m2, [r0+r1*2-8]
+ movu m3, [r2-8]
+ TRANSPOSE4x4W 5, 0, 2, 3, 6
+ mova tcm, m3
+
+ movu m4, [r2+r1-8]
+ movu m1, [r2+r1*2-8]
+ movu m3, [r2+r3-8]
+ movu m7, [r2+r1*4-8]
+ TRANSPOSE4x4W 4, 1, 3, 7, 6
+
+ mova m6, tcm
+ punpcklqdq m6, m7
+ punpckhqdq m5, m4
+ SBUTTERFLY qdq, 0, 1, 7
+ SBUTTERFLY qdq, 2, 3, 7
+%endif
+
+ mova p2m, m6
+ LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
+ LOAD_TC m6, r4
+ mova tcm, m6
+
+ LUMA_DEBLOCK_ONE m1, m0, ms1
+ mova p1m, m5
+
+ mova m5, p2m
+ LUMA_DEBLOCK_ONE m2, m3, ms2
+ mova p2m, m5
+
+ pxor m5, m5
+ mova m6, tcm
+ pcmpgtw m5, tcm
+ psubw m6, ms1
+ pandn m5, m7
+ psubw m6, ms2
+ pand m5, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+ mova m0, p1m
+ mova m3, p2m
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ LUMA_H_STORE r2, r3
+
+ add r4, mmsize/8
+ lea r0, [r0+r1*(mmsize/2)]
+ lea r2, [r2+r1*(mmsize/2)]
+ dec r5
+ jg .loop
+ ADD rsp, pad
+ RET
+%endmacro
+
+%if ARCH_X86_64
+; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
+; m12=alpha, m13=beta
+; out: m0=p1', m3=q1', m1=p0', m2=q0'
+; clobbers: m4, m5, m6, m7, m10, m11, m14
+%macro DEBLOCK_LUMA_INTER_SSE2 0
+ LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
+ LOAD_TC m6, r4
+ DIFF_LT m8, m1, m13, m10, m4
+ DIFF_LT m9, m2, m13, m11, m4
+ pand m6, m7
+
+ mova m14, m6
+ pxor m4, m4
+ pcmpgtw m6, m4
+ pand m6, m14
+
+ mova m5, m10
+ pand m5, m6
+ LUMA_Q1 m8, m0, m1, m2, m5, m4
+
+ mova m5, m11
+ pand m5, m6
+ LUMA_Q1 m9, m3, m1, m2, m5, m4
+
+ pxor m4, m4
+ psubw m6, m10
+ pcmpgtw m4, m14
+ pandn m4, m7
+ psubw m6, m11
+ pand m4, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
+
+ SWAP 0, 8
+ SWAP 3, 9
+%endmacro
+
+%macro DEBLOCK_LUMA_64 0
+cglobal deblock_v_luma_10, 5,5,15
+ %define p2 m8
+ %define p1 m0
+ %define p0 m1
+ %define q0 m2
+ %define q1 m3
+ %define q2 m9
+ %define mask0 m7
+ %define mask1 m10
+ %define mask2 m11
+ shl r2d, 2
+ shl r3d, 2
+ LOAD_AB m12, m13, r2d, r3d
+ mov r2, r0
+ sub r0, r1
+ sub r0, r1
+ sub r0, r1
+ mov r3, 2
+.loop:
+ mova p2, [r0]
+ mova p1, [r0+r1]
+ mova p0, [r0+r1*2]
+ mova q0, [r2]
+ mova q1, [r2+r1]
+ mova q2, [r2+r1*2]
+ DEBLOCK_LUMA_INTER_SSE2
+ mova [r0+r1], p1
+ mova [r0+r1*2], p0
+ mova [r2], q0
+ mova [r2+r1], q1
+ add r0, mmsize
+ add r2, mmsize
+ add r4, 2
+ dec r3
+ jg .loop
+ REP_RET
+
+cglobal deblock_h_luma_10, 5,7,15
+ shl r2d, 2
+ shl r3d, 2
+ LOAD_AB m12, m13, r2d, r3d
+ mov r2, r1
+ add r2, r1
+ add r2, r1
+ mov r5, r0
+ add r5, r2
+ mov r6, 2
+.loop:
+ movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
+ movu m0, [r0+r1-8]
+ movu m2, [r0+r1*2-8]
+ movu m9, [r5-8]
+ movu m5, [r5+r1-8]
+ movu m1, [r5+r1*2-8]
+ movu m3, [r5+r2-8]
+ movu m7, [r5+r1*4-8]
+
+ TRANSPOSE4x4W 8, 0, 2, 9, 10
+ TRANSPOSE4x4W 5, 1, 3, 7, 10
+
+ punpckhqdq m8, m5
+ SBUTTERFLY qdq, 0, 1, 10
+ SBUTTERFLY qdq, 2, 3, 10
+ punpcklqdq m9, m7
+
+ DEBLOCK_LUMA_INTER_SSE2
+
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ LUMA_H_STORE r5, r2
+ add r4, 2
+ lea r0, [r0+r1*8]
+ lea r5, [r5+r1*8]
+ dec r6
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA_64
+%endif
+%endif
+
+%macro SWAPMOVA 2
+%ifid %1
+ SWAP %1, %2
+%else
+ mova %1, %2
+%endif
+%endmacro
+
+; in: t0-t2: tmp registers
+; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
+; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
+%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
+%if ARCH_X86_64
+ paddw t0, %3, %2
+ mova t2, %4
+ paddw t2, %3
+%else
+ mova t0, %3
+ mova t2, %4
+ paddw t0, %2
+ paddw t2, %3
+%endif
+ paddw t0, %1
+ paddw t2, t2
+ paddw t0, %5
+ paddw t2, %9
+ paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
+ paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
+
+ psrlw t2, 3
+ psrlw t1, t0, 2
+ psubw t2, %3
+ psubw t1, %2
+ pand t2, %8
+ pand t1, %8
+ paddw t2, %3
+ paddw t1, %2
+ SWAPMOVA %11, t1
+
+ psubw t1, t0, %3
+ paddw t0, t0
+ psubw t1, %5
+ psubw t0, %3
+ paddw t1, %6
+ paddw t1, %2
+ paddw t0, %6
+ psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
+ psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
+
+ pxor t0, t1
+ pxor t1, %1
+ pand t0, %8
+ pand t1, %7
+ pxor t0, t1
+ pxor t0, %1
+ SWAPMOVA %10, t0
+ SWAPMOVA %12, t2
+%endmacro
+
+%macro LUMA_INTRA_INIT 1
+ %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
+ %define t0 m4
+ %define t1 m5
+ %define t2 m6
+ %define t3 m7
+ %assign i 4
+%rep %1
+ CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
+ %assign i i+1
+%endrep
+ SUB rsp, pad
+%endmacro
+
+; in: %1-%3=tmp, %4=p2, %5=q2
+%macro LUMA_INTRA_INTER 5
+ LOAD_AB t0, t1, r2d, r3d
+ mova %1, t0
+ LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
+%if ARCH_X86_64
+ mova %2, t0 ; mask0
+ psrlw t3, %1, 2
+%else
+ mova t3, %1
+ mova %2, t0 ; mask0
+ psrlw t3, 2
+%endif
+ paddw t3, [pw_2] ; alpha/4+2
+ DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
+ pand t2, %2
+ mova t3, %5 ; q2
+ mova %1, t2 ; mask1
+ DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
+ pand t2, %1
+ mova t3, %4 ; p2
+ mova %3, t2 ; mask1q
+ DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
+ pand t2, %1
+ mova %1, t2 ; mask1p
+%endmacro
+
+%macro LUMA_H_INTRA_LOAD 0
+%if mmsize == 8
+ movu t0, [r0-8]
+ movu t1, [r0+r1-8]
+ movu m0, [r0+r1*2-8]
+ movu m1, [r0+r4-8]
+ TRANSPOSE4x4W 4, 5, 0, 1, 2
+ mova t4, t0 ; p3
+ mova t5, t1 ; p2
+
+ movu m2, [r0]
+ movu m3, [r0+r1]
+ movu t0, [r0+r1*2]
+ movu t1, [r0+r4]
+ TRANSPOSE4x4W 2, 3, 4, 5, 6
+ mova t6, t0 ; q2
+ mova t7, t1 ; q3
+%else
+ movu t0, [r0-8]
+ movu t1, [r0+r1-8]
+ movu m0, [r0+r1*2-8]
+ movu m1, [r0+r5-8]
+ movu m2, [r4-8]
+ movu m3, [r4+r1-8]
+ movu t2, [r4+r1*2-8]
+ movu t3, [r4+r5-8]
+ TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
+ mova t4, t0 ; p3
+ mova t5, t1 ; p2
+ mova t6, t2 ; q2
+ mova t7, t3 ; q3
+%endif
+%endmacro
+
+; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
+%macro LUMA_H_INTRA_STORE 9
+%if mmsize == 8
+ TRANSPOSE4x4W %1, %2, %3, %4, %9
+ movq [r0-8], m%1
+ movq [r0+r1-8], m%2
+ movq [r0+r1*2-8], m%3
+ movq [r0+r4-8], m%4
+ movq m%1, %8
+ TRANSPOSE4x4W %5, %6, %7, %1, %9
+ movq [r0], m%5
+ movq [r0+r1], m%6
+ movq [r0+r1*2], m%7
+ movq [r0+r4], m%1
+%else
+ TRANSPOSE2x4x4W %1, %2, %3, %4, %9
+ movq [r0-8], m%1
+ movq [r0+r1-8], m%2
+ movq [r0+r1*2-8], m%3
+ movq [r0+r5-8], m%4
+ movhps [r4-8], m%1
+ movhps [r4+r1-8], m%2
+ movhps [r4+r1*2-8], m%3
+ movhps [r4+r5-8], m%4
+%ifnum %8
+ SWAP %1, %8
+%else
+ mova m%1, %8
+%endif
+ TRANSPOSE2x4x4W %5, %6, %7, %1, %9
+ movq [r0], m%5
+ movq [r0+r1], m%6
+ movq [r0+r1*2], m%7
+ movq [r0+r5], m%1
+ movhps [r4], m%5
+ movhps [r4+r1], m%6
+ movhps [r4+r1*2], m%7
+ movhps [r4+r5], m%1
+%endif
+%endmacro
+
+%if ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_LUMA_INTRA_64 0
+cglobal deblock_v_luma_intra_10, 4,7,16
+ %define t0 m1
+ %define t1 m2
+ %define t2 m4
+ %define p2 m8
+ %define p1 m9
+ %define p0 m10
+ %define q0 m11
+ %define q1 m12
+ %define q2 m13
+ %define aa m5
+ %define bb m14
+ lea r4, [r1*4]
+ lea r5, [r1*3] ; 3*stride
+ neg r4
+ add r4, r0 ; pix-4*stride
+ mov r6, 2
+ mova m0, [pw_2]
+ shl r2d, 2
+ shl r3d, 2
+ LOAD_AB aa, bb, r2d, r3d
+.loop:
+ mova p2, [r4+r1]
+ mova p1, [r4+2*r1]
+ mova p0, [r4+r5]
+ mova q0, [r0]
+ mova q1, [r0+r1]
+ mova q2, [r0+2*r1]
+
+ LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
+ mova t2, aa
+ psrlw t2, 2
+ paddw t2, m0 ; alpha/4+2
+ DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+ DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
+ DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
+ pand m6, m3
+ pand m7, m6
+ pand m6, t1
+ LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
+ LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
+ add r0, mmsize
+ add r4, mmsize
+ dec r6
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10, 4,7,16
+ %define t0 m15
+ %define t1 m14
+ %define t2 m2
+ %define q3 m5
+ %define q2 m8
+ %define q1 m9
+ %define q0 m10
+ %define p0 m11
+ %define p1 m12
+ %define p2 m13
+ %define p3 m4
+ %define spill [rsp]
+ %assign pad 24-(stack_offset&15)
+ SUB rsp, pad
+ lea r4, [r1*4]
+ lea r5, [r1*3] ; 3*stride
+ add r4, r0 ; pix+4*stride
+ mov r6, 2
+ mova m0, [pw_2]
+ shl r2d, 2
+ shl r3d, 2
+.loop:
+ movu q3, [r0-8]
+ movu q2, [r0+r1-8]
+ movu q1, [r0+r1*2-8]
+ movu q0, [r0+r5-8]
+ movu p0, [r4-8]
+ movu p1, [r4+r1-8]
+ movu p2, [r4+r1*2-8]
+ movu p3, [r4+r5-8]
+ TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
+
+ LOAD_AB m1, m2, r2d, r3d
+ LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
+ psrlw m1, 2
+ paddw m1, m0 ; alpha/4+2
+ DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+ DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
+ DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
+ pand m6, m3
+ pand m7, m6
+ pand m6, t1
+
+ mova spill, q3
+ LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
+ LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
+ mova m7, spill
+
+ LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
+
+ lea r0, [r0+r1*8]
+ lea r4, [r4+r1*8]
+ dec r6
+ jg .loop
+ ADD rsp, pad
+ RET
+%endmacro
+
+INIT_XMM sse2
+DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA_INTRA_64
+%endif
+
+%endif
+
+%macro DEBLOCK_LUMA_INTRA 0
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
+ LUMA_INTRA_INIT 3
+ lea r4, [r1*4]
+ lea r5, [r1*3]
+ neg r4
+ add r4, r0
+ mov r6, 32/mmsize
+ shl r2d, 2
+ shl r3d, 2
+.loop:
+ mova m0, [r4+r1*2] ; p1
+ mova m1, [r4+r5] ; p0
+ mova m2, [r0] ; q0
+ mova m3, [r0+r1] ; q1
+ LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
+ LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
+ mova t3, [r0+r1*2] ; q2
+ LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
+ add r0, mmsize
+ add r4, mmsize
+ dec r6
+ jg .loop
+ ADD rsp, pad
+ RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
+ LUMA_INTRA_INIT 8
+%if mmsize == 8
+ lea r4, [r1*3]
+ mov r5, 32/mmsize
+%else
+ lea r4, [r1*4]
+ lea r5, [r1*3] ; 3*stride
+ add r4, r0 ; pix+4*stride
+ mov r6, 32/mmsize
+%endif
+ shl r2d, 2
+ shl r3d, 2
+.loop:
+ LUMA_H_INTRA_LOAD
+ LUMA_INTRA_INTER t8, t9, t10, t5, t6
+
+ LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
+ mova t3, t6 ; q2
+ LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
+
+ mova m2, t4
+ mova m0, t11
+ mova m1, t5
+ mova m3, t8
+ mova m6, t6
+
+ LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
+
+ lea r0, [r0+r1*(mmsize/2)]
+%if mmsize == 8
+ dec r5
+%else
+ lea r4, [r4+r1*(mmsize/2)]
+ dec r6
+%endif
+ jg .loop
+ ADD rsp, pad
+ RET
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmxext
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+INIT_XMM sse2
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_LUMA
+DEBLOCK_LUMA_INTRA
+%endif
+%endif
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', %2=q0'
+%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
+ mova %6, [pw_2]
+ paddw %6, %3
+ paddw %6, %4
+ paddw %7, %6, %2
+ paddw %6, %1
+ paddw %6, %3
+ paddw %7, %4
+ psraw %6, 2
+ psraw %7, 2
+ psubw %6, %1
+ psubw %7, %2
+ pand %6, %5
+ pand %7, %5
+ paddw %1, %6
+ paddw %2, %7
+%endmacro
+
+%macro CHROMA_V_LOAD 1
+ mova m0, [r0] ; p1
+ mova m1, [r0+r1] ; p0
+ mova m2, [%1] ; q0
+ mova m3, [%1+r1] ; q1
+%endmacro
+
+%macro CHROMA_V_STORE 0
+ mova [r0+1*r1], m1
+ mova [r0+2*r1], m2
+%endmacro
+
+%macro CHROMA_V_LOAD_TC 2
+ movd %1, [%2]
+ punpcklbw %1, %1
+ punpcklwd %1, %1
+ psraw %1, 6
+%endmacro
+
+%macro DEBLOCK_CHROMA 0
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
+ mov r5, r0
+ sub r0, r1
+ sub r0, r1
+ shl r2d, 2
+ shl r3d, 2
+%if mmsize < 16
+ mov r6, 16/mmsize
+.loop:
+%endif
+ CHROMA_V_LOAD r5
+ LOAD_AB m4, m5, r2d, r3d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ CHROMA_V_LOAD_TC m6, r4
+ psubw m6, [pw_3]
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_V_STORE
+%if mmsize < 16
+ add r0, mmsize
+ add r5, mmsize
+ add r4, mmsize/4
+ dec r6
+ jg .loop
+ REP_RET
+%else
+ RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
+ mov r4, r0
+ sub r0, r1
+ sub r0, r1
+ shl r2d, 2
+ shl r3d, 2
+%if mmsize < 16
+ mov r5, 16/mmsize
+.loop:
+%endif
+ CHROMA_V_LOAD r4
+ LOAD_AB m4, m5, r2d, r3d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+ CHROMA_V_STORE
+%if mmsize < 16
+ add r0, mmsize
+ add r4, mmsize
+ dec r5
+ jg .loop
+ REP_RET
+%else
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmxext
+DEBLOCK_CHROMA
+%endif
+INIT_XMM sse2
+DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEBLOCK_CHROMA
+%endif
diff --git a/ffmpeg/libavcodec/x86/h264_i386.h b/ffmpeg/libavcodec/x86/h264_i386.h
new file mode 100644
index 0000000..0dc0a7c
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_i386.h
@@ -0,0 +1,204 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 codec.
+ * non-MMX i386-specific optimizations for H.264
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_X86_H264_I386_H
+#define AVCODEC_X86_H264_I386_H
+
+#include <stddef.h>
+
+#include "libavcodec/cabac.h"
+#include "cabac.h"
+
+#if HAVE_INLINE_ASM
+
+//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
+//as that would make optimization work hard)
+#if HAVE_7REGS
+#define decode_significance decode_significance_x86
+static int decode_significance_x86(CABACContext *c, int max_coeff,
+ uint8_t *significant_coeff_ctx_base,
+ int *index, x86_reg last_off){
+ void *end= significant_coeff_ctx_base + max_coeff - 1;
+ int minusstart= -(intptr_t)significant_coeff_ctx_base;
+ int minusindex= 4-(intptr_t)index;
+ int bit;
+ x86_reg coeff_count;
+
+#ifdef BROKEN_RELOCATIONS
+ void *tables;
+
+ __asm__ volatile(
+ "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
+ : "=&r"(tables)
+ );
+#endif
+
+ __asm__ volatile(
+ "3: \n\t"
+
+ BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+ "%5", "%q5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)",
+ AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+ AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+ AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+ "%13")
+
+ "test $1, %4 \n\t"
+ " jz 4f \n\t"
+ "add %10, %1 \n\t"
+
+ BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+ "%5", "%q5", "%k0", "%b0",
+ "%c11(%6)", "%c12(%6)",
+ AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+ AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+ AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+ "%13")
+
+ "sub %10, %1 \n\t"
+ "mov %2, %0 \n\t"
+ "movl %7, %%ecx \n\t"
+ "add %1, %%"REG_c" \n\t"
+ "movl %%ecx, (%0) \n\t"
+
+ "test $1, %4 \n\t"
+ " jnz 5f \n\t"
+
+ "add"OPSIZE" $4, %2 \n\t"
+
+ "4: \n\t"
+ "add $1, %1 \n\t"
+ "cmp %8, %1 \n\t"
+ " jb 3b \n\t"
+ "mov %2, %0 \n\t"
+ "movl %7, %%ecx \n\t"
+ "add %1, %%"REG_c" \n\t"
+ "movl %%ecx, (%0) \n\t"
+ "5: \n\t"
+ "add %9, %k0 \n\t"
+ "shr $2, %k0 \n\t"
+ : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
+ "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
+ : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end))
+ TABLES_ARG
+ : "%"REG_c, "memory"
+ );
+ return coeff_count;
+}
+
+#define decode_significance_8x8 decode_significance_8x8_x86
+static int decode_significance_8x8_x86(CABACContext *c,
+ uint8_t *significant_coeff_ctx_base,
+ int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
+ int minusindex= 4-(intptr_t)index;
+ int bit;
+ x86_reg coeff_count;
+ x86_reg last=0;
+ x86_reg state;
+
+#ifdef BROKEN_RELOCATIONS
+ void *tables;
+
+ __asm__ volatile(
+ "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
+ : "=&r"(tables)
+ );
+#endif
+
+ __asm__ volatile(
+ "mov %1, %6 \n\t"
+ "3: \n\t"
+
+ "mov %10, %0 \n\t"
+ "movzbl (%0, %6), %k6 \n\t"
+ "add %9, %6 \n\t"
+
+ BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+ "%5", "%q5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)",
+ AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+ AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+ AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+ "%15")
+
+ "mov %1, %k6 \n\t"
+ "test $1, %4 \n\t"
+ " jz 4f \n\t"
+
+#ifdef BROKEN_RELOCATIONS
+ "movzbl %c14(%15, %q6), %k6\n\t"
+#else
+ "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+#endif
+ "add %11, %6 \n\t"
+
+ BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+ "%5", "%q5", "%k0", "%b0",
+ "%c12(%7)", "%c13(%7)",
+ AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
+ AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
+ AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
+ "%15")
+
+ "mov %2, %0 \n\t"
+ "mov %1, %k6 \n\t"
+ "movl %k6, (%0) \n\t"
+
+ "test $1, %4 \n\t"
+ " jnz 5f \n\t"
+
+ "add"OPSIZE" $4, %2 \n\t"
+
+ "4: \n\t"
+ "addl $1, %k6 \n\t"
+ "mov %k6, %1 \n\t"
+ "cmpl $63, %k6 \n\t"
+ " jb 3b \n\t"
+ "mov %2, %0 \n\t"
+ "movl %k6, (%0) \n\t"
+ "5: \n\t"
+ "addl %8, %k0 \n\t"
+ "shr $2, %k0 \n\t"
+ : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+ "=&r"(bit), "+&r"(c->range), "=&r"(state)
+ : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
+ "m"(sig_off), "m"(last_coeff_ctx_base),
+ "i"(offsetof(CABACContext, bytestream)),
+ "i"(offsetof(CABACContext, bytestream_end)),
+ "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
+ : "%"REG_c, "memory"
+ );
+ return coeff_count;
+}
+#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/ffmpeg/libavcodec/x86/h264_idct.asm b/ffmpeg/libavcodec/x86/h264_idct.asm
new file mode 100644
index 0000000..7bb1653
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_idct.asm
@@ -0,0 +1,1073 @@
+;*****************************************************************************
+;* MMX/SSE2-optimized H.264 iDCT
+;*****************************************************************************
+;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;* Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <hal@duncan.ol.sub.de>
+;* Min Chen <chenm001.163.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+ db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+ db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+ db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+ db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+ db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+ db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+ db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+ db 4+11*8, 5+11*8, 4+12*8, 5+12*8
+ db 6+11*8, 7+11*8, 6+12*8, 7+12*8
+ db 4+13*8, 5+13*8, 4+14*8, 5+14*8
+ db 6+13*8, 7+13*8, 6+14*8, 7+14*8
+%ifdef PIC
+%define npicregs 1
+%define scan8 picregq
+%else
+%define npicregs 0
+%define scan8 scan8_mem
+%endif
+
+cextern pw_32
+cextern pw_1
+
+SECTION .text
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+%macro IDCT4_ADD 3
+ ; Load dct coeffs
+ movq m0, [%2]
+ movq m1, [%2+8]
+ movq m2, [%2+16]
+ movq m3, [%2+24]
+
+ IDCT4_1D w, 0, 1, 2, 3, 4, 5
+ mova m6, [pw_32]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, m6
+ IDCT4_1D w, 0, 1, 2, 3, 4, 5
+ pxor m7, m7
+ movq [%2+ 0], m7
+ movq [%2+ 8], m7
+ movq [%2+16], m7
+ movq [%2+24], m7
+
+ STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
+%endmacro
+
+INIT_MMX mmx
+; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct_add_8, 3, 3, 0
+ IDCT4_ADD r0, r1, r2
+ RET
+
+%macro IDCT8_1D 2
+ mova m0, m1
+ psraw m1, 1
+ mova m4, m5
+ psraw m4, 1
+ paddw m4, m5
+ paddw m1, m0
+ paddw m4, m7
+ paddw m1, m5
+ psubw m4, m0
+ paddw m1, m3
+
+ psubw m0, m3
+ psubw m5, m3
+ psraw m3, 1
+ paddw m0, m7
+ psubw m5, m7
+ psraw m7, 1
+ psubw m0, m3
+ psubw m5, m7
+
+ mova m7, m1
+ psraw m1, 2
+ mova m3, m4
+ psraw m3, 2
+ paddw m3, m0
+ psraw m0, 2
+ paddw m1, m5
+ psraw m5, 2
+ psubw m0, m4
+ psubw m7, m5
+
+ mova m5, m6
+ psraw m6, 1
+ mova m4, m2
+ psraw m4, 1
+ paddw m6, m2
+ psubw m4, m5
+
+ mova m2, %1
+ mova m5, %2
+ SUMSUB_BA w, 5, 2
+ SUMSUB_BA w, 6, 5
+ SUMSUB_BA w, 4, 2
+ SUMSUB_BA w, 7, 6
+ SUMSUB_BA w, 0, 4
+ SUMSUB_BA w, 3, 2
+ SUMSUB_BA w, 1, 5
+ SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
+%endmacro
+
+%macro IDCT8_1D_FULL 1
+ mova m7, [%1+112]
+ mova m6, [%1+ 96]
+ mova m5, [%1+ 80]
+ mova m3, [%1+ 48]
+ mova m2, [%1+ 32]
+ mova m1, [%1+ 16]
+ IDCT8_1D [%1], [%1+ 64]
+%endmacro
+
+; %1=int16_t *block, %2=int16_t *dstblock
+%macro IDCT8_ADD_MMX_START 2
+ IDCT8_1D_FULL %1
+ mova [%1], m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 7
+ mova m7, [%1]
+ mova [%2 ], m0
+ mova [%2+16], m1
+ mova [%2+32], m2
+ mova [%2+48], m3
+ TRANSPOSE4x4W 4, 5, 6, 7, 3
+ mova [%2+ 8], m4
+ mova [%2+24], m5
+ mova [%2+40], m6
+ mova [%2+56], m7
+%endmacro
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+%macro IDCT8_ADD_MMX_END 3-4
+ IDCT8_1D_FULL %2
+ mova [%2 ], m5
+ mova [%2+16], m6
+ mova [%2+32], m7
+
+ pxor m7, m7
+%if %0 == 4
+ movq [%4+ 0], m7
+ movq [%4+ 8], m7
+ movq [%4+ 16], m7
+ movq [%4+ 24], m7
+ movq [%4+ 32], m7
+ movq [%4+ 40], m7
+ movq [%4+ 48], m7
+ movq [%4+ 56], m7
+ movq [%4+ 64], m7
+ movq [%4+ 72], m7
+ movq [%4+ 80], m7
+ movq [%4+ 88], m7
+ movq [%4+ 96], m7
+ movq [%4+104], m7
+ movq [%4+112], m7
+ movq [%4+120], m7
+%endif
+ STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
+ mova m0, [%2 ]
+ mova m1, [%2+16]
+ mova m2, [%2+32]
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
+%endmacro
+
+INIT_MMX mmx
+; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct8_add_8, 3, 4, 0
+ %assign pad 128+4-(stack_offset&7)
+ SUB rsp, pad
+
+ add word [r1], 32
+ IDCT8_ADD_MMX_START r1 , rsp
+ IDCT8_ADD_MMX_START r1+8, rsp+64
+ lea r3, [r0+4]
+ IDCT8_ADD_MMX_END r0 , rsp, r2, r1
+ IDCT8_ADD_MMX_END r3 , rsp+8, r2
+
+ ADD rsp, pad
+ RET
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+%macro IDCT8_ADD_SSE 4
+ IDCT8_1D_FULL %2
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
+%endif
+ paddw m0, [pw_32]
+
+%if ARCH_X86_64 == 0
+ mova [%2 ], m0
+ mova [%2+16], m4
+ IDCT8_1D [%2], [%2+ 16]
+ mova [%2 ], m6
+ mova [%2+16], m7
+%else
+ SWAP 0, 8
+ SWAP 4, 9
+ IDCT8_1D m8, m9
+ SWAP 6, 8
+ SWAP 7, 9
+%endif
+
+ pxor m7, m7
+ lea %4, [%3*3]
+ STORE_DIFF m0, m6, m7, [%1 ]
+ STORE_DIFF m1, m6, m7, [%1+%3 ]
+ STORE_DIFF m2, m6, m7, [%1+%3*2]
+ STORE_DIFF m3, m6, m7, [%1+%4 ]
+%if ARCH_X86_64 == 0
+ mova m0, [%2 ]
+ mova m1, [%2+16]
+%else
+ SWAP 0, 8
+ SWAP 1, 9
+%endif
+ mova [%2+ 0], m7
+ mova [%2+ 16], m7
+ mova [%2+ 32], m7
+ mova [%2+ 48], m7
+ mova [%2+ 64], m7
+ mova [%2+ 80], m7
+ mova [%2+ 96], m7
+ mova [%2+112], m7
+ lea %1, [%1+%3*4]
+ STORE_DIFF m4, m6, m7, [%1 ]
+ STORE_DIFF m5, m6, m7, [%1+%3 ]
+ STORE_DIFF m0, m6, m7, [%1+%3*2]
+ STORE_DIFF m1, m6, m7, [%1+%4 ]
+%endmacro
+
+INIT_XMM sse2
+; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct8_add_8, 3, 4, 10
+ IDCT8_ADD_SSE r0, r1, r2, r3
+ RET
+
+%macro DC_ADD_MMXEXT_INIT 2
+ add %1, 32
+ sar %1, 6
+ movd m0, %1d
+ lea %1, [%2*3]
+ pshufw m0, m0, 0
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
+%macro DC_ADD_MMXEXT_OP 4
+ %1 m2, [%2 ]
+ %1 m3, [%2+%3 ]
+ %1 m4, [%2+%3*2]
+ %1 m5, [%2+%4 ]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ %1 [%2 ], m2
+ %1 [%2+%3 ], m3
+ %1 [%2+%3*2], m4
+ %1 [%2+%4 ], m5
+%endmacro
+
+INIT_MMX mmxext
+; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
+%if ARCH_X86_64
+cglobal h264_idct_dc_add_8, 3, 4, 0
+ movsx r3, word [r1]
+ mov word [r1], 0
+ DC_ADD_MMXEXT_INIT r3, r2
+ DC_ADD_MMXEXT_OP movh, r0, r2, r3
+ RET
+
+; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct8_dc_add_8, 3, 4, 0
+ movsx r3, word [r1]
+ mov word [r1], 0
+ DC_ADD_MMXEXT_INIT r3, r2
+ DC_ADD_MMXEXT_OP mova, r0, r2, r3
+ lea r0, [r0+r2*4]
+ DC_ADD_MMXEXT_OP mova, r0, r2, r3
+ RET
+%else
+cglobal h264_idct_dc_add_8, 2, 3, 0
+ movsx r2, word [r1]
+ mov word [r1], 0
+ mov r1, r2m
+ DC_ADD_MMXEXT_INIT r2, r1
+ DC_ADD_MMXEXT_OP movh, r0, r1, r2
+ RET
+
+; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct8_dc_add_8, 2, 3, 0
+ movsx r2, word [r1]
+ mov word [r1], 0
+ mov r1, r2m
+ DC_ADD_MMXEXT_INIT r2, r1
+ DC_ADD_MMXEXT_OP mova, r0, r1, r2
+ lea r0, [r0+r1*4]
+ DC_ADD_MMXEXT_OP mova, r0, r1, r2
+ RET
+%endif
+
+INIT_MMX mmx
+; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .skipblock
+ mov r6d, dword [r1+r5*4]
+ lea r6, [r0+r6]
+ IDCT4_ADD r6, r2, r3
+.skipblock:
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+
+; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+ %assign pad 128+4-(stack_offset&7)
+ SUB rsp, pad
+
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .skipblock
+ mov r6d, dword [r1+r5*4]
+ add r6, r0
+ add word [r2], 32
+ IDCT8_ADD_MMX_START r2 , rsp
+ IDCT8_ADD_MMX_START r2+8, rsp+64
+ IDCT8_ADD_MMX_END r6 , rsp, r3, r2
+ mov r6d, dword [r1+r5*4]
+ lea r6, [r0+r6+4]
+ IDCT8_ADD_MMX_END r6 , rsp+8, r3
+.skipblock:
+ add r5, 4
+ add r2, 128
+ cmp r5, 16
+ jl .nextblock
+ ADD rsp, pad
+ RET
+
+INIT_MMX mmxext
+; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .skipblock
+ cmp r6, 1
+ jnz .no_dc
+ movsx r6, word [r2]
+ test r6, r6
+ jz .no_dc
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
+%if ARCH_X86_64 == 0
+ mov r1, r1m
+%endif
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+.no_dc:
+ mov r6d, dword [r1+r5*4]
+ add r6, r0
+ IDCT4_ADD r6, r2, r3
+.skipblock:
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+
+INIT_MMX mmx
+; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ or r6w, word [r2]
+ test r6, r6
+ jz .skipblock
+ mov r6d, dword [r1+r5*4]
+ add r6, r0
+ IDCT4_ADD r6, r2, r3
+.skipblock:
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+
+INIT_MMX mmxext
+; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride,
+; const uint8_t nnzc[6*8])
+cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .try_dc
+ mov r6d, dword [r1+r5*4]
+ lea r6, [r0+r6]
+ IDCT4_ADD r6, r2, r3
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+.try_dc:
+ movsx r6, word [r2]
+ test r6, r6
+ jz .skipblock
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
+%if ARCH_X86_64 == 0
+ mov r1, r1m
+%endif
+.skipblock:
+ inc r5
+ add r2, 32
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+
+; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride,
+; const uint8_t nnzc[6*8])
+cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ %assign pad 128+4-(stack_offset&7)
+ SUB rsp, pad
+
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .skipblock
+ cmp r6, 1
+ jnz .no_dc
+ movsx r6, word [r2]
+ test r6, r6
+ jz .no_dc
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
+%if ARCH_X86_64 == 0
+ mov r1, r1m
+%endif
+ add r5, 4
+ add r2, 128
+ cmp r5, 16
+ jl .nextblock
+
+ ADD rsp, pad
+ RET
+.no_dc:
+ mov r6d, dword [r1+r5*4]
+ add r6, r0
+ add word [r2], 32
+ IDCT8_ADD_MMX_START r2 , rsp
+ IDCT8_ADD_MMX_START r2+8, rsp+64
+ IDCT8_ADD_MMX_END r6 , rsp, r3, r2
+ mov r6d, dword [r1+r5*4]
+ lea r6, [r0+r6+4]
+ IDCT8_ADD_MMX_END r6 , rsp+8, r3
+.skipblock:
+ add r5, 4
+ add r2, 128
+ cmp r5, 16
+ jl .nextblock
+
+ ADD rsp, pad
+ RET
+
+INIT_XMM sse2
+; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ xor r5, r5
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .skipblock
+ cmp r6, 1
+ jnz .no_dc
+ movsx r6, word [r2]
+ test r6, r6
+ jz .no_dc
+INIT_MMX cpuname
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
+%if ARCH_X86_64 == 0
+ mov r1, r1m
+%endif
+ add r5, 4
+ add r2, 128
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+.no_dc:
+INIT_XMM cpuname
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ IDCT8_ADD_SSE dst2q, r2, r3, r6
+%if ARCH_X86_64 == 0
+ mov r1, r1m
+%endif
+.skipblock:
+ add r5, 4
+ add r2, 128
+ cmp r5, 16
+ jl .nextblock
+ REP_RET
+
+INIT_MMX mmx
+h264_idct_add8_mmx_plane:
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ or r6w, word [r2]
+ test r6, r6
+ jz .skipblock
+%if ARCH_X86_64
+ mov r0d, dword [r1+r5*4]
+ add r0, [dst2q]
+%else
+ mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, [r0]
+ add r0, dword [r1+r5*4]
+%endif
+ IDCT4_ADD r0, r2, r3
+.skipblock:
+ inc r5
+ add r2, 32
+ test r5, 3
+ jnz .nextblock
+ rep ret
+
+; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ mov r5, 16
+ add r2, 512
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+%if ARCH_X86_64
+ mov dst2q, r0
+%endif
+ call h264_idct_add8_mmx_plane
+ mov r5, 32
+ add r2, 384
+%if ARCH_X86_64
+ add dst2q, gprsize
+%else
+ add r0mp, gprsize
+%endif
+ call h264_idct_add8_mmx_plane
+ RET
+
+h264_idct_add8_mmxext_plane:
+.nextblock:
+ movzx r6, byte [scan8+r5]
+ movzx r6, byte [r4+r6]
+ test r6, r6
+ jz .try_dc
+%if ARCH_X86_64
+ mov r0d, dword [r1+r5*4]
+ add r0, [dst2q]
+%else
+ mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, [r0]
+ add r0, dword [r1+r5*4]
+%endif
+ IDCT4_ADD r0, r2, r3
+ inc r5
+ add r2, 32
+ test r5, 3
+ jnz .nextblock
+ rep ret
+.try_dc:
+ movsx r6, word [r2]
+ test r6, r6
+ jz .skipblock
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
+%if ARCH_X86_64
+ mov r0d, dword [r1+r5*4]
+ add r0, [dst2q]
+%else
+ mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, [r0]
+ add r0, dword [r1+r5*4]
+%endif
+ DC_ADD_MMXEXT_OP movh, r0, r3, r6
+.skipblock:
+ inc r5
+ add r2, 32
+ test r5, 3
+ jnz .nextblock
+ rep ret
+
+INIT_MMX mmxext
+; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+ mov r5, 16
+ add r2, 512
+%if ARCH_X86_64
+ mov dst2q, r0
+%endif
+%ifdef PIC
+ lea picregq, [scan8_mem]
+%endif
+ call h264_idct_add8_mmxext_plane
+ mov r5, 32
+ add r2, 384
+%if ARCH_X86_64
+ add dst2q, gprsize
+%else
+ add r0mp, gprsize
+%endif
+ call h264_idct_add8_mmxext_plane
+ RET
+
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+h264_idct_dc_add8_mmxext:
+ movd m0, [r2 ] ; 0 0 X D
+ mov word [r2+ 0], 0
+ punpcklwd m0, [r2+32] ; x X d D
+ mov word [r2+32], 0
+ paddsw m0, [pw_32]
+ psraw m0, 6
+ punpcklwd m0, m0 ; d d D D
+ pxor m1, m1 ; 0 0 0 0
+ psubw m1, m0 ; -d-d-D-D
+ packuswb m0, m1 ; -d-d-D-D d d D D
+ pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
+ punpcklwd m0, m0 ; d d d d D D D D
+ lea r6, [r3*3]
+ DC_ADD_MMXEXT_OP movq, r0, r3, r6
+ ret
+
+ALIGN 16
+INIT_XMM sse2
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+h264_add8x4_idct_sse2:
+ movq m0, [r2+ 0]
+ movq m1, [r2+ 8]
+ movq m2, [r2+16]
+ movq m3, [r2+24]
+ movhps m0, [r2+32]
+ movhps m1, [r2+40]
+ movhps m2, [r2+48]
+ movhps m3, [r2+56]
+ IDCT4_1D w,0,1,2,3,4,5
+ TRANSPOSE2x4x4W 0,1,2,3,4
+ paddw m0, [pw_32]
+ IDCT4_1D w,0,1,2,3,4,5
+ pxor m7, m7
+ mova [r2+ 0], m7
+ mova [r2+16], m7
+ mova [r2+32], m7
+ mova [r2+48], m7
+ STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
+ lea r0, [r0+r3*2]
+ STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
+ ret
+
+%macro add16_sse2_cycle 2
+ movzx r0, word [r4+%2]
+ test r0, r0
+ jz .cycle%1end
+ mov r0d, dword [r1+%1*8]
+%if ARCH_X86_64
+ add r0, r5
+%else
+ add r0, r0m
+%endif
+ call h264_add8x4_idct_sse2
+.cycle%1end:
+%if %1 < 7
+ add r2, 64
+%endif
+%endmacro
+
+; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
+%if ARCH_X86_64
+ mov r5, r0
+%endif
+ ; unrolling of the loop leads to an average performance gain of
+ ; 20-25%
+ add16_sse2_cycle 0, 0xc
+ add16_sse2_cycle 1, 0x14
+ add16_sse2_cycle 2, 0xe
+ add16_sse2_cycle 3, 0x16
+ add16_sse2_cycle 4, 0x1c
+ add16_sse2_cycle 5, 0x24
+ add16_sse2_cycle 6, 0x1e
+ add16_sse2_cycle 7, 0x26
+ RET
+
+%macro add16intra_sse2_cycle 2
+ movzx r0, word [r4+%2]
+ test r0, r0
+ jz .try%1dc
+ mov r0d, dword [r1+%1*8]
+%if ARCH_X86_64
+ add r0, r7
+%else
+ add r0, r0m
+%endif
+ call h264_add8x4_idct_sse2
+ jmp .cycle%1end
+.try%1dc:
+ movsx r0, word [r2 ]
+ or r0w, word [r2+32]
+ jz .cycle%1end
+ mov r0d, dword [r1+%1*8]
+%if ARCH_X86_64
+ add r0, r7
+%else
+ add r0, r0m
+%endif
+ call h264_idct_dc_add8_mmxext
+.cycle%1end:
+%if %1 < 7
+ add r2, 64
+%endif
+%endmacro
+
+; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
+%if ARCH_X86_64
+ mov r7, r0
+%endif
+ add16intra_sse2_cycle 0, 0xc
+ add16intra_sse2_cycle 1, 0x14
+ add16intra_sse2_cycle 2, 0xe
+ add16intra_sse2_cycle 3, 0x16
+ add16intra_sse2_cycle 4, 0x1c
+ add16intra_sse2_cycle 5, 0x24
+ add16intra_sse2_cycle 6, 0x1e
+ add16intra_sse2_cycle 7, 0x26
+ RET
+
+%macro add8_sse2_cycle 2
+ movzx r0, word [r4+%2]
+ test r0, r0
+ jz .try%1dc
+%if ARCH_X86_64
+ mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+ add r0, [r7]
+%else
+ mov r0, r0m
+ mov r0, [r0]
+ add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+%endif
+ call h264_add8x4_idct_sse2
+ jmp .cycle%1end
+.try%1dc:
+ movsx r0, word [r2 ]
+ or r0w, word [r2+32]
+ jz .cycle%1end
+%if ARCH_X86_64
+ mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+ add r0, [r7]
+%else
+ mov r0, r0m
+ mov r0, [r0]
+ add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
+%endif
+ call h264_idct_dc_add8_mmxext
+.cycle%1end:
+%if %1 == 1
+ add r2, 384+64
+%elif %1 < 3
+ add r2, 64
+%endif
+%endmacro
+
+; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
+; int16_t *block, int stride, const uint8_t nnzc[6*8])
+cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
+ add r2, 512
+%if ARCH_X86_64
+ mov r7, r0
+%endif
+ add8_sse2_cycle 0, 0x34
+ add8_sse2_cycle 1, 0x3c
+%if ARCH_X86_64
+ add r7, gprsize
+%else
+ add r0mp, gprsize
+%endif
+ add8_sse2_cycle 2, 0x5c
+ add8_sse2_cycle 3, 0x64
+ RET
+
+;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
+
+%macro WALSH4_1D 5
+ SUMSUB_BADC w, %4, %3, %2, %1, %5
+ SUMSUB_BADC w, %4, %2, %3, %1, %5
+ SWAP %1, %4, %3
+%endmacro
+
+%macro DEQUANT_MMX 3
+ mova m7, [pw_1]
+ mova m4, %1
+ punpcklwd %1, m7
+ punpckhwd m4, m7
+ mova m5, %2
+ punpcklwd %2, m7
+ punpckhwd m5, m7
+ movd m7, t3d
+ punpckldq m7, m7
+ pmaddwd %1, m7
+ pmaddwd %2, m7
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ psrad %1, %3
+ psrad %2, %3
+ psrad m4, %3
+ psrad m5, %3
+ packssdw %1, m4
+ packssdw %2, m5
+%endmacro
+
+%macro STORE_WORDS 5-9
+%if cpuflag(sse)
+ movd t0d, %1
+ psrldq %1, 4
+ movd t1d, %1
+ psrldq %1, 4
+ mov [t2+%2*32], t0w
+ mov [t2+%4*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%3*32], t0w
+ mov [t2+%5*32], t1w
+ movd t0d, %1
+ psrldq %1, 4
+ movd t1d, %1
+ mov [t2+%6*32], t0w
+ mov [t2+%8*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%7*32], t0w
+ mov [t2+%9*32], t1w
+%else
+ movd t0d, %1
+ psrlq %1, 32
+ movd t1d, %1
+ mov [t2+%2*32], t0w
+ mov [t2+%4*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%3*32], t0w
+ mov [t2+%5*32], t1w
+%endif
+%endmacro
+
+%macro DEQUANT_STORE 1
+%if cpuflag(sse2)
+ movd xmm4, t3d
+ movq xmm5, [pw_1]
+ pshufd xmm4, xmm4, 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+ punpcklwd xmm0, xmm5
+ punpcklwd xmm1, xmm5
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm5
+ pmaddwd xmm0, xmm4
+ pmaddwd xmm1, xmm4
+ pmaddwd xmm2, xmm4
+ pmaddwd xmm3, xmm4
+ psrad xmm0, %1
+ psrad xmm1, %1
+ psrad xmm2, %1
+ psrad xmm3, %1
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
+ STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
+%else
+ DEQUANT_MMX m0, m1, %1
+ STORE_WORDS m0, 0, 1, 4, 5
+ STORE_WORDS m1, 2, 3, 6, 7
+
+ DEQUANT_MMX m2, m3, %1
+ STORE_WORDS m2, 8, 9, 12, 13
+ STORE_WORDS m3, 10, 11, 14, 15
+%endif
+%endmacro
+
+%macro IDCT_DC_DEQUANT 1
+cglobal h264_luma_dc_dequant_idct, 3, 4, %1
+ ; manually spill XMM registers for Win64 because
+ ; the code here is initialized with INIT_MMX
+ WIN64_SPILL_XMM %1
+ movq m3, [r1+24]
+ movq m2, [r1+16]
+ movq m1, [r1+ 8]
+ movq m0, [r1+ 0]
+ WALSH4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+ WALSH4_1D 0,1,2,3,4
+
+; shift, tmp, output, qmul
+%if WIN64
+ DECLARE_REG_TMP 0,3,1,2
+ ; we can't avoid this, because r0 is the shift register (ecx) on win64
+ xchg r0, t2
+%elif ARCH_X86_64
+ DECLARE_REG_TMP 3,1,0,2
+%else
+ DECLARE_REG_TMP 1,3,0,2
+%endif
+
+ cmp t3d, 32767
+ jg .big_qmul
+ add t3d, 128 << 16
+ DEQUANT_STORE 8
+ RET
+.big_qmul:
+ bsr t0d, t3d
+ add t3d, 128 << 16
+ mov t1d, 7
+ cmp t0d, t1d
+ cmovg t0d, t1d
+ inc t1d
+ shr t3d, t0b
+ sub t1d, t0d
+%if cpuflag(sse2)
+ movd xmm6, t1d
+ DEQUANT_STORE xmm6
+%else
+ movd m6, t1d
+ DEQUANT_STORE m6
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmx
+IDCT_DC_DEQUANT 0
+INIT_MMX sse2
+IDCT_DC_DEQUANT 7
diff --git a/ffmpeg/libavcodec/x86/h264_idct_10bit.asm b/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
new file mode 100644
index 0000000..88fdb84
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
@@ -0,0 +1,589 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+pd_32: times 4 dd 32
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
+;-----------------------------------------------------------------------------
+%macro STORE_DIFFx2 6
+ psrad %1, 6
+ psrad %2, 6
+ packssdw %1, %2
+ movq %3, [%5]
+ movhps %3, [%5+%6]
+ paddsw %1, %3
+ CLIPW %1, %4, [pw_pixel_max]
+ movq [%5], %1
+ movhps [%5+%6], %1
+%endmacro
+
+%macro STORE_DIFF16 5
+ psrad %1, 6
+ psrad %2, 6
+ packssdw %1, %2
+ paddsw %1, [%5]
+ CLIPW %1, %3, %4
+ mova [%5], %1
+%endmacro
+
+;dst, in, stride
+%macro IDCT4_ADD_10 3
+ mova m0, [%2+ 0]
+ mova m1, [%2+16]
+ mova m2, [%2+32]
+ mova m3, [%2+48]
+ IDCT4_1D d,0,1,2,3,4,5
+ TRANSPOSE4x4D 0,1,2,3,4
+ paddd m0, [pd_32]
+ IDCT4_1D d,0,1,2,3,4,5
+ pxor m5, m5
+ mova [%2+ 0], m5
+ mova [%2+16], m5
+ mova [%2+32], m5
+ mova [%2+48], m5
+ STORE_DIFFx2 m0, m1, m4, m5, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m2, m3, m4, m5, %1, %3
+%endmacro
+
+%macro IDCT_ADD_10 0
+cglobal h264_idct_add_10, 3,3
+ IDCT4_ADD_10 r0, r1, r2
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD_10
+%endif
+
+;-----------------------------------------------------------------------------
+; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
+%macro ADD4x4IDCT 0
+add4x4_idct %+ SUFFIX:
+ add r5, r0
+ mova m0, [r2+ 0]
+ mova m1, [r2+16]
+ mova m2, [r2+32]
+ mova m3, [r2+48]
+ IDCT4_1D d,0,1,2,3,4,5
+ TRANSPOSE4x4D 0,1,2,3,4
+ paddd m0, [pd_32]
+ IDCT4_1D d,0,1,2,3,4,5
+ pxor m5, m5
+ mova [r2+ 0], m5
+ mova [r2+16], m5
+ mova [r2+32], m5
+ mova [r2+48], m5
+ STORE_DIFFx2 m0, m1, m4, m5, r5, r3
+ lea r5, [r5+r3*2]
+ STORE_DIFFx2 m2, m3, m4, m5, r5, r3
+ ret
+%endmacro
+
+INIT_XMM sse2
+ALIGN 16
+ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+ALIGN 16
+ADD4x4IDCT
+%endif
+
+%macro ADD16_OP 2
+ cmp byte [r4+%2], 0
+ jz .skipblock%1
+ mov r5d, [r1+%1*4]
+ call add4x4_idct %+ SUFFIX
+.skipblock%1:
+%if %1<15
+ add r2, 64
+%endif
+%endmacro
+
+%macro IDCT_ADD16_10 0
+cglobal h264_idct_add16_10, 5,6
+ ADD16_OP 0, 4+1*8
+ ADD16_OP 1, 5+1*8
+ ADD16_OP 2, 4+2*8
+ ADD16_OP 3, 5+2*8
+ ADD16_OP 4, 6+1*8
+ ADD16_OP 5, 7+1*8
+ ADD16_OP 6, 6+2*8
+ ADD16_OP 7, 7+2*8
+ ADD16_OP 8, 4+3*8
+ ADD16_OP 9, 5+3*8
+ ADD16_OP 10, 4+4*8
+ ADD16_OP 11, 5+4*8
+ ADD16_OP 12, 6+3*8
+ ADD16_OP 13, 7+3*8
+ ADD16_OP 14, 6+4*8
+ ADD16_OP 15, 7+4*8
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD16_10
+%endif
+
+;-----------------------------------------------------------------------------
+; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT_DC_ADD_OP_10 3
+ pxor m5, m5
+%if avx_enabled
+ paddw m1, m0, [%1+0 ]
+ paddw m2, m0, [%1+%2 ]
+ paddw m3, m0, [%1+%2*2]
+ paddw m4, m0, [%1+%3 ]
+%else
+ mova m1, [%1+0 ]
+ mova m2, [%1+%2 ]
+ mova m3, [%1+%2*2]
+ mova m4, [%1+%3 ]
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+%endif
+ CLIPW m1, m5, m6
+ CLIPW m2, m5, m6
+ CLIPW m3, m5, m6
+ CLIPW m4, m5, m6
+ mova [%1+0 ], m1
+ mova [%1+%2 ], m2
+ mova [%1+%2*2], m3
+ mova [%1+%3 ], m4
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_idct_dc_add_10,3,3
+ movd m0, [r1]
+ mov dword [r1], 0
+ paddd m0, [pd_32]
+ psrad m0, 6
+ lea r1, [r2*3]
+ pshufw m0, m0, 0
+ mova m6, [pw_pixel_max]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ RET
+
+;-----------------------------------------------------------------------------
+; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_DC_ADD 0
+cglobal h264_idct8_dc_add_10,3,4,7
+ movd m0, [r1]
+ mov dword[r1], 0
+ paddd m0, [pd_32]
+ psrad m0, 6
+ lea r1, [r2*3]
+ SPLATW m0, m0, 0
+ mova m6, [pw_pixel_max]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ lea r0, [r0+r2*4]
+ IDCT_DC_ADD_OP_10 r0, r2, r1
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_DC_ADD
+%endif
+
+;-----------------------------------------------------------------------------
+; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%macro AC 1
+.ac%1:
+ mov r5d, [r1+(%1+0)*4]
+ call add4x4_idct %+ SUFFIX
+ mov r5d, [r1+(%1+1)*4]
+ add r2, 64
+ call add4x4_idct %+ SUFFIX
+ add r2, 64
+ jmp .skipadd%1
+%endmacro
+
+%assign last_block 16
+%macro ADD16_OP_INTRA 2
+ cmp word [r4+%2], 0
+ jnz .ac%1
+ mov r5d, [r2+ 0]
+ or r5d, [r2+64]
+ jz .skipblock%1
+ mov r5d, [r1+(%1+0)*4]
+ call idct_dc_add %+ SUFFIX
+.skipblock%1:
+%if %1<last_block-2
+ add r2, 128
+%endif
+.skipadd%1:
+%endmacro
+
+%macro IDCT_ADD16INTRA_10 0
+idct_dc_add %+ SUFFIX:
+ add r5, r0
+ movq m0, [r2+ 0]
+ movhps m0, [r2+64]
+ mov dword [r2+ 0], 0
+ mov dword [r2+64], 0
+ paddd m0, [pd_32]
+ psrad m0, 6
+ pshufhw m0, m0, 0
+ pshuflw m0, m0, 0
+ lea r6, [r3*3]
+ mova m6, [pw_pixel_max]
+ IDCT_DC_ADD_OP_10 r5, r3, r6
+ ret
+
+cglobal h264_idct_add16intra_10,5,7,8
+ ADD16_OP_INTRA 0, 4+1*8
+ ADD16_OP_INTRA 2, 4+2*8
+ ADD16_OP_INTRA 4, 6+1*8
+ ADD16_OP_INTRA 6, 6+2*8
+ ADD16_OP_INTRA 8, 4+3*8
+ ADD16_OP_INTRA 10, 4+4*8
+ ADD16_OP_INTRA 12, 6+3*8
+ ADD16_OP_INTRA 14, 6+4*8
+ REP_RET
+ AC 8
+ AC 10
+ AC 12
+ AC 14
+ AC 0
+ AC 2
+ AC 4
+ AC 6
+%endmacro
+
+INIT_XMM sse2
+IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD16INTRA_10
+%endif
+
+%assign last_block 36
+;-----------------------------------------------------------------------------
+; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+%macro IDCT_ADD8 0
+cglobal h264_idct_add8_10,5,8,7
+%if ARCH_X86_64
+ mov r7, r0
+%endif
+ add r2, 1024
+ mov r0, [r0]
+ ADD16_OP_INTRA 16, 4+ 6*8
+ ADD16_OP_INTRA 18, 4+ 7*8
+ add r2, 1024-128*2
+%if ARCH_X86_64
+ mov r0, [r7+gprsize]
+%else
+ mov r0, r0m
+ mov r0, [r0+gprsize]
+%endif
+ ADD16_OP_INTRA 32, 4+11*8
+ ADD16_OP_INTRA 34, 4+12*8
+ REP_RET
+ AC 16
+ AC 18
+ AC 32
+ AC 34
+
+%endmacro ; IDCT_ADD8
+
+INIT_XMM sse2
+IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT_ADD8
+%endif
+
+;-----------------------------------------------------------------------------
+; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_1D 2
+ SWAP 0, 1
+ psrad m4, m5, 1
+ psrad m1, m0, 1
+ paddd m4, m5
+ paddd m1, m0
+ paddd m4, m7
+ paddd m1, m5
+ psubd m4, m0
+ paddd m1, m3
+
+ psubd m0, m3
+ psubd m5, m3
+ paddd m0, m7
+ psubd m5, m7
+ psrad m3, 1
+ psrad m7, 1
+ psubd m0, m3
+ psubd m5, m7
+
+ SWAP 1, 7
+ psrad m1, m7, 2
+ psrad m3, m4, 2
+ paddd m3, m0
+ psrad m0, 2
+ paddd m1, m5
+ psrad m5, 2
+ psubd m0, m4
+ psubd m7, m5
+
+ SWAP 5, 6
+ psrad m4, m2, 1
+ psrad m6, m5, 1
+ psubd m4, m5
+ paddd m6, m2
+
+ mova m2, %1
+ mova m5, %2
+ SUMSUB_BA d, 5, 2
+ SUMSUB_BA d, 6, 5
+ SUMSUB_BA d, 4, 2
+ SUMSUB_BA d, 7, 6
+ SUMSUB_BA d, 0, 4
+ SUMSUB_BA d, 3, 2
+ SUMSUB_BA d, 1, 5
+ SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
+%endmacro
+
+%macro IDCT8_1D_FULL 1
+ mova m7, [%1+112*2]
+ mova m6, [%1+ 96*2]
+ mova m5, [%1+ 80*2]
+ mova m3, [%1+ 48*2]
+ mova m2, [%1+ 32*2]
+ mova m1, [%1+ 16*2]
+ IDCT8_1D [%1], [%1+ 64*2]
+%endmacro
+
+; %1=int16_t *block, %2=int16_t *dstblock
+%macro IDCT8_ADD_SSE_START 2
+ IDCT8_1D_FULL %1
+%if ARCH_X86_64
+ TRANSPOSE4x4D 0,1,2,3,8
+ mova [%2 ], m0
+ TRANSPOSE4x4D 4,5,6,7,8
+ mova [%2+8*2], m4
+%else
+ mova [%1], m7
+ TRANSPOSE4x4D 0,1,2,3,7
+ mova m7, [%1]
+ mova [%2 ], m0
+ mova [%2+16*2], m1
+ mova [%2+32*2], m2
+ mova [%2+48*2], m3
+ TRANSPOSE4x4D 4,5,6,7,3
+ mova [%2+ 8*2], m4
+ mova [%2+24*2], m5
+ mova [%2+40*2], m6
+ mova [%2+56*2], m7
+%endif
+%endmacro
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+%macro IDCT8_ADD_SSE_END 3
+ IDCT8_1D_FULL %2
+ mova [%2 ], m6
+ mova [%2+16*2], m7
+
+ pxor m7, m7
+ STORE_DIFFx2 m0, m1, m6, m7, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m2, m3, m6, m7, %1, %3
+ mova m0, [%2 ]
+ mova m1, [%2+16*2]
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m4, m5, m6, m7, %1, %3
+ lea %1, [%1+%3*2]
+ STORE_DIFFx2 m0, m1, m6, m7, %1, %3
+%endmacro
+
+%macro IDCT8_ADD 0
+cglobal h264_idct8_add_10, 3,4,16
+%if UNIX64 == 0
+ %assign pad 16-gprsize-(stack_offset&15)
+ sub rsp, pad
+ call h264_idct8_add1_10 %+ SUFFIX
+ add rsp, pad
+ RET
+%endif
+
+ALIGN 16
+; TODO: does not need to use stack
+h264_idct8_add1_10 %+ SUFFIX:
+%assign pad 256+16-gprsize
+ sub rsp, pad
+ add dword [r1], 32
+
+%if ARCH_X86_64
+ IDCT8_ADD_SSE_START r1, rsp
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+ IDCT8_ADD_SSE_START r1+16, rsp+128
+ PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
+ IDCT8_1D [rsp], [rsp+128]
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+ IDCT8_1D [rsp+16], [rsp+144]
+ psrad m8, 6
+ psrad m0, 6
+ packssdw m8, m0
+ paddsw m8, [r0]
+ pxor m0, m0
+ mova [r1+ 0], m0
+ mova [r1+ 16], m0
+ mova [r1+ 32], m0
+ mova [r1+ 48], m0
+ mova [r1+ 64], m0
+ mova [r1+ 80], m0
+ mova [r1+ 96], m0
+ mova [r1+112], m0
+ mova [r1+128], m0
+ mova [r1+144], m0
+ mova [r1+160], m0
+ mova [r1+176], m0
+ mova [r1+192], m0
+ mova [r1+208], m0
+ mova [r1+224], m0
+ mova [r1+240], m0
+ CLIPW m8, m0, [pw_pixel_max]
+ mova [r0], m8
+ mova m8, [pw_pixel_max]
+ STORE_DIFF16 m9, m1, m0, m8, r0+r2
+ lea r0, [r0+r2*2]
+ STORE_DIFF16 m10, m2, m0, m8, r0
+ STORE_DIFF16 m11, m3, m0, m8, r0+r2
+ lea r0, [r0+r2*2]
+ STORE_DIFF16 m12, m4, m0, m8, r0
+ STORE_DIFF16 m13, m5, m0, m8, r0+r2
+ lea r0, [r0+r2*2]
+ STORE_DIFF16 m14, m6, m0, m8, r0
+ STORE_DIFF16 m15, m7, m0, m8, r0+r2
+%else
+ IDCT8_ADD_SSE_START r1, rsp
+ IDCT8_ADD_SSE_START r1+16, rsp+128
+ lea r3, [r0+8]
+ IDCT8_ADD_SSE_END r0, rsp, r2
+ IDCT8_ADD_SSE_END r3, rsp+16, r2
+ mova [r1+ 0], m7
+ mova [r1+ 16], m7
+ mova [r1+ 32], m7
+ mova [r1+ 48], m7
+ mova [r1+ 64], m7
+ mova [r1+ 80], m7
+ mova [r1+ 96], m7
+ mova [r1+112], m7
+ mova [r1+128], m7
+ mova [r1+144], m7
+ mova [r1+160], m7
+ mova [r1+176], m7
+ mova [r1+192], m7
+ mova [r1+208], m7
+ mova [r1+224], m7
+ mova [r1+240], m7
+%endif ; ARCH_X86_64
+
+ add rsp, pad
+ ret
+%endmacro
+
+INIT_XMM sse2
+IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_ADD
+%endif
+
+;-----------------------------------------------------------------------------
+; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
+;-----------------------------------------------------------------------------
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
+%macro IDCT8_ADD4_OP 2
+ cmp byte [r4+%2], 0
+ jz .skipblock%1
+ mov r0d, [r6+%1*4]
+ add r0, r5
+ call h264_idct8_add1_10 %+ SUFFIX
+.skipblock%1:
+%if %1<12
+ add r1, 256
+%endif
+%endmacro
+
+%macro IDCT8_ADD4 0
+cglobal h264_idct8_add4_10, 0,7,16
+ %assign pad 16-gprsize-(stack_offset&15)
+ SUB rsp, pad
+ mov r5, r0mp
+ mov r6, r1mp
+ mov r1, r2mp
+ mov r2d, r3m
+ movifnidn r4, r4mp
+ IDCT8_ADD4_OP 0, 4+1*8
+ IDCT8_ADD4_OP 4, 6+1*8
+ IDCT8_ADD4_OP 8, 4+3*8
+ IDCT8_ADD4_OP 12, 6+3*8
+ ADD rsp, pad
+ RET
+%endmacro ; IDCT8_ADD4
+
+INIT_XMM sse2
+IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_ADD4
+%endif
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred.asm b/ffmpeg/libavcodec/x86/h264_intrapred.asm
new file mode 100644
index 0000000..5c0dff4
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_intrapred.asm
@@ -0,0 +1,2702 @@
+;******************************************************************************
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Jason Garrett-Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+tm_shuf: times 8 db 0x03, 0x80
+pw_ff00: times 8 dw 0xff00
+plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
+ db 1, 2, 3, 4, 5, 6, 7, 8
+plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
+ db 1, 2, 3, 4, 0, 0, 0, 0
+pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
+pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
+pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
+pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
+
+SECTION .text
+
+cextern pb_1
+cextern pb_3
+cextern pw_4
+cextern pw_5
+cextern pw_8
+cextern pw_16
+cextern pw_17
+cextern pw_32
+
+;-----------------------------------------------------------------------------
+; void pred16x16_vertical_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal pred16x16_vertical_8, 2,3
+ sub r0, r1
+ mov r2, 8
+ movq mm0, [r0+0]
+ movq mm1, [r0+8]
+.loop:
+ movq [r0+r1*1+0], mm0
+ movq [r0+r1*1+8], mm1
+ movq [r0+r1*2+0], mm0
+ movq [r0+r1*2+8], mm1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+INIT_XMM sse
+cglobal pred16x16_vertical_8, 2,3
+ sub r0, r1
+ mov r2, 4
+ movaps xmm0, [r0]
+.loop:
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void pred16x16_horizontal_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_H 0
+cglobal pred16x16_horizontal_8, 2,3
+ mov r2, 8
+%if cpuflag(ssse3)
+ mova m2, [pb_3]
+%endif
+.loop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ pshufb m1, m2
+%else
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ SPLATW m0, m0, 3
+ SPLATW m1, m1, 3
+ mova [r0+r1*0+8], m0
+ mova [r0+r1*1+8], m1
+%endif
+
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED16x16_H
+INIT_MMX mmxext
+PRED16x16_H
+INIT_XMM ssse3
+PRED16x16_H
+
+;-----------------------------------------------------------------------------
+; void pred16x16_dc_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_DC 0
+cglobal pred16x16_dc_8, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [r0+0]
+ psadbw mm1, [r0+8]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ paddw mm0, mm1
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 7
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+16]
+ shr r2d, 5
+%if cpuflag(ssse3)
+ pxor m1, m1
+%endif
+ SPLATB_REG m0, r2, m1
+
+%if mmsize==8
+ mov r3d, 8
+.loop:
+ mova [r4+r1*0+0], m0
+ mova [r4+r1*0+8], m0
+ mova [r4+r1*1+0], m0
+ mova [r4+r1*1+8], m0
+%else
+ mov r3d, 4
+.loop:
+ mova [r4+r1*0], m0
+ mova [r4+r1*1], m0
+ lea r4, [r4+r1*2]
+ mova [r4+r1*0], m0
+ mova [r4+r1*1], m0
+%endif
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_DC
+INIT_XMM sse2
+PRED16x16_DC
+INIT_XMM ssse3
+PRED16x16_DC
+
+;-----------------------------------------------------------------------------
+; void pred16x16_tm_vp8_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_TM 0
+cglobal pred16x16_tm_vp8_8, 2,5
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0+0]
+ movq mm2, [r0+8]
+ movq mm1, mm0
+ movq mm3, mm2
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movzx r3d, byte [r0-1]
+ mov r4d, 16
+.loop:
+ movzx r2d, byte [r0+r1-1]
+ sub r2d, r3d
+ movd mm4, r2d
+ SPLATW mm4, mm4, 0
+ movq mm5, mm4
+ movq mm6, mm4
+ movq mm7, mm4
+ paddw mm4, mm0
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+ packuswb mm4, mm5
+ packuswb mm6, mm7
+ movq [r0+r1+0], mm4
+ movq [r0+r1+8], mm6
+ add r0, r1
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED16x16_TM
+INIT_MMX mmxext
+PRED16x16_TM
+
+INIT_XMM sse2
+cglobal pred16x16_tm_vp8_8, 2,6,6
+ sub r0, r1
+ pxor xmm2, xmm2
+ movdqa xmm0, [r0]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movzx r4d, byte [r0-1]
+ mov r5d, 8
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm4, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm4, xmm4, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm4, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm3, xmm1
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ packuswb xmm2, xmm3
+ packuswb xmm4, xmm5
+ movdqa [r0+r1*1], xmm2
+ movdqa [r0+r1*2], xmm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void pred16x16_plane_*_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro H264_PRED16x16_PLANE 1
+cglobal pred16x16_plane_%1_8, 2,9,7
+ mov r2, r1 ; +stride
+ neg r1 ; -stride
+
+ movh m0, [r0+r1 -1]
+%if mmsize == 8
+ pxor m4, m4
+ movh m1, [r0+r1 +3 ]
+ movh m2, [r0+r1 +8 ]
+ movh m3, [r0+r1 +12]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ pmullw m0, [pw_m8tom1 ]
+ pmullw m1, [pw_m8tom1+8]
+ pmullw m2, [pw_1to8 ]
+ pmullw m3, [pw_1to8 +8]
+ paddw m0, m2
+ paddw m1, m3
+%else ; mmsize == 16
+%if cpuflag(ssse3)
+ movhps m0, [r0+r1 +8]
+ pmaddubsw m0, [plane_shuf] ; H coefficients
+%else ; sse2
+ pxor m2, m2
+ movh m1, [r0+r1 +8]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ pmullw m0, [pw_m8tom1]
+ pmullw m1, [pw_1to8]
+ paddw m0, m1
+%endif
+ movhlps m1, m0
+%endif
+ paddw m0, m1
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0xE
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 32
+%endif
+ paddw m0, m1
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0x1
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 16
+%endif
+ paddw m0, m1 ; sum of H coefficients
+
+ lea r4, [r0+r2*8-1]
+ lea r3, [r0+r2*4-1]
+ add r4, r2
+
+%if ARCH_X86_64
+%define e_reg r8
+%else
+%define e_reg r0
+%endif
+
+ movzx e_reg, byte [r3+r2*2 ]
+ movzx r5, byte [r4+r1 ]
+ sub r5, e_reg
+
+ movzx e_reg, byte [r3+r2 ]
+ movzx r6, byte [r4 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*2]
+
+ movzx e_reg, byte [r3+r1 ]
+ movzx r6, byte [r4+r2*2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+
+ movzx e_reg, byte [r3 ]
+%if ARCH_X86_64
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
+%else
+ movzx r6, byte [r4+r2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+ sub r5, r6
+%endif
+
+ lea e_reg, [r3+r1*4]
+ lea r3, [r4+r2*4]
+
+ movzx r4, byte [e_reg+r2 ]
+ movzx r6, byte [r3 ]
+ sub r6, r4
+%if ARCH_X86_64
+ lea r6, [r7+r6*2]
+ lea r5, [r5+r6*2]
+ add r5, r6
+%else
+ lea r5, [r5+r6*4]
+ lea r5, [r5+r6*2]
+%endif
+
+ movzx r4, byte [e_reg ]
+%if ARCH_X86_64
+ movzx r7, byte [r3 +r2 ]
+ sub r7, r4
+ sub r5, r7
+%else
+ movzx r6, byte [r3 +r2 ]
+ sub r6, r4
+ lea r5, [r5+r6*8]
+ sub r5, r6
+%endif
+
+ movzx r4, byte [e_reg+r1 ]
+ movzx r6, byte [r3 +r2*2]
+ sub r6, r4
+%if ARCH_X86_64
+ add r6, r7
+%endif
+ lea r5, [r5+r6*8]
+
+ movzx r4, byte [e_reg+r2*2]
+ movzx r6, byte [r3 +r1 ]
+ sub r6, r4
+ lea r5, [r5+r6*4]
+ add r5, r6 ; sum of V coefficients
+
+%if ARCH_X86_64 == 0
+ mov r0, r0m
+%endif
+
+%ifidn %1, h264
+ lea r5, [r5*5+32]
+ sar r5, 6
+%elifidn %1, rv40
+ lea r5, [r5*5]
+ sar r5, 6
+%elifidn %1, svq3
+ test r5, r5
+ lea r6, [r5+3]
+ cmovs r5, r6
+ sar r5, 2 ; V/4
+ lea r5, [r5*5] ; 5*(V/4)
+ test r5, r5
+ lea r6, [r5+15]
+ cmovs r5, r6
+ sar r5, 4 ; (5*(V/4))/16
+%endif
+
+ movzx r4, byte [r0+r1 +15]
+ movzx r3, byte [r3+r2*2 ]
+ lea r3, [r3+r4+1]
+ shl r3, 4
+
+ movd r1d, m0
+ movsx r1d, r1w
+%ifnidn %1, svq3
+%ifidn %1, h264
+ lea r1d, [r1d*5+32]
+%else ; rv40
+ lea r1d, [r1d*5]
+%endif
+ sar r1d, 6
+%else ; svq3
+ test r1d, r1d
+ lea r4d, [r1d+3]
+ cmovs r1d, r4d
+ sar r1d, 2 ; H/4
+ lea r1d, [r1d*5] ; 5*(H/4)
+ test r1d, r1d
+ lea r4d, [r1d+15]
+ cmovs r1d, r4d
+ sar r1d, 4 ; (5*(H/4))/16
+%endif
+ movd m0, r1d
+
+ add r1d, r5d
+ add r3d, r1d
+ shl r1d, 3
+ sub r3d, r1d ; a
+
+ movd m1, r5d
+ movd m3, r3d
+ SPLATW m0, m0, 0 ; H
+ SPLATW m1, m1, 0 ; V
+ SPLATW m3, m3, 0 ; a
+%ifidn %1, svq3
+ SWAP 0, 1
+%endif
+ mova m2, m0
+%if mmsize == 8
+ mova m5, m0
+%endif
+ pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
+%if mmsize == 16
+ psllw m2, 3
+%else
+ psllw m5, 3
+ psllw m2, 2
+ mova m6, m5
+ paddw m6, m2
+%endif
+ paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
+ paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
+%if mmsize == 8
+ paddw m5, m0 ; a + {8,9,10,11}*H
+ paddw m6, m0 ; a + {12,13,14,15}*H
+%endif
+
+ mov r4, 8
+.loop:
+ mova m3, m0 ; b[0..7]
+ mova m4, m2 ; b[8..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0], m3
+%if mmsize == 8
+ mova m3, m5 ; b[8..11]
+ mova m4, m6 ; b[12..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+8], m3
+%endif
+ paddw m0, m1
+ paddw m2, m1
+%if mmsize == 8
+ paddw m5, m1
+ paddw m6, m1
+%endif
+
+ mova m3, m0 ; b[0..7]
+ mova m4, m2 ; b[8..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+r2], m3
+%if mmsize == 8
+ mova m3, m5 ; b[8..11]
+ mova m4, m6 ; b[12..15]
+ psraw m3, 5
+ psraw m4, 5
+ packuswb m3, m4
+ mova [r0+r2+8], m3
+%endif
+ paddw m0, m1
+ paddw m2, m1
+%if mmsize == 8
+ paddw m5, m1
+ paddw m6, m1
+%endif
+
+ lea r0, [r0+r2*2]
+ dec r4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_MMX mmxext
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_XMM sse2
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+INIT_XMM ssse3
+H264_PRED16x16_PLANE h264
+H264_PRED16x16_PLANE rv40
+H264_PRED16x16_PLANE svq3
+
+;-----------------------------------------------------------------------------
+; void pred8x8_plane_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro H264_PRED8x8_PLANE 0
+cglobal pred8x8_plane_8, 2,9,7
+ mov r2, r1 ; +stride
+ neg r1 ; -stride
+
+ movd m0, [r0+r1 -1]
+%if mmsize == 8
+ pxor m2, m2
+ movh m1, [r0+r1 +4 ]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ pmullw m0, [pw_m4to4]
+ pmullw m1, [pw_m4to4+8]
+%else ; mmsize == 16
+%if cpuflag(ssse3)
+ movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
+ pmaddubsw m0, [plane8_shuf] ; H coefficients
+%else ; sse2
+ pxor m2, m2
+ movd m1, [r0+r1 +4]
+ punpckldq m0, m1
+ punpcklbw m0, m2
+ pmullw m0, [pw_m4to4]
+%endif
+ movhlps m1, m0
+%endif
+ paddw m0, m1
+
+%if notcpuflag(ssse3)
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0xE
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 32
+%endif
+ paddw m0, m1
+%endif ; !ssse3
+
+%if cpuflag(mmxext)
+ PSHUFLW m1, m0, 0x1
+%elif cpuflag(mmx)
+ mova m1, m0
+ psrlq m1, 16
+%endif
+ paddw m0, m1 ; sum of H coefficients
+
+ lea r4, [r0+r2*4-1]
+ lea r3, [r0 -1]
+ add r4, r2
+
+%if ARCH_X86_64
+%define e_reg r8
+%else
+%define e_reg r0
+%endif
+
+ movzx e_reg, byte [r3+r2*2 ]
+ movzx r5, byte [r4+r1 ]
+ sub r5, e_reg
+
+ movzx e_reg, byte [r3 ]
+%if ARCH_X86_64
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
+ sub r5, r7
+%else
+ movzx r6, byte [r4+r2 ]
+ sub r6, e_reg
+ lea r5, [r5+r6*4]
+ sub r5, r6
+%endif
+
+ movzx e_reg, byte [r3+r1 ]
+ movzx r6, byte [r4+r2*2 ]
+ sub r6, e_reg
+%if ARCH_X86_64
+ add r6, r7
+%endif
+ lea r5, [r5+r6*4]
+
+ movzx e_reg, byte [r3+r2 ]
+ movzx r6, byte [r4 ]
+ sub r6, e_reg
+ lea r6, [r5+r6*2]
+
+ lea r5, [r6*9+16]
+ lea r5, [r5+r6*8]
+ sar r5, 5
+
+%if ARCH_X86_64 == 0
+ mov r0, r0m
+%endif
+
+ movzx r3, byte [r4+r2*2 ]
+ movzx r4, byte [r0+r1 +7]
+ lea r3, [r3+r4+1]
+ shl r3, 4
+ movd r1d, m0
+ movsx r1d, r1w
+ imul r1d, 17
+ add r1d, 16
+ sar r1d, 5
+ movd m0, r1d
+ add r1d, r5d
+ sub r3d, r1d
+ add r1d, r1d
+ sub r3d, r1d ; a
+
+ movd m1, r5d
+ movd m3, r3d
+ SPLATW m0, m0, 0 ; H
+ SPLATW m1, m1, 0 ; V
+ SPLATW m3, m3, 0 ; a
+%if mmsize == 8
+ mova m2, m0
+%endif
+ pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
+ paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
+%if mmsize == 8
+ psllw m2, 2
+ paddw m2, m0 ; a + {4,5,6,7}*H
+%endif
+
+ mov r4, 4
+ALIGN 16
+.loop:
+%if mmsize == 16
+ mova m3, m0 ; b[0..7]
+ paddw m0, m1
+ psraw m3, 5
+ mova m4, m0 ; V+b[0..7]
+ paddw m0, m1
+ psraw m4, 5
+ packuswb m3, m4
+ movh [r0], m3
+ movhps [r0+r2], m3
+%else ; mmsize == 8
+ mova m3, m0 ; b[0..3]
+ mova m4, m2 ; b[4..7]
+ paddw m0, m1
+ paddw m2, m1
+ psraw m3, 5
+ psraw m4, 5
+ mova m5, m0 ; V+b[0..3]
+ mova m6, m2 ; V+b[4..7]
+ paddw m0, m1
+ paddw m2, m1
+ psraw m5, 5
+ psraw m6, 5
+ packuswb m3, m4
+ packuswb m5, m6
+ mova [r0], m3
+ mova [r0+r2], m5
+%endif
+
+ lea r0, [r0+r2*2]
+ dec r4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+H264_PRED8x8_PLANE
+INIT_MMX mmxext
+H264_PRED8x8_PLANE
+INIT_XMM sse2
+H264_PRED8x8_PLANE
+INIT_XMM ssse3
+H264_PRED8x8_PLANE
+
+;-----------------------------------------------------------------------------
+; void pred8x8_vertical_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal pred8x8_vertical_8, 2,2
+ sub r0, r1
+ movq mm0, [r0]
+%rep 3
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ lea r0, [r0+r1*2]
+%endrep
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_horizontal_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_H 0
+cglobal pred8x8_horizontal_8, 2,3
+ mov r2, 4
+%if cpuflag(ssse3)
+ mova m2, [pb_3]
+%endif
+.loop:
+ SPLATB_LOAD m0, r0+r1*0-1, m2
+ SPLATB_LOAD m1, r0+r1*1-1, m2
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED8x8_H
+INIT_MMX mmxext
+PRED8x8_H
+INIT_MMX ssse3
+PRED8x8_H
+
+;-----------------------------------------------------------------------------
+; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred8x8_top_dc_8, 2,5
+ sub r0, r1
+ movq mm0, [r0]
+ pxor mm1, mm1
+ pxor mm2, mm2
+ lea r2, [r0+r1*2]
+ punpckhbw mm1, mm0
+ punpcklbw mm0, mm2
+ psadbw mm1, mm2 ; s1
+ lea r3, [r2+r1*2]
+ psadbw mm0, mm2 ; s0
+ psrlw mm1, 1
+ psrlw mm0, 1
+ pavgw mm1, mm2
+ lea r4, [r3+r1*2]
+ pavgw mm0, mm2
+ pshufw mm1, mm1, 0
+ pshufw mm0, mm0, 0 ; dc0 (w)
+ packuswb mm0, mm1 ; dc0,dc1 (b)
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ lea r0, [r3+r1*2]
+ movq [r2+r1*1], mm0
+ movq [r2+r1*2], mm0
+ movq [r3+r1*1], mm0
+ movq [r3+r1*2], mm0
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_dc_8_mmxext(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8_dc_8, 2,5
+ sub r0, r1
+ pxor m7, m7
+ movd m0, [r0+0]
+ movd m1, [r0+4]
+ psadbw m0, m7 ; s0
+ mov r4, r0
+ psadbw m1, m7 ; s1
+
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ lea r0, [r0+r1*2]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*1-1]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*2-1]
+ add r2d, r3d
+ lea r0, [r0+r1*2]
+ movd m2, r2d ; s2
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ lea r0, [r0+r1*2]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*1-1]
+ add r2d, r3d
+ movzx r3d, byte [r0+r1*2-1]
+ add r2d, r3d
+ movd m3, r2d ; s3
+
+ punpcklwd m0, m1
+ mov r0, r4
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; s0, s1, s2, s3
+ pshufw m3, m0, 11110110b ; s2, s1, s3, s3
+ lea r2, [r0+r1*2]
+ pshufw m0, m0, 01110100b ; s0, s1, s3, s1
+ paddw m0, m3
+ lea r3, [r2+r1*2]
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+ lea r4, [r3+r1*2]
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0
+ punpckhbw m1, m1
+ movq [r0+r1*1], m0
+ movq [r0+r1*2], m0
+ movq [r2+r1*1], m0
+ movq [r2+r1*2], m0
+ movq [r3+r1*1], m1
+ movq [r3+r1*2], m1
+ movq [r4+r1*1], m1
+ movq [r4+r1*2], m1
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_dc_rv40_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8_dc_rv40_8, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ psadbw mm0, [r0]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 3
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+8]
+ shr r2d, 4
+ movd mm0, r2d
+ punpcklbw mm0, mm0
+ pshufw mm0, mm0, 0
+ mov r3d, 4
+.loop:
+ movq [r4+r1*0], mm0
+ movq [r4+r1*1], mm0
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_tm_vp8_8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_TM 0
+cglobal pred8x8_tm_vp8_8, 2,6
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd mm2, r2d
+ movd mm4, r3d
+ SPLATW mm2, mm2, 0
+ SPLATW mm4, mm4, 0
+ movq mm3, mm2
+ movq mm5, mm4
+ paddw mm2, mm0
+ paddw mm3, mm1
+ paddw mm4, mm0
+ paddw mm5, mm1
+ packuswb mm2, mm3
+ packuswb mm4, mm5
+ movq [r0+r1*1], mm2
+ movq [r0+r1*2], mm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED8x8_TM
+INIT_MMX mmxext
+PRED8x8_TM
+
+INIT_XMM sse2
+cglobal pred8x8_tm_vp8_8, 2,6,4
+ sub r0, r1
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm3, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm3, xmm3, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+INIT_XMM ssse3
+cglobal pred8x8_tm_vp8_8, 2,3,6
+ sub r0, r1
+ movdqa xmm4, [tm_shuf]
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movd xmm5, [r0-4]
+ pshufb xmm5, xmm4
+ mov r2d, 4
+.loop:
+ movd xmm2, [r0+r1*1-4]
+ movd xmm3, [r0+r1*2-4]
+ pshufb xmm2, xmm4
+ pshufb xmm3, xmm4
+ psubw xmm2, xmm5
+ psubw xmm3, xmm5
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+
+; dest, left, right, src, tmp
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED4x4_LOWPASS 5
+ mova %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ mova %1, %4
+ pand %3, [pb_1]
+ psubusb %2, %3
+ pavgb %1, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_TOP_DC 0
+cglobal pred8x8l_top_dc_8, 4,4
+ sub r0, r3
+ pxor mm7, mm7
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1 ; top_left
+ jz .fix_lt_2
+ test r2, r2 ; top_right
+ jz .fix_tr_1
+ jmp .body
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2 ; top_right
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+.body:
+ PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
+ psadbw mm7, mm0
+ paddw mm7, [pw_4]
+ psrlw mm7, 3
+ pshufw mm7, mm7, 0
+ packuswb mm7, mm7
+%rep 3
+ movq [r0+r3*1], mm7
+ movq [r0+r3*2], mm7
+ lea r0, [r0+r3*2]
+%endrep
+ movq [r0+r3*1], mm7
+ movq [r0+r3*2], mm7
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_TOP_DC
+INIT_MMX ssse3
+PRED8x8L_TOP_DC
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_DC 0
+cglobal pred8x8l_dc_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .body
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.body:
+ lea r1, [r0+r3*2]
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ pxor mm0, mm0
+ pxor mm1, mm1
+ lea r2, [r1+r3*2]
+ psadbw mm0, mm7
+ psadbw mm1, mm6
+ paddw mm0, [pw_8]
+ paddw mm0, mm1
+ lea r4, [r2+r3*2]
+ psrlw mm0, 4
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ movq [r1+r3*1], mm0
+ movq [r1+r3*2], mm0
+ movq [r2+r3*1], mm0
+ movq [r2+r3*2], mm0
+ movq [r4+r3*1], mm0
+ movq [r4+r3*2], mm0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_DC
+INIT_MMX ssse3
+PRED8x8L_DC
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_HORIZONTAL 0
+cglobal pred8x8l_horizontal_8, 4,4
+ sub r0, r3
+ lea r2, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ test r1, r1
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
+ movq mm1, [r2+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r2, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r1+r3*0-8]
+ mov r0, r2
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm3, mm7
+ lea r1, [r0+r3*2]
+ movq mm7, mm3
+ punpckhbw mm3, mm3
+ punpcklbw mm7, mm7
+ pshufw mm0, mm3, 0xff
+ pshufw mm1, mm3, 0xaa
+ lea r2, [r1+r3*2]
+ pshufw mm2, mm3, 0x55
+ pshufw mm3, mm3, 0x00
+ pshufw mm4, mm7, 0xff
+ pshufw mm5, mm7, 0xaa
+ pshufw mm6, mm7, 0x55
+ pshufw mm7, mm7, 0x00
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm1
+ movq [r1+r3*1], mm2
+ movq [r1+r3*2], mm3
+ movq [r2+r3*1], mm4
+ movq [r2+r3*2], mm5
+ lea r0, [r2+r3*2]
+ movq [r0+r3*1], mm6
+ movq [r0+r3*2], mm7
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_HORIZONTAL
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_VERTICAL 0
+cglobal pred8x8l_vertical_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1 ; top_left
+ jz .fix_lt_2
+ test r2, r2 ; top_right
+ jz .fix_tr_1
+ jmp .body
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2 ; top_right
+ jnz .body
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+.body:
+ PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
+%rep 3
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ lea r0, [r0+r3*2]
+%endrep
+ movq [r0+r3*1], mm0
+ movq [r0+r3*2], mm0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_VERTICAL
+INIT_MMX ssse3
+PRED8x8L_VERTICAL
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_down_left_8, 4,5
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm7, mm4
+ test r2, r2
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ lea r1, [r0+r3*2]
+ movq mm6, mm1
+ psrlq mm1, 56
+ movq mm4, mm1
+ lea r2, [r1+r3*2]
+ movq mm2, mm6
+ PALIGNR mm2, mm7, 1, mm0
+ movq mm3, mm6
+ PALIGNR mm3, mm7, 7, mm0
+ PALIGNR mm4, mm6, 1, mm0
+ movq mm5, mm7
+ movq mm1, mm7
+ movq mm7, mm6
+ lea r4, [r2+r3*2]
+ psllq mm1, 8
+ PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
+ PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
+ movq [r4+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r4+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r0+r3*2], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
+ movq [r0+r3*1], mm1
+ RET
+
+%macro PRED8x8L_DOWN_LEFT 0
+cglobal pred8x8l_down_left_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1 ; top_left
+ jz .fix_lt_2
+ test r2, r2 ; top_right
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2 ; top_right
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm3, mm4
+ test r2, r2 ; top_right
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm4, mm1
+ psrlq mm1, 56
+ movq2dq xmm5, mm1
+ lea r1, [r0+r3*2]
+ pslldq xmm4, 8
+ por xmm3, xmm4
+ movdqa xmm2, xmm3
+ psrldq xmm2, 1
+ pslldq xmm5, 15
+ por xmm2, xmm5
+ lea r2, [r1+r3*2]
+ movdqa xmm1, xmm3
+ pslldq xmm1, 1
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*2], xmm0
+ psrldq xmm0, 1
+ lea r0, [r2+r3*2]
+ movq [r1+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r1+r3*2], xmm0
+ psrldq xmm0, 1
+ movq [r2+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r2+r3*2], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm0
+ psrldq xmm0, 1
+ movq [r0+r3*2], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_DOWN_LEFT
+INIT_MMX ssse3
+PRED8x8L_DOWN_LEFT
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_down_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1 ; top_left
+ jz .fix_lt_1
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq mm6, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1 ; top_left
+ jz .fix_lt_2
+ test r2, r2 ; top_right
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm5, mm4
+ jmp .body
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2 ; top_right
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.body:
+ lea r1, [r0+r3*2]
+ movq mm1, mm7
+ movq mm7, mm5
+ movq mm5, mm6
+ movq mm2, mm7
+ lea r2, [r1+r3*2]
+ PALIGNR mm2, mm6, 1, mm0
+ movq mm3, mm7
+ PALIGNR mm3, mm6, 7, mm0
+ movq mm4, mm7
+ lea r4, [r2+r3*2]
+ psrlq mm4, 8
+ PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
+ PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
+ movq [r4+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r4+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r2+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r2+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r1+r3*2], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r1+r3*1], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+ movq [r0+r3*2], mm0
+ psrlq mm0, 8
+ psllq mm1, 56
+ por mm0, mm1
+ movq [r0+r3*1], mm0
+ RET
+
+%macro PRED8x8L_DOWN_RIGHT 0
+cglobal pred8x8l_down_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jz .fix_lt_1
+ jmp .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq2dq xmm3, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq2dq xmm1, mm7
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm4, mm4
+ lea r1, [r0+r3*2]
+ movdqa xmm0, xmm3
+ pslldq xmm4, 8
+ por xmm3, xmm4
+ lea r2, [r1+r3*2]
+ pslldq xmm4, 1
+ por xmm1, xmm4
+ psrldq xmm0, 7
+ pslldq xmm0, 15
+ psrldq xmm0, 7
+ por xmm1, xmm0
+ lea r0, [r2+r3*2]
+ movdqa xmm2, xmm3
+ psrldq xmm2, 1
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ movq [r0+r3*2], xmm0
+ movq [r0+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r2+r3*2], xmm0
+ movq [r2+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r1+r3*2], xmm0
+ movq [r1+r3*1], xmm1
+ psrldq xmm0, 2
+ psrldq xmm1, 2
+ movq [r4+r3*2], xmm0
+ movq [r4+r3*1], xmm1
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_DOWN_RIGHT
+INIT_MMX ssse3
+PRED8x8L_DOWN_RIGHT
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_vertical_right_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jz .fix_lt_1
+ jmp .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm7, mm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ lea r1, [r0+r3*2]
+ movq mm2, mm6
+ movq mm3, mm6
+ PALIGNR mm3, mm7, 7, mm0
+ PALIGNR mm6, mm7, 6, mm1
+ movq mm4, mm3
+ pavgb mm3, mm2
+ lea r2, [r1+r3*2]
+ PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
+ movq [r0+r3*1], mm3
+ movq [r0+r3*2], mm0
+ movq mm5, mm0
+ movq mm6, mm3
+ movq mm1, mm7
+ movq mm2, mm1
+ psllq mm2, 8
+ movq mm3, mm1
+ psllq mm3, 16
+ lea r4, [r2+r3*2]
+ PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r1+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r1+r3*2], mm5
+ psllq mm0, 8
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r2+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r2+r3*2], mm5
+ psllq mm0, 8
+ PALIGNR mm6, mm0, 7, mm2
+ movq [r4+r3*1], mm6
+ psllq mm0, 8
+ PALIGNR mm5, mm0, 7, mm1
+ movq [r4+r3*2], mm5
+ RET
+
+%macro PRED8x8L_VERTICAL_RIGHT 0
+cglobal pred8x8l_vertical_right_8, 4,5,7
+ ; manually spill XMM registers for Win64 because
+ ; the code here is initialized with INIT_MMX
+ WIN64_SPILL_XMM 7
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq2dq xmm0, mm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
+ lea r1, [r0+r3*2]
+ movq2dq xmm4, mm6
+ pslldq xmm4, 8
+ por xmm0, xmm4
+ movdqa xmm6, [pw_ff00]
+ movdqa xmm1, xmm0
+ lea r2, [r1+r3*2]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslldq xmm0, 1
+ pslldq xmm1, 2
+ pavgb xmm2, xmm0
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
+ pandn xmm6, xmm4
+ movdqa xmm5, xmm4
+ psrlw xmm4, 8
+ packuswb xmm6, xmm4
+ movhlps xmm4, xmm6
+ movhps [r0+r3*2], xmm5
+ movhps [r0+r3*1], xmm2
+ psrldq xmm5, 4
+ movss xmm5, xmm6
+ psrldq xmm2, 4
+ movss xmm2, xmm4
+ lea r0, [r2+r3*2]
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r0+r3*2], xmm5
+ movq [r0+r3*1], xmm2
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r2+r3*2], xmm5
+ movq [r2+r3*1], xmm2
+ psrldq xmm5, 1
+ psrldq xmm2, 1
+ movq [r1+r3*2], xmm5
+ movq [r1+r3*1], xmm2
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_VERTICAL_RIGHT
+INIT_MMX ssse3
+PRED8x8L_VERTICAL_RIGHT
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_VERTICAL_LEFT 0
+cglobal pred8x8l_vertical_left_8, 4,4
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm4, mm4
+ test r2, r2
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm3, mm1
+ lea r1, [r0+r3*2]
+ pslldq xmm3, 8
+ por xmm4, xmm3
+ movdqa xmm2, xmm4
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm4
+ psrldq xmm2, 1
+ pslldq xmm1, 1
+ pavgb xmm3, xmm2
+ lea r2, [r1+r3*2]
+INIT_XMM cpuname
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm3
+ movq [r0+r3*2], xmm0
+ lea r0, [r2+r3*2]
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r1+r3*1], xmm3
+ movq [r1+r3*2], xmm0
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r2+r3*1], xmm3
+ movq [r2+r3*2], xmm0
+ psrldq xmm3, 1
+ psrldq xmm0, 1
+ movq [r0+r3*1], xmm3
+ movq [r0+r3*2], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_VERTICAL_LEFT
+INIT_MMX ssse3
+PRED8x8L_VERTICAL_LEFT
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8L_HORIZONTAL_UP 0
+cglobal pred8x8l_horizontal_up_8, 4,4
+ sub r0, r3
+ lea r2, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ test r1, r1
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
+ movq mm1, [r2+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r2, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r1+r3*0-8]
+ mov r0, r2
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ lea r1, [r0+r3*2]
+ pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
+ psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
+ movq mm2, mm0
+ psllw mm0, 8
+ psrlw mm2, 8
+ por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
+ movq mm3, mm2
+ movq mm4, mm2
+ movq mm5, mm2
+ psrlq mm2, 8
+ psrlq mm3, 16
+ lea r2, [r1+r3*2]
+ por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
+ punpckhbw mm7, mm7
+ por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
+ pavgb mm4, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
+ movq mm5, mm4
+ punpcklbw mm4, mm1 ; p4 p3 p2 p1
+ punpckhbw mm5, mm1 ; p8 p7 p6 p5
+ movq mm6, mm5
+ movq mm7, mm5
+ movq mm0, mm5
+ PALIGNR mm5, mm4, 2, mm1
+ pshufw mm1, mm6, 11111001b
+ PALIGNR mm6, mm4, 4, mm2
+ pshufw mm2, mm7, 11111110b
+ PALIGNR mm7, mm4, 6, mm3
+ pshufw mm3, mm0, 11111111b
+ movq [r0+r3*1], mm4
+ movq [r0+r3*2], mm5
+ lea r0, [r2+r3*2]
+ movq [r1+r3*1], mm6
+ movq [r1+r3*2], mm7
+ movq [r2+r3*1], mm0
+ movq [r2+r3*2], mm1
+ movq [r0+r3*1], mm2
+ movq [r0+r3*2], mm3
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_HORIZONTAL_UP
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL_UP
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred8x8l_horizontal_down_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq mm4, mm0
+ movq mm7, mm2
+ movq mm6, mm2
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ psllq mm1, 56
+ PALIGNR mm7, mm1, 7, mm3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm5, mm4
+ lea r1, [r0+r3*2]
+ psllq mm7, 56
+ movq mm2, mm5
+ movq mm3, mm6
+ movq mm4, mm2
+ PALIGNR mm2, mm6, 7, mm5
+ PALIGNR mm6, mm7, 7, mm0
+ lea r2, [r1+r3*2]
+ PALIGNR mm4, mm3, 1, mm7
+ movq mm5, mm3
+ pavgb mm3, mm6
+ PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
+ movq mm4, mm2
+ movq mm1, mm2
+ lea r4, [r2+r3*2]
+ psrlq mm4, 16
+ psrlq mm1, 8
+ PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
+ movq mm7, mm3
+ punpcklbw mm3, mm0
+ punpckhbw mm7, mm0
+ movq mm1, mm7
+ movq mm0, mm7
+ movq mm4, mm7
+ movq [r4+r3*2], mm3
+ PALIGNR mm7, mm3, 2, mm5
+ movq [r4+r3*1], mm7
+ PALIGNR mm1, mm3, 4, mm5
+ movq [r2+r3*2], mm1
+ PALIGNR mm0, mm3, 6, mm3
+ movq [r2+r3*1], mm0
+ movq mm2, mm6
+ movq mm3, mm6
+ movq [r1+r3*2], mm4
+ PALIGNR mm6, mm4, 2, mm5
+ movq [r1+r3*1], mm6
+ PALIGNR mm2, mm4, 4, mm5
+ movq [r0+r3*2], mm2
+ PALIGNR mm3, mm4, 6, mm4
+ movq [r0+r3*1], mm3
+ RET
+
+%macro PRED8x8L_HORIZONTAL_DOWN 0
+cglobal pred8x8l_horizontal_down_8, 4,5
+ sub r0, r3
+ lea r4, [r0+r3*2]
+ movq mm0, [r0+r3*1-8]
+ punpckhbw mm0, [r0+r3*0-8]
+ movq mm1, [r4+r3*1-8]
+ punpckhbw mm1, [r0+r3*2-8]
+ mov r4, r0
+ punpckhwd mm1, mm0
+ lea r0, [r0+r3*4]
+ movq mm2, [r0+r3*1-8]
+ punpckhbw mm2, [r0+r3*0-8]
+ lea r0, [r0+r3*2]
+ movq mm3, [r0+r3*1-8]
+ punpckhbw mm3, [r0+r3*0-8]
+ punpckhwd mm3, mm2
+ punpckhdq mm3, mm1
+ lea r0, [r0+r3*2]
+ movq mm0, [r0+r3*0-8]
+ movq mm1, [r4]
+ mov r0, r4
+ movq mm4, mm3
+ movq mm2, mm3
+ PALIGNR mm4, mm0, 7, mm0
+ PALIGNR mm1, mm2, 1, mm2
+ test r1, r1
+ jnz .do_left
+.fix_lt_1:
+ movq mm5, mm3
+ pxor mm5, mm4
+ psrlq mm5, 56
+ psllq mm5, 48
+ pxor mm1, mm5
+ jmp .do_left
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_left:
+ movq mm0, mm4
+ PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
+ movq2dq xmm0, mm2
+ pslldq xmm0, 8
+ movq mm4, mm0
+ PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
+ movq2dq xmm2, mm1
+ pslldq xmm2, 15
+ psrldq xmm2, 8
+ por xmm0, xmm2
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq2dq xmm1, mm4
+ test r2, r2
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ movq2dq xmm5, mm1
+ pslldq xmm5, 8
+ por xmm1, xmm5
+INIT_XMM cpuname
+ lea r2, [r4+r3*2]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ PALIGNR xmm1, xmm0, 7, xmm4
+ PALIGNR xmm2, xmm0, 9, xmm5
+ lea r1, [r2+r3*2]
+ PALIGNR xmm3, xmm0, 8, xmm0
+ movdqa xmm4, xmm1
+ pavgb xmm4, xmm3
+ lea r0, [r1+r3*2]
+ PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
+ punpcklbw xmm4, xmm0
+ movhlps xmm0, xmm4
+ movq [r0+r3*2], xmm4
+ movq [r2+r3*2], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r0+r3*1], xmm4
+ movq [r2+r3*1], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r1+r3*2], xmm4
+ movq [r4+r3*2], xmm0
+ psrldq xmm4, 2
+ psrldq xmm0, 2
+ movq [r1+r3*1], xmm4
+ movq [r4+r3*1], xmm0
+ RET
+%endmacro
+
+INIT_MMX sse2
+PRED8x8L_HORIZONTAL_DOWN
+INIT_MMX ssse3
+PRED8x8L_HORIZONTAL_DOWN
+
+;-----------------------------------------------------------------------------
+; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_dc_8, 3,5
+ pxor mm7, mm7
+ mov r4, r0
+ sub r0, r2
+ movd mm0, [r0]
+ psadbw mm0, mm7
+ movzx r1d, byte [r0+r2*1-1]
+ movd r3d, mm0
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*2-1]
+ lea r0, [r0+r2*2]
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*1-1]
+ add r3d, r1d
+ movzx r1d, byte [r0+r2*2-1]
+ add r3d, r1d
+ add r3d, 4
+ shr r3d, 3
+ imul r3d, 0x01010101
+ mov [r4+r2*0], r3d
+ mov [r0+r2*0], r3d
+ mov [r0+r2*1], r3d
+ mov [r0+r2*2], r3d
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED4x4_TM 0
+cglobal pred4x4_tm_vp8_8, 3,6
+ sub r0, r2
+ pxor mm7, mm7
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movzx r4d, byte [r0-1]
+ mov r5d, 2
+.loop:
+ movzx r1d, byte [r0+r2*1-1]
+ movzx r3d, byte [r0+r2*2-1]
+ sub r1d, r4d
+ sub r3d, r4d
+ movd mm2, r1d
+ movd mm4, r3d
+%if cpuflag(mmxext)
+ pshufw mm2, mm2, 0
+ pshufw mm4, mm4, 0
+%else
+ punpcklwd mm2, mm2
+ punpcklwd mm4, mm4
+ punpckldq mm2, mm2
+ punpckldq mm4, mm4
+%endif
+ paddw mm2, mm0
+ paddw mm4, mm0
+ packuswb mm2, mm2
+ packuswb mm4, mm4
+ movd [r0+r2*1], mm2
+ movd [r0+r2*2], mm4
+ lea r0, [r0+r2*2]
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+PRED4x4_TM
+INIT_MMX mmxext
+PRED4x4_TM
+
+INIT_XMM ssse3
+cglobal pred4x4_tm_vp8_8, 3,3
+ sub r0, r2
+ movq mm6, [tm_shuf]
+ pxor mm1, mm1
+ movd mm0, [r0]
+ punpcklbw mm0, mm1
+ movd mm7, [r0-4]
+ pshufb mm7, mm6
+ lea r1, [r0+r2*2]
+ movd mm2, [r0+r2*1-4]
+ movd mm3, [r0+r2*2-4]
+ movd mm4, [r1+r2*1-4]
+ movd mm5, [r1+r2*2-4]
+ pshufb mm2, mm6
+ pshufb mm3, mm6
+ pshufb mm4, mm6
+ pshufb mm5, mm6
+ psubw mm2, mm7
+ psubw mm3, mm7
+ psubw mm4, mm7
+ psubw mm5, mm7
+ paddw mm2, mm0
+ paddw mm3, mm0
+ paddw mm4, mm0
+ paddw mm5, mm0
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+ packuswb mm4, mm4
+ packuswb mm5, mm5
+ movd [r0+r2*1], mm2
+ movd [r0+r2*2], mm3
+ movd [r1+r2*1], mm4
+ movd [r1+r2*2], mm5
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_vp8_8, 3,3
+ sub r0, r2
+ movd m1, [r0-1]
+ movd m0, [r0]
+ mova m2, m0 ;t0 t1 t2 t3
+ punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
+ lea r1, [r0+r2*2]
+ psrlq m0, 8 ;t1 t2 t3 t4
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ movd [r0+r2*1], m3
+ movd [r0+r2*2], m3
+ movd [r1+r2*1], m3
+ movd [r1+r2*2], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred4x4_down_left_8, 3,3
+ sub r0, r2
+ movq m1, [r0]
+ punpckldq m1, [r1]
+ movq m2, m1
+ movq m3, m1
+ psllq m1, 8
+ pxor m2, m1
+ psrlq m2, 8
+ pxor m2, m3
+ PRED4x4_LOWPASS m0, m1, m2, m3, m4
+ lea r1, [r0+r2*2]
+ psrlq m0, 8
+ movd [r0+r2*1], m0
+ psrlq m0, 8
+ movd [r0+r2*2], m0
+ psrlq m0, 8
+ movd [r1+r2*1], m0
+ psrlq m0, 8
+ movd [r1+r2*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_left_8, 3,3
+ sub r0, r2
+ movq m1, [r0]
+ punpckldq m1, [r1]
+ movq m3, m1
+ movq m2, m1
+ psrlq m3, 8
+ psrlq m2, 16
+ movq m4, m3
+ pavgb m4, m1
+ PRED4x4_LOWPASS m0, m1, m2, m3, m5
+ lea r1, [r0+r2*2]
+ movh [r0+r2*1], m4
+ movh [r0+r2*2], m0
+ psrlq m4, 8
+ psrlq m0, 8
+ movh [r1+r2*1], m4
+ movh [r1+r2*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_up_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movd m0, [r0+r2*1-4]
+ punpcklbw m0, [r0+r2*2-4]
+ movd m1, [r1+r2*1-4]
+ punpcklbw m1, [r1+r2*2-4]
+ punpckhwd m0, m1
+ movq m1, m0
+ punpckhbw m1, m1
+ pshufw m1, m1, 0xFF
+ punpckhdq m0, m1
+ movq m2, m0
+ movq m3, m0
+ movq m7, m0
+ psrlq m2, 16
+ psrlq m3, 8
+ pavgb m7, m3
+ PRED4x4_LOWPASS m4, m0, m2, m3, m5
+ punpcklbw m7, m4
+ movd [r0+r2*1], m7
+ psrlq m7, 16
+ movd [r0+r2*2], m7
+ psrlq m7, 16
+ movd [r1+r2*1], m7
+ movd [r1+r2*2], m1
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_down_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movh m0, [r0-4] ; lt ..
+ punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
+ psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
+ movd m1, [r1+r2*2-4] ; l3
+ punpcklbw m1, [r1+r2*1-4] ; l2 l3
+ movd m2, [r0+r2*2-4] ; l1
+ punpcklbw m2, [r0+r2*1-4] ; l0 l1
+ punpckhwd m1, m2 ; l0 l1 l2 l3
+ punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
+ movq m0, m1
+ movq m2, m1
+ movq m5, m1
+ psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
+ psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
+ pavgb m5, m2
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ punpcklbw m5, m3
+ psrlq m3, 32
+ PALIGNR m3, m5, 6, m4
+ movh [r1+r2*2], m5
+ psrlq m5, 16
+ movh [r1+r2*1], m5
+ psrlq m5, 16
+ movh [r0+r2*2], m5
+ movh [r0+r2*1], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_vertical_right_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movh m0, [r0] ; ........t3t2t1t0
+ movq m5, m0
+ PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
+ pavgb m5, m0
+ PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
+ movq m1, m0
+ PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
+ movq m2, m0
+ PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ movq m1, m3
+ psrlq m3, 16
+ psllq m1, 48
+ movh [r0+r2*1], m5
+ movh [r0+r2*2], m3
+ PALIGNR m5, m1, 7, m2
+ psllq m1, 8
+ movh [r1+r2*1], m5
+ PALIGNR m3, m1, 7, m1
+ movh [r1+r2*2], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmxext
+cglobal pred4x4_down_right_8, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m1, [r1-8]
+ movq m2, [r0+r2*1-8]
+ punpckhbw m2, [r0-8]
+ movh m3, [r0]
+ punpckhwd m1, m2
+ PALIGNR m3, m1, 5, m1
+ movq m1, m3
+ PALIGNR m3, [r1+r2*1-8], 7, m4
+ movq m2, m3
+ PALIGNR m3, [r1+r2*2-8], 7, m4
+ PRED4x4_LOWPASS m0, m3, m1, m2, m4
+ movh [r1+r2*2], m0
+ psrlq m0, 8
+ movh [r1+r2*1], m0
+ psrlq m0, 8
+ movh [r0+r2*2], m0
+ psrlq m0, 8
+ movh [r0+r2*1], m0
+ RET
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm b/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
new file mode 100644
index 0000000..db2b25c
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
@@ -0,0 +1,1199 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_16
+cextern pw_8
+cextern pw_4
+cextern pw_2
+cextern pw_1
+
+pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3: times 8 dw -3
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+pw_512: times 8 dw 512
+pd_17: times 4 dd 17
+pd_16: times 4 dd 16
+
+SECTION .text
+
+; dest, left, right, src
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED4x4_LOWPASS 4
+ paddw %2, %3
+ psrlw %2, 1
+ pavgw %1, %4, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_DR 0
+cglobal pred4x4_down_right_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movhps m1, [r1-8]
+ movhps m2, [r0+r2*1-8]
+ movhps m4, [r0-8]
+ punpckhwd m2, m4
+ movq m3, [r0]
+ punpckhdq m1, m2
+ PALIGNR m3, m1, 10, m1
+ movhps m4, [r1+r2*1-8]
+ PALIGNR m0, m3, m4, 14, m4
+ movhps m4, [r1+r2*2-8]
+ PALIGNR m2, m0, m4, 14, m4
+ PRED4x4_LOWPASS m0, m2, m3, m0
+ movq [r1+r2*2], m0
+ psrldq m0, 2
+ movq [r1+r2*1], m0
+ psrldq m0, 2
+ movq [r0+r2*2], m0
+ psrldq m0, 2
+ movq [r0+r2*1], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_DR
+INIT_XMM ssse3
+PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_DR
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_VR 0
+cglobal pred4x4_vertical_right_10, 3, 3, 6
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m5, [r0] ; ........t3t2t1t0
+ movhps m1, [r0-8]
+ PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
+ pavgw m5, m0
+ movhps m1, [r0+r2*1-8]
+ PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
+ movhps m2, [r0+r2*2-8]
+ PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
+ movhps m3, [r1+r2*1-8]
+ PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
+ PRED4x4_LOWPASS m1, m0, m2, m1
+ pslldq m0, m1, 12
+ psrldq m1, 4
+ movq [r0+r2*1], m5
+ movq [r0+r2*2], m1
+ PALIGNR m5, m0, 14, m2
+ pslldq m0, 2
+ movq [r1+r2*1], m5
+ PALIGNR m1, m0, 14, m0
+ movq [r1+r2*2], m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_VR
+INIT_XMM ssse3
+PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_VR
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_HD 0
+cglobal pred4x4_horizontal_down_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m0, [r0-8] ; lt ..
+ movhps m0, [r0]
+ pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
+ movq m1, [r1+r2*2-8] ; l3
+ movq m3, [r1+r2*1-8]
+ punpcklwd m1, m3 ; l2 l3
+ movq m2, [r0+r2*2-8] ; l1
+ movq m3, [r0+r2*1-8]
+ punpcklwd m2, m3 ; l0 l1
+ punpckhdq m1, m2 ; l0 l1 l2 l3
+ punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
+ psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
+ psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
+ pavgw m5, m1, m3
+ PRED4x4_LOWPASS m3, m1, m0, m3
+ punpcklwd m5, m3
+ psrldq m3, 8
+ PALIGNR m3, m5, 12, m4
+ movq [r1+r2*2], m5
+ movhps [r0+r2*2], m5
+ psrldq m5, 4
+ movq [r1+r2*1], m5
+ movq [r0+r2*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_HD
+INIT_XMM ssse3
+PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_HD
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro HADDD 2 ; sum junk
+%if mmsize == 16
+ movhlps %2, %1
+ paddd %1, %2
+ pshuflw %2, %1, 0xE
+ paddd %1, %2
+%else
+ pshufw %2, %1, 0xE
+ paddd %1, %2
+%endif
+%endmacro
+
+%macro HADDW 2
+ pmaddwd %1, [pw_1]
+ HADDD %1, %2
+%endmacro
+
+INIT_MMX mmxext
+cglobal pred4x4_dc_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m2, [r0+r2*1-8]
+ paddw m2, [r0+r2*2-8]
+ paddw m2, [r1+r2*1-8]
+ paddw m2, [r1+r2*2-8]
+ psrlq m2, 48
+ movq m0, [r0]
+ HADDW m0, m1
+ paddw m0, [pw_4]
+ paddw m0, m2
+ psrlw m0, 3
+ SPLATW m0, m0, 0
+ movq [r0+r2*1], m0
+ movq [r0+r2*2], m0
+ movq [r1+r2*1], m0
+ movq [r1+r2*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_DL 0
+cglobal pred4x4_down_left_10, 3, 3
+ sub r0, r2
+ movq m0, [r0]
+ movhps m0, [r1]
+ psrldq m2, m0, 2
+ pslldq m3, m0, 2
+ pshufhw m2, m2, 10100100b
+ PRED4x4_LOWPASS m0, m3, m2, m0
+ lea r1, [r0+r2*2]
+ movhps [r1+r2*2], m0
+ psrldq m0, 2
+ movq [r0+r2*1], m0
+ psrldq m0, 2
+ movq [r0+r2*2], m0
+ psrldq m0, 2
+ movq [r1+r2*1], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_DL
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED4x4_VL 0
+cglobal pred4x4_vertical_left_10, 3, 3
+ sub r0, r2
+ movu m1, [r0]
+ movhps m1, [r1]
+ psrldq m0, m1, 2
+ psrldq m2, m1, 4
+ pavgw m4, m0, m1
+ PRED4x4_LOWPASS m0, m1, m2, m0
+ lea r1, [r0+r2*2]
+ movq [r0+r2*1], m4
+ movq [r0+r2*2], m0
+ psrldq m4, 2
+ psrldq m0, 2
+ movq [r1+r2*1], m4
+ movq [r1+r2*2], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED4x4_VL
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
+;-----------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pred4x4_horizontal_up_10, 3, 3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m0, [r0+r2*1-8]
+ punpckhwd m0, [r0+r2*2-8]
+ movq m1, [r1+r2*1-8]
+ punpckhwd m1, [r1+r2*2-8]
+ punpckhdq m0, m1
+ pshufw m1, m1, 0xFF
+ movq [r1+r2*2], m1
+ movd [r1+r2*1+4], m1
+ pshufw m2, m0, 11111001b
+ movq m1, m2
+ pavgw m2, m0
+
+ pshufw m5, m0, 11111110b
+ PRED4x4_LOWPASS m1, m0, m5, m1
+ movq m6, m2
+ punpcklwd m6, m1
+ movq [r0+r2*1], m6
+ psrlq m2, 16
+ psrlq m1, 16
+ punpcklwd m2, m1
+ movq [r0+r2*2], m2
+ psrlq m2, 32
+ movd [r1+r2*1], m2
+ RET
+
+
+
+;-----------------------------------------------------------------------------
+; void pred8x8_vertical(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_vertical_10, 2, 2
+ sub r0, r1
+ mova m0, [r0]
+%rep 3
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ lea r0, [r0+r1*2]
+%endrep
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_horizontal(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_horizontal_10, 2, 3
+ mov r2d, 4
+.loop:
+ movq m0, [r0+r1*0-8]
+ movq m1, [r0+r1*1-8]
+ pshuflw m0, m0, 0xff
+ pshuflw m1, m1, 0xff
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MOV8 2-3
+; sort of a hack, but it works
+%if mmsize==8
+ movq [%1+0], %2
+ movq [%1+8], %3
+%else
+ movdqa [%1], %2
+%endif
+%endmacro
+
+%macro PRED8x8_DC 1
+cglobal pred8x8_dc_10, 2, 6
+ sub r0, r1
+ pxor m4, m4
+ movq m0, [r0+0]
+ movq m1, [r0+8]
+%if mmsize==16
+ punpcklwd m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%else
+ pshufw m2, m0, 00001110b
+ pshufw m3, m1, 00001110b
+ paddw m0, m2
+ paddw m1, m3
+ punpcklwd m0, m1
+%endif
+ %1 m2, m0, 00001110b
+ paddw m0, m2
+
+ lea r5, [r1*3]
+ lea r4, [r0+r1*4]
+ movzx r2d, word [r0+r1*1-2]
+ movzx r3d, word [r0+r1*2-2]
+ add r2d, r3d
+ movzx r3d, word [r0+r5*1-2]
+ add r2d, r3d
+ movzx r3d, word [r4-2]
+ add r2d, r3d
+ movd m2, r2d ; s2
+
+ movzx r2d, word [r4+r1*1-2]
+ movzx r3d, word [r4+r1*2-2]
+ add r2d, r3d
+ movzx r3d, word [r4+r5*1-2]
+ add r2d, r3d
+ movzx r3d, word [r4+r1*4-2]
+ add r2d, r3d
+ movd m3, r2d ; s3
+
+ punpcklwd m2, m3
+ punpckldq m0, m2 ; s0, s1, s2, s3
+ %1 m3, m0, 11110110b ; s2, s1, s3, s3
+ %1 m0, m0, 01110100b ; s0, s1, s3, s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
+%if mmsize==16
+ punpcklwd m0, m0
+ pshufd m3, m0, 11111010b
+ punpckldq m0, m0
+ SWAP 0,1
+%else
+ pshufw m1, m0, 0x00
+ pshufw m2, m0, 0x55
+ pshufw m3, m0, 0xaa
+ pshufw m4, m0, 0xff
+%endif
+ MOV8 r0+r1*1, m1, m2
+ MOV8 r0+r1*2, m1, m2
+ MOV8 r0+r5*1, m1, m2
+ MOV8 r0+r1*4, m1, m2
+ MOV8 r4+r1*1, m3, m4
+ MOV8 r4+r1*2, m3, m4
+ MOV8 r4+r5*1, m3, m4
+ MOV8 r4+r1*4, m3, m4
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8_DC pshufw
+INIT_XMM sse2
+PRED8x8_DC pshuflw
+
+;-----------------------------------------------------------------------------
+; void pred8x8_top_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_top_dc_10, 2, 4
+ sub r0, r1
+ mova m0, [r0]
+ pshuflw m1, m0, 0x4e
+ pshufhw m1, m1, 0x4e
+ paddw m0, m1
+ pshuflw m1, m0, 0xb1
+ pshufhw m1, m1, 0xb1
+ paddw m0, m1
+ lea r2, [r1*3]
+ lea r3, [r0+r1*4]
+ paddw m0, [pw_2]
+ psrlw m0, 2
+ mova [r0+r1*1], m0
+ mova [r0+r1*2], m0
+ mova [r0+r2*1], m0
+ mova [r0+r1*4], m0
+ mova [r3+r1*1], m0
+ mova [r3+r1*2], m0
+ mova [r3+r2*1], m0
+ mova [r3+r1*4], m0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_plane(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pred8x8_plane_10, 2, 7, 7
+ sub r0, r1
+ lea r2, [r1*3]
+ lea r3, [r0+r1*4]
+ mova m2, [r0]
+ pmaddwd m2, [pw_m32101234]
+ HADDD m2, m1
+ movd m0, [r0-4]
+ psrld m0, 14
+ psubw m2, m0 ; H
+ movd m0, [r3+r1*4-4]
+ movd m1, [r0+12]
+ paddw m0, m1
+ psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
+ movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
+ movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
+ sub r4d, r5d
+ movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
+ movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
+ sub r6d, r5d
+ lea r4d, [r4+r6*2]
+ movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
+ movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
+ sub r5d, r6d
+ lea r5d, [r5*3]
+ add r4d, r5d
+ movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
+ movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
+ sub r6d, r5d
+ lea r4d, [r4+r6*4]
+ movd m3, r4d ; V
+ punpckldq m2, m3
+ pmaddwd m2, [pd_17]
+ paddd m2, [pd_16]
+ psrad m2, 5 ; b, c
+
+ mova m3, [pw_pixel_max]
+ pxor m1, m1
+ SPLATW m0, m0, 1
+ SPLATW m4, m2, 2
+ SPLATW m2, m2, 0
+ pmullw m2, [pw_m32101234] ; b
+ pmullw m5, m4, [pw_m3] ; c
+ paddw m5, [pw_16]
+ mov r2d, 8
+ add r0, r1
+.loop:
+ paddsw m6, m2, m5
+ paddsw m6, m0
+ psraw m6, 5
+ CLIPW m6, m1, m3
+ mova [r0], m6
+ paddw m5, m4
+ add r0, r1
+ dec r2d
+ jg .loop
+ REP_RET
+
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_128_DC 0
+cglobal pred8x8l_128_dc_10, 4, 4
+ mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ MOV8 r0+r3*0, m0, m0
+ MOV8 r0+r3*1, m0, m0
+ MOV8 r0+r3*2, m0, m0
+ MOV8 r0+r1*1, m0, m0
+ MOV8 r2+r3*0, m0, m0
+ MOV8 r2+r3*1, m0, m0
+ MOV8 r2+r3*2, m0, m0
+ MOV8 r2+r1*1, m0, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PRED8x8L_128_DC
+INIT_XMM sse2
+PRED8x8L_128_DC
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_TOP_DC 0
+cglobal pred8x8l_top_dc_10, 4, 4, 6
+ sub r0, r3
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ HADDW m0, m1
+ paddw m0, [pw_4]
+ psrlw m0, 3
+ SPLATW m0, m0, 0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r1*1], m0
+ mova [r0+r3*4], m0
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m0
+ mova [r2+r1*1], m0
+ mova [r2+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_TOP_DC
+%endif
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+;TODO: see if scalar is faster
+%macro PRED8x8L_DC 0
+cglobal pred8x8l_dc_10, 4, 6, 6
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r5, [r3*3]
+ mova m0, [r0+r3*2-16]
+ punpckhwd m0, [r0+r3*1-16]
+ mova m1, [r4+r3*0-16]
+ punpckhwd m1, [r0+r5*1-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*2-16]
+ punpckhwd m2, [r4+r3*1-16]
+ mova m3, [r4+r3*4-16]
+ punpckhwd m3, [r4+r5*1-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ not r1
+ and r1, r3
+ pslldq m4, m3, 2
+ psrldq m5, m3, 2
+ pshuflw m4, m4, 11100101b
+ pinsrw m5, [r0+r1-2], 7
+ PRED4x4_LOWPASS m3, m4, m5, m3
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ paddw m0, m3
+ HADDW m0, m1
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ SPLATW m0, m0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r5*1], m0
+ mova [r0+r3*4], m0
+ mova [r4+r3*1], m0
+ mova [r4+r3*2], m0
+ mova [r4+r5*1], m0
+ mova [r4+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DC
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_VERTICAL 0
+cglobal pred8x8l_vertical_10, 4, 4, 6
+ sub r0, r3
+ mova m0, [r0]
+ shr r1d, 14
+ shr r2d, 13
+ neg r1
+ pslldq m1, m0, 2
+ psrldq m2, m0, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ PRED4x4_LOWPASS m0, m2, m1, m0
+ mova [r0+r3*1], m0
+ mova [r0+r3*2], m0
+ mova [r0+r1*1], m0
+ mova [r0+r3*4], m0
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m0
+ mova [r2+r1*1], m0
+ mova [r2+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_VERTICAL
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_HORIZONTAL 0
+cglobal pred8x8l_horizontal_10, 4, 4, 5
+ mova m0, [r0-16]
+ shr r1d, 14
+ dec r1
+ and r1, r3
+ sub r1, r3
+ punpckhwd m0, [r0+r1-16]
+ mova m1, [r0+r3*2-16]
+ punpckhwd m1, [r0+r3*1-16]
+ lea r2, [r0+r3*4]
+ lea r1, [r3*3]
+ punpckhdq m1, m0
+ mova m2, [r2+r3*0-16]
+ punpckhwd m2, [r0+r1-16]
+ mova m3, [r2+r3*2-16]
+ punpckhwd m3, [r2+r3*1-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ PALIGNR m4, m3, [r2+r1-16], 14, m0
+ pslldq m0, m4, 2
+ pshuflw m0, m0, 11100101b
+ PRED4x4_LOWPASS m4, m3, m0, m4
+ punpckhwd m3, m4, m4
+ punpcklwd m4, m4
+ pshufd m0, m3, 0xff
+ pshufd m1, m3, 0xaa
+ pshufd m2, m3, 0x55
+ pshufd m3, m3, 0x00
+ mova [r0+r3*0], m0
+ mova [r0+r3*1], m1
+ mova [r0+r3*2], m2
+ mova [r0+r1*1], m3
+ pshufd m0, m4, 0xff
+ pshufd m1, m4, 0xaa
+ pshufd m2, m4, 0x55
+ pshufd m3, m4, 0x00
+ mova [r2+r3*0], m0
+ mova [r2+r3*1], m1
+ mova [r2+r3*2], m2
+ mova [r2+r1*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_HORIZONTAL
+INIT_XMM ssse3
+PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_HORIZONTAL
+%endif
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_DOWN_LEFT 0
+cglobal pred8x8l_down_left_10, 4, 4, 7
+ sub r0, r3
+ mova m3, [r0]
+ shr r1d, 14
+ neg r1
+ shr r2d, 13
+ pslldq m1, m3, 2
+ psrldq m2, m3, 2
+ pinsrw m1, [r0+r1], 0
+ pinsrw m2, [r0+r2+14], 7
+ PRED4x4_LOWPASS m6, m2, m1, m3
+ jz .fix_tr ; flags from shr r2d
+ mova m1, [r0+16]
+ psrldq m5, m1, 2
+ PALIGNR m2, m1, m3, 14, m3
+ pshufhw m5, m5, 10100100b
+ PRED4x4_LOWPASS m1, m2, m5, m1
+.do_topright:
+ lea r1, [r3*3]
+ psrldq m5, m1, 14
+ lea r2, [r0+r3*4]
+ PALIGNR m2, m1, m6, 2, m0
+ PALIGNR m3, m1, m6, 14, m0
+ PALIGNR m5, m1, 2, m0
+ pslldq m4, m6, 2
+ PRED4x4_LOWPASS m6, m4, m2, m6
+ PRED4x4_LOWPASS m1, m3, m5, m1
+ mova [r2+r3*4], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r1*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r3*2], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r2+r3*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*4], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r1*1], m1
+ PALIGNR m1, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*2], m1
+ PALIGNR m1, m6, 14, m6
+ mova [r0+r3*1], m1
+ RET
+.fix_tr:
+ punpckhwd m3, m3
+ pshufd m1, m3, 0xFF
+ jmp .do_topright
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DOWN_LEFT
+INIT_XMM ssse3
+PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DOWN_LEFT
+%endif
+
+;-----------------------------------------------------------------------------
+;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_DOWN_RIGHT 0
+; standard forbids this when has_topleft is false
+; no need to check
+cglobal pred8x8l_down_right_10, 4, 5, 8
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r1, [r3*3]
+ mova m0, [r0+r3*1-16]
+ punpckhwd m0, [r0+r3*0-16]
+ mova m1, [r0+r1*1-16]
+ punpckhwd m1, [r0+r3*2-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*1-16]
+ punpckhwd m2, [r4+r3*0-16]
+ mova m3, [r4+r1*1-16]
+ punpckhwd m3, [r4+r3*2-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r4+r3*4-16]
+ mova m1, [r0]
+ PALIGNR m4, m3, m0, 14, m0
+ PALIGNR m1, m3, 2, m2
+ pslldq m0, m4, 2
+ pshuflw m0, m0, 11100101b
+ PRED4x4_LOWPASS m6, m1, m4, m3
+ PRED4x4_LOWPASS m4, m3, m0, m4
+ mova m3, [r0]
+ shr r2d, 13
+ pslldq m1, m3, 2
+ psrldq m2, m3, 2
+ pinsrw m1, [r0-2], 0
+ pinsrw m2, [r0+r2+14], 7
+ PRED4x4_LOWPASS m3, m2, m1, m3
+ PALIGNR m2, m3, m6, 2, m0
+ PALIGNR m5, m3, m6, 14, m0
+ psrldq m7, m3, 2
+ PRED4x4_LOWPASS m6, m4, m2, m6
+ PRED4x4_LOWPASS m3, m5, m7, m3
+ mova [r4+r3*4], m6
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*2], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r1*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r0+r3*4], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r4+r3*1], m3
+ PALIGNR m3, m6, 14, m2
+ pslldq m6, 2
+ mova [r4+r3*2], m3
+ PALIGNR m3, m6, 14, m6
+ mova [r4+r1*1], m3
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_DOWN_RIGHT
+INIT_XMM ssse3
+PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_DOWN_RIGHT
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_VERTICAL_RIGHT 0
+; likewise with 8x8l_down_right
+cglobal pred8x8l_vertical_right_10, 4, 5, 7
+ sub r0, r3
+ lea r4, [r0+r3*4]
+ lea r1, [r3*3]
+ mova m0, [r0+r3*1-16]
+ punpckhwd m0, [r0+r3*0-16]
+ mova m1, [r0+r1*1-16]
+ punpckhwd m1, [r0+r3*2-16]
+ punpckhdq m1, m0
+ mova m2, [r4+r3*1-16]
+ punpckhwd m2, [r4+r3*0-16]
+ mova m3, [r4+r1*1-16]
+ punpckhwd m3, [r4+r3*2-16]
+ punpckhdq m3, m2
+ punpckhqdq m3, m1
+ mova m0, [r4+r3*4-16]
+ mova m1, [r0]
+ PALIGNR m4, m3, m0, 14, m0
+ PALIGNR m1, m3, 2, m2
+ PRED4x4_LOWPASS m3, m1, m4, m3
+ mova m2, [r0]
+ shr r2d, 13
+ pslldq m1, m2, 2
+ psrldq m5, m2, 2
+ pinsrw m1, [r0-2], 0
+ pinsrw m5, [r0+r2+14], 7
+ PRED4x4_LOWPASS m2, m5, m1, m2
+ PALIGNR m6, m2, m3, 12, m1
+ PALIGNR m5, m2, m3, 14, m0
+ PRED4x4_LOWPASS m0, m6, m2, m5
+ pavgw m2, m5
+ mova [r0+r3*2], m0
+ mova [r0+r3*1], m2
+ pslldq m6, m3, 4
+ pslldq m1, m3, 2
+ PRED4x4_LOWPASS m1, m3, m6, m1
+ PALIGNR m2, m1, 14, m4
+ mova [r0+r1*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m3
+ mova [r0+r3*4], m0
+ pslldq m1, 2
+ PALIGNR m2, m1, 14, m4
+ mova [r4+r3*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m3
+ mova [r4+r3*2], m0
+ pslldq m1, 2
+ PALIGNR m2, m1, 14, m4
+ mova [r4+r1*1], m2
+ pslldq m1, 2
+ PALIGNR m0, m1, 14, m1
+ mova [r4+r3*4], m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_VERTICAL_RIGHT
+INIT_XMM ssse3
+PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_VERTICAL_RIGHT
+%endif
+
+;-----------------------------------------------------------------------------
+; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED8x8L_HORIZONTAL_UP 0
+cglobal pred8x8l_horizontal_up_10, 4, 4, 6
+ mova m0, [r0+r3*0-16]
+ punpckhwd m0, [r0+r3*1-16]
+ shr r1d, 14
+ dec r1
+ and r1, r3
+ sub r1, r3
+ mova m4, [r0+r1*1-16]
+ lea r1, [r3*3]
+ lea r2, [r0+r3*4]
+ mova m1, [r0+r3*2-16]
+ punpckhwd m1, [r0+r1*1-16]
+ punpckhdq m0, m1
+ mova m2, [r2+r3*0-16]
+ punpckhwd m2, [r2+r3*1-16]
+ mova m3, [r2+r3*2-16]
+ punpckhwd m3, [r2+r1*1-16]
+ punpckhdq m2, m3
+ punpckhqdq m0, m2
+ PALIGNR m1, m0, m4, 14, m4
+ psrldq m2, m0, 2
+ pshufhw m2, m2, 10100100b
+ PRED4x4_LOWPASS m0, m1, m2, m0
+ psrldq m1, m0, 2
+ psrldq m2, m0, 4
+ pshufhw m1, m1, 10100100b
+ pshufhw m2, m2, 01010100b
+ pavgw m4, m0, m1
+ PRED4x4_LOWPASS m1, m2, m0, m1
+ punpckhwd m5, m4, m1
+ punpcklwd m4, m1
+ mova [r2+r3*0], m5
+ mova [r0+r3*0], m4
+ pshufd m0, m5, 11111001b
+ pshufd m1, m5, 11111110b
+ pshufd m2, m5, 11111111b
+ mova [r2+r3*1], m0
+ mova [r2+r3*2], m1
+ mova [r2+r1*1], m2
+ PALIGNR m2, m5, m4, 4, m0
+ PALIGNR m3, m5, m4, 8, m1
+ PALIGNR m5, m5, m4, 12, m4
+ mova [r0+r3*1], m2
+ mova [r0+r3*2], m3
+ mova [r0+r1*1], m5
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRED8x8L_HORIZONTAL_UP
+INIT_XMM ssse3
+PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PRED8x8L_HORIZONTAL_UP
+%endif
+
+
+;-----------------------------------------------------------------------------
+; void pred16x16_vertical(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MOV16 3-5
+ mova [%1+ 0], %2
+ mova [%1+mmsize], %3
+%if mmsize==8
+ mova [%1+ 16], %4
+ mova [%1+ 24], %5
+%endif
+%endmacro
+
+%macro PRED16x16_VERTICAL 0
+cglobal pred16x16_vertical_10, 2, 3
+ sub r0, r1
+ mov r2d, 8
+ mova m0, [r0+ 0]
+ mova m1, [r0+mmsize]
+%if mmsize==8
+ mova m2, [r0+16]
+ mova m3, [r0+24]
+%endif
+.loop:
+ MOV16 r0+r1*1, m0, m1, m2, m3
+ MOV16 r0+r1*2, m0, m1, m2, m3
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_VERTICAL
+INIT_XMM sse2
+PRED16x16_VERTICAL
+
+;-----------------------------------------------------------------------------
+; void pred16x16_horizontal(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_HORIZONTAL 0
+cglobal pred16x16_horizontal_10, 2, 3
+ mov r2d, 8
+.vloop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+ SPLATW m0, m0, 1
+ SPLATW m1, m1, 1
+ MOV16 r0+r1*0, m0, m0, m0, m0
+ MOV16 r0+r1*1, m1, m1, m1, m1
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .vloop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_HORIZONTAL
+INIT_XMM sse2
+PRED16x16_HORIZONTAL
+
+;-----------------------------------------------------------------------------
+; void pred16x16_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_DC 0
+cglobal pred16x16_dc_10, 2, 6
+ mov r5, r0
+ sub r0, r1
+ mova m0, [r0+0]
+ paddw m0, [r0+mmsize]
+%if mmsize==8
+ paddw m0, [r0+16]
+ paddw m0, [r0+24]
+%endif
+ HADDW m0, m2
+
+ lea r0, [r0+r1-2]
+ movzx r3d, word [r0]
+ movzx r4d, word [r0+r1]
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx r2d, word [r0]
+ add r3d, r2d
+ movzx r2d, word [r0+r1]
+ add r4d, r2d
+%endrep
+ lea r3d, [r3+r4+16]
+
+ movd m1, r3d
+ paddw m0, m1
+ psrlw m0, 5
+ SPLATW m0, m0
+ mov r3d, 8
+.loop:
+ MOV16 r5+r1*0, m0, m0, m0, m0
+ MOV16 r5+r1*1, m0, m0, m0, m0
+ lea r5, [r5+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_DC
+INIT_XMM sse2
+PRED16x16_DC
+
+;-----------------------------------------------------------------------------
+; void pred16x16_top_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_TOP_DC 0
+cglobal pred16x16_top_dc_10, 2, 3
+ sub r0, r1
+ mova m0, [r0+0]
+ paddw m0, [r0+mmsize]
+%if mmsize==8
+ paddw m0, [r0+16]
+ paddw m0, [r0+24]
+%endif
+ HADDW m0, m2
+
+ SPLATW m0, m0
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ mov r2d, 8
+.loop:
+ MOV16 r0+r1*1, m0, m0, m0, m0
+ MOV16 r0+r1*2, m0, m0, m0, m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_TOP_DC
+INIT_XMM sse2
+PRED16x16_TOP_DC
+
+;-----------------------------------------------------------------------------
+; void pred16x16_left_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_LEFT_DC 0
+cglobal pred16x16_left_dc_10, 2, 6
+ mov r5, r0
+
+ sub r0, 2
+ movzx r3d, word [r0]
+ movzx r4d, word [r0+r1]
+%rep 7
+ lea r0, [r0+r1*2]
+ movzx r2d, word [r0]
+ add r3d, r2d
+ movzx r2d, word [r0+r1]
+ add r4d, r2d
+%endrep
+ lea r3d, [r3+r4+8]
+ shr r3d, 4
+
+ movd m0, r3d
+ SPLATW m0, m0
+ mov r3d, 8
+.loop:
+ MOV16 r5+r1*0, m0, m0, m0, m0
+ MOV16 r5+r1*1, m0, m0, m0, m0
+ lea r5, [r5+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_LEFT_DC
+INIT_XMM sse2
+PRED16x16_LEFT_DC
+
+;-----------------------------------------------------------------------------
+; void pred16x16_128_dc(pixel *src, int stride)
+;-----------------------------------------------------------------------------
+%macro PRED16x16_128_DC 0
+cglobal pred16x16_128_dc_10, 2,3
+ mova m0, [pw_512]
+ mov r2d, 8
+.loop:
+ MOV16 r0+r1*0, m0, m0, m0, m0
+ MOV16 r0+r1*1, m0, m0, m0, m0
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PRED16x16_128_DC
+INIT_XMM sse2
+PRED16x16_128_DC
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_init.c b/ffmpeg/libavcodec/x86/h264_intrapred_init.c
new file mode 100644
index 0000000..f5b5e3e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_intrapred_init.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2010 Jason Garrett-Glaser
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED4x4(TYPE, DEPTH, OPT) \
+void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ const uint8_t *topright, \
+ ptrdiff_t stride);
+
+PRED4x4(dc, 10, mmxext)
+PRED4x4(down_left, 10, sse2)
+PRED4x4(down_left, 10, avx)
+PRED4x4(down_right, 10, sse2)
+PRED4x4(down_right, 10, ssse3)
+PRED4x4(down_right, 10, avx)
+PRED4x4(vertical_left, 10, sse2)
+PRED4x4(vertical_left, 10, avx)
+PRED4x4(vertical_right, 10, sse2)
+PRED4x4(vertical_right, 10, ssse3)
+PRED4x4(vertical_right, 10, avx)
+PRED4x4(horizontal_up, 10, mmxext)
+PRED4x4(horizontal_down, 10, sse2)
+PRED4x4(horizontal_down, 10, ssse3)
+PRED4x4(horizontal_down, 10, avx)
+
+#define PRED8x8(TYPE, DEPTH, OPT) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ ptrdiff_t stride);
+
+PRED8x8(dc, 10, mmxext)
+PRED8x8(dc, 10, sse2)
+PRED8x8(top_dc, 10, sse2)
+PRED8x8(plane, 10, sse2)
+PRED8x8(vertical, 10, sse2)
+PRED8x8(horizontal, 10, sse2)
+
+#define PRED8x8L(TYPE, DEPTH, OPT)\
+void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ int has_topleft, \
+ int has_topright, \
+ ptrdiff_t stride);
+
+PRED8x8L(dc, 10, sse2)
+PRED8x8L(dc, 10, avx)
+PRED8x8L(128_dc, 10, mmxext)
+PRED8x8L(128_dc, 10, sse2)
+PRED8x8L(top_dc, 10, sse2)
+PRED8x8L(top_dc, 10, avx)
+PRED8x8L(vertical, 10, sse2)
+PRED8x8L(vertical, 10, avx)
+PRED8x8L(horizontal, 10, sse2)
+PRED8x8L(horizontal, 10, ssse3)
+PRED8x8L(horizontal, 10, avx)
+PRED8x8L(down_left, 10, sse2)
+PRED8x8L(down_left, 10, ssse3)
+PRED8x8L(down_left, 10, avx)
+PRED8x8L(down_right, 10, sse2)
+PRED8x8L(down_right, 10, ssse3)
+PRED8x8L(down_right, 10, avx)
+PRED8x8L(vertical_right, 10, sse2)
+PRED8x8L(vertical_right, 10, ssse3)
+PRED8x8L(vertical_right, 10, avx)
+PRED8x8L(horizontal_up, 10, sse2)
+PRED8x8L(horizontal_up, 10, ssse3)
+PRED8x8L(horizontal_up, 10, avx)
+
+#define PRED16x16(TYPE, DEPTH, OPT)\
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
+ ptrdiff_t stride);
+
+PRED16x16(dc, 10, mmxext)
+PRED16x16(dc, 10, sse2)
+PRED16x16(top_dc, 10, mmxext)
+PRED16x16(top_dc, 10, sse2)
+PRED16x16(128_dc, 10, mmxext)
+PRED16x16(128_dc, 10, sse2)
+PRED16x16(left_dc, 10, mmxext)
+PRED16x16(left_dc, 10, sse2)
+PRED16x16(vertical, 10, mmxext)
+PRED16x16(vertical, 10, sse2)
+PRED16x16(horizontal, 10, mmxext)
+PRED16x16(horizontal, 10, sse2)
+
+/* 8-bit versions */
+PRED16x16(vertical, 8, mmx)
+PRED16x16(vertical, 8, sse)
+PRED16x16(horizontal, 8, mmx)
+PRED16x16(horizontal, 8, mmxext)
+PRED16x16(horizontal, 8, ssse3)
+PRED16x16(dc, 8, mmxext)
+PRED16x16(dc, 8, sse2)
+PRED16x16(dc, 8, ssse3)
+PRED16x16(plane_h264, 8, mmx)
+PRED16x16(plane_h264, 8, mmxext)
+PRED16x16(plane_h264, 8, sse2)
+PRED16x16(plane_h264, 8, ssse3)
+PRED16x16(plane_rv40, 8, mmx)
+PRED16x16(plane_rv40, 8, mmxext)
+PRED16x16(plane_rv40, 8, sse2)
+PRED16x16(plane_rv40, 8, ssse3)
+PRED16x16(plane_svq3, 8, mmx)
+PRED16x16(plane_svq3, 8, mmxext)
+PRED16x16(plane_svq3, 8, sse2)
+PRED16x16(plane_svq3, 8, ssse3)
+PRED16x16(tm_vp8, 8, mmx)
+PRED16x16(tm_vp8, 8, mmxext)
+PRED16x16(tm_vp8, 8, sse2)
+
+PRED8x8(top_dc, 8, mmxext)
+PRED8x8(dc_rv40, 8, mmxext)
+PRED8x8(dc, 8, mmxext)
+PRED8x8(vertical, 8, mmx)
+PRED8x8(horizontal, 8, mmx)
+PRED8x8(horizontal, 8, mmxext)
+PRED8x8(horizontal, 8, ssse3)
+PRED8x8(plane, 8, mmx)
+PRED8x8(plane, 8, mmxext)
+PRED8x8(plane, 8, sse2)
+PRED8x8(plane, 8, ssse3)
+PRED8x8(tm_vp8, 8, mmx)
+PRED8x8(tm_vp8, 8, mmxext)
+PRED8x8(tm_vp8, 8, sse2)
+PRED8x8(tm_vp8, 8, ssse3)
+
+PRED8x8L(top_dc, 8, mmxext)
+PRED8x8L(top_dc, 8, ssse3)
+PRED8x8L(dc, 8, mmxext)
+PRED8x8L(dc, 8, ssse3)
+PRED8x8L(horizontal, 8, mmxext)
+PRED8x8L(horizontal, 8, ssse3)
+PRED8x8L(vertical, 8, mmxext)
+PRED8x8L(vertical, 8, ssse3)
+PRED8x8L(down_left, 8, mmxext)
+PRED8x8L(down_left, 8, sse2)
+PRED8x8L(down_left, 8, ssse3)
+PRED8x8L(down_right, 8, mmxext)
+PRED8x8L(down_right, 8, sse2)
+PRED8x8L(down_right, 8, ssse3)
+PRED8x8L(vertical_right, 8, mmxext)
+PRED8x8L(vertical_right, 8, sse2)
+PRED8x8L(vertical_right, 8, ssse3)
+PRED8x8L(vertical_left, 8, sse2)
+PRED8x8L(vertical_left, 8, ssse3)
+PRED8x8L(horizontal_up, 8, mmxext)
+PRED8x8L(horizontal_up, 8, ssse3)
+PRED8x8L(horizontal_down, 8, mmxext)
+PRED8x8L(horizontal_down, 8, sse2)
+PRED8x8L(horizontal_down, 8, ssse3)
+
+PRED4x4(dc, 8, mmxext)
+PRED4x4(down_left, 8, mmxext)
+PRED4x4(down_right, 8, mmxext)
+PRED4x4(vertical_left, 8, mmxext)
+PRED4x4(vertical_right, 8, mmxext)
+PRED4x4(horizontal_up, 8, mmxext)
+PRED4x4(horizontal_down, 8, mmxext)
+PRED4x4(tm_vp8, 8, mmx)
+PRED4x4(tm_vp8, 8, mmxext)
+PRED4x4(tm_vp8, 8, ssse3)
+PRED4x4(vertical_vp8, 8, mmxext)
+
+av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (bit_depth == 8) {
+ if (EXTERNAL_MMX(mm_flags)) {
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
+ if (chroma_format_idc == 1) {
+ h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
+ }
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
+ } else {
+ if (chroma_format_idc == 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ if (mm_flags & AV_CPU_FLAG_CMOV)
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
+ }
+ }
+ }
+
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
+ if (chroma_format_idc == 1)
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
+ h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
+ h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
+ h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
+ h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
+ h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
+ h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
+ h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
+ h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
+ h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
+ if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) {
+ h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+ h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
+ }
+ if (codec_id != AV_CODEC_ID_RV40) {
+ h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+ if (chroma_format_idc == 1) {
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
+ }
+ }
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
+ h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
+ h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
+ } else {
+ if (chroma_format_idc == 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
+ } else {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
+ }
+ }
+ }
+
+ if (EXTERNAL_SSE(mm_flags)) {
+ h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
+ }
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
+ h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
+ } else {
+ if (chroma_format_idc == 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
+ }
+ }
+ }
+
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
+ if (chroma_format_idc == 1)
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
+ h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
+ h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
+ h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
+ h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
+ h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
+ h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
+ h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
+ h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
+ h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
+ h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
+ } else {
+ if (chroma_format_idc == 1)
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
+ }
+ }
+ }
+ } else if (bit_depth == 10) {
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
+ h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
+
+ if (chroma_format_idc == 1)
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
+
+ h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
+ h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
+ }
+ if (EXTERNAL_SSE2(mm_flags)) {
+ h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
+ h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
+
+ if (chroma_format_idc == 1) {
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
+ h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
+ h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
+ }
+
+ h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
+ h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
+ h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
+ h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
+ h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
+ }
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
+
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
+ }
+ if (EXTERNAL_AVX(mm_flags)) {
+ h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
+ h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
+ h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
+ h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
+ h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
+
+ h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
+ h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
+ h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
+ }
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/h264_qpel.c b/ffmpeg/libavcodec/x86/h264_qpel.c
new file mode 100644
index 0000000..96dec82
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_qpel.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ * Copyright (c) 2011 Daniel Kang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264qpel.h"
+#include "libavcodec/mpegvideo.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_YASM
+void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
+{
+ ff_put_pixels8_mmxext(block, pixels, line_size, h);
+ ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h)
+{
+ ff_avg_pixels8_mmxext(block, pixels, line_size, h);
+ ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int dstStride, int src1Stride, int h);
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
+#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
+#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
+#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
+#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
+
+#define DEF_QPEL(OPNAME)\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
+void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
+void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
+void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
+void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
+void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
+
+DEF_QPEL(avg)
+DEF_QPEL(put)
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ int w=3;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
+ tmp += 4;\
+ src += 4;\
+ }\
+ tmp -= 3*4;\
+ ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ src -= 2*srcStride;\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+ src += 4;\
+ dst += 4;\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
+ int w = (size+8)>>2;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
+ tmp += 4;\
+ src += 4;\
+ }\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+ int w = size>>4;\
+ do{\
+ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
+ tmp += 8;\
+ dst += 8;\
+ }while(w--);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
+ ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#if ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+
+void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
+void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
+
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+ int w = (size+8)>>3;
+ src -= 2*srcStride+2;
+ while(w--){
+ ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
+ tmp += 8;
+ src += 8;
+ }
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+ ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
+#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
+#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
+
+#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
+#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
+#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
+#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
+
+#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
+#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride)
+{
+ ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
+{\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+#undef PAVGB
+#define PAVGB "pavgb"
+QPEL_H264(put_, PUT_OP, mmxext)
+QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
+QPEL_H264_V_XMM(put_, PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
+QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
+#undef PAVGB
+
+H264_MC_4816(mmxext)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+
+
+//10bit
+#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
+void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
+ (uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+#define LUMA_MC_816(DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
+ LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
+
+LUMA_MC_ALL(10, mc00, mmxext)
+LUMA_MC_ALL(10, mc10, mmxext)
+LUMA_MC_ALL(10, mc20, mmxext)
+LUMA_MC_ALL(10, mc30, mmxext)
+LUMA_MC_ALL(10, mc01, mmxext)
+LUMA_MC_ALL(10, mc11, mmxext)
+LUMA_MC_ALL(10, mc21, mmxext)
+LUMA_MC_ALL(10, mc31, mmxext)
+LUMA_MC_ALL(10, mc02, mmxext)
+LUMA_MC_ALL(10, mc12, mmxext)
+LUMA_MC_ALL(10, mc22, mmxext)
+LUMA_MC_ALL(10, mc32, mmxext)
+LUMA_MC_ALL(10, mc03, mmxext)
+LUMA_MC_ALL(10, mc13, mmxext)
+LUMA_MC_ALL(10, mc23, mmxext)
+LUMA_MC_ALL(10, mc33, mmxext)
+
+LUMA_MC_816(10, mc00, sse2)
+LUMA_MC_816(10, mc10, sse2)
+LUMA_MC_816(10, mc10, sse2_cache64)
+LUMA_MC_816(10, mc10, ssse3_cache64)
+LUMA_MC_816(10, mc20, sse2)
+LUMA_MC_816(10, mc20, sse2_cache64)
+LUMA_MC_816(10, mc20, ssse3_cache64)
+LUMA_MC_816(10, mc30, sse2)
+LUMA_MC_816(10, mc30, sse2_cache64)
+LUMA_MC_816(10, mc30, ssse3_cache64)
+LUMA_MC_816(10, mc01, sse2)
+LUMA_MC_816(10, mc11, sse2)
+LUMA_MC_816(10, mc21, sse2)
+LUMA_MC_816(10, mc31, sse2)
+LUMA_MC_816(10, mc02, sse2)
+LUMA_MC_816(10, mc12, sse2)
+LUMA_MC_816(10, mc22, sse2)
+LUMA_MC_816(10, mc32, sse2)
+LUMA_MC_816(10, mc03, sse2)
+LUMA_MC_816(10, mc13, sse2)
+LUMA_MC_816(10, mc23, sse2)
+LUMA_MC_816(10, mc33, sse2)
+
+#define QPEL16_OPMC(OP, MC, MMX)\
+void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\
+ ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
+ ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+ src += 8*stride;\
+ dst += 8*stride;\
+ ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
+ ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
+}
+
+#define QPEL16_OP(MC, MMX)\
+QPEL16_OPMC(put, MC, MMX)\
+QPEL16_OPMC(avg, MC, MMX)
+
+#define QPEL16(MMX)\
+QPEL16_OP(mc00, MMX)\
+QPEL16_OP(mc01, MMX)\
+QPEL16_OP(mc02, MMX)\
+QPEL16_OP(mc03, MMX)\
+QPEL16_OP(mc10, MMX)\
+QPEL16_OP(mc11, MMX)\
+QPEL16_OP(mc12, MMX)\
+QPEL16_OP(mc13, MMX)\
+QPEL16_OP(mc20, MMX)\
+QPEL16_OP(mc21, MMX)\
+QPEL16_OP(mc22, MMX)\
+QPEL16_OP(mc23, MMX)\
+QPEL16_OP(mc30, MMX)\
+QPEL16_OP(mc31, MMX)\
+QPEL16_OP(mc32, MMX)\
+QPEL16_OP(mc33, MMX)
+
+#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
+QPEL16(mmxext)
+#endif
+
+#endif /* HAVE_YASM */
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
+ do { \
+ c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
+ } while (0)
+
+#define H264_QPEL_FUNCS(x, y, CPU) \
+ do { \
+ c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+ c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
+ c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
+ c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
+ } while (0)
+
+#define H264_QPEL_FUNCS_10(x, y, CPU) \
+ do { \
+ c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+ c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
+ c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
+ c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
+ } while (0)
+
+av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_YASM
+ int high_bit_depth = bit_depth > 8;
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ if (!high_bit_depth) {
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
+ } else if (bit_depth == 10) {
+#if ARCH_X86_32
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
+#endif
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
+ }
+ }
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) {
+ // these functions are slower than mmx on AMD, but faster on Intel
+ H264_QPEL_FUNCS(0, 0, sse2);
+ }
+
+ if (!high_bit_depth) {
+ H264_QPEL_FUNCS(0, 1, sse2);
+ H264_QPEL_FUNCS(0, 2, sse2);
+ H264_QPEL_FUNCS(0, 3, sse2);
+ H264_QPEL_FUNCS(1, 1, sse2);
+ H264_QPEL_FUNCS(1, 2, sse2);
+ H264_QPEL_FUNCS(1, 3, sse2);
+ H264_QPEL_FUNCS(2, 1, sse2);
+ H264_QPEL_FUNCS(2, 2, sse2);
+ H264_QPEL_FUNCS(2, 3, sse2);
+ H264_QPEL_FUNCS(3, 1, sse2);
+ H264_QPEL_FUNCS(3, 2, sse2);
+ H264_QPEL_FUNCS(3, 3, sse2);
+ }
+
+ if (bit_depth == 10) {
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
+ H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
+ H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
+ H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
+ }
+ }
+
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ if (!high_bit_depth) {
+ H264_QPEL_FUNCS(1, 0, ssse3);
+ H264_QPEL_FUNCS(1, 1, ssse3);
+ H264_QPEL_FUNCS(1, 2, ssse3);
+ H264_QPEL_FUNCS(1, 3, ssse3);
+ H264_QPEL_FUNCS(2, 0, ssse3);
+ H264_QPEL_FUNCS(2, 1, ssse3);
+ H264_QPEL_FUNCS(2, 2, ssse3);
+ H264_QPEL_FUNCS(2, 3, ssse3);
+ H264_QPEL_FUNCS(3, 0, ssse3);
+ H264_QPEL_FUNCS(3, 1, ssse3);
+ H264_QPEL_FUNCS(3, 2, ssse3);
+ H264_QPEL_FUNCS(3, 3, ssse3);
+ }
+
+ if (bit_depth == 10) {
+ H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
+ H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
+ H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
+ }
+ }
+
+ if (EXTERNAL_AVX(mm_flags)) {
+ /* AVX implies 64 byte cache lines without the need to avoid unaligned
+ * memory accesses that cross the boundary between two cache lines.
+ * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
+ * having to treat SSE2 functions with such properties as AVX. */
+ if (bit_depth == 10) {
+ H264_QPEL_FUNCS_10(1, 0, sse2);
+ H264_QPEL_FUNCS_10(2, 0, sse2);
+ H264_QPEL_FUNCS_10(3, 0, sse2);
+ }
+ }
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm b/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
new file mode 100644
index 0000000..e14df84
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
@@ -0,0 +1,884 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
+;*****************************************************************************
+;* Copyright (C) 2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_16
+cextern pw_1
+cextern pb_0
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+
+pad10: times 8 dw 10*1023
+pad20: times 8 dw 20*1023
+pad30: times 8 dw 30*1023
+depad: times 4 dd 32*20*1023 + 512
+depad2: times 8 dw 20*1023 + 16*1022 + 16
+unpad: times 8 dw 16*1022/32 ; needs to be mod 16
+
+tap1: times 4 dw 1, -5
+tap2: times 4 dw 20, 20
+tap3: times 4 dw -5, 1
+pd_0f: times 4 dd 0xffff
+
+SECTION .text
+
+
+%macro AVG_MOV 2
+ pavgw %2, %1
+ mova %1, %2
+%endmacro
+
+%macro ADDW 3
+%if mmsize == 8
+ paddw %1, %2
+%else
+ movu %3, %2
+ paddw %1, %3
+%endif
+%endmacro
+
+%macro FILT_H 4
+ paddw %1, %4
+ psubw %1, %2 ; a-b
+ psraw %1, 2 ; (a-b)/4
+ psubw %1, %2 ; (a-b)/4-b
+ paddw %1, %3 ; (a-b)/4-b+c
+ psraw %1, 2 ; ((a-b)/4-b+c)/4
+ paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+%endmacro
+
+%macro PRELOAD_V 0
+ lea r3, [r2*3]
+ sub r1, r3
+ movu m0, [r1+r2]
+ movu m1, [r1+r2*2]
+ add r1, r3
+ movu m2, [r1]
+ movu m3, [r1+r2]
+ movu m4, [r1+r2*2]
+ add r1, r3
+%endmacro
+
+%macro FILT_V 8
+ movu %6, [r1]
+ paddw %1, %6
+ mova %7, %2
+ paddw %7, %5
+ mova %8, %3
+ paddw %8, %4
+ FILT_H %1, %7, %8, [pw_16]
+ psraw %1, 1
+ CLIPW %1, [pb_0], [pw_pixel_max]
+%endmacro
+
+%macro MC 1
+%define OP_MOV mova
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2
+%1 put, 8
+
+%define OP_MOV AVG_MOV
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2
+%1 avg, 8
+%endmacro
+
+%macro MCAxA_OP 7
+%if ARCH_X86_32
+cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ mov r0, r0m
+ mov r1, r1m
+ add r0, %3*2
+ add r1, %3*2
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ mov r0, r0m
+ mov r1, r1m
+ lea r0, [r0+r2*%3]
+ lea r1, [r1+r2*%3]
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ mov r0, r0m
+ mov r1, r1m
+ lea r0, [r0+r2*%3+%3*2]
+ lea r1, [r1+r2*%3+%3*2]
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ RET
+%else ; ARCH_X86_64
+cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
+ mov r%6, r0
+%assign p1 %6+1
+ mov r %+ p1, r1
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ lea r0, [r%6+%3*2]
+ lea r1, [r %+ p1+%3*2]
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ lea r0, [r%6+r2*%3]
+ lea r1, [r %+ p1+r2*%3]
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ lea r0, [r%6+r2*%3+%3*2]
+ lea r1, [r %+ p1+r2*%3+%3*2]
+%if UNIX64 == 0 ; fall through to function
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ RET
+%endif
+%endif
+%endmacro
+
+;cpu, put/avg, mc, 4/8, ...
+%macro cglobal_mc 6
+%assign i %3*2
+%if ARCH_X86_32 || cpuflag(sse2)
+MCAxA_OP %1, %2, %3, i, %4,%5,%6
+%endif
+
+cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
+%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
+ call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+ RET
+%endif
+
+stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro COPY4 0
+ movu m0, [r1 ]
+ OP_MOV [r0 ], m0
+ movu m0, [r1+r2 ]
+ OP_MOV [r0+r2 ], m0
+ movu m0, [r1+r2*2]
+ OP_MOV [r0+r2*2], m0
+ movu m0, [r1+r3 ]
+ OP_MOV [r0+r3 ], m0
+%endmacro
+
+%macro MC00 1
+INIT_MMX mmxext
+cglobal_mc %1, mc00, 4, 3,4,0
+ lea r3, [r2*3]
+ COPY4
+ ret
+
+INIT_XMM sse2
+cglobal %1_h264_qpel8_mc00_10, 3,4
+ lea r3, [r2*3]
+ COPY4
+ lea r0, [r0+r2*4]
+ lea r1, [r1+r2*4]
+ COPY4
+ RET
+
+cglobal %1_h264_qpel16_mc00_10, 3,4
+ mov r3d, 8
+.loop:
+ movu m0, [r1 ]
+ movu m1, [r1 +16]
+ OP_MOV [r0 ], m0
+ OP_MOV [r0 +16], m1
+ movu m0, [r1+r2 ]
+ movu m1, [r1+r2+16]
+ OP_MOV [r0+r2 ], m0
+ OP_MOV [r0+r2+16], m1
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+%define OP_MOV mova
+MC00 put
+
+%define OP_MOV AVG_MOV
+MC00 avg
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC_CACHE 1
+%define OP_MOV mova
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2, cache64
+%1 put, 8
+INIT_XMM ssse3, cache64
+%1 put, 8
+INIT_XMM sse2
+%1 put, 8
+
+%define OP_MOV AVG_MOV
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2, cache64
+%1 avg, 8
+INIT_XMM ssse3, cache64
+%1 avg, 8
+INIT_XMM sse2
+%1 avg, 8
+%endmacro
+
+%macro MC20 2
+cglobal_mc %1, mc20, %2, 3,4,9
+ mov r3d, %2
+ mova m1, [pw_pixel_max]
+%if num_mmregs > 8
+ mova m8, [pw_16]
+ %define p16 m8
+%else
+ %define p16 [pw_16]
+%endif
+.nextrow:
+%if %0 == 4
+ movu m2, [r1-4]
+ movu m3, [r1-2]
+ movu m4, [r1+0]
+ ADDW m2, [r1+6], m5
+ ADDW m3, [r1+4], m5
+ ADDW m4, [r1+2], m5
+%else ; movu is slow on these processors
+%if mmsize==16
+ movu m2, [r1-4]
+ movu m0, [r1+6]
+ mova m6, m0
+ psrldq m0, 6
+
+ paddw m6, m2
+ PALIGNR m3, m0, m2, 2, m5
+ PALIGNR m7, m0, m2, 8, m5
+ paddw m3, m7
+ PALIGNR m4, m0, m2, 4, m5
+ PALIGNR m7, m0, m2, 6, m5
+ paddw m4, m7
+ SWAP 2, 6
+%else
+ movu m2, [r1-4]
+ movu m6, [r1+4]
+ PALIGNR m3, m6, m2, 2, m5
+ paddw m3, m6
+ PALIGNR m4, m6, m2, 4, m5
+ PALIGNR m7, m6, m2, 6, m5
+ paddw m4, m7
+ paddw m2, [r1+6]
+%endif
+%endif
+
+ FILT_H m2, m3, m4, p16
+ psraw m2, 1
+ pxor m0, m0
+ CLIPW m2, m0, m1
+ OP_MOV [r0], m2
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jg .nextrow
+ rep ret
+%endmacro
+
+MC_CACHE MC20
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC30 2
+cglobal_mc %1, mc30, %2, 3,5,9
+ lea r4, [r1+2]
+ jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC_CACHE MC30
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC10 2
+cglobal_mc %1, mc10, %2, 3,5,9
+ mov r4, r1
+.body:
+ mov r3d, %2
+ mova m1, [pw_pixel_max]
+%if num_mmregs > 8
+ mova m8, [pw_16]
+ %define p16 m8
+%else
+ %define p16 [pw_16]
+%endif
+.nextrow:
+%if %0 == 4
+ movu m2, [r1-4]
+ movu m3, [r1-2]
+ movu m4, [r1+0]
+ ADDW m2, [r1+6], m5
+ ADDW m3, [r1+4], m5
+ ADDW m4, [r1+2], m5
+%else ; movu is slow on these processors
+%if mmsize==16
+ movu m2, [r1-4]
+ movu m0, [r1+6]
+ mova m6, m0
+ psrldq m0, 6
+
+ paddw m6, m2
+ PALIGNR m3, m0, m2, 2, m5
+ PALIGNR m7, m0, m2, 8, m5
+ paddw m3, m7
+ PALIGNR m4, m0, m2, 4, m5
+ PALIGNR m7, m0, m2, 6, m5
+ paddw m4, m7
+ SWAP 2, 6
+%else
+ movu m2, [r1-4]
+ movu m6, [r1+4]
+ PALIGNR m3, m6, m2, 2, m5
+ paddw m3, m6
+ PALIGNR m4, m6, m2, 4, m5
+ PALIGNR m7, m6, m2, 6, m5
+ paddw m4, m7
+ paddw m2, [r1+6]
+%endif
+%endif
+
+ FILT_H m2, m3, m4, p16
+ psraw m2, 1
+ pxor m0, m0
+ CLIPW m2, m0, m1
+ movu m3, [r4]
+ pavgw m2, m3
+ OP_MOV [r0], m2
+ add r0, r2
+ add r1, r2
+ add r4, r2
+ dec r3d
+ jg .nextrow
+ rep ret
+%endmacro
+
+MC_CACHE MC10
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro V_FILT 10
+v_filt%9_%10_10
+ add r4, r2
+.no_addr4:
+ FILT_V m0, m1, m2, m3, m4, m5, m6, m7
+ add r1, r2
+ add r0, r2
+ ret
+%endmacro
+
+INIT_MMX mmxext
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 4
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+INIT_XMM sse2
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 6
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+%macro MC02 2
+cglobal_mc %1, mc02, %2, 3,4,8
+ PRELOAD_V
+
+ sub r0, r2
+%assign j 0
+%rep %2
+ %assign i (j % 6)
+ call v_filt%2_ %+ i %+ _10.no_addr4
+ OP_MOV [r0], m0
+ SWAP 0,1,2,3,4,5
+ %assign j j+1
+%endrep
+ ret
+%endmacro
+
+MC MC02
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC01 2
+cglobal_mc %1, mc01, %2, 3,5,8
+ mov r4, r1
+.body:
+ PRELOAD_V
+
+ sub r4, r2
+ sub r0, r2
+%assign j 0
+%rep %2
+ %assign i (j % 6)
+ call v_filt%2_ %+ i %+ _10
+ movu m7, [r4]
+ pavgw m0, m7
+ OP_MOV [r0], m0
+ SWAP 0,1,2,3,4,5
+ %assign j j+1
+%endrep
+ ret
+%endmacro
+
+MC MC01
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC03 2
+cglobal_mc %1, mc03, %2, 3,5,8
+ lea r4, [r1+r2]
+ jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC03
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro H_FILT_AVG 2-3
+h_filt%1_%2_10:
+;FILT_H with fewer registers and averaged with the FILT_V result
+;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
+;unfortunately I need three registers, so m5 will have to be re-read from memory
+ movu m5, [r4-4]
+ ADDW m5, [r4+6], m7
+ movu m6, [r4-2]
+ ADDW m6, [r4+4], m7
+ paddw m5, [pw_16]
+ psubw m5, m6 ; a-b
+ psraw m5, 2 ; (a-b)/4
+ psubw m5, m6 ; (a-b)/4-b
+ movu m6, [r4+0]
+ ADDW m6, [r4+2], m7
+ paddw m5, m6 ; (a-b)/4-b+c
+ psraw m5, 2 ; ((a-b)/4-b+c)/4
+ paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ psraw m5, 1
+ CLIPW m5, [pb_0], [pw_pixel_max]
+;avg FILT_V, FILT_H
+ pavgw m0, m5
+%if %0!=4
+ movu m5, [r1+r5]
+%endif
+ ret
+%endmacro
+
+INIT_MMX mmxext
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 3
+H_FILT_AVG 4, i
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+H_FILT_AVG 4, i, 0
+
+INIT_XMM sse2
+RESET_MM_PERMUTATION
+%assign i 0
+%rep 6
+%if i==1
+H_FILT_AVG 8, i, 0
+%else
+H_FILT_AVG 8, i
+%endif
+SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+
+%macro MC11 2
+; this REALLY needs x86_64
+cglobal_mc %1, mc11, %2, 3,6,8
+ mov r4, r1
+.body:
+ PRELOAD_V
+
+ sub r0, r2
+ sub r4, r2
+ mov r5, r2
+ neg r5
+%assign j 0
+%rep %2
+ %assign i (j % 6)
+ call v_filt%2_ %+ i %+ _10
+ call h_filt%2_ %+ i %+ _10
+%if %2==8 && i==1
+ movu m5, [r1+r5]
+%endif
+ OP_MOV [r0], m0
+ SWAP 0,1,2,3,4,5
+ %assign j j+1
+%endrep
+ ret
+%endmacro
+
+MC MC11
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC31 2
+cglobal_mc %1, mc31, %2, 3,6,8
+ mov r4, r1
+ add r1, 2
+ jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC31
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC13 2
+cglobal_mc %1, mc13, %2, 3,7,12
+ lea r4, [r1+r2]
+ jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC13
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC33 2
+cglobal_mc %1, mc33, %2, 3,6,8
+ lea r4, [r1+r2]
+ add r1, 2
+ jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC33
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro FILT_H2 3
+ psubw %1, %2 ; a-b
+ psubw %2, %3 ; b-c
+ psllw %2, 2
+ psubw %1, %2 ; a-5*b+4*c
+ psllw %3, 4
+ paddw %1, %3 ; a-5*b+20*c
+%endmacro
+
+%macro FILT_VNRD 8
+ movu %6, [r1]
+ paddw %1, %6
+ mova %7, %2
+ paddw %7, %5
+ mova %8, %3
+ paddw %8, %4
+ FILT_H2 %1, %7, %8
+%endmacro
+
+%macro HV 1
+%if mmsize==16
+%define PAD 12
+%define COUNT 2
+%else
+%define PAD 4
+%define COUNT 3
+%endif
+put_hv%1_10:
+ neg r2 ; This actually saves instructions
+ lea r1, [r1+r2*2-mmsize+PAD]
+ lea r4, [rsp+PAD+gprsize]
+ mov r3d, COUNT
+.v_loop:
+ movu m0, [r1]
+ sub r1, r2
+ movu m1, [r1]
+ sub r1, r2
+ movu m2, [r1]
+ sub r1, r2
+ movu m3, [r1]
+ sub r1, r2
+ movu m4, [r1]
+ sub r1, r2
+%assign i 0
+%rep %1-1
+ FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
+ psubw m0, [pad20]
+ movu [r4+i*mmsize*3], m0
+ sub r1, r2
+ SWAP 0,1,2,3,4,5
+%assign i i+1
+%endrep
+ FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
+ psubw m0, [pad20]
+ movu [r4+i*mmsize*3], m0
+ add r4, mmsize
+ lea r1, [r1+r2*8+mmsize]
+%if %1==8
+ lea r1, [r1+r2*4]
+%endif
+ dec r3d
+ jg .v_loop
+ neg r2
+ ret
+%endmacro
+
+INIT_MMX mmxext
+HV 4
+INIT_XMM sse2
+HV 8
+
+%macro H_LOOP 1
+%if num_mmregs > 8
+ %define s1 m8
+ %define s2 m9
+ %define s3 m10
+ %define d1 m11
+%else
+ %define s1 [tap1]
+ %define s2 [tap2]
+ %define s3 [tap3]
+ %define d1 [depad]
+%endif
+h%1_loop_op:
+ movu m1, [r1+mmsize-4]
+ movu m2, [r1+mmsize-2]
+ mova m3, [r1+mmsize+0]
+ movu m4, [r1+mmsize+2]
+ movu m5, [r1+mmsize+4]
+ movu m6, [r1+mmsize+6]
+%if num_mmregs > 8
+ pmaddwd m1, s1
+ pmaddwd m2, s1
+ pmaddwd m3, s2
+ pmaddwd m4, s2
+ pmaddwd m5, s3
+ pmaddwd m6, s3
+ paddd m1, d1
+ paddd m2, d1
+%else
+ mova m0, s1
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ mova m0, s2
+ pmaddwd m3, m0
+ pmaddwd m4, m0
+ mova m0, s3
+ pmaddwd m5, m0
+ pmaddwd m6, m0
+ mova m0, d1
+ paddd m1, m0
+ paddd m2, m0
+%endif
+ paddd m3, m5
+ paddd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 10
+ psrad m2, 10
+ pslld m2, 16
+ pand m1, [pd_0f]
+ por m1, m2
+%if num_mmregs <= 8
+ pxor m0, m0
+%endif
+ CLIPW m1, m0, m7
+ add r1, mmsize*3
+ ret
+%endmacro
+
+INIT_MMX mmxext
+H_LOOP 4
+INIT_XMM sse2
+H_LOOP 8
+
+%macro MC22 2
+cglobal_mc %1, mc22, %2, 3,7,12
+%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
+ mov r6, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+ sub rsp, PAD
+
+ call put_hv%2_10
+
+ mov r3d, %2
+ mova m7, [pw_pixel_max]
+%if num_mmregs > 8
+ pxor m0, m0
+ mova m8, [tap1]
+ mova m9, [tap2]
+ mova m10, [tap3]
+ mova m11, [depad]
+%endif
+ mov r1, rsp
+.h_loop:
+ call h%2_loop_op
+
+ OP_MOV [r0], m1
+ add r0, r2
+ dec r3d
+ jg .h_loop
+
+ mov rsp, r6 ; restore stack pointer
+ ret
+%endmacro
+
+MC MC22
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC12 2
+cglobal_mc %1, mc12, %2, 3,7,12
+%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
+ mov r6, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+ sub rsp, PAD
+
+ call put_hv%2_10
+
+ xor r4d, r4d
+.body:
+ mov r3d, %2
+ pxor m0, m0
+ mova m7, [pw_pixel_max]
+%if num_mmregs > 8
+ mova m8, [tap1]
+ mova m9, [tap2]
+ mova m10, [tap3]
+ mova m11, [depad]
+%endif
+ mov r1, rsp
+.h_loop:
+ call h%2_loop_op
+
+ movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
+ paddw m3, [depad2]
+ psrlw m3, 5
+ psubw m3, [unpad]
+ CLIPW m3, m0, m7
+ pavgw m1, m3
+
+ OP_MOV [r0], m1
+ add r0, r2
+ dec r3d
+ jg .h_loop
+
+ mov rsp, r6 ; restore stack pointer
+ ret
+%endmacro
+
+MC MC12
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC32 2
+cglobal_mc %1, mc32, %2, 3,7,12
+%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
+ mov r6, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+ sub rsp, PAD
+
+ call put_hv%2_10
+
+ mov r4d, 2 ; sizeof(pixel)
+ jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC32
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro H_NRD 1
+put_h%1_10:
+ add rsp, gprsize
+ mov r3d, %1
+ xor r4d, r4d
+ mova m6, [pad20]
+.nextrow:
+ movu m2, [r5-4]
+ movu m3, [r5-2]
+ movu m4, [r5+0]
+ ADDW m2, [r5+6], m5
+ ADDW m3, [r5+4], m5
+ ADDW m4, [r5+2], m5
+
+ FILT_H2 m2, m3, m4
+ psubw m2, m6
+ mova [rsp+r4], m2
+ add r4d, mmsize*3
+ add r5, r2
+ dec r3d
+ jg .nextrow
+ sub rsp, gprsize
+ ret
+%endmacro
+
+INIT_MMX mmxext
+H_NRD 4
+INIT_XMM sse2
+H_NRD 8
+
+%macro MC21 2
+cglobal_mc %1, mc21, %2, 3,7,12
+ mov r5, r1
+.body:
+%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
+ mov r6, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+
+ sub rsp, PAD
+ call put_h%2_10
+
+ sub rsp, PAD
+ call put_hv%2_10
+
+ mov r4d, PAD-mmsize ; H buffer
+ jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC21
+
+;-----------------------------------------------------------------------------
+; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+%macro MC23 2
+cglobal_mc %1, mc23, %2, 3,7,12
+ lea r5, [r1+r2]
+ jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
+%endmacro
+
+MC MC23
diff --git a/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm b/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
new file mode 100644
index 0000000..2d287ba
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
@@ -0,0 +1,862 @@
+;*****************************************************************************
+;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
+;*****************************************************************************
+;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+;* Copyright (C) 2012 Daniel Kang
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_16
+cextern pw_5
+cextern pb_0
+
+SECTION .text
+
+
+%macro op_avgh 3
+ movh %3, %2
+ pavgb %1, %3
+ movh %2, %1
+%endmacro
+
+%macro op_avg 2-3
+ pavgb %1, %2
+ mova %2, %1
+%endmacro
+
+%macro op_puth 2-3
+ movh %2, %1
+%endmacro
+
+%macro op_put 2-3
+ mova %2, %1
+%endmacro
+
+%macro QPEL4_H_LOWPASS_OP 1
+cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+ mova m4, [pw_5]
+ mova m5, [pw_16]
+ mov r4d, 4
+.loop:
+ movh m1, [r1-1]
+ movh m2, [r1+0]
+ movh m3, [r1+1]
+ movh m0, [r1+2]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m0, m7
+ paddw m1, m0
+ paddw m2, m3
+ movh m0, [r1-2]
+ movh m3, [r1+3]
+ punpcklbw m0, m7
+ punpcklbw m3, m7
+ paddw m0, m3
+ psllw m2, 2
+ psubw m2, m1
+ pmullw m2, m4
+ paddw m0, m5
+ paddw m0, m2
+ psraw m0, 5
+ packuswb m0, m0
+ op_%1h m0, [r0], m6
+ add r0, r2
+ add r1, r3
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_H_LOWPASS_OP put
+QPEL4_H_LOWPASS_OP avg
+
+%macro QPEL8_H_LOWPASS_OP 1
+cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ mov r4d, 8
+ pxor m7, m7
+ mova m6, [pw_5]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+1]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ psllw m0, 2
+ psllw m1, 2
+ mova m2, [r1-1]
+ mova m4, [r1+2]
+ mova m3, m2
+ mova m5, m4
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ punpcklbw m4, m7
+ punpckhbw m5, m7
+ paddw m2, m4
+ paddw m5, m3
+ psubw m0, m2
+ psubw m1, m5
+ pmullw m0, m6
+ pmullw m1, m6
+ movd m2, [r1-2]
+ movd m5, [r1+7]
+ punpcklbw m2, m7
+ punpcklbw m5, m7
+ paddw m2, m3
+ paddw m4, m5
+ mova m5, [pw_16]
+ paddw m2, m5
+ paddw m4, m5
+ paddw m0, m2
+ paddw m1, m4
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ op_%1 m0, [r0], m4
+ add r0, r2
+ add r1, r3
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8_H_LOWPASS_OP put
+QPEL8_H_LOWPASS_OP avg
+
+%macro QPEL8_H_LOWPASS_OP_XMM 1
+cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ mov r4d, 8
+ pxor m7, m7
+ mova m6, [pw_5]
+.loop:
+ movu m1, [r1-2]
+ mova m0, m1
+ punpckhbw m1, m7
+ punpcklbw m0, m7
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+ mova m5, m1
+ palignr m4, m0, 2
+ palignr m3, m0, 4
+ palignr m2, m0, 6
+ palignr m1, m0, 8
+ palignr m5, m0, 10
+ paddw m0, m5
+ paddw m2, m3
+ paddw m1, m4
+ psllw m2, 2
+ psubw m2, m1
+ paddw m0, [pw_16]
+ pmullw m2, m6
+ paddw m2, m0
+ psraw m2, 5
+ packuswb m2, m2
+ op_%1h m2, [r0], m4
+ add r1, r3
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8_H_LOWPASS_OP_XMM put
+QPEL8_H_LOWPASS_OP_XMM avg
+
+
+%macro QPEL4_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ pxor m7, m7
+ mova m4, [pw_5]
+ mova m5, [pw_16]
+ mov r5d, 4
+.loop:
+ movh m1, [r1-1]
+ movh m2, [r1+0]
+ movh m3, [r1+1]
+ movh m0, [r1+2]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m0, m7
+ paddw m1, m0
+ paddw m2, m3
+ movh m0, [r1-2]
+ movh m3, [r1+3]
+ punpcklbw m0, m7
+ punpcklbw m3, m7
+ paddw m0, m3
+ psllw m2, 2
+ psubw m2, m1
+ pmullw m2, m4
+ paddw m0, m5
+ paddw m0, m2
+ movh m3, [r2]
+ psraw m0, 5
+ packuswb m0, m0
+ pavgb m0, m3
+ op_%1h m0, [r0], m6
+ add r0, r3
+ add r1, r3
+ add r2, r4
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_H_LOWPASS_L2_OP put
+QPEL4_H_LOWPASS_L2_OP avg
+
+
+%macro QPEL8_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ mov r5d, 8
+ pxor m7, m7
+ mova m6, [pw_5]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+1]
+ mova m1, m0
+ mova m3, m2
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ psllw m0, 2
+ psllw m1, 2
+ mova m2, [r1-1]
+ mova m4, [r1+2]
+ mova m3, m2
+ mova m5, m4
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ punpcklbw m4, m7
+ punpckhbw m5, m7
+ paddw m2, m4
+ paddw m5, m3
+ psubw m0, m2
+ psubw m1, m5
+ pmullw m0, m6
+ pmullw m1, m6
+ movd m2, [r1-2]
+ movd m5, [r1+7]
+ punpcklbw m2, m7
+ punpcklbw m5, m7
+ paddw m2, m3
+ paddw m4, m5
+ mova m5, [pw_16]
+ paddw m2, m5
+ paddw m4, m5
+ paddw m0, m2
+ paddw m1, m4
+ psraw m0, 5
+ psraw m1, 5
+ mova m4, [r2]
+ packuswb m0, m1
+ pavgb m0, m4
+ op_%1 m0, [r0], m4
+ add r0, r3
+ add r1, r3
+ add r2, r4
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8_H_LOWPASS_L2_OP put
+QPEL8_H_LOWPASS_L2_OP avg
+
+
+%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
+cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ mov r5d, 8
+ pxor m7, m7
+ mova m6, [pw_5]
+.loop:
+ lddqu m1, [r1-2]
+ mova m0, m1
+ punpckhbw m1, m7
+ punpcklbw m0, m7
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+ mova m5, m1
+ palignr m4, m0, 2
+ palignr m3, m0, 4
+ palignr m2, m0, 6
+ palignr m1, m0, 8
+ palignr m5, m0, 10
+ paddw m0, m5
+ paddw m2, m3
+ paddw m1, m4
+ psllw m2, 2
+ movh m3, [r2]
+ psubw m2, m1
+ paddw m0, [pw_16]
+ pmullw m2, m6
+ paddw m2, m0
+ psraw m2, 5
+ packuswb m2, m2
+ pavgb m2, m3
+ op_%1h m2, [r0], m4
+ add r1, r3
+ add r0, r3
+ add r2, r4
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8_H_LOWPASS_L2_OP_XMM put
+QPEL8_H_LOWPASS_L2_OP_XMM avg
+
+
+; All functions that call this are required to have function arguments of
+; dst, src, dstStride, srcStride
+%macro FILT_V 1
+ mova m6, m2
+ movh m5, [r1]
+ paddw m6, m3
+ psllw m6, 2
+ psubw m6, m1
+ psubw m6, m4
+ punpcklbw m5, m7
+ pmullw m6, [pw_5]
+ paddw m0, [pw_16]
+ add r1, r3
+ paddw m0, m5
+ paddw m6, m0
+ psraw m6, 5
+ packuswb m6, m6
+ op_%1h m6, [r0], m0 ; 1
+ add r0, r2
+ SWAP 0, 1, 2, 3, 4, 5
+%endmacro
+
+%macro QPEL4_V_LOWPASS_OP 1
+cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ sub r1, r3
+ sub r1, r3
+ pxor m7, m7
+ movh m0, [r1]
+ movh m1, [r1+r3]
+ lea r1, [r1+2*r3]
+ movh m2, [r1]
+ movh m3, [r1+r3]
+ lea r1, [r1+2*r3]
+ movh m4, [r1]
+ add r1, r3
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_V_LOWPASS_OP put
+QPEL4_V_LOWPASS_OP avg
+
+
+
+%macro QPEL8OR16_V_LOWPASS_OP 1
+%if cpuflag(sse2)
+cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ sub r1, r3
+ sub r1, r3
+%else
+cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+%endif
+ pxor m7, m7
+ movh m0, [r1]
+ movh m1, [r1+r3]
+ lea r1, [r1+2*r3]
+ movh m2, [r1]
+ movh m3, [r1+r3]
+ lea r1, [r1+2*r3]
+ movh m4, [r1]
+ add r1, r3
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ cmp r4d, 16
+ jne .end
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+ FILT_V %1
+.end:
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_V_LOWPASS_OP put
+QPEL8OR16_V_LOWPASS_OP avg
+
+INIT_XMM sse2
+QPEL8OR16_V_LOWPASS_OP put
+QPEL8OR16_V_LOWPASS_OP avg
+
+
+; All functions that use this are required to have args:
+; src, tmp, srcSize
+%macro FILT_HV 1 ; offset
+ mova m6, m2
+ movh m5, [r0]
+ paddw m6, m3
+ psllw m6, 2
+ paddw m0, [pw_16]
+ psubw m6, m1
+ psubw m6, m4
+ punpcklbw m5, m7
+ pmullw m6, [pw_5]
+ paddw m0, m5
+ add r0, r2
+ paddw m6, m0
+ mova [r1+%1], m6
+ SWAP 0, 1, 2, 3, 4, 5
+%endmacro
+
+%macro QPEL4_HV1_LOWPASS_OP 1
+cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
+ movsxdifnidn r2, r2d
+ pxor m7, m7
+ movh m0, [r0]
+ movh m1, [r0+r2]
+ lea r0, [r0+2*r2]
+ movh m2, [r0]
+ movh m3, [r0+r2]
+ lea r0, [r0+2*r2]
+ movh m4, [r0]
+ add r0, r2
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ FILT_HV 0*24
+ FILT_HV 1*24
+ FILT_HV 2*24
+ FILT_HV 3*24
+ RET
+
+cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
+ movsxdifnidn r2, r2d
+ mov r3d, 4
+.loop:
+ mova m0, [r0]
+ paddw m0, [r0+10]
+ mova m1, [r0+2]
+ paddw m1, [r0+8]
+ mova m2, [r0+4]
+ paddw m2, [r0+6]
+ psubw m0, m1
+ psraw m0, 2
+ psubw m0, m1
+ paddsw m0, m2
+ psraw m0, 2
+ paddw m0, m2
+ psraw m0, 6
+ packuswb m0, m0
+ op_%1h m0, [r1], m7
+ add r0, 24
+ add r1, r2
+ dec r3d
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL4_HV1_LOWPASS_OP put
+QPEL4_HV1_LOWPASS_OP avg
+
+%macro QPEL8OR16_HV1_LOWPASS_OP 1
+cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
+ movsxdifnidn r2, r2d
+ pxor m7, m7
+ movh m0, [r0]
+ movh m1, [r0+r2]
+ lea r0, [r0+2*r2]
+ movh m2, [r0]
+ movh m3, [r0+r2]
+ lea r0, [r0+2*r2]
+ movh m4, [r0]
+ add r0, r2
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ FILT_HV 0*48
+ FILT_HV 1*48
+ FILT_HV 2*48
+ FILT_HV 3*48
+ FILT_HV 4*48
+ FILT_HV 5*48
+ FILT_HV 6*48
+ FILT_HV 7*48
+ cmp r3d, 16
+ jne .end
+ FILT_HV 8*48
+ FILT_HV 9*48
+ FILT_HV 10*48
+ FILT_HV 11*48
+ FILT_HV 12*48
+ FILT_HV 13*48
+ FILT_HV 14*48
+ FILT_HV 15*48
+.end:
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_HV1_LOWPASS_OP put
+QPEL8OR16_HV1_LOWPASS_OP avg
+
+INIT_XMM sse2
+QPEL8OR16_HV1_LOWPASS_OP put
+
+
+
+%macro QPEL8OR16_HV2_LOWPASS_OP 1
+; unused is to match ssse3 and mmxext args
+cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
+ movsxdifnidn r2, r2d
+.loop:
+ mova m0, [r1]
+ mova m3, [r1+8]
+ mova m1, [r1+2]
+ mova m4, [r1+10]
+ paddw m0, m4
+ paddw m1, m3
+ paddw m3, [r1+18]
+ paddw m4, [r1+16]
+ mova m2, [r1+4]
+ mova m5, [r1+12]
+ paddw m2, [r1+6]
+ paddw m5, [r1+14]
+ psubw m0, m1
+ psubw m3, m4
+ psraw m0, 2
+ psraw m3, 2
+ psubw m0, m1
+ psubw m3, m4
+ paddsw m0, m2
+ paddsw m3, m5
+ psraw m0, 2
+ psraw m3, 2
+ paddw m0, m2
+ paddw m3, m5
+ psraw m0, 6
+ psraw m3, 6
+ packuswb m0, m3
+ op_%1 m0, [r0], m7
+ add r1, 48
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+QPEL8OR16_HV2_LOWPASS_OP put
+QPEL8OR16_HV2_LOWPASS_OP avg
+
+%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
+cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ cmp r4d, 16
+ je .op16
+.loop8:
+ mova m1, [r1+16]
+ mova m0, [r1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+ mova m5, m1
+ palignr m5, m0, 10
+ palignr m4, m0, 8
+ palignr m3, m0, 6
+ palignr m2, m0, 4
+ palignr m1, m0, 2
+ paddw m0, m5
+ paddw m1, m4
+ paddw m2, m3
+ psubw m0, m1
+ psraw m0, 2
+ psubw m0, m1
+ paddw m0, m2
+ psraw m0, 2
+ paddw m0, m2
+ psraw m0, 6
+ packuswb m0, m0
+ op_%1h m0, [r0], m7
+ add r1, 48
+ add r0, r2
+ dec r4d
+ jne .loop8
+ jmp .done
+.op16:
+ mova m4, [r1+32]
+ mova m5, [r1+16]
+ mova m7, [r1]
+ mova m3, m4
+ mova m2, m4
+ mova m1, m4
+ mova m0, m4
+ palignr m0, m5, 10
+ palignr m1, m5, 8
+ palignr m2, m5, 6
+ palignr m3, m5, 4
+ palignr m4, m5, 2
+ paddw m0, m5
+ paddw m1, m4
+ paddw m2, m3
+ mova m6, m5
+ mova m4, m5
+ mova m3, m5
+ palignr m4, m7, 8
+ palignr m6, m7, 2
+ palignr m3, m7, 10
+ paddw m4, m6
+ mova m6, m5
+ palignr m5, m7, 6
+ palignr m6, m7, 4
+ paddw m3, m7
+ paddw m5, m6
+ psubw m0, m1
+ psubw m3, m4
+ psraw m0, 2
+ psraw m3, 2
+ psubw m0, m1
+ psubw m3, m4
+ paddw m0, m2
+ paddw m3, m5
+ psraw m0, 2
+ psraw m3, 2
+ paddw m0, m2
+ paddw m3, m5
+ psraw m0, 6
+ psraw m3, 6
+ packuswb m3, m0
+ op_%1 m3, [r0], m7
+ add r1, 48
+ add r0, r2
+ dec r4d
+ jne .op16
+.done:
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL8OR16_HV2_LOWPASS_OP_XMM put
+QPEL8OR16_HV2_LOWPASS_OP_XMM avg
+
+
+%macro PIXELS4_L2_SHIFT5 1
+cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ mova m0, [r1]
+ mova m1, [r1+24]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m0
+ packuswb m1, m1
+ pavgb m0, [r2]
+ pavgb m1, [r2+r4]
+ op_%1h m0, [r0], m4
+ op_%1h m1, [r0+r3], m5
+ lea r2, [r2+r4*2]
+ lea r0, [r0+r3*2]
+ mova m0, [r1+48]
+ mova m1, [r1+72]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m0
+ packuswb m1, m1
+ pavgb m0, [r2]
+ pavgb m1, [r2+r4]
+ op_%1h m0, [r0], m4
+ op_%1h m1, [r0+r3], m5
+ RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS4_L2_SHIFT5 put
+PIXELS4_L2_SHIFT5 avg
+
+
+%macro PIXELS8_L2_SHIFT5 1
+cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+8]
+ mova m2, [r1+48]
+ mova m3, [r1+48+8]
+ psraw m0, 5
+ psraw m1, 5
+ psraw m2, 5
+ psraw m3, 5
+ packuswb m0, m1
+ packuswb m2, m3
+ pavgb m0, [r2]
+ pavgb m2, [r2+r4]
+ op_%1 m0, [r0], m4
+ op_%1 m2, [r0+r3], m5
+ lea r2, [r2+2*r4]
+ add r1, 48*2
+ lea r0, [r0+2*r3]
+ sub r5d, 2
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS8_L2_SHIFT5 put
+PIXELS8_L2_SHIFT5 avg
+
+
+%if ARCH_X86_64
+%macro QPEL16_H_LOWPASS_L2_OP 1
+cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ mov r5d, 16
+ pxor m15, m15
+ mova m14, [pw_5]
+ mova m13, [pw_16]
+.loop:
+ lddqu m1, [r1+6]
+ lddqu m7, [r1-2]
+ mova m0, m1
+ punpckhbw m1, m15
+ punpcklbw m0, m15
+ punpcklbw m7, m15
+ mova m2, m1
+ mova m6, m0
+ mova m3, m1
+ mova m8, m0
+ mova m4, m1
+ mova m9, m0
+ mova m12, m0
+ mova m11, m1
+ palignr m11, m0, 10
+ palignr m12, m7, 10
+ palignr m4, m0, 2
+ palignr m9, m7, 2
+ palignr m3, m0, 4
+ palignr m8, m7, 4
+ palignr m2, m0, 6
+ palignr m6, m7, 6
+ paddw m11, m0
+ palignr m1, m0, 8
+ palignr m0, m7, 8
+ paddw m7, m12
+ paddw m2, m3
+ paddw m6, m8
+ paddw m1, m4
+ paddw m0, m9
+ psllw m2, 2
+ psllw m6, 2
+ psubw m2, m1
+ psubw m6, m0
+ paddw m11, m13
+ paddw m7, m13
+ pmullw m2, m14
+ pmullw m6, m14
+ lddqu m3, [r2]
+ paddw m2, m11
+ paddw m6, m7
+ psraw m2, 5
+ psraw m6, 5
+ packuswb m6, m2
+ pavgb m6, m3
+ op_%1 m6, [r0], m11
+ add r1, r3
+ add r0, r3
+ add r2, r4
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL16_H_LOWPASS_L2_OP put
+QPEL16_H_LOWPASS_L2_OP avg
+%endif
diff --git a/ffmpeg/libavcodec/x86/h264_weight.asm b/ffmpeg/libavcodec/x86/h264_weight.asm
new file mode 100644
index 0000000..4759a06
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_weight.asm
@@ -0,0 +1,317 @@
+;*****************************************************************************
+;* SSE2-optimized weighted prediction code
+;*****************************************************************************
+;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; biweight pred:
+;
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+; int height, int log2_denom, int weightd,
+; int weights, int offset);
+; and
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+; int log2_denom, int weight, int offset);
+;-----------------------------------------------------------------------------
+
+%macro WEIGHT_SETUP 0
+ add r5, r5
+ inc r5
+ movd m3, r4d
+ movd m5, r5d
+ movd m6, r3d
+ pslld m5, m6
+ psrld m5, 1
+%if mmsize == 16
+ pshuflw m3, m3, 0
+ pshuflw m5, m5, 0
+ punpcklqdq m3, m3
+ punpcklqdq m5, m5
+%else
+ pshufw m3, m3, 0
+ pshufw m5, m5, 0
+%endif
+ pxor m7, m7
+%endmacro
+
+%macro WEIGHT_OP 2
+ movh m0, [r0+%1]
+ movh m1, [r0+%2]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m3
+ pmullw m1, m3
+ paddsw m0, m5
+ paddsw m1, m5
+ psraw m0, m6
+ psraw m1, m6
+ packuswb m0, m1
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_weight_16, 6, 6, 0
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0, 4
+ mova [r0 ], m0
+ WEIGHT_OP 8, 12
+ mova [r0+8], m0
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+
+%macro WEIGHT_FUNC_MM 2
+cglobal h264_weight_%1, 6, 6, %2
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0, mmsize/2
+ mova [r0], m0
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+WEIGHT_FUNC_MM 8, 0
+INIT_XMM sse2
+WEIGHT_FUNC_MM 16, 8
+
+%macro WEIGHT_FUNC_HALF_MM 2
+cglobal h264_weight_%1, 6, 6, %2
+ WEIGHT_SETUP
+ sar r2d, 1
+ lea r3, [r1*2]
+.nextrow:
+ WEIGHT_OP 0, r1
+ movh [r0], m0
+%if mmsize == 16
+ movhps [r0+r1], m0
+%else
+ psrlq m0, 32
+ movh [r0+r1], m0
+%endif
+ add r0, r3
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+WEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM 8, 8
+
+%macro BIWEIGHT_SETUP 0
+%if ARCH_X86_64
+%define off_regd r7d
+%else
+%define off_regd r3d
+%endif
+ mov off_regd, r7m
+ add off_regd, 1
+ or off_regd, 1
+ add r4, 1
+ cmp r5, 128
+ jne .normal
+ sar r5, 1
+ sar r6, 1
+ sar off_regd, 1
+ sub r4, 1
+.normal
+%if cpuflag(ssse3)
+ movd m4, r5d
+ movd m0, r6d
+%else
+ movd m3, r5d
+ movd m4, r6d
+%endif
+ movd m5, off_regd
+ movd m6, r4d
+ pslld m5, m6
+ psrld m5, 1
+%if cpuflag(ssse3)
+ punpcklbw m4, m0
+ pshuflw m4, m4, 0
+ pshuflw m5, m5, 0
+ punpcklqdq m4, m4
+ punpcklqdq m5, m5
+
+%else
+%if mmsize == 16
+ pshuflw m3, m3, 0
+ pshuflw m4, m4, 0
+ pshuflw m5, m5, 0
+ punpcklqdq m3, m3
+ punpcklqdq m4, m4
+ punpcklqdq m5, m5
+%else
+ pshufw m3, m3, 0
+ pshufw m4, m4, 0
+ pshufw m5, m5, 0
+%endif
+ pxor m7, m7
+%endif
+%endmacro
+
+%macro BIWEIGHT_STEPA 3
+ movh m%1, [r0+%3]
+ movh m%2, [r1+%3]
+ punpcklbw m%1, m7
+ punpcklbw m%2, m7
+ pmullw m%1, m3
+ pmullw m%2, m4
+ paddsw m%1, m%2
+%endmacro
+
+%macro BIWEIGHT_STEPB 0
+ paddsw m0, m5
+ paddsw m1, m5
+ psraw m0, m6
+ psraw m1, m6
+ packuswb m0, m1
+%endmacro
+
+INIT_MMX mmxext
+cglobal h264_biweight_16, 7, 8, 0
+ BIWEIGHT_SETUP
+ movifnidn r3d, r3m
+.nextrow:
+ BIWEIGHT_STEPA 0, 1, 0
+ BIWEIGHT_STEPA 1, 2, 4
+ BIWEIGHT_STEPB
+ mova [r0], m0
+ BIWEIGHT_STEPA 0, 1, 8
+ BIWEIGHT_STEPA 1, 2, 12
+ BIWEIGHT_STEPB
+ mova [r0+8], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+
+%macro BIWEIGHT_FUNC_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
+ BIWEIGHT_SETUP
+ movifnidn r3d, r3m
+.nextrow:
+ BIWEIGHT_STEPA 0, 1, 0
+ BIWEIGHT_STEPA 1, 2, mmsize/2
+ BIWEIGHT_STEPB
+ mova [r0], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+BIWEIGHT_FUNC_MM 8, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_MM 16, 8
+
+%macro BIWEIGHT_FUNC_HALF_MM 2
+cglobal h264_biweight_%1, 7, 8, %2
+ BIWEIGHT_SETUP
+ movifnidn r3d, r3m
+ sar r3, 1
+ lea r4, [r2*2]
+.nextrow:
+ BIWEIGHT_STEPA 0, 1, 0
+ BIWEIGHT_STEPA 1, 2, r2
+ BIWEIGHT_STEPB
+ movh [r0], m0
+%if mmsize == 16
+ movhps [r0+r2], m0
+%else
+ psrlq m0, 32
+ movh [r0+r2], m0
+%endif
+ add r0, r4
+ add r1, r4
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+BIWEIGHT_FUNC_HALF_MM 4, 0
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8
+
+%macro BIWEIGHT_SSSE3_OP 0
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+ paddsw m0, m5
+ paddsw m2, m5
+ psraw m0, m6
+ psraw m2, m6
+ packuswb m0, m2
+%endmacro
+
+INIT_XMM ssse3
+cglobal h264_biweight_16, 7, 8, 8
+ BIWEIGHT_SETUP
+ movifnidn r3d, r3m
+
+.nextrow:
+ movh m0, [r0]
+ movh m2, [r0+8]
+ movh m3, [r1+8]
+ punpcklbw m0, [r1]
+ punpcklbw m2, m3
+ BIWEIGHT_SSSE3_OP
+ mova [r0], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+
+INIT_XMM ssse3
+cglobal h264_biweight_8, 7, 8, 8
+ BIWEIGHT_SETUP
+ movifnidn r3d, r3m
+ sar r3, 1
+ lea r4, [r2*2]
+
+.nextrow:
+ movh m0, [r0]
+ movh m1, [r1]
+ movh m2, [r0+r2]
+ movh m3, [r1+r2]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ BIWEIGHT_SSSE3_OP
+ movh [r0], m0
+ movhps [r0+r2], m0
+ add r0, r4
+ add r1, r4
+ dec r3d
+ jnz .nextrow
+ REP_RET
diff --git a/ffmpeg/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
new file mode 100644
index 0000000..3b09e42
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
@@ -0,0 +1,282 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+sq_1: dq 1
+ dq 0
+
+cextern pw_1
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
+; int weight, int offset);
+;-----------------------------------------------------------------------------
+%macro WEIGHT_PROLOGUE 0
+.prologue:
+ PROLOGUE 0,6,8
+ movifnidn r0, r0mp
+ movifnidn r1d, r1m
+ movifnidn r2d, r2m
+ movifnidn r4d, r4m
+ movifnidn r5d, r5m
+%endmacro
+
+%macro WEIGHT_SETUP 0
+ mova m0, [pw_1]
+ movd m2, r3m
+ pslld m0, m2 ; 1<<log2_denom
+ SPLATW m0, m0
+ shl r5, 19 ; *8, move to upper half of dword
+ lea r5, [r5+r4*2+0x10000]
+ movd m3, r5d ; weight<<1 | 1+(offset<<(3))
+ pshufd m3, m3, 0
+ mova m4, [pw_pixel_max]
+ paddw m2, [sq_1] ; log2_denom+1
+%if notcpuflag(sse4)
+ pxor m7, m7
+%endif
+%endmacro
+
+%macro WEIGHT_OP 1-2
+%if %0==1
+ mova m5, [r0+%1]
+ punpckhwd m6, m5, m0
+ punpcklwd m5, m0
+%else
+ movq m5, [r0+%1]
+ movq m6, [r0+%2]
+ punpcklwd m5, m0
+ punpcklwd m6, m0
+%endif
+ pmaddwd m5, m3
+ pmaddwd m6, m3
+ psrad m5, m2
+ psrad m6, m2
+%if cpuflag(sse4)
+ packusdw m5, m6
+ pminsw m5, m4
+%else
+ packssdw m5, m6
+ CLIPW m5, m7, m4
+%endif
+%endmacro
+
+%macro WEIGHT_FUNC_DBL 0
+cglobal h264_weight_16_10
+ WEIGHT_PROLOGUE
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0
+ mova [r0 ], m5
+ WEIGHT_OP 16
+ mova [r0+16], m5
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_DBL
+INIT_XMM sse4
+WEIGHT_FUNC_DBL
+
+
+%macro WEIGHT_FUNC_MM 0
+cglobal h264_weight_8_10
+ WEIGHT_PROLOGUE
+ WEIGHT_SETUP
+.nextrow:
+ WEIGHT_OP 0
+ mova [r0], m5
+ add r0, r1
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_MM
+INIT_XMM sse4
+WEIGHT_FUNC_MM
+
+
+%macro WEIGHT_FUNC_HALF_MM 0
+cglobal h264_weight_4_10
+ WEIGHT_PROLOGUE
+ sar r2d, 1
+ WEIGHT_SETUP
+ lea r3, [r1*2]
+.nextrow:
+ WEIGHT_OP 0, r1
+ movh [r0], m5
+ movhps [r0+r1], m5
+ add r0, r3
+ dec r2d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+WEIGHT_FUNC_HALF_MM
+INIT_XMM sse4
+WEIGHT_FUNC_HALF_MM
+
+
+;-----------------------------------------------------------------------------
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+; int log2_denom, int weightd, int weights, int offset);
+;-----------------------------------------------------------------------------
+%if ARCH_X86_32
+DECLARE_REG_TMP 3
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro BIWEIGHT_PROLOGUE 0
+.prologue:
+ PROLOGUE 0,8,8
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r5m
+ movifnidn r6d, r6m
+ movifnidn t0d, r7m
+%endmacro
+
+%macro BIWEIGHT_SETUP 0
+ lea t0, [t0*4+1] ; (offset<<2)+1
+ or t0, 1
+ shl r6, 16
+ or r5, r6
+ movd m4, r5d ; weightd | weights
+ movd m5, t0d ; (offset+1)|1
+ movd m6, r4m ; log2_denom
+ pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
+ paddd m6, [sq_1]
+ pshufd m4, m4, 0
+ pshufd m5, m5, 0
+ mova m3, [pw_pixel_max]
+ movifnidn r3d, r3m
+%if notcpuflag(sse4)
+ pxor m7, m7
+%endif
+%endmacro
+
+%macro BIWEIGHT 1-2
+%if %0==1
+ mova m0, [r0+%1]
+ mova m1, [r1+%1]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+%else
+ movq m0, [r0+%1]
+ movq m1, [r1+%1]
+ punpcklwd m0, m1
+ movq m2, [r0+%2]
+ movq m1, [r1+%2]
+ punpcklwd m2, m1
+%endif
+ pmaddwd m0, m4
+ pmaddwd m2, m4
+ paddd m0, m5
+ paddd m2, m5
+ psrad m0, m6
+ psrad m2, m6
+%if cpuflag(sse4)
+ packusdw m0, m2
+ pminsw m0, m3
+%else
+ packssdw m0, m2
+ CLIPW m0, m7, m3
+%endif
+%endmacro
+
+%macro BIWEIGHT_FUNC_DBL 0
+cglobal h264_biweight_16_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+.nextrow:
+ BIWEIGHT 0
+ mova [r0 ], m0
+ BIWEIGHT 16
+ mova [r0+16], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_DBL
+INIT_XMM sse4
+BIWEIGHT_FUNC_DBL
+
+%macro BIWEIGHT_FUNC 0
+cglobal h264_biweight_8_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+.nextrow:
+ BIWEIGHT 0
+ mova [r0], m0
+ add r0, r2
+ add r1, r2
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC
+INIT_XMM sse4
+BIWEIGHT_FUNC
+
+%macro BIWEIGHT_FUNC_HALF 0
+cglobal h264_biweight_4_10
+ BIWEIGHT_PROLOGUE
+ BIWEIGHT_SETUP
+ sar r3d, 1
+ lea r4, [r2*2]
+.nextrow:
+ BIWEIGHT 0, r2
+ movh [r0 ], m0
+ movhps [r0+r2], m0
+ add r0, r4
+ add r1, r4
+ dec r3d
+ jnz .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+BIWEIGHT_FUNC_HALF
+INIT_XMM sse4
+BIWEIGHT_FUNC_HALF
diff --git a/ffmpeg/libavcodec/x86/h264chroma_init.c b/ffmpeg/libavcodec/x86/h264chroma_init.c
new file mode 100644
index 0000000..b5c078f
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264chroma_init.c
@@ -0,0 +1,118 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
+void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
+ (uint8_t *dst, uint8_t *src, \
+ int stride, int h, int x, int y);
+
+CHROMA_MC(put, 2, 10, mmxext)
+CHROMA_MC(avg, 2, 10, mmxext)
+CHROMA_MC(put, 4, 10, mmxext)
+CHROMA_MC(avg, 4, 10, mmxext)
+CHROMA_MC(put, 8, 10, sse2)
+CHROMA_MC(avg, 8, 10, sse2)
+CHROMA_MC(put, 8, 10, avx)
+CHROMA_MC(avg, 8, 10, avx)
+
+void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_YASM
+ int high_bit_depth = bit_depth > 8;
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags) && !high_bit_depth) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
+ }
+
+ if (EXTERNAL_AMD3DNOW(mm_flags) && !high_bit_depth) {
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
+ }
+
+ if (EXTERNAL_MMXEXT(mm_flags) && !high_bit_depth) {
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
+ c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
+ c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
+ }
+
+ if (EXTERNAL_MMXEXT(mm_flags) && bit_depth > 8 && bit_depth <= 10) {
+ c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
+ c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(mm_flags) && bit_depth > 8 && bit_depth <= 10) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
+ }
+
+ if (EXTERNAL_SSSE3(mm_flags) && !high_bit_depth) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
+ }
+
+ if (EXTERNAL_AVX(mm_flags) && bit_depth > 8 && bit_depth <= 10) {
+ // AVX implies !cache64.
+ // TODO: Port cache(32|64) detection from x264.
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
+ }
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/h264dsp_init.c b/ffmpeg/libavcodec/x86/h264dsp_init.c
new file mode 100644
index 0000000..11aae77
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/h264dsp_init.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/h264dsp.h"
+#include "dsputil_mmx.h"
+
+/***********************************/
+/* IDCT */
+#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
+void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+ int16_t *block, \
+ int stride);
+
+IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 10, sse2)
+IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 10, mmxext)
+IDCT_ADD_FUNC(8_dc, 8, mmxext)
+IDCT_ADD_FUNC(8_dc, 10, sse2)
+IDCT_ADD_FUNC(8, 8, mmx)
+IDCT_ADD_FUNC(8, 8, sse2)
+IDCT_ADD_FUNC(8, 10, sse2)
+IDCT_ADD_FUNC(, 10, avx)
+IDCT_ADD_FUNC(8_dc, 10, avx)
+IDCT_ADD_FUNC(8, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
+ (uint8_t *dst, const int *block_offset, \
+ int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
+IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
+IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 10, avx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
+IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
+IDCT_ADD_REP_FUNC(, 16, 10, avx)
+IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
+
+
+#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
+ (uint8_t **dst, const int *block_offset, \
+ int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
+
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
+IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
+IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
+IDCT_ADD_REP_FUNC2(, 8, 10, avx)
+
+void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
+
+/***********************************/
+/* deblocking */
+
+void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
+ int8_t ref[2][40],
+ int16_t mv[2][40][2],
+ int bidir, int edges, int step,
+ int mask_mv0, int mask_mv1, int field);
+
+#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
+ int stride, \
+ int alpha, \
+ int beta, \
+ int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
+ int stride, \
+ int alpha, \
+ int beta);
+
+#define LF_FUNCS(type, depth) \
+LF_FUNC(h, chroma, depth, mmxext) \
+LF_IFUNC(h, chroma_intra, depth, mmxext) \
+LF_FUNC(v, chroma, depth, mmxext) \
+LF_IFUNC(v, chroma_intra, depth, mmxext) \
+LF_FUNC(h, luma, depth, mmxext) \
+LF_IFUNC(h, luma_intra, depth, mmxext) \
+LF_FUNC(h, luma, depth, sse2) \
+LF_IFUNC(h, luma_intra, depth, sse2) \
+LF_FUNC(v, luma, depth, sse2) \
+LF_IFUNC(v, luma_intra, depth, sse2) \
+LF_FUNC(h, chroma, depth, sse2) \
+LF_IFUNC(h, chroma_intra, depth, sse2) \
+LF_FUNC(v, chroma, depth, sse2) \
+LF_IFUNC(v, chroma_intra, depth, sse2) \
+LF_FUNC(h, luma, depth, avx) \
+LF_IFUNC(h, luma_intra, depth, avx) \
+LF_FUNC(v, luma, depth, avx) \
+LF_IFUNC(v, luma_intra, depth, avx) \
+LF_FUNC(h, chroma, depth, avx) \
+LF_IFUNC(h, chroma_intra, depth, avx) \
+LF_FUNC(v, chroma, depth, avx) \
+LF_IFUNC(v, chroma_intra, depth, avx)
+
+LF_FUNCS(uint8_t, 8)
+LF_FUNCS(uint16_t, 10)
+
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+LF_FUNC(v8, luma, 8, mmxext)
+static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0)
+{
+ if ((tc0[0] & tc0[1]) >= 0)
+ ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
+ if ((tc0[2] & tc0[3]) >= 0)
+ ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+LF_IFUNC(v8, luma_intra, 8, mmxext)
+static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
+ int alpha, int beta)
+{
+ ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
+ ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
+}
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+
+LF_FUNC(v, luma, 10, mmxext)
+LF_IFUNC(v, luma_intra, 10, mmxext)
+
+/***********************************/
+/* weighted prediction */
+
+#define H264_WEIGHT(W, OPT) \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
+ int height, int log2_denom, \
+ int weight, int offset);
+
+#define H264_BIWEIGHT(W, OPT) \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
+ int stride, int height, \
+ int log2_denom, int weightd, \
+ int weights, int offset);
+
+#define H264_BIWEIGHT_MMX(W) \
+ H264_WEIGHT(W, mmxext) \
+ H264_BIWEIGHT(W, mmxext)
+
+#define H264_BIWEIGHT_MMX_SSE(W) \
+ H264_BIWEIGHT_MMX(W) \
+ H264_WEIGHT(W, sse2) \
+ H264_BIWEIGHT(W, sse2) \
+ H264_BIWEIGHT(W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE(8)
+H264_BIWEIGHT_MMX(4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+ int stride, \
+ int height, \
+ int log2_denom, \
+ int weight, \
+ int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+ uint8_t *src, \
+ int stride, \
+ int height, \
+ int log2_denom, \
+ int weightd, \
+ int weights, \
+ int offset);
+
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
+ H264_WEIGHT_10(W, DEPTH, sse2) \
+ H264_WEIGHT_10(W, DEPTH, sse4) \
+ H264_BIWEIGHT_10(W, DEPTH, sse2) \
+ H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE(8, 10)
+H264_BIWEIGHT_10_SSE(4, 10)
+
+av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
+ const int chroma_format_idc)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags))
+ c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
+
+ if (bit_depth == 8) {
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->h264_idct_dc_add =
+ c->h264_idct_add = ff_h264_idct_add_8_mmx;
+ c->h264_idct8_dc_add =
+ c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
+ if (mm_flags & AV_CPU_FLAG_CMOV)
+ c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+ c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
+
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
+ if (chroma_format_idc == 1) {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+ }
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
+ c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
+
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
+
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+ }
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
+ }
+ if (EXTERNAL_AVX(mm_flags)) {
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+ }
+ }
+ }
+ } else if (bit_depth == 10) {
+ if (EXTERNAL_MMX(mm_flags)) {
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+#if ARCH_X86_32
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
+#endif /* ARCH_X86_32 */
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->h264_idct_add = ff_h264_idct_add_10_sse2;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
+#if HAVE_ALIGNED_STACK
+ c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
+
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+#if HAVE_ALIGNED_STACK
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
+#endif /* HAVE_ALIGNED_STACK */
+ }
+ if (EXTERNAL_SSE4(mm_flags)) {
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
+ }
+ if (EXTERNAL_AVX(mm_flags)) {
+ c->h264_idct_dc_add =
+ c->h264_idct_add = ff_h264_idct_add_10_avx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
+#if HAVE_ALIGNED_STACK
+ c->h264_idct8_add = ff_h264_idct8_add_10_avx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+
+ c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
+ c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+#if HAVE_ALIGNED_STACK
+ c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
+ c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
+ c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
+#endif /* HAVE_ALIGNED_STACK */
+ }
+ }
+ }
+ }
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/hpeldsp.asm b/ffmpeg/libavcodec/x86/hpeldsp.asm
new file mode 100644
index 0000000..1a572a3
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/hpeldsp.asm
@@ -0,0 +1,461 @@
+;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* MMX optimized hpel functions
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_1
+
+SECTION_TEXT
+
+; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS8_X2 0
+cglobal put_pixels8_x2, 4,5
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r1, r4
+ add r0, r4
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_PIXELS8_X2
+
+
+; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS_16 0
+cglobal put_pixels16_x2, 4,5
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ mova m2, [r1+8]
+ mova m3, [r1+r2+8]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ PAVGB m2, [r1+9]
+ PAVGB m3, [r1+r2+9]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+8], m2
+ mova [r0+r2+8], m3
+ add r1, r4
+ add r0, r4
+ mova m0, [r1]
+ mova m1, [r1+r2]
+ mova m2, [r1+8]
+ mova m3, [r1+r2+8]
+ PAVGB m0, [r1+1]
+ PAVGB m1, [r1+r2+1]
+ PAVGB m2, [r1+9]
+ PAVGB m3, [r1+r2+9]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova [r0+8], m2
+ mova [r0+r2+8], m3
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS_16
+INIT_MMX 3dnow
+PUT_PIXELS_16
+
+
+; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2 0
+cglobal put_no_rnd_pixels8_x2, 4,5
+ mova m6, [pb_1]
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, [r1+1]
+ mova m3, [r1+r2+1]
+ add r1, r4
+ psubusb m0, m6
+ psubusb m2, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1]
+ mova m1, [r1+1]
+ mova m2, [r1+r2]
+ mova m3, [r1+r2+1]
+ add r0, r4
+ add r1, r4
+ psubusb m0, m6
+ psubusb m2, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ mova [r0], m0
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2
+
+
+; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
+cglobal put_no_rnd_pixels8_x2_exact, 4,5
+ lea r4, [r2*3]
+ pcmpeqb m6, m6
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ mova m1, [r1+1]
+ mova m3, [r1+r2+1]
+ pxor m0, m6
+ pxor m2, m6
+ pxor m1, m6
+ pxor m3, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ pxor m0, m6
+ pxor m2, m6
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1+r2*2]
+ mova m1, [r1+r2*2+1]
+ mova m2, [r1+r4]
+ mova m3, [r1+r4+1]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m1
+ PAVGB m2, m3
+ pxor m0, m6
+ pxor m2, m6
+ mova [r0+r2*2], m0
+ mova [r0+r4], m2
+ lea r1, [r1+r2*4]
+ lea r0, [r0+r2*4]
+ sub r3d, 4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2_EXACT
+
+
+; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_PIXELS8_Y2 0
+cglobal put_pixels8_y2, 4,5
+ lea r4, [r2*2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ add r0, r4
+ add r1, r4
+ PAVGB m2, m1
+ PAVGB m1, m0
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2 0
+cglobal put_no_rnd_pixels8_y2, 4,5
+ mova m6, [pb_1]
+ lea r4, [r2+r2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ psubusb m1, m6
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ add r0, r4
+ add r1, r4
+ psubusb m1, m6
+ PAVGB m2, m1
+ PAVGB m1, m0
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
+cglobal put_no_rnd_pixels8_y2_exact, 4,5
+ lea r4, [r2*3]
+ mova m0, [r1]
+ pcmpeqb m6, m6
+ add r1, r2
+ pxor m0, m6
+.loop:
+ mova m1, [r1]
+ mova m2, [r1+r2]
+ pxor m1, m6
+ pxor m2, m6
+ PAVGB m0, m1
+ PAVGB m1, m2
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova m1, [r1+r2*2]
+ mova m0, [r1+r4]
+ pxor m1, m6
+ pxor m0, m6
+ PAVGB m2, m1
+ PAVGB m1, m0
+ pxor m2, m6
+ pxor m1, m6
+ mova [r0+r2*2], m2
+ mova [r0+r4], m1
+ lea r1, [r1+r2*4]
+ lea r0, [r0+r2*4]
+ sub r3d, 4
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2_EXACT
+
+
+; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8 0
+cglobal avg_pixels8, 4,5
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r0]
+ mova m1, [r0+r2]
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r1, r4
+ add r0, r4
+ mova m0, [r0]
+ mova m1, [r0+r2]
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnow
+AVG_PIXELS8
+
+
+; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8_X2 0
+cglobal avg_pixels8_x2, 4,5
+ lea r4, [r2*2]
+.loop:
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m2, [r1+r2+1]
+ PAVGB m0, [r0]
+ PAVGB m2, [r0+r2]
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m2
+ mova m0, [r1]
+ mova m2, [r1+r2]
+ PAVGB m0, [r1+1]
+ PAVGB m2, [r1+r2+1]
+ add r0, r4
+ add r1, r4
+ PAVGB m0, [r0]
+ PAVGB m2, [r0+r2]
+ mova [r0], m0
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_X2
+INIT_MMX 3dnow
+AVG_PIXELS8_X2
+
+
+; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8_Y2 0
+cglobal avg_pixels8_y2, 4,5
+ lea r4, [r2*2]
+ mova m0, [r1]
+ sub r0, r2
+.loop:
+ mova m1, [r1+r2]
+ mova m2, [r1+r4]
+ add r1, r4
+ PAVGB m0, m1
+ PAVGB m1, m2
+ mova m3, [r0+r2]
+ mova m4, [r0+r4]
+ PAVGB m0, m3
+ PAVGB m1, m4
+ mova [r0+r2], m0
+ mova [r0+r4], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ PAVGB m2, m1
+ PAVGB m1, m0
+ add r0, r4
+ add r1, r4
+ mova m3, [r0+r2]
+ mova m4, [r0+r4]
+ PAVGB m2, m3
+ PAVGB m1, m4
+ mova [r0+r2], m2
+ mova [r0+r4], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_Y2
+INIT_MMX 3dnow
+AVG_PIXELS8_Y2
+
+
+; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro AVG_PIXELS8_XY2 0
+cglobal avg_pixels8_xy2, 4,5
+ mova m6, [pb_1]
+ lea r4, [r2*2]
+ mova m0, [r1]
+ pavgb m0, [r1+1]
+.loop:
+ mova m2, [r1+r4]
+ mova m1, [r1+r2]
+ psubusb m2, m6
+ pavgb m1, [r1+r2+1]
+ pavgb m2, [r1+r4+1]
+ add r1, r4
+ pavgb m0, m1
+ pavgb m1, m2
+ pavgb m0, [r0]
+ pavgb m1, [r0+r2]
+ mova [r0], m0
+ mova [r0+r2], m1
+ mova m1, [r1+r2]
+ mova m0, [r1+r4]
+ pavgb m1, [r1+r2+1]
+ pavgb m0, [r1+r4+1]
+ add r0, r4
+ add r1, r4
+ pavgb m2, m1
+ pavgb m1, m0
+ pavgb m2, [r0]
+ pavgb m1, [r0+r2]
+ mova [r0], m2
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_PIXELS8_XY2
diff --git a/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c b/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c
new file mode 100644
index 0000000..b9a8f83
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/hpeldsp_avg_template.c
@@ -0,0 +1,77 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmxext
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//FIXME the following could be optimized too ...
+static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_put_no_rnd_pixels8_x2)(block, pixels, line_size, h);
+ DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_put_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_put_no_rnd_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_avg_pixels8)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_avg_pixels8_x2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_avg_pixels8_y2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ DEF(ff_avg_pixels8_xy2)(block, pixels, line_size, h);
+ DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h);
+}
diff --git a/ffmpeg/libavcodec/x86/hpeldsp_init.c b/ffmpeg/libavcodec/x86/hpeldsp_init.c
new file mode 100644
index 0000000..4b877b8
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/hpeldsp_init.c
@@ -0,0 +1,415 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/hpeldsp.h"
+#include "dsputil_mmx.h"
+
+//#undef NDEBUG
+//#include <assert.h>
+
+#if HAVE_YASM
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+ const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+
+#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
+
+#define MOVQ_BFE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "paddb %%"#regd", %%"#regd" \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "psrlw $15, %%"#regd" \n\t" \
+ "packuswb %%"#regd", %%"#regd" \n\t" ::)
+
+#define MOVQ_WTWO(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "psrlw $15, %%"#regd" \n\t" \
+ "psllw $1, %%"#regd" \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+ "movq "#rega", "#regr" \n\t" \
+ "pand "#regb", "#regr" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pand "#regfe", "#regb" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "paddb "#regb", "#regr" \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+ "movq "#rega", "#regr" \n\t" \
+ "por "#regb", "#regr" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pand "#regfe", "#regb" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psubb "#regb", "#regr" \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
+ "movq "#rega", "#regr" \n\t" \
+ "movq "#regc", "#regp" \n\t" \
+ "pand "#regb", "#regr" \n\t" \
+ "pand "#regd", "#regp" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pxor "#regc", "#regd" \n\t" \
+ "pand %%mm6, "#regb" \n\t" \
+ "pand %%mm6, "#regd" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psrlq $1, "#regd" \n\t" \
+ "paddb "#regb", "#regr" \n\t" \
+ "paddb "#regd", "#regp" \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+ "movq "#rega", "#regr" \n\t" \
+ "movq "#regc", "#regp" \n\t" \
+ "por "#regb", "#regr" \n\t" \
+ "por "#regd", "#regp" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pxor "#regc", "#regd" \n\t" \
+ "pand %%mm6, "#regb" \n\t" \
+ "pand %%mm6, "#regd" \n\t" \
+ "psrlq $1, "#regd" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psubb "#regb", "#regr" \n\t" \
+ "psubb "#regd", "#regp" \n\t"
+
+/***********************************/
+/* MMX no rounding */
+#define NO_RND 1
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
+#define SET_RND MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
+#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef NO_RND
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ## _mmx
+#define SET_RND MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef OP_AVG
+
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_YASM
+#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
+
+/***********************************/
+/* 3Dnow specific */
+
+#define DEF(x) x ## _3dnow
+
+#include "hpeldsp_avg_template.c"
+
+#undef DEF
+
+/***********************************/
+/* MMXEXT specific */
+
+#define DEF(x) x ## _mmxext
+
+#include "hpeldsp_avg_template.c"
+
+#undef DEF
+
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmxext put_pixels16_mmx
+#define put_pixels8_mmxext put_pixels8_mmx
+#define put_pixels4_mmxext put_pixels4_mmx
+#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ __asm__ volatile (
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r"(pixels), "+r"(block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ __asm__ volatile (
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq 8(%1 ), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1 ), %%mm0 \n\t"
+ "movq 8(%1 ), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r"(pixels), "+r"(block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+ do { \
+ c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
+ c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
+ c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
+ c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
+ } while (0)
+
+static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_INLINE_ASM
+ SET_HPEL_FUNCS(put, [0], 16, mmx);
+ SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+ SET_HPEL_FUNCS(avg, [0], 16, mmx);
+ SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
+ SET_HPEL_FUNCS(put, [1], 8, mmx);
+ SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
+ SET_HPEL_FUNCS(avg, [1], 8, mmx);
+#endif /* HAVE_INLINE_ASM */
+}
+
+static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_YASM
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
+
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
+
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+
+ if (!(flags & CODEC_FLAG_BITEXACT)) {
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
+
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+ }
+#endif /* HAVE_YASM */
+
+#if HAVE_MMXEXT_EXTERNAL
+ if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+ }
+#endif /* HAVE_MMXEXT_EXTERNAL */
+}
+
+static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_YASM
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
+
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
+
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+
+ if (!(flags & CODEC_FLAG_BITEXACT)){
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
+
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+ }
+
+ if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+ }
+#endif /* HAVE_YASM */
+}
+
+static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_SSE2_EXTERNAL
+ if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ // these functions are slower than mmx on AMD, but faster on Intel
+ c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
+ }
+#endif /* HAVE_SSE2_EXTERNAL */
+}
+
+void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX)
+ hpeldsp_init_mmx(c, flags, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_MMXEXT)
+ hpeldsp_init_mmxext(c, flags, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_3DNOW)
+ hpeldsp_init_3dnow(c, flags, mm_flags);
+
+ if (mm_flags & AV_CPU_FLAG_SSE2)
+ hpeldsp_init_sse2(c, flags, mm_flags);
+}
diff --git a/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c b/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
new file mode 100644
index 0000000..07de675
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
@@ -0,0 +1,428 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// put_pixels
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"),%%mm2 \n\t"
+ PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"),%%mm0 \n\t"
+ PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "movq %%mm4, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+// avg_pixels
+#ifndef NO_RND
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+#endif // NO_RND
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ "movq 8%0, %%mm0 \n\t"
+ "movq 8%1, %%mm1 \n\t"
+ OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, 8%0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+#ifndef NO_RND
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq 1%1, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+}
+#endif // NO_RND
+
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq 1%1, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ "movq 8%1, %%mm0 \n\t"
+ "movq 9%1, %%mm1 \n\t"
+ "movq 8%0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, 8%0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+}
+
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
+ "movq %%mm5, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(put, pixels8_y2)(block , pixels , line_size, h);
+ DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(put, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(avg, pixels8_y2)(block , pixels , line_size, h);
+ DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
+ DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
diff --git a/ffmpeg/libavcodec/x86/idct_mmx_xvid.c b/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
new file mode 100644
index 0000000..5e9f405
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
@@ -0,0 +1,558 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - MMX and XMM forward discrete cosine transform -
+ *
+ * Copyright(C) 2001 Peter Ross <pross@xvid.org>
+ *
+ * Originally provided by Intel at AP-922
+ * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+ * but in a limited edition.
+ * New macro implements a column part for precise iDCT
+ * The routine precision now satisfies IEEE standard 1180-1990.
+ *
+ * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+ * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+ *
+ * http://www.elecard.com/peter/idct.html
+ * http://www.linuxvideo.org/mpeg2dec/
+ *
+ * These examples contain code fragments for first stage iDCT 8x8
+ * (for rows) and first stage DCT 8x8 (for columns)
+ *
+ * conversion to gcc syntax by Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/mem.h"
+#include "dsputil_mmx.h"
+#include "idct_xvid.h"
+
+#if HAVE_INLINE_ASM
+
+//=============================================================================
+// Macros and other preprocessor constants
+//=============================================================================
+
+#define BITS_INV_ACC 5 // 4 or 5 for IEEE
+#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
+#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
+#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
+#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
+#define RND_INV_CORR (RND_INV_COL - 1)
+
+#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
+#define SHIFT_FRW_COL BITS_FRW_ACC
+#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
+#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
+
+
+//-----------------------------------------------------------------------------
+// Various memory constants (trigonometric values or rounding values)
+//-----------------------------------------------------------------------------
+
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
+ 13036,13036,13036,13036, // tg * (2<<16) + 0.5
+ 27146,27146,27146,27146, // tg * (2<<16) + 0.5
+ -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
+ 23170,23170,23170,23170}; // cos * (2<<15) + 0.5
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
+ 65536,65536,
+ 3597,3597,
+ 2260,2260,
+ 1203,1203,
+ 0,0,
+ 120,120,
+ 512,512,
+ 512,512};
+
+//-----------------------------------------------------------------------------
+//
+// The first stage iDCT 8x8 - inverse DCTs of rows
+//
+//-----------------------------------------------------------------------------
+// The 8-point inverse DCT direct algorithm
+//-----------------------------------------------------------------------------
+//
+// static const short w[32] = {
+// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
+// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
+// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
+// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
+// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
+// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
+//
+// #define DCT_8_INV_ROW(x, y)
+// {
+// int a0, a1, a2, a3, b0, b1, b2, b3;
+//
+// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
+// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
+// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
+// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+//
+// y[0] = SHIFT_ROUND ( a0 + b0 );
+// y[1] = SHIFT_ROUND ( a1 + b1 );
+// y[2] = SHIFT_ROUND ( a2 + b2 );
+// y[3] = SHIFT_ROUND ( a3 + b3 );
+// y[4] = SHIFT_ROUND ( a3 - b3 );
+// y[5] = SHIFT_ROUND ( a2 - b2 );
+// y[6] = SHIFT_ROUND ( a1 - b1 );
+// y[7] = SHIFT_ROUND ( a0 - b0 );
+// }
+//
+//-----------------------------------------------------------------------------
+//
+// In this implementation the outputs of the iDCT-1D are multiplied
+// for rows 0,4 - by cos_4_16,
+// for rows 1,7 - by cos_1_16,
+// for rows 2,6 - by cos_2_16,
+// for rows 3,5 - by cos_3_16
+// and are shifted to the left for better accuracy
+//
+// For the constants used,
+// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// Tables for mmx processors
+//-----------------------------------------------------------------------------
+
+// Table for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
+ 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
+ 21407,8867,8867,-21407, // w07 w05 w03 w01
+ 16384,-16384,16384,16384, // w14 w12 w10 w08
+ -8867,21407,-21407,-8867, // w15 w13 w11 w09
+ 22725,12873,19266,-22725, // w22 w20 w18 w16
+ 19266,4520,-4520,-12873, // w23 w21 w19 w17
+ 12873,4520,4520,19266, // w30 w28 w26 w24
+ -22725,19266,-12873,-22725, // w31 w29 w27 w25
+// Table for rows 1,7 - constants are multiplied by cos_1_16
+ 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
+ 29692,12299,12299,-29692, // w07 w05 w03 w01
+ 22725,-22725,22725,22725, // w14 w12 w10 w08
+ -12299,29692,-29692,-12299, // w15 w13 w11 w09
+ 31521,17855,26722,-31521, // w22 w20 w18 w16
+ 26722,6270,-6270,-17855, // w23 w21 w19 w17
+ 17855,6270,6270,26722, // w30 w28 w26 w24
+ -31521,26722,-17855,-31521, // w31 w29 w27 w25
+// Table for rows 2,6 - constants are multiplied by cos_2_16
+ 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
+ 27969,11585,11585,-27969, // w07 w05 w03 w01
+ 21407,-21407,21407,21407, // w14 w12 w10 w08
+ -11585,27969,-27969,-11585, // w15 w13 w11 w09
+ 29692,16819,25172,-29692, // w22 w20 w18 w16
+ 25172,5906,-5906,-16819, // w23 w21 w19 w17
+ 16819,5906,5906,25172, // w30 w28 w26 w24
+ -29692,25172,-16819,-29692, // w31 w29 w27 w25
+// Table for rows 3,5 - constants are multiplied by cos_3_16
+ 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
+ 25172,10426,10426,-25172, // w07 w05 w03 w01
+ 19266,-19266,19266,19266, // w14 w12 w10 w08
+ -10426,25172,-25172,-10426, // w15 w13 w11 w09
+ 26722,15137,22654,-26722, // w22 w20 w18 w16
+ 22654,5315,-5315,-15137, // w23 w21 w19 w17
+ 15137,5315,5315,22654, // w30 w28 w26 w24
+ -26722,22654,-15137,-26722, // w31 w29 w27 w25
+};
+//-----------------------------------------------------------------------------
+// Tables for xmm processors
+//-----------------------------------------------------------------------------
+
+// %3 for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
+ 16384,21407,16384,8867, // movq-> w05 w04 w01 w00
+ 16384,8867,-16384,-21407, // w07 w06 w03 w02
+ 16384,-8867,16384,-21407, // w13 w12 w09 w08
+ -16384,21407,16384,-8867, // w15 w14 w11 w10
+ 22725,19266,19266,-4520, // w21 w20 w17 w16
+ 12873,4520,-22725,-12873, // w23 w22 w19 w18
+ 12873,-22725,4520,-12873, // w29 w28 w25 w24
+ 4520,19266,19266,-22725, // w31 w30 w27 w26
+// %3 for rows 1,7 - constants are multiplied by cos_1_16
+ 22725,29692,22725,12299, // movq-> w05 w04 w01 w00
+ 22725,12299,-22725,-29692, // w07 w06 w03 w02
+ 22725,-12299,22725,-29692, // w13 w12 w09 w08
+ -22725,29692,22725,-12299, // w15 w14 w11 w10
+ 31521,26722,26722,-6270, // w21 w20 w17 w16
+ 17855,6270,-31521,-17855, // w23 w22 w19 w18
+ 17855,-31521,6270,-17855, // w29 w28 w25 w24
+ 6270,26722,26722,-31521, // w31 w30 w27 w26
+// %3 for rows 2,6 - constants are multiplied by cos_2_16
+ 21407,27969,21407,11585, // movq-> w05 w04 w01 w00
+ 21407,11585,-21407,-27969, // w07 w06 w03 w02
+ 21407,-11585,21407,-27969, // w13 w12 w09 w08
+ -21407,27969,21407,-11585, // w15 w14 w11 w10
+ 29692,25172,25172,-5906, // w21 w20 w17 w16
+ 16819,5906,-29692,-16819, // w23 w22 w19 w18
+ 16819,-29692,5906,-16819, // w29 w28 w25 w24
+ 5906,25172,25172,-29692, // w31 w30 w27 w26
+// %3 for rows 3,5 - constants are multiplied by cos_3_16
+ 19266,25172,19266,10426, // movq-> w05 w04 w01 w00
+ 19266,10426,-19266,-25172, // w07 w06 w03 w02
+ 19266,-10426,19266,-25172, // w13 w12 w09 w08
+ -19266,25172,19266,-10426, // w15 w14 w11 w10
+ 26722,22654,22654,-5315, // w21 w20 w17 w16
+ 15137,5315,-26722,-15137, // w23 w22 w19 w18
+ 15137,-26722,5315,-15137, // w29 w28 w25 w24
+ 5315,22654,22654,-26722, // w31 w30 w27 w26
+};
+//=============================================================================
+// Helper macros for the code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
+ "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
+ "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
+ "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
+ "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
+ "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
+ "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
+ "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
+ "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
+ "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
+ "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
+ "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
+ "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
+ "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
+ "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
+ "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
+ "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
+ "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
+ "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
+ "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
+ "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
+ "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
+ "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
+ "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
+ "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
+ "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
+ "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
+ "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
+ "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
+ "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
+ "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
+ "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
+ "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
+ "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
+ "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
+ "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
+ "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
+ "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
+ "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
+ "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
+ "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
+ "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
+ "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
+ "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
+ "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
+ "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
+ "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
+ "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
+ "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
+ "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
+ "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
+ "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
+ "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
+ "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
+ "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
+ "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
+ "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
+ "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
+ "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
+ "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
+ "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
+ "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
+ "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
+ "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
+ "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
+ "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
+ "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
+ "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
+ "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
+ "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
+ "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
+ "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
+ "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
+ "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
+ "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
+ "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
+ "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
+ "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
+ "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
+ "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
+ "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
+ "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
+ "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
+ "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
+ "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
+ "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
+ "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
+ "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
+ "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+//
+// The first stage DCT 8x8 - forward DCTs of columns
+//
+// The %2puts are multiplied
+// for rows 0,4 - on cos_4_16,
+// for rows 1,7 - on cos_1_16,
+// for rows 2,6 - on cos_2_16,
+// for rows 3,5 - on cos_3_16
+// and are shifted to the left for rise of accuracy
+//
+//-----------------------------------------------------------------------------
+//
+// The 8-point scaled forward DCT algorithm (26a8m)
+//
+//-----------------------------------------------------------------------------
+//
+// #define DCT_8_FRW_COL(x, y)
+//{
+// short t0, t1, t2, t3, t4, t5, t6, t7;
+// short tp03, tm03, tp12, tm12, tp65, tm65;
+// short tp465, tm465, tp765, tm765;
+//
+// t0 = LEFT_SHIFT ( x[0] + x[7] );
+// t1 = LEFT_SHIFT ( x[1] + x[6] );
+// t2 = LEFT_SHIFT ( x[2] + x[5] );
+// t3 = LEFT_SHIFT ( x[3] + x[4] );
+// t4 = LEFT_SHIFT ( x[3] - x[4] );
+// t5 = LEFT_SHIFT ( x[2] - x[5] );
+// t6 = LEFT_SHIFT ( x[1] - x[6] );
+// t7 = LEFT_SHIFT ( x[0] - x[7] );
+//
+// tp03 = t0 + t3;
+// tm03 = t0 - t3;
+// tp12 = t1 + t2;
+// tm12 = t1 - t2;
+//
+// y[0] = tp03 + tp12;
+// y[4] = tp03 - tp12;
+//
+// y[2] = tm03 + tm12 * tg_2_16;
+// y[6] = tm03 * tg_2_16 - tm12;
+//
+// tp65 =(t6 +t5 )*cos_4_16;
+// tm65 =(t6 -t5 )*cos_4_16;
+//
+// tp765 = t7 + tp65;
+// tm765 = t7 - tp65;
+// tp465 = t4 + tm65;
+// tm465 = t4 - tm65;
+//
+// y[1] = tp765 + tp465 * tg_1_16;
+// y[7] = tp765 * tg_1_16 - tp465;
+// y[5] = tm765 * tg_3_16 + tm465;
+// y[3] = tm765 - tm465 * tg_3_16;
+//}
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_COL_4 INP,OUT
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_COL(A1,A2)\
+ "movq 2*8(%3),%%mm0\n\t"\
+ "movq 16*3+" #A1 ",%%mm3\n\t"\
+ "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
+ "movq 16*5+" #A1 ",%%mm5\n\t"\
+ "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
+ "movq (%3),%%mm4\n\t"\
+ "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
+ "movq 16*7+" #A1 ",%%mm7\n\t"\
+ "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
+ "movq 16*1+" #A1 ",%%mm6\n\t"\
+ "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
+ "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
+ "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
+ "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
+ "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
+ "movq 3*8(%3),%%mm3\n\t"\
+ "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
+ "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
+ "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
+ "movq %%mm4,%%mm5 \n\t"/* tp17*/\
+ "movq %%mm2,%%mm6 \n\t"/* tm17*/\
+ "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
+ "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
+ "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
+ "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
+ "movq 1*8(%3),%%mm7\n\t"\
+ "movq %%mm4,%%mm1 \n\t"/* t1*/\
+ "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
+ "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
+ "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
+ "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
+ "movq 2*16+" #A1 ",%%mm5\n\t"\
+ "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
+ "movq 6*16+" #A1 ",%%mm6\n\t"\
+ "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
+ "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
+ "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
+ "movq 0*16+" #A1 ",%%mm2\n\t"\
+ "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
+ "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
+ "movq %%mm2,%%mm3 \n\t"/* x0*/\
+ "movq 4*16+" #A1 ",%%mm6\n\t"\
+ "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
+ "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
+ "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
+ "movq %%mm2,%%mm5 \n\t"/* tp04*/\
+ "movq %%mm3,%%mm6 \n\t"/* tm04*/\
+ "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
+ "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
+ "paddsw %%mm1,%%mm1 \n\t"/* b1*/\
+ "paddsw %%mm4,%%mm4 \n\t"/* b2*/\
+ "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
+ "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
+ "movq %%mm3,%%mm7 \n\t"/* a1*/\
+ "movq %%mm6,%%mm0 \n\t"/* a2*/\
+ "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
+ "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
+ "psraw $6,%%mm3 \n\t"/* dst1*/\
+ "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
+ "psraw $6,%%mm6 \n\t"/* dst2*/\
+ "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
+ "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
+ "psraw $6,%%mm7 \n\t"/* dst6*/\
+ "movq %%mm5,%%mm4 \n\t"/* a0*/\
+ "psraw $6,%%mm0 \n\t"/* dst5*/\
+ "movq %%mm3,1*16+" #A2 "\n\t"\
+ "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
+ "movq %%mm6,2*16+" #A2 "\n\t"\
+ "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
+ "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
+ "psraw $6,%%mm5 \n\t"/* dst0*/\
+ "movq %%mm2,%%mm6 \n\t"/* a3*/\
+ "psraw $6,%%mm4 \n\t"/* dst7*/\
+ "movq %%mm0,5*16+" #A2 "\n\t"\
+ "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
+ "movq %%mm7,6*16+" #A2 "\n\t"\
+ "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
+ "movq %%mm5,0*16+" #A2 "\n\t"\
+ "psraw $6,%%mm2 \n\t"/* dst3*/\
+ "movq %%mm4,7*16+" #A2 "\n\t"\
+ "psraw $6,%%mm6 \n\t"/* dst4*/\
+ "movq %%mm2,3*16+" #A2 "\n\t"\
+ "movq %%mm6,4*16+" #A2 "\n\t"
+
+//=============================================================================
+// Code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// void idct_mmx(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_idct_xvid_mmx(short *block){
+__asm__ volatile(
+ //# Process each row
+ DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+ DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+ DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+ DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+ DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+ DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+ DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+ DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+ //# Process the columns (4 at a time)
+ DCT_8_INV_COL(0(%0), 0(%0))
+ DCT_8_INV_COL(8(%0), 8(%0))
+ :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
+}
+
+//-----------------------------------------------------------------------------
+// void idct_xmm(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_idct_xvid_mmxext(short *block)
+{
+__asm__ volatile(
+ //# Process each row
+ DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+ DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+ DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+ DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+ DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+ DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+ DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+ DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+ //# Process the columns (4 at a time)
+ DCT_8_INV_COL(0(%0), 0(%0))
+ DCT_8_INV_COL(8(%0), 8(%0))
+ :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
+}
+
+void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_idct_xvid_mmx(block);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_idct_xvid_mmx(block);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_idct_xvid_mmxext(block);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_idct_xvid_mmxext(block);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/idct_sse2_xvid.c b/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
new file mode 100644
index 0000000..b51466c
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
@@ -0,0 +1,407 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - SSE2 inverse discrete cosine transform -
+ *
+ * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+ *
+ * Conversion to gcc syntax with modifications
+ * by Alexander Strange <astrange@ithinksw.com>
+ *
+ * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+ *
+ * This file is part of FFmpeg.
+ *
+ * Vertical pass is an implementation of the scheme:
+ * Loeffler C., Ligtenberg A., and Moschytz C.S.:
+ * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+ * Proc. ICASSP 1989, 988-991.
+ *
+ * Horizontal pass is a double 4x4 vector/matrix multiplication,
+ * (see also Intel's Application Note 922:
+ * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ * Copyright (C) 1999 Intel Corporation)
+ *
+ * More details at http://skal.planet-d.net/coding/dct.html
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "idct_xvid.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+/**
+ * @file
+ * @brief SSE2 idct compatible with xvidmmx
+ */
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
+DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
+DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
+DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
+DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
+
+DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
+ 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
+ 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
+ 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
+ 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
+ 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
+ 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
+ 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
+ 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
+ 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
+ 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
+ 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
+ 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
+ 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
+ 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
+ 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
+ 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+};
+
+DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
+ 65536, 65536, 65536, 65536,
+ 3597, 3597, 3597, 3597,
+ 2260, 2260, 2260, 2260,
+ 1203, 1203, 1203, 1203,
+ 120, 120, 120, 120,
+ 512, 512, 512, 512
+};
+
+// Temporary storage before the column pass
+#define ROW1 "%%xmm6"
+#define ROW3 "%%xmm4"
+#define ROW5 "%%xmm5"
+#define ROW7 "%%xmm7"
+
+#define CLEAR_ODD(r) "pxor "r","r" \n\t"
+#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
+
+#if ARCH_X86_64
+
+# define ROW0 "%%xmm8"
+# define REG0 ROW0
+# define ROW2 "%%xmm9"
+# define REG2 ROW2
+# define ROW4 "%%xmm10"
+# define REG4 ROW4
+# define ROW6 "%%xmm11"
+# define REG6 ROW6
+# define CLEAR_EVEN(r) CLEAR_ODD(r)
+# define PUT_EVEN(dst) PUT_ODD(dst)
+# define XMMS "%%xmm12"
+# define MOV_32_ONLY "#"
+# define SREG2 REG2
+# define TAN3 "%%xmm13"
+# define TAN1 "%%xmm14"
+
+#else
+
+# define ROW0 "(%0)"
+# define REG0 "%%xmm4"
+# define ROW2 "2*16(%0)"
+# define REG2 "%%xmm4"
+# define ROW4 "4*16(%0)"
+# define REG4 "%%xmm6"
+# define ROW6 "6*16(%0)"
+# define REG6 "%%xmm6"
+# define CLEAR_EVEN(r)
+# define PUT_EVEN(dst) \
+ "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
+ "movdqa %%xmm2, "dst" \n\t"
+# define XMMS "%%xmm2"
+# define MOV_32_ONLY "movdqa "
+# define SREG2 "%%xmm7"
+# define TAN3 "%%xmm0"
+# define TAN1 "%%xmm2"
+
+#endif
+
+#define ROUND(x) "paddd "MANGLE(x)
+
+#define JZ(reg, to) \
+ "testl "reg","reg" \n\t" \
+ "jz "to" \n\t"
+
+#define JNZ(reg, to) \
+ "testl "reg","reg" \n\t" \
+ "jnz "to" \n\t"
+
+#define TEST_ONE_ROW(src, reg, clear) \
+ clear \
+ "movq "src", %%mm1 \n\t" \
+ "por 8+"src", %%mm1 \n\t" \
+ "paddusb %%mm0, %%mm1 \n\t" \
+ "pmovmskb %%mm1, "reg" \n\t"
+
+#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
+ clear1 \
+ clear2 \
+ "movq "row1", %%mm1 \n\t" \
+ "por 8+"row1", %%mm1 \n\t" \
+ "movq "row2", %%mm2 \n\t" \
+ "por 8+"row2", %%mm2 \n\t" \
+ "paddusb %%mm0, %%mm1 \n\t" \
+ "paddusb %%mm0, %%mm2 \n\t" \
+ "pmovmskb %%mm1, "reg1" \n\t" \
+ "pmovmskb %%mm2, "reg2" \n\t"
+
+///IDCT pass on rows.
+#define iMTX_MULT(src, table, rounder, put) \
+ "movdqa "src", %%xmm3 \n\t" \
+ "movdqa %%xmm3, %%xmm0 \n\t" \
+ "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
+ "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
+ "pmaddwd "table", %%xmm0 \n\t" \
+ "pmaddwd 16+"table", %%xmm1 \n\t" \
+ "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
+ "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
+ "pmaddwd 32+"table", %%xmm2 \n\t" \
+ "pmaddwd 48+"table", %%xmm3 \n\t" \
+ "paddd %%xmm1, %%xmm0 \n\t" \
+ "paddd %%xmm3, %%xmm2 \n\t" \
+ rounder", %%xmm0 \n\t" \
+ "movdqa %%xmm2, %%xmm3 \n\t" \
+ "paddd %%xmm0, %%xmm2 \n\t" \
+ "psubd %%xmm3, %%xmm0 \n\t" \
+ "psrad $11, %%xmm2 \n\t" \
+ "psrad $11, %%xmm0 \n\t" \
+ "packssdw %%xmm0, %%xmm2 \n\t" \
+ put \
+ "1: \n\t"
+
+#define iLLM_HEAD \
+ "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
+ "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
+
+///IDCT pass on columns.
+#define iLLM_PASS(dct) \
+ "movdqa "TAN3", %%xmm1 \n\t" \
+ "movdqa "TAN1", %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "pmulhw %%xmm5, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN3" \n\t" \
+ "paddsw %%xmm5, %%xmm1 \n\t" \
+ "psubsw %%xmm5, "TAN3" \n\t" \
+ "paddsw %%xmm4, %%xmm1 \n\t" \
+ "pmulhw %%xmm7, %%xmm3 \n\t" \
+ "pmulhw %%xmm6, "TAN1" \n\t" \
+ "paddsw %%xmm6, %%xmm3 \n\t" \
+ "psubsw %%xmm7, "TAN1" \n\t" \
+ "movdqa %%xmm3, %%xmm7 \n\t" \
+ "movdqa "TAN1", %%xmm6 \n\t" \
+ "psubsw %%xmm1, %%xmm3 \n\t" \
+ "psubsw "TAN3", "TAN1" \n\t" \
+ "paddsw %%xmm7, %%xmm1 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa %%xmm3, %%xmm6 \n\t" \
+ "psubsw "TAN3", %%xmm3 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
+ "pmulhw %%xmm4, %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw "TAN3", "TAN3" \n\t" \
+ "paddsw %%xmm3, %%xmm3 \n\t" \
+ "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
+ MOV_32_ONLY ROW2", "REG2" \n\t" \
+ MOV_32_ONLY ROW6", "REG6" \n\t" \
+ "movdqa %%xmm7, %%xmm5 \n\t" \
+ "pmulhw "REG6", %%xmm7 \n\t" \
+ "pmulhw "REG2", %%xmm5 \n\t" \
+ "paddsw "REG2", %%xmm7 \n\t" \
+ "psubsw "REG6", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW0", "REG0" \n\t" \
+ MOV_32_ONLY ROW4", "REG4" \n\t" \
+ MOV_32_ONLY" "TAN1", (%0) \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw "REG4", "REG0" \n\t" \
+ "paddsw "XMMS", "REG4" \n\t" \
+ "movdqa "REG4", "XMMS" \n\t" \
+ "psubsw %%xmm7, "REG4" \n\t" \
+ "paddsw "XMMS", %%xmm7 \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm5, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm5 \n\t" \
+ "movdqa %%xmm5, "XMMS" \n\t" \
+ "psubsw "TAN3", %%xmm5 \n\t" \
+ "paddsw "XMMS", "TAN3" \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm3, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm3 \n\t" \
+ MOV_32_ONLY" (%0), "TAN1" \n\t" \
+ "psraw $6, %%xmm5 \n\t" \
+ "psraw $6, "REG0" \n\t" \
+ "psraw $6, "TAN3" \n\t" \
+ "psraw $6, %%xmm3 \n\t" \
+ "movdqa "TAN3", 1*16("dct") \n\t" \
+ "movdqa %%xmm3, 2*16("dct") \n\t" \
+ "movdqa "REG0", 5*16("dct") \n\t" \
+ "movdqa %%xmm5, 6*16("dct") \n\t" \
+ "movdqa %%xmm7, %%xmm0 \n\t" \
+ "movdqa "REG4", %%xmm4 \n\t" \
+ "psubsw %%xmm1, %%xmm7 \n\t" \
+ "psubsw "TAN1", "REG4" \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN1" \n\t" \
+ "psraw $6, %%xmm1 \n\t" \
+ "psraw $6, %%xmm7 \n\t" \
+ "psraw $6, "TAN1" \n\t" \
+ "psraw $6, "REG4" \n\t" \
+ "movdqa %%xmm1, ("dct") \n\t" \
+ "movdqa "TAN1", 3*16("dct") \n\t" \
+ "movdqa "REG4", 4*16("dct") \n\t" \
+ "movdqa %%xmm7, 7*16("dct") \n\t"
+
+///IDCT pass on columns, assuming rows 4-7 are zero.
+#define iLLM_PASS_SPARSE(dct) \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw %%xmm4, "TAN3" \n\t" \
+ "movdqa %%xmm6, %%xmm3 \n\t" \
+ "pmulhw %%xmm6, "TAN1" \n\t" \
+ "movdqa %%xmm4, %%xmm1 \n\t" \
+ "psubsw %%xmm1, %%xmm3 \n\t" \
+ "paddsw %%xmm6, %%xmm1 \n\t" \
+ "movdqa "TAN1", %%xmm6 \n\t" \
+ "psubsw "TAN3", "TAN1" \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa %%xmm3, %%xmm6 \n\t" \
+ "psubsw "TAN3", %%xmm3 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
+ "pmulhw %%xmm4, %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw "TAN3", "TAN3" \n\t" \
+ "paddsw %%xmm3, %%xmm3 \n\t" \
+ "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW2", "SREG2" \n\t" \
+ "pmulhw "SREG2", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW0", "REG0" \n\t" \
+ "movdqa "REG0", %%xmm6 \n\t" \
+ "psubsw "SREG2", %%xmm6 \n\t" \
+ "paddsw "REG0", "SREG2" \n\t" \
+ MOV_32_ONLY" "TAN1", (%0) \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm5, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm5 \n\t" \
+ "movdqa %%xmm5, "XMMS" \n\t" \
+ "psubsw "TAN3", %%xmm5 \n\t" \
+ "paddsw "XMMS", "TAN3" \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm3, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm3 \n\t" \
+ MOV_32_ONLY" (%0), "TAN1" \n\t" \
+ "psraw $6, %%xmm5 \n\t" \
+ "psraw $6, "REG0" \n\t" \
+ "psraw $6, "TAN3" \n\t" \
+ "psraw $6, %%xmm3 \n\t" \
+ "movdqa "TAN3", 1*16("dct") \n\t" \
+ "movdqa %%xmm3, 2*16("dct") \n\t" \
+ "movdqa "REG0", 5*16("dct") \n\t" \
+ "movdqa %%xmm5, 6*16("dct") \n\t" \
+ "movdqa "SREG2", %%xmm0 \n\t" \
+ "movdqa %%xmm6, %%xmm4 \n\t" \
+ "psubsw %%xmm1, "SREG2" \n\t" \
+ "psubsw "TAN1", %%xmm6 \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN1" \n\t" \
+ "psraw $6, %%xmm1 \n\t" \
+ "psraw $6, "SREG2" \n\t" \
+ "psraw $6, "TAN1" \n\t" \
+ "psraw $6, %%xmm6 \n\t" \
+ "movdqa %%xmm1, ("dct") \n\t" \
+ "movdqa "TAN1", 3*16("dct") \n\t" \
+ "movdqa %%xmm6, 4*16("dct") \n\t" \
+ "movdqa "SREG2", 7*16("dct") \n\t"
+
+inline void ff_idct_xvid_sse2(short *block)
+{
+ __asm__ volatile(
+ "movq "MANGLE(m127)", %%mm0 \n\t"
+ iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
+ iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
+ iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
+
+ TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
+ JZ("%%eax", "1f")
+ iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
+
+ TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
+ TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
+ iLLM_HEAD
+ ".p2align 4 \n\t"
+ JNZ("%%ecx", "2f")
+ JNZ("%%eax", "3f")
+ JNZ("%%edx", "4f")
+ JNZ("%%esi", "5f")
+ iLLM_PASS_SPARSE("%0")
+ "jmp 6f \n\t"
+ "2: \n\t"
+ iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
+ "3: \n\t"
+ iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
+ JZ("%%edx", "1f")
+ "4: \n\t"
+ iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
+ JZ("%%esi", "1f")
+ "5: \n\t"
+ iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
+#if ARCH_X86_32
+ iLLM_HEAD
+#endif
+ iLLM_PASS("%0")
+ "6: \n\t"
+ : "+r"(block)
+ :
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
+ "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
+#if ARCH_X86_64
+ XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14",)
+#endif
+ "%eax", "%ecx", "%edx", "%esi", "memory"
+ );
+}
+
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
+{
+ ff_idct_xvid_sse2(block);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
+{
+ ff_idct_xvid_sse2(block);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/idct_xvid.h b/ffmpeg/libavcodec/x86/idct_xvid.h
new file mode 100644
index 0000000..7a2847b
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/idct_xvid.h
@@ -0,0 +1,43 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * header for Xvid IDCT functions
+ */
+
+#ifndef AVCODEC_X86_IDCT_XVID_H
+#define AVCODEC_X86_IDCT_XVID_H
+
+#include <stdint.h>
+
+void ff_idct_xvid_mmx(short *block);
+void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_idct_xvid_mmxext(short *block);
+void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_idct_xvid_sse2(short *block);
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
+
+#endif /* AVCODEC_X86_IDCT_XVID_H */
diff --git a/ffmpeg/libavcodec/x86/imdct36.asm b/ffmpeg/libavcodec/x86/imdct36.asm
new file mode 100644
index 0000000..d311fbe
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/imdct36.asm
@@ -0,0 +1,724 @@
+;******************************************************************************
+;* 36 point SSE-optimized IMDCT transform
+;* Copyright (c) 2011 Vitor Sessak
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+align 16
+ps_mask: dd 0, ~0, ~0, ~0
+ps_mask2: dd 0, ~0, 0, ~0
+ps_mask3: dd 0, 0, 0, ~0
+ps_mask4: dd 0, ~0, 0, 0
+
+ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
+ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
+ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
+ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
+ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
+ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
+ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
+
+ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
+ dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
+ dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
+ dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
+ dd 1.0, 0.70710678118654752439, 0.0, 0.0
+
+ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
+ dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
+ dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
+ dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
+ dd 1.0, 0.70710678118654752439, 0.0, 0.0
+
+costabs: times 4 dd 0.98480773
+ times 4 dd 0.93969262
+ times 4 dd 0.86602539
+ times 4 dd -0.76604444
+ times 4 dd -0.64278764
+ times 4 dd 0.50000000
+ times 4 dd -0.50000000
+ times 4 dd -0.34202015
+ times 4 dd -0.17364818
+ times 4 dd 0.50190992
+ times 4 dd 0.51763808
+ times 4 dd 0.55168896
+ times 4 dd 0.61038726
+ times 4 dd 0.70710677
+ times 4 dd 0.87172341
+ times 4 dd 1.18310082
+ times 4 dd 1.93185163
+ times 4 dd 5.73685646
+
+%define SBLIMIT 32
+SECTION_TEXT
+
+%macro PSHUFD 3
+%if cpuflag(sse2) && notcpuflag(avx)
+ pshufd %1, %2, %3
+%else
+ shufps %1, %2, %2, %3
+%endif
+%endmacro
+
+; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
+; output %1={x3,x4,y1,y2}
+%macro BUILDINVHIGHLOW 3
+%if cpuflag(avx)
+ shufps %1, %2, %3, 0x4e
+%else
+ movlhps %1, %3
+ movhlps %1, %2
+%endif
+%endmacro
+
+; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
+; output %1={x4,y1,y2,y3}
+%macro ROTLEFT 3
+%if cpuflag(ssse3)
+ palignr %1, %3, %2, 12
+%else
+ BUILDINVHIGHLOW %1, %2, %3
+ shufps %1, %1, %3, 0x99
+%endif
+%endmacro
+
+%macro INVERTHL 2
+%if cpuflag(sse2)
+ PSHUFD %1, %2, 0x4e
+%else
+ movhlps %1, %2
+ movlhps %1, %2
+%endif
+%endmacro
+
+%macro BUTTERF 3
+ INVERTHL %2, %1
+ xorps %1, [ps_p1p1m1m1]
+ addps %1, %2
+%if cpuflag(sse3)
+ mulps %1, %1, [ps_cosh_sse3 + %3]
+ PSHUFD %2, %1, 0xb1
+ addsubps %1, %1, %2
+%else
+ mulps %1, [ps_cosh + %3]
+ PSHUFD %2, %1, 0xb1
+ xorps %1, [ps_p1m1p1m1]
+ addps %1, %2
+%endif
+%endmacro
+
+%macro STORE 4
+ movhlps %2, %1
+ movss [%3 ], %1
+ movss [%3 + 2*%4], %2
+ shufps %1, %1, 0xb1
+ movss [%3 + %4], %1
+ movhlps %2, %1
+ movss [%3 + 3*%4], %2
+%endmacro
+
+%macro LOAD 4
+ movlps %1, [%3 ]
+ movhps %1, [%3 + %4]
+ movlps %2, [%3 + 2*%4]
+ movhps %2, [%3 + 3*%4]
+ shufps %1, %2, 0x88
+%endmacro
+
+%macro LOADA64 2
+%if cpuflag(avx)
+ movu %1, [%2]
+%else
+ movlps %1, [%2]
+ movhps %1, [%2 + 8]
+%endif
+%endmacro
+
+%macro DEFINE_IMDCT 0
+cglobal imdct36_float, 4,4,9, out, buf, in, win
+
+ ; for(i=17;i>=1;i--) in[i] += in[i-1];
+ LOADA64 m0, inq
+ LOADA64 m1, inq + 16
+
+ ROTLEFT m5, m0, m1
+
+ PSHUFD m6, m0, 0x93
+ andps m6, m6, [ps_mask]
+ addps m0, m0, m6
+
+ LOADA64 m2, inq + 32
+
+ ROTLEFT m7, m1, m2
+
+ addps m1, m1, m5
+ LOADA64 m3, inq + 48
+
+ ROTLEFT m5, m2, m3
+
+ xorps m4, m4, m4
+ movlps m4, [inq+64]
+ BUILDINVHIGHLOW m6, m3, m4
+ shufps m6, m6, m4, 0xa9
+
+ addps m4, m4, m6
+ addps m2, m2, m7
+ addps m3, m3, m5
+
+ ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
+ movlhps m5, m5, m0
+ andps m5, m5, [ps_mask3]
+
+ BUILDINVHIGHLOW m7, m0, m1
+ andps m7, m7, [ps_mask2]
+
+ addps m0, m0, m5
+
+ BUILDINVHIGHLOW m6, m1, m2
+ andps m6, m6, [ps_mask2]
+
+ addps m1, m1, m7
+
+ BUILDINVHIGHLOW m7, m2, m3
+ andps m7, m7, [ps_mask2]
+
+ addps m2, m2, m6
+
+ movhlps m6, m6, m3
+ andps m6, m6, [ps_mask4]
+
+ addps m3, m3, m7
+ addps m4, m4, m6
+
+ ; Populate tmp[]
+ movlhps m6, m1, m5 ; zero out high values
+ subps m6, m6, m4
+
+ subps m5, m0, m3
+
+%if ARCH_X86_64
+ SWAP m5, m8
+%endif
+
+ mulps m7, m2, [ps_val1]
+
+%if ARCH_X86_64
+ mulps m5, m8, [ps_val2]
+%else
+ mulps m5, m5, [ps_val2]
+%endif
+ addps m7, m7, m5
+
+ mulps m5, m6, [ps_val1]
+ subps m7, m7, m5
+
+%if ARCH_X86_64
+ SWAP m5, m8
+%else
+ subps m5, m0, m3
+%endif
+
+ subps m5, m5, m6
+ addps m5, m5, m2
+
+ shufps m6, m4, m3, 0xe4
+ subps m6, m6, m2
+ mulps m6, m6, [ps_val3]
+
+ addps m4, m4, m1
+ mulps m4, m4, [ps_val4]
+
+ shufps m1, m1, m0, 0xe4
+ addps m1, m1, m2
+ mulps m1, m1, [ps_val5]
+
+ mulps m3, m3, [ps_val6]
+ mulps m0, m0, [ps_val7]
+ addps m0, m0, m3
+
+ xorps m2, m1, [ps_p1p1m1m1]
+ subps m2, m2, m4
+ addps m2, m2, m0
+
+ addps m3, m4, m0
+ subps m3, m3, m6
+ xorps m3, m3, [ps_p1p1m1m1]
+
+ shufps m0, m0, m4, 0xe4
+ subps m0, m0, m1
+ addps m0, m0, m6
+
+ BUILDINVHIGHLOW m4, m2, m3
+ shufps m3, m3, m2, 0x4e
+
+ ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
+
+ BUTTERF m0, m1, 0
+ BUTTERF m7, m2, 16
+ BUTTERF m3, m6, 32
+ BUTTERF m4, m1, 48
+
+ mulps m5, m5, [ps_cosh + 64]
+ PSHUFD m1, m5, 0xe1
+ xorps m5, m5, [ps_p1m1p1m1]
+ addps m5, m5, m1
+
+ ; permutates:
+ ; m0 0 1 2 3 => 2 6 10 14 m1
+ ; m7 4 5 6 7 => 3 7 11 15 m2
+ ; m3 8 9 10 11 => 17 13 9 5 m3
+ ; m4 12 13 14 15 => 16 12 8 4 m5
+ ; m5 16 17 xx xx => 0 1 xx xx m0
+
+ unpckhps m1, m0, m7
+ unpckhps m6, m3, m4
+ movhlps m2, m6, m1
+ movlhps m1, m1, m6
+
+ unpcklps m5, m5, m4
+ unpcklps m3, m3, m7
+ movhlps m4, m3, m5
+ movlhps m5, m5, m3
+ SWAP m4, m3
+ ; permutation done
+
+ PSHUFD m6, m2, 0xb1
+ movss m4, [bufq + 4*68]
+ movss m7, [bufq + 4*64]
+ unpcklps m7, m7, m4
+ mulps m6, m6, [winq + 16*4]
+ addps m6, m6, m7
+ movss [outq + 64*SBLIMIT], m6
+ shufps m6, m6, m6, 0xb1
+ movss [outq + 68*SBLIMIT], m6
+
+ mulps m6, m3, [winq + 4*4]
+ LOAD m4, m7, bufq + 4*16, 16
+ addps m6, m6, m4
+ STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
+
+ shufps m4, m0, m3, 0xb5
+ mulps m4, m4, [winq + 8*4]
+ LOAD m7, m6, bufq + 4*32, 16
+ addps m4, m4, m7
+ STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
+
+ shufps m3, m3, m2, 0xb1
+ mulps m3, m3, [winq + 12*4]
+ LOAD m7, m6, bufq + 4*48, 16
+ addps m3, m3, m7
+ STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
+
+ mulps m2, m2, [winq]
+ LOAD m6, m7, bufq, 16
+ addps m2, m2, m6
+ STORE m2, m7, outq, 4*SBLIMIT
+
+ mulps m4, m1, [winq + 20*4]
+ STORE m4, m7, bufq, 16
+
+ mulps m3, m5, [winq + 24*4]
+ STORE m3, m7, bufq + 4*16, 16
+
+ shufps m0, m0, m5, 0xb0
+ mulps m0, m0, [winq + 28*4]
+ STORE m0, m7, bufq + 4*32, 16
+
+ shufps m5, m5, m1, 0xb1
+ mulps m5, m5, [winq + 32*4]
+ STORE m5, m7, bufq + 4*48, 16
+
+ shufps m1, m1, m1, 0xb1
+ mulps m1, m1, [winq + 36*4]
+ movss [bufq + 4*64], m1
+ shufps m1, m1, 0xb1
+ movss [bufq + 4*68], m1
+ RET
+%endmacro
+
+INIT_XMM sse
+DEFINE_IMDCT
+
+INIT_XMM sse2
+DEFINE_IMDCT
+
+INIT_XMM sse3
+DEFINE_IMDCT
+
+INIT_XMM ssse3
+DEFINE_IMDCT
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEFINE_IMDCT
+%endif
+
+INIT_XMM sse
+
+%if ARCH_X86_64
+%define SPILL SWAP
+%define UNSPILL SWAP
+%define SPILLED(x) m %+ x
+%else
+%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
+%macro SPILL 2 ; xmm#, mempos
+ movaps SPILLED(%2), m%1
+%endmacro
+%macro UNSPILL 2
+ movaps m%1, SPILLED(%2)
+%endmacro
+%endif
+
+%macro DEFINE_FOUR_IMDCT 0
+cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
+ movlps m0, [inq+64]
+ movhps m0, [inq+64 + 72]
+ movlps m3, [inq+64 + 2*72]
+ movhps m3, [inq+64 + 3*72]
+
+ shufps m5, m0, m3, 0xdd
+ shufps m0, m0, m3, 0x88
+
+ mova m1, [inq+48]
+ movu m6, [inq+48 + 72]
+ mova m7, [inq+48 + 2*72]
+ movu m3, [inq+48 + 3*72]
+
+ TRANSPOSE4x4PS 1, 6, 7, 3, 4
+
+ addps m4, m6, m7
+ mova [tmpq+4*28], m4
+
+ addps m7, m3
+ addps m6, m1
+ addps m3, m0
+ addps m0, m5
+ addps m0, m7
+ addps m7, m6
+ mova [tmpq+4*12], m7
+ SPILL 3, 12
+
+ mova m4, [inq+32]
+ movu m5, [inq+32 + 72]
+ mova m2, [inq+32 + 2*72]
+ movu m7, [inq+32 + 3*72]
+
+ TRANSPOSE4x4PS 4, 5, 2, 7, 3
+
+ addps m1, m7
+ SPILL 1, 11
+
+ addps m3, m5, m2
+ SPILL 3, 13
+
+ addps m7, m2
+ addps m5, m4
+ addps m6, m7
+ mova [tmpq], m6
+ addps m7, m5
+ mova [tmpq+4*16], m7
+
+ mova m2, [inq+16]
+ movu m7, [inq+16 + 72]
+ mova m1, [inq+16 + 2*72]
+ movu m6, [inq+16 + 3*72]
+
+ TRANSPOSE4x4PS 2, 7, 1, 6, 3
+
+ addps m4, m6
+ addps m6, m1
+ addps m1, m7
+ addps m7, m2
+ addps m5, m6
+ SPILL 5, 15
+ addps m6, m7
+ mulps m6, [costabs + 16*2]
+ mova [tmpq+4*8], m6
+ SPILL 1, 10
+ SPILL 0, 14
+
+ mova m1, [inq]
+ movu m6, [inq + 72]
+ mova m3, [inq + 2*72]
+ movu m5, [inq + 3*72]
+
+ TRANSPOSE4x4PS 1, 6, 3, 5, 0
+
+ addps m2, m5
+ addps m5, m3
+ addps m7, m5
+ addps m3, m6
+ addps m6, m1
+ SPILL 7, 8
+ addps m5, m6
+ SPILL 6, 9
+ addps m6, m4, SPILLED(12)
+ subps m6, m2
+ UNSPILL 7, 11
+ SPILL 5, 11
+ subps m5, m1, m7
+ mulps m7, [costabs + 16*5]
+ addps m7, m1
+ mulps m0, m6, [costabs + 16*6]
+ addps m0, m5
+ mova [tmpq+4*24], m0
+ addps m6, m5
+ mova [tmpq+4*4], m6
+ addps m6, m4, m2
+ mulps m6, [costabs + 16*1]
+ subps m4, SPILLED(12)
+ mulps m4, [costabs + 16*8]
+ addps m2, SPILLED(12)
+ mulps m2, [costabs + 16*3]
+ subps m5, m7, m6
+ subps m5, m2
+ addps m6, m7
+ addps m6, m4
+ addps m7, m2
+ subps m7, m4
+ mova [tmpq+4*20], m7
+ mova m2, [tmpq+4*28]
+ mova [tmpq+4*28], m5
+ UNSPILL 7, 13
+ subps m5, m7, m2
+ mulps m5, [costabs + 16*7]
+ UNSPILL 1, 10
+ mulps m1, [costabs + 16*2]
+ addps m4, m3, m2
+ mulps m4, [costabs + 16*4]
+ addps m2, m7
+ addps m7, m3
+ mulps m7, [costabs]
+ subps m3, m2
+ mulps m3, [costabs + 16*2]
+ addps m2, m7, m5
+ addps m2, m1
+ SPILL 2, 10
+ addps m7, m4
+ subps m7, m1
+ SPILL 7, 12
+ subps m5, m4
+ subps m5, m1
+ UNSPILL 0, 14
+ SPILL 5, 13
+ addps m1, m0, SPILLED(15)
+ subps m1, SPILLED(8)
+ mova m4, [costabs + 16*5]
+ mulps m4, [tmpq]
+ UNSPILL 2, 9
+ addps m4, m2
+ subps m2, [tmpq]
+ mulps m5, m1, [costabs + 16*6]
+ addps m5, m2
+ SPILL 5, 9
+ addps m2, m1
+ SPILL 2, 14
+ UNSPILL 5, 15
+ subps m7, m5, m0
+ addps m5, SPILLED(8)
+ mulps m5, [costabs + 16*1]
+ mulps m7, [costabs + 16*8]
+ addps m0, SPILLED(8)
+ mulps m0, [costabs + 16*3]
+ subps m2, m4, m5
+ subps m2, m0
+ SPILL 2, 15
+ addps m5, m4
+ addps m5, m7
+ addps m4, m0
+ subps m4, m7
+ SPILL 4, 8
+ mova m7, [tmpq+4*16]
+ mova m2, [tmpq+4*12]
+ addps m0, m7, m2
+ subps m0, SPILLED(11)
+ mulps m0, [costabs + 16*2]
+ addps m4, m7, SPILLED(11)
+ mulps m4, [costabs]
+ subps m7, m2
+ mulps m7, [costabs + 16*7]
+ addps m2, SPILLED(11)
+ mulps m2, [costabs + 16*4]
+ addps m1, m7, [tmpq+4*8]
+ addps m1, m4
+ addps m4, m2
+ subps m4, [tmpq+4*8]
+ SPILL 4, 11
+ subps m7, m2
+ subps m7, [tmpq+4*8]
+ addps m4, m6, SPILLED(10)
+ subps m6, SPILLED(10)
+ addps m2, m5, m1
+ mulps m2, [costabs + 16*9]
+ subps m5, m1
+ mulps m5, [costabs + 16*17]
+ subps m1, m4, m2
+ addps m4, m2
+ mulps m2, m1, [winq+4*36]
+ addps m2, [bufq+4*36]
+ mova [outq+1152], m2
+ mulps m1, [winq+4*32]
+ addps m1, [bufq+4*32]
+ mova [outq+1024], m1
+ mulps m1, m4, [winq+4*116]
+ mova [bufq+4*36], m1
+ mulps m4, [winq+4*112]
+ mova [bufq+4*32], m4
+ addps m2, m6, m5
+ subps m6, m5
+ mulps m1, m6, [winq+4*68]
+ addps m1, [bufq+4*68]
+ mova [outq+2176], m1
+ mulps m6, [winq]
+ addps m6, [bufq]
+ mova [outq], m6
+ mulps m1, m2, [winq+4*148]
+ mova [bufq+4*68], m1
+ mulps m2, [winq+4*80]
+ mova [bufq], m2
+ addps m5, m3, [tmpq+4*24]
+ mova m2, [tmpq+4*24]
+ subps m2, m3
+ mova m1, SPILLED(9)
+ subps m1, m0
+ mulps m1, [costabs + 16*10]
+ addps m0, SPILLED(9)
+ mulps m0, [costabs + 16*16]
+ addps m6, m5, m1
+ subps m5, m1
+ mulps m3, m5, [winq+4*40]
+ addps m3, [bufq+4*40]
+ mova [outq+1280], m3
+ mulps m5, [winq+4*28]
+ addps m5, [bufq+4*28]
+ mova [outq+896], m5
+ mulps m1, m6, [winq+4*120]
+ mova [bufq+4*40], m1
+ mulps m6, [winq+4*108]
+ mova [bufq+4*28], m6
+ addps m1, m2, m0
+ subps m2, m0
+ mulps m5, m2, [winq+4*64]
+ addps m5, [bufq+4*64]
+ mova [outq+2048], m5
+ mulps m2, [winq+4*4]
+ addps m2, [bufq+4*4]
+ mova [outq+128], m2
+ mulps m0, m1, [winq+4*144]
+ mova [bufq+4*64], m0
+ mulps m1, [winq+4*84]
+ mova [bufq+4*4], m1
+ mova m1, [tmpq+4*28]
+ mova m5, m1
+ addps m1, SPILLED(13)
+ subps m5, SPILLED(13)
+ UNSPILL 3, 15
+ addps m2, m7, m3
+ mulps m2, [costabs + 16*11]
+ subps m3, m7
+ mulps m3, [costabs + 16*15]
+ addps m0, m2, m1
+ subps m1, m2
+ SWAP m0, m2
+ mulps m6, m1, [winq+4*44]
+ addps m6, [bufq+4*44]
+ mova [outq+1408], m6
+ mulps m1, [winq+4*24]
+ addps m1, [bufq+4*24]
+ mova [outq+768], m1
+ mulps m0, m2, [winq+4*124]
+ mova [bufq+4*44], m0
+ mulps m2, [winq+4*104]
+ mova [bufq+4*24], m2
+ addps m0, m5, m3
+ subps m5, m3
+ mulps m1, m5, [winq+4*60]
+ addps m1, [bufq+4*60]
+ mova [outq+1920], m1
+ mulps m5, [winq+4*8]
+ addps m5, [bufq+4*8]
+ mova [outq+256], m5
+ mulps m1, m0, [winq+4*140]
+ mova [bufq+4*60], m1
+ mulps m0, [winq+4*88]
+ mova [bufq+4*8], m0
+ mova m1, [tmpq+4*20]
+ addps m1, SPILLED(12)
+ mova m2, [tmpq+4*20]
+ subps m2, SPILLED(12)
+ UNSPILL 7, 8
+ subps m0, m7, SPILLED(11)
+ addps m7, SPILLED(11)
+ mulps m4, m7, [costabs + 16*12]
+ mulps m0, [costabs + 16*14]
+ addps m5, m1, m4
+ subps m1, m4
+ mulps m7, m1, [winq+4*48]
+ addps m7, [bufq+4*48]
+ mova [outq+1536], m7
+ mulps m1, [winq+4*20]
+ addps m1, [bufq+4*20]
+ mova [outq+640], m1
+ mulps m1, m5, [winq+4*128]
+ mova [bufq+4*48], m1
+ mulps m5, [winq+4*100]
+ mova [bufq+4*20], m5
+ addps m6, m2, m0
+ subps m2, m0
+ mulps m1, m2, [winq+4*56]
+ addps m1, [bufq+4*56]
+ mova [outq+1792], m1
+ mulps m2, [winq+4*12]
+ addps m2, [bufq+4*12]
+ mova [outq+384], m2
+ mulps m0, m6, [winq+4*136]
+ mova [bufq+4*56], m0
+ mulps m6, [winq+4*92]
+ mova [bufq+4*12], m6
+ UNSPILL 0, 14
+ mulps m0, [costabs + 16*13]
+ mova m3, [tmpq+4*4]
+ addps m2, m0, m3
+ subps m3, m0
+ mulps m0, m3, [winq+4*52]
+ addps m0, [bufq+4*52]
+ mova [outq+1664], m0
+ mulps m3, [winq+4*16]
+ addps m3, [bufq+4*16]
+ mova [outq+512], m3
+ mulps m0, m2, [winq+4*132]
+ mova [bufq+4*52], m0
+ mulps m2, [winq+4*96]
+ mova [bufq+4*16], m2
+ RET
+%endmacro
+
+INIT_XMM sse
+DEFINE_FOUR_IMDCT
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+DEFINE_FOUR_IMDCT
+%endif
diff --git a/ffmpeg/libavcodec/x86/lpc.c b/ffmpeg/libavcodec/x86/lpc.c
new file mode 100644
index 0000000..1962212
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/lpc.c
@@ -0,0 +1,154 @@
+/*
+ * MMX optimized LPC DSP utils
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/lpc.h"
+
+#if HAVE_SSE2_INLINE
+
+static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
+ double *w_data)
+{
+ double c = 2.0 / (len-1.0);
+ int n2 = len>>1;
+ x86_reg i = -n2*sizeof(int32_t);
+ x86_reg j = n2*sizeof(int32_t);
+ __asm__ volatile(
+ "movsd %4, %%xmm7 \n\t"
+ "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t"
+ "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t"
+ "movlhps %%xmm7, %%xmm7 \n\t"
+ "subpd %%xmm5, %%xmm7 \n\t"
+ "addsd %%xmm6, %%xmm7 \n\t"
+ "test $1, %5 \n\t"
+ "jz 2f \n\t"
+#define WELCH(MOVPD, offset)\
+ "1: \n\t"\
+ "movapd %%xmm7, %%xmm1 \n\t"\
+ "mulpd %%xmm1, %%xmm1 \n\t"\
+ "movapd %%xmm6, %%xmm0 \n\t"\
+ "subpd %%xmm1, %%xmm0 \n\t"\
+ "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
+ "cvtpi2pd (%3,%0), %%xmm2 \n\t"\
+ "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
+ "mulpd %%xmm0, %%xmm2 \n\t"\
+ "mulpd %%xmm1, %%xmm3 \n\t"\
+ "movapd %%xmm2, (%2,%0,2) \n\t"\
+ MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
+ "subpd %%xmm5, %%xmm7 \n\t"\
+ "sub $8, %1 \n\t"\
+ "add $8, %0 \n\t"\
+ "jl 1b \n\t"\
+
+ WELCH("movupd", -1)
+ "jmp 3f \n\t"
+ "2: \n\t"
+ WELCH("movapd", -2)
+ "3: \n\t"
+ :"+&r"(i), "+&r"(j)
+ :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm5", "%xmm6", "%xmm7")
+ );
+#undef WELCH
+}
+
+static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
+ double *autoc)
+{
+ int j;
+
+ if((x86_reg)data & 15)
+ data++;
+
+ for(j=0; j<lag; j+=2){
+ x86_reg i = -len*sizeof(double);
+ if(j == lag-2) {
+ __asm__ volatile(
+ "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t"
+ "1: \n\t"
+ "movapd (%2,%0), %%xmm3 \n\t"
+ "movupd -8(%3,%0), %%xmm4 \n\t"
+ "movapd (%3,%0), %%xmm5 \n\t"
+ "mulpd %%xmm3, %%xmm4 \n\t"
+ "mulpd %%xmm3, %%xmm5 \n\t"
+ "mulpd -16(%3,%0), %%xmm3 \n\t"
+ "addpd %%xmm4, %%xmm1 \n\t"
+ "addpd %%xmm5, %%xmm0 \n\t"
+ "addpd %%xmm3, %%xmm2 \n\t"
+ "add $16, %0 \n\t"
+ "jl 1b \n\t"
+ "movhlps %%xmm0, %%xmm3 \n\t"
+ "movhlps %%xmm1, %%xmm4 \n\t"
+ "movhlps %%xmm2, %%xmm5 \n\t"
+ "addsd %%xmm3, %%xmm0 \n\t"
+ "addsd %%xmm4, %%xmm1 \n\t"
+ "addsd %%xmm5, %%xmm2 \n\t"
+ "movsd %%xmm0, (%1) \n\t"
+ "movsd %%xmm1, 8(%1) \n\t"
+ "movsd %%xmm2, 16(%1) \n\t"
+ :"+&r"(i)
+ :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+ :"memory"
+ );
+ } else {
+ __asm__ volatile(
+ "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+ "1: \n\t"
+ "movapd (%3,%0), %%xmm3 \n\t"
+ "movupd -8(%4,%0), %%xmm4 \n\t"
+ "mulpd %%xmm3, %%xmm4 \n\t"
+ "mulpd (%4,%0), %%xmm3 \n\t"
+ "addpd %%xmm4, %%xmm1 \n\t"
+ "addpd %%xmm3, %%xmm0 \n\t"
+ "add $16, %0 \n\t"
+ "jl 1b \n\t"
+ "movhlps %%xmm0, %%xmm3 \n\t"
+ "movhlps %%xmm1, %%xmm4 \n\t"
+ "addsd %%xmm3, %%xmm0 \n\t"
+ "addsd %%xmm4, %%xmm1 \n\t"
+ "movsd %%xmm0, %1 \n\t"
+ "movsd %%xmm1, %2 \n\t"
+ :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
+ :"r"(data+len), "r"(data+len-j)
+ );
+ }
+ }
+}
+
+#endif /* HAVE_SSE2_INLINE */
+
+av_cold void ff_lpc_init_x86(LPCContext *c)
+{
+#if HAVE_SSE2_INLINE
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
+ c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
+ c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
+ }
+#endif /* HAVE_SSE2_INLINE */
+}
diff --git a/ffmpeg/libavcodec/x86/mathops.h b/ffmpeg/libavcodec/x86/mathops.h
new file mode 100644
index 0000000..79e29e6
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mathops.h
@@ -0,0 +1,130 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_INLINE_ASM
+
+#if ARCH_X86_32
+
+#define MULL MULL
+static av_always_inline av_const int MULL(int a, int b, unsigned shift)
+{
+ int rt, dummy;
+ __asm__ (
+ "imull %3 \n\t"
+ "shrdl %4, %%edx, %%eax \n\t"
+ :"=a"(rt), "=d"(dummy)
+ :"a"(a), "rm"(b), "ci"((uint8_t)shift)
+ );
+ return rt;
+}
+
+#define MULH MULH
+static av_always_inline av_const int MULH(int a, int b)
+{
+ int rt, dummy;
+ __asm__ (
+ "imull %3"
+ :"=d"(rt), "=a"(dummy)
+ :"a"(a), "rm"(b)
+ );
+ return rt;
+}
+
+#define MUL64 MUL64
+static av_always_inline av_const int64_t MUL64(int a, int b)
+{
+ int64_t rt;
+ __asm__ (
+ "imull %2"
+ :"=A"(rt)
+ :"a"(a), "rm"(b)
+ );
+ return rt;
+}
+
+#endif /* ARCH_X86_32 */
+
+#if HAVE_CMOV
+/* median of 3 */
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+ int i=b;
+ __asm__ volatile(
+ "cmp %2, %1 \n\t"
+ "cmovg %1, %0 \n\t"
+ "cmovg %2, %1 \n\t"
+ "cmp %3, %1 \n\t"
+ "cmovl %3, %1 \n\t"
+ "cmp %1, %0 \n\t"
+ "cmovg %1, %0 \n\t"
+ :"+&r"(i), "+&r"(a)
+ :"r"(b), "r"(c)
+ );
+ return i;
+}
+#endif
+
+#if HAVE_CMOV
+#define COPY3_IF_LT(x, y, a, b, c, d)\
+__asm__ volatile(\
+ "cmpl %0, %3 \n\t"\
+ "cmovl %3, %0 \n\t"\
+ "cmovl %4, %1 \n\t"\
+ "cmovl %5, %2 \n\t"\
+ : "+&r" (x), "+&r" (a), "+r" (c)\
+ : "r" (y), "r" (b), "r" (d)\
+);
+#endif
+
+#define MASK_ABS(mask, level) \
+ __asm__ ("cltd \n\t" \
+ "xorl %1, %0 \n\t" \
+ "subl %1, %0 \n\t" \
+ : "+a"(level), "=&d"(mask))
+
+// avoid +32 for shift optimization (gcc should do that ...)
+#define NEG_SSR32 NEG_SSR32
+static inline int32_t NEG_SSR32( int32_t a, int8_t s){
+ __asm__ ("sarl %1, %0\n\t"
+ : "+r" (a)
+ : "ic" ((uint8_t)(-s))
+ );
+ return a;
+}
+
+#define NEG_USR32 NEG_USR32
+static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
+ __asm__ ("shrl %1, %0\n\t"
+ : "+r" (a)
+ : "ic" ((uint8_t)(-s))
+ );
+ return a;
+}
+
+#endif /* HAVE_INLINE_ASM */
+#endif /* AVCODEC_X86_MATHOPS_H */
diff --git a/ffmpeg/libavcodec/x86/mlpdsp.c b/ffmpeg/libavcodec/x86/mlpdsp.c
new file mode 100644
index 0000000..81cab5a
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mlpdsp.c
@@ -0,0 +1,182 @@
+/*
+ * MLP DSP functions x86-optimized
+ * Copyright (c) 2009 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/mlpdsp.h"
+#include "libavcodec/mlp.h"
+
+#if HAVE_7REGS && HAVE_INLINE_ASM
+
+extern char ff_mlp_firorder_8;
+extern char ff_mlp_firorder_7;
+extern char ff_mlp_firorder_6;
+extern char ff_mlp_firorder_5;
+extern char ff_mlp_firorder_4;
+extern char ff_mlp_firorder_3;
+extern char ff_mlp_firorder_2;
+extern char ff_mlp_firorder_1;
+extern char ff_mlp_firorder_0;
+
+extern char ff_mlp_iirorder_4;
+extern char ff_mlp_iirorder_3;
+extern char ff_mlp_iirorder_2;
+extern char ff_mlp_iirorder_1;
+extern char ff_mlp_iirorder_0;
+
+static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+ &ff_mlp_firorder_2, &ff_mlp_firorder_3,
+ &ff_mlp_firorder_4, &ff_mlp_firorder_5,
+ &ff_mlp_firorder_6, &ff_mlp_firorder_7,
+ &ff_mlp_firorder_8 };
+static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+ &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
+ &ff_mlp_iirorder_4 };
+
+#if ARCH_X86_64
+
+#define MLPMUL(label, offset, offs, offc) \
+ LABEL_MANGLE(label)": \n\t" \
+ "movslq "offset"+"offs"(%0), %%rax\n\t" \
+ "movslq "offset"+"offc"(%1), %%rdx\n\t" \
+ "imul %%rdx, %%rax\n\t" \
+ "add %%rax, %%rsi\n\t"
+
+#define FIRMULREG(label, offset, firc)\
+ LABEL_MANGLE(label)": \n\t" \
+ "movslq "#offset"(%0), %%rax\n\t" \
+ "imul %"#firc", %%rax\n\t" \
+ "add %%rax, %%rsi\n\t"
+
+#define CLEAR_ACCUM \
+ "xor %%rsi, %%rsi\n\t"
+
+#define SHIFT_ACCUM \
+ "shr %%cl, %%rsi\n\t"
+
+#define ACCUM "%%rdx"
+#define RESULT "%%rsi"
+#define RESULT32 "%%esi"
+
+#else /* if ARCH_X86_32 */
+
+#define MLPMUL(label, offset, offs, offc) \
+ LABEL_MANGLE(label)": \n\t" \
+ "mov "offset"+"offs"(%0), %%eax\n\t" \
+ "imull "offset"+"offc"(%1) \n\t" \
+ "add %%eax , %%esi\n\t" \
+ "adc %%edx , %%ecx\n\t"
+
+#define FIRMULREG(label, offset, firc) \
+ MLPMUL(label, #offset, "0", "0")
+
+#define CLEAR_ACCUM \
+ "xor %%esi, %%esi\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define SHIFT_ACCUM \
+ "mov %%ecx, %%edx\n\t" \
+ "mov %%esi, %%eax\n\t" \
+ "movzbl %7 , %%ecx\n\t" \
+ "shrd %%cl, %%edx, %%eax\n\t" \
+
+#define ACCUM "%%edx"
+#define RESULT "%%eax"
+#define RESULT32 "%%eax"
+
+#endif /* !ARCH_X86_64 */
+
+#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
+#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
+#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
+
+#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
+#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
+
+static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer)
+{
+ const void *firjump = firtable[firorder];
+ const void *iirjump = iirtable[iirorder];
+
+ blocksize = -blocksize;
+
+ __asm__ volatile(
+ "1: \n\t"
+ CLEAR_ACCUM
+ "jmp *%5 \n\t"
+ FIRMUL (ff_mlp_firorder_8, 0x1c )
+ FIRMUL (ff_mlp_firorder_7, 0x18 )
+ FIRMUL (ff_mlp_firorder_6, 0x14 )
+ FIRMUL (ff_mlp_firorder_5, 0x10 )
+ FIRMUL (ff_mlp_firorder_4, 0x0c )
+ FIRMULREG(ff_mlp_firorder_3, 0x08,10)
+ FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+ FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
+ LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
+ "jmp *%6 \n\t"
+ IIRMUL (ff_mlp_iirorder_4, 0x0c )
+ IIRMUL (ff_mlp_iirorder_3, 0x08 )
+ IIRMUL (ff_mlp_iirorder_2, 0x04 )
+ IIRMUL (ff_mlp_iirorder_1, 0x00 )
+ LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
+ SHIFT_ACCUM
+ "mov "RESULT" ,"ACCUM" \n\t"
+ "add (%2) ,"RESULT" \n\t"
+ "and %4 ,"RESULT" \n\t"
+ "sub $4 , %0 \n\t"
+ "mov "RESULT32", (%0) \n\t"
+ "mov "RESULT32", (%2) \n\t"
+ "add $"BINC" , %2 \n\t"
+ "sub "ACCUM" ,"RESULT" \n\t"
+ "mov "RESULT32","IOFFS"(%0) \n\t"
+ "incl %3 \n\t"
+ "js 1b \n\t"
+ : /* 0*/"+r"(state),
+ /* 1*/"+r"(coeff),
+ /* 2*/"+r"(sample_buffer),
+#if ARCH_X86_64
+ /* 3*/"+r"(blocksize)
+ : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
+ /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
+ , /* 8*/"r"((int64_t)coeff[0])
+ , /* 9*/"r"((int64_t)coeff[1])
+ , /*10*/"r"((int64_t)coeff[2])
+ : "rax", "rdx", "rsi"
+#else /* ARCH_X86_32 */
+ /* 3*/"+m"(blocksize)
+ : /* 4*/"m"( mask), /* 5*/"m"(firjump),
+ /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
+ : "eax", "edx", "esi", "ecx"
+#endif /* !ARCH_X86_64 */
+ );
+}
+
+#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
+
+av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
+{
+#if HAVE_7REGS && HAVE_INLINE_ASM
+ c->mlp_filter_channel = mlp_filter_channel_x86;
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/motion_est.c b/ffmpeg/libavcodec/x86/motion_est.c
new file mode 100644
index 0000000..3ffb002
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/motion_est.c
@@ -0,0 +1,473 @@
+/*
+ * MMX optimized motion estimation
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * mostly by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
+0x0000000000000000ULL,
+0x0001000100010001ULL,
+0x0002000200020002ULL,
+};
+
+DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
+
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(x86_reg)stride*h;
+ __asm__ volatile(
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq (%2, %%"REG_a"), %%mm2 \n\t"
+ "movq (%2, %%"REG_a"), %%mm4 \n\t"
+ "add %3, %%"REG_a" \n\t"
+ "psubusb %%mm0, %%mm2 \n\t"
+ "psubusb %%mm4, %%mm0 \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "movq (%2, %%"REG_a"), %%mm5 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm5, %%mm1 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm1, %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm2 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "add %3, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
+ int stride, int h)
+{
+ __asm__ volatile(
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
+{
+ int ret;
+ __asm__ volatile(
+ "pxor %%xmm2, %%xmm2 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu (%1, %4), %%xmm1 \n\t"
+ "psadbw (%2), %%xmm0 \n\t"
+ "psadbw (%2, %4), %%xmm1 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "lea (%1,%4,2), %1 \n\t"
+ "lea (%2,%4,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ "movhlps %%xmm2, %%xmm0 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "movd %%xmm2, %3 \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
+ : "r" ((x86_reg)stride)
+ );
+ return ret;
+}
+
+static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
+ int stride, int h)
+{
+ __asm__ volatile(
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "pavgb 1(%1), %%mm0 \n\t"
+ "pavgb 1(%1, %3), %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
+ int stride, int h)
+{
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "pavgb %%mm1, %%mm0 \n\t"
+ "pavgb %%mm2, %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
+ int stride, int h)
+{
+ __asm__ volatile(
+ "movq "MANGLE(bone)", %%mm5 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "pavgb 1(%1), %%mm0 \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1,%3), %%mm2 \n\t"
+ "pavgb 1(%1), %%mm1 \n\t"
+ "pavgb 1(%1,%3), %%mm2 \n\t"
+ "psubusb %%mm5, %%mm1 \n\t"
+ "pavgb %%mm1, %%mm0 \n\t"
+ "pavgb %%mm2, %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2,%3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(x86_reg)stride*h;
+ __asm__ volatile(
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq (%2, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "paddw %%mm2, %%mm3 \n\t"
+ "movq (%3, %%"REG_a"), %%mm4 \n\t"
+ "movq (%3, %%"REG_a"), %%mm2 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "paddw %%mm5, %%mm3 \n\t"
+ "psrlw $1, %%mm1 \n\t"
+ "psrlw $1, %%mm3 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm2, %%mm1 \n\t"
+ "por %%mm4, %%mm1 \n\t"
+ "movq %%mm1, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "add %4, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(x86_reg)stride*h;
+ __asm__ volatile(
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%2, %%"REG_a"), %%mm2 \n\t"
+ "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddw %%mm4, %%mm2 \n\t"
+ "paddw %%mm5, %%mm3 \n\t"
+ "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+ "paddw %%mm5, %%mm0 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "movq (%3, %%"REG_a"), %%mm4 \n\t"
+ "movq (%3, %%"REG_a"), %%mm5 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "psubusb %%mm0, %%mm4 \n\t"
+ "psubusb %%mm5, %%mm0 \n\t"
+ "por %%mm4, %%mm0 \n\t"
+ "movq %%mm0, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm4 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm3, %%mm1 \n\t"
+ "add %4, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline int sum_mmx(void)
+{
+ int ret;
+ __asm__ volatile(
+ "movq %%mm6, %%mm0 \n\t"
+ "psrlq $32, %%mm6 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "psrlq $16, %%mm6 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "movd %%mm6, %0 \n\t"
+ : "=r" (ret)
+ );
+ return ret&0xFFFF;
+}
+
+static inline int sum_mmxext(void)
+{
+ int ret;
+ __asm__ volatile(
+ "movd %%mm6, %0 \n\t"
+ : "=r" (ret)
+ );
+ return ret;
+}
+
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
+}
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
+}
+
+
+#define PIX_SAD(suf)\
+static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ av_assert2(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t":);\
+\
+ sad8_1_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ av_assert2(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ av_assert2(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ av_assert2(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ ::);\
+\
+ sad8_4_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t":);\
+\
+ sad8_1_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ ::);\
+\
+ sad8_4_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+
+PIX_SAD(mmx)
+PIX_SAD(mmxext)
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
+{
+#if HAVE_INLINE_ASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ c->pix_abs[0][0] = sad16_mmx;
+ c->pix_abs[0][1] = sad16_x2_mmx;
+ c->pix_abs[0][2] = sad16_y2_mmx;
+ c->pix_abs[0][3] = sad16_xy2_mmx;
+ c->pix_abs[1][0] = sad8_mmx;
+ c->pix_abs[1][1] = sad8_x2_mmx;
+ c->pix_abs[1][2] = sad8_y2_mmx;
+ c->pix_abs[1][3] = sad8_xy2_mmx;
+
+ c->sad[0]= sad16_mmx;
+ c->sad[1]= sad8_mmx;
+ }
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->pix_abs[0][0] = sad16_mmxext;
+ c->pix_abs[1][0] = sad8_mmxext;
+
+ c->sad[0] = sad16_mmxext;
+ c->sad[1] = sad8_mmxext;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->pix_abs[0][1] = sad16_x2_mmxext;
+ c->pix_abs[0][2] = sad16_y2_mmxext;
+ c->pix_abs[0][3] = sad16_xy2_mmxext;
+ c->pix_abs[1][1] = sad8_x2_mmxext;
+ c->pix_abs[1][2] = sad8_y2_mmxext;
+ c->pix_abs[1][3] = sad8_xy2_mmxext;
+ }
+ }
+ if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+ c->sad[0]= sad16_sse2;
+ }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/ffmpeg/libavcodec/x86/mpeg4qpel.asm b/ffmpeg/libavcodec/x86/mpeg4qpel.asm
new file mode 100644
index 0000000..ca52375
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mpeg4qpel.asm
@@ -0,0 +1,560 @@
+;******************************************************************************
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pb_1
+cextern pw_3
+cextern pw_15
+cextern pw_16
+cextern pw_20
+
+
+SECTION_TEXT
+
+; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS8_L2 0
+cglobal put_no_rnd_pixels8_l2, 6,6
+ movsxdifnidn r4, r4d
+ movsxdifnidn r3, r3d
+ pcmpeqb m6, m6
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r2]
+ add r1, r4
+ add r2, 8
+ pxor m0, m6
+ pxor m1, m6
+ PAVGB m0, m1
+ pxor m0, m6
+ mova [r0], m0
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ add r1, r4
+ mova m1, [r1]
+ add r1, r4
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ add r0, r3
+ mova [r0], m1
+ add r0, r3
+ mova m0, [r1]
+ add r1, r4
+ mova m1, [r1]
+ add r1, r4
+ mova m2, [r2+16]
+ mova m3, [r2+24]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ add r0, r3
+ mova [r0], m1
+ add r0, r3
+ add r2, 32
+ sub r5d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_L2
+
+
+; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS16_l2 0
+cglobal put_no_rnd_pixels16_l2, 6,6
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ pcmpeqb m6, m6
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r1+8]
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ add r1, r4
+ add r2, 16
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ mova m2, [r2]
+ mova m3, [r2+8]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ mova m2, [r2+16]
+ mova m3, [r2+24]
+ pxor m0, m6
+ pxor m1, m6
+ pxor m2, m6
+ pxor m3, m6
+ PAVGB m0, m2
+ PAVGB m1, m3
+ pxor m0, m6
+ pxor m1, m6
+ mova [r0], m0
+ mova [r0+8], m1
+ add r0, r3
+ add r2, 32
+ sub r5d, 2
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS16_l2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS16_l2
+
+%macro MPEG4_QPEL16_H_LOWPASS 1
+cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m1, m0
+ mova m2, m0
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ pshufw m5, m0, 0x90
+ pshufw m6, m0, 0x41
+ mova m3, m2
+ mova m4, m2
+ psllq m2, 8
+ psllq m3, 16
+ psllq m4, 24
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m3
+ paddw m6, m2
+ paddw m5, m5
+ psubw m6, m5
+ pshufw m5, m0, 6
+ pmullw m6, [pw_3]
+ paddw m0, m4
+ paddw m5, m1
+ pmullw m0, [pw_20]
+ psubw m0, m5
+ paddw m6, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ mova [rsp+8], m0
+ mova m0, [r1+5]
+ mova m5, m0
+ mova m6, m0
+ psrlq m0, 8
+ psrlq m5, 16
+ punpcklbw m0, m7
+ punpcklbw m5, m7
+ paddw m2, m0
+ paddw m3, m5
+ paddw m2, m2
+ psubw m3, m2
+ mova m2, m6
+ psrlq m6, 24
+ punpcklbw m2, m7
+ punpcklbw m6, m7
+ pmullw m3, [pw_3]
+ paddw m1, m2
+ paddw m4, m6
+ pmullw m1, [pw_20]
+ psubw m3, m4
+ paddw m1, [PW_ROUND]
+ paddw m3, m1
+ psraw m3, 5
+ mova m1, [rsp+8]
+ packuswb m1, m3
+ OP_MOV [r0], m1, m4
+ mova m1, [r1+9]
+ mova m4, m1
+ mova m3, m1
+ psrlq m1, 8
+ psrlq m4, 16
+ punpcklbw m1, m7
+ punpcklbw m4, m7
+ paddw m5, m1
+ paddw m0, m4
+ paddw m5, m5
+ psubw m0, m5
+ mova m5, m3
+ psrlq m3, 24
+ pmullw m0, [pw_3]
+ punpcklbw m3, m7
+ paddw m2, m3
+ psubw m0, m2
+ mova m2, m5
+ punpcklbw m2, m7
+ punpckhbw m5, m7
+ paddw m6, m2
+ pmullw m6, [pw_20]
+ paddw m0, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ paddw m3, m5
+ pshufw m6, m5, 0xf9
+ paddw m6, m4
+ pshufw m4, m5, 0xbe
+ pshufw m5, m5, 0x6f
+ paddw m4, m1
+ paddw m5, m2
+ paddw m6, m6
+ psubw m4, m6
+ pmullw m3, [pw_20]
+ pmullw m4, [pw_3]
+ psubw m3, m5
+ paddw m4, [PW_ROUND]
+ paddw m4, m3
+ psraw m4, 5
+ packuswb m0, m4
+ OP_MOV [r0+8], m0, m4
+ add r1, r3
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+%macro PUT_OP 2-3
+ mova %1, %2
+%endmacro
+
+%macro AVG_OP 2-3
+ mova %3, %1
+ pavgb %2, %3
+ mova %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL16_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_H_LOWPASS 1
+cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+ pxor m7, m7
+.loop:
+ mova m0, [r1]
+ mova m1, m0
+ mova m2, m0
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ pshufw m5, m0, 0x90
+ pshufw m6, m0, 0x41
+ mova m3, m2
+ mova m4, m2
+ psllq m2, 8
+ psllq m3, 16
+ psllq m4, 24
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m3
+ paddw m6, m2
+ paddw m5, m5
+ psubw m6, m5
+ pshufw m5, m0, 0x6
+ pmullw m6, [pw_3]
+ paddw m0, m4
+ paddw m5, m1
+ pmullw m0, [pw_20]
+ psubw m0, m5
+ paddw m6, [PW_ROUND]
+ paddw m0, m6
+ psraw m0, 5
+ movh m5, [r1+5]
+ punpcklbw m5, m7
+ pshufw m6, m5, 0xf9
+ paddw m1, m5
+ paddw m2, m6
+ pshufw m6, m5, 0xbe
+ pshufw m5, m5, 0x6f
+ paddw m3, m6
+ paddw m4, m5
+ paddw m2, m2
+ psubw m3, m2
+ pmullw m1, [pw_20]
+ pmullw m3, [pw_3]
+ psubw m3, m4
+ paddw m1, [PW_ROUND]
+ paddw m3, m1
+ psraw m3, 5
+ packuswb m0, m3
+ OP_MOV [r0], m0, m4
+ add r1, r3
+ add r0, r2
+ dec r4d
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL8_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put_no_rnd
+
+
+
+%macro QPEL_V_LOW 5
+ paddw m0, m1
+ mova m4, [pw_20]
+ pmullw m4, m0
+ mova m0, %4
+ mova m5, %1
+ paddw m5, m0
+ psubw m4, m5
+ mova m5, %2
+ mova m6, %3
+ paddw m5, m3
+ paddw m6, m2
+ paddw m6, m6
+ psubw m5, m6
+ pmullw m5, [pw_3]
+ paddw m4, [PW_ROUND]
+ paddw m5, m4
+ psraw m5, 5
+ packuswb m5, m5
+ OP_MOV %5, m5, m7
+ SWAP 0,1,2,3
+%endmacro
+
+%macro MPEG4_QPEL16_V_LOWPASS 1
+cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+
+ mov r4d, 17
+ mov r5, rsp
+ pxor m7, m7
+.looph:
+ mova m0, [r1]
+ mova m1, [r1]
+ mova m2, [r1+8]
+ mova m3, [r1+8]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ mova [r5], m0
+ mova [r5+0x88], m1
+ mova [r5+0x110], m2
+ mova [r5+0x198], m3
+ add r5, 8
+ add r1, r3
+ dec r4d
+ jne .looph
+
+
+ ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
+ mov r4d, 4
+ mov r1, 4
+ neg r2
+ lea r1, [r1+r2*8]
+ lea r1, [r1+r2*4]
+ lea r1, [r1+r2*2]
+ neg r2
+ mov r5, rsp
+.loopv:
+ pxor m7, m7
+ mova m0, [r5+ 0x0]
+ mova m1, [r5+ 0x8]
+ mova m2, [r5+0x10]
+ mova m3, [r5+0x18]
+ QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+ QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+ QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
+ QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
+ QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
+ QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
+ QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
+ QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
+
+ add r5, 0x88
+ add r0, r1
+ dec r4d
+ jne .loopv
+ REP_RET
+%endmacro
+
+%macro PUT_OPH 2-3
+ movh %1, %2
+%endmacro
+
+%macro AVG_OPH 2-3
+ movh %3, %1
+ pavgb %2, %3
+ movh %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL16_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_V_LOWPASS 1
+cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
+ movsxdifnidn r2, r2d
+ movsxdifnidn r3, r3d
+
+ mov r4d, 9
+ mov r5, rsp
+ pxor m7, m7
+.looph:
+ mova m0, [r1]
+ mova m1, [r1]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ mova [r5], m0
+ mova [r5+0x48], m1
+ add r5, 8
+ add r1, r3
+ dec r4d
+ jne .looph
+
+
+ ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
+ mov r4d, 2
+ mov r1, 4
+ neg r2
+ lea r1, [r1+r2*4]
+ lea r1, [r1+r2*2]
+ neg r2
+ mov r5, rsp
+.loopv:
+ pxor m7, m7
+ mova m0, [r5+ 0x0]
+ mova m1, [r5+ 0x8]
+ mova m2, [r5+0x10]
+ mova m3, [r5+0x18]
+ QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+ QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+ QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+ QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
+ lea r0, [r0+r2*2]
+ QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
+ QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
+
+ add r5, 0x48
+ add r0, r1
+ dec r4d
+ jne .loopv
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL8_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put_no_rnd
diff --git a/ffmpeg/libavcodec/x86/mpegaudiodec.c b/ffmpeg/libavcodec/x86/mpegaudiodec.c
new file mode 100644
index 0000000..287d8ff
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mpegaudiodec.c
@@ -0,0 +1,273 @@
+/*
+ * MMX optimized MP3 decoding functions
+ * Copyright (c) 2010 Vitor Sessak
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+DECL(sse)
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+
+void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
+ float *tmpbuf);
+void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
+ float *tmpbuf);
+
+DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
+
+#if HAVE_SSE2_INLINE
+
+#define MACS(rt, ra, rb) rt+=(ra)*(rb)
+#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
+
+#define SUM8(op, sum, w, p) \
+{ \
+ op(sum, (w)[0 * 64], (p)[0 * 64]); \
+ op(sum, (w)[1 * 64], (p)[1 * 64]); \
+ op(sum, (w)[2 * 64], (p)[2 * 64]); \
+ op(sum, (w)[3 * 64], (p)[3 * 64]); \
+ op(sum, (w)[4 * 64], (p)[4 * 64]); \
+ op(sum, (w)[5 * 64], (p)[5 * 64]); \
+ op(sum, (w)[6 * 64], (p)[6 * 64]); \
+ op(sum, (w)[7 * 64], (p)[7 * 64]); \
+}
+
+static void apply_window(const float *buf, const float *win1,
+ const float *win2, float *sum1, float *sum2, int len)
+{
+ x86_reg count = - 4*len;
+ const float *win1a = win1+len;
+ const float *win2a = win2+len;
+ const float *bufa = buf+len;
+ float *sum1a = sum1+len;
+ float *sum2a = sum2+len;
+
+
+#define MULT(a, b) \
+ "movaps " #a "(%1,%0), %%xmm1 \n\t" \
+ "movaps " #a "(%3,%0), %%xmm2 \n\t" \
+ "mulps %%xmm2, %%xmm1 \n\t" \
+ "subps %%xmm1, %%xmm0 \n\t" \
+ "mulps " #b "(%2,%0), %%xmm2 \n\t" \
+ "subps %%xmm2, %%xmm4 \n\t" \
+
+ __asm__ volatile(
+ "1: \n\t"
+ "xorps %%xmm0, %%xmm0 \n\t"
+ "xorps %%xmm4, %%xmm4 \n\t"
+
+ MULT( 0, 0)
+ MULT( 256, 64)
+ MULT( 512, 128)
+ MULT( 768, 192)
+ MULT(1024, 256)
+ MULT(1280, 320)
+ MULT(1536, 384)
+ MULT(1792, 448)
+
+ "movaps %%xmm0, (%4,%0) \n\t"
+ "movaps %%xmm4, (%5,%0) \n\t"
+ "add $16, %0 \n\t"
+ "jl 1b \n\t"
+ :"+&r"(count)
+ :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
+ );
+
+#undef MULT
+}
+
+static void apply_window_mp3(float *in, float *win, int *unused, float *out,
+ int incr)
+{
+ LOCAL_ALIGNED_16(float, suma, [17]);
+ LOCAL_ALIGNED_16(float, sumb, [17]);
+ LOCAL_ALIGNED_16(float, sumc, [17]);
+ LOCAL_ALIGNED_16(float, sumd, [17]);
+
+ float sum;
+
+ /* copy to avoid wrap */
+ __asm__ volatile(
+ "movaps 0(%0), %%xmm0 \n\t" \
+ "movaps 16(%0), %%xmm1 \n\t" \
+ "movaps 32(%0), %%xmm2 \n\t" \
+ "movaps 48(%0), %%xmm3 \n\t" \
+ "movaps %%xmm0, 0(%1) \n\t" \
+ "movaps %%xmm1, 16(%1) \n\t" \
+ "movaps %%xmm2, 32(%1) \n\t" \
+ "movaps %%xmm3, 48(%1) \n\t" \
+ "movaps 64(%0), %%xmm0 \n\t" \
+ "movaps 80(%0), %%xmm1 \n\t" \
+ "movaps 96(%0), %%xmm2 \n\t" \
+ "movaps 112(%0), %%xmm3 \n\t" \
+ "movaps %%xmm0, 64(%1) \n\t" \
+ "movaps %%xmm1, 80(%1) \n\t" \
+ "movaps %%xmm2, 96(%1) \n\t" \
+ "movaps %%xmm3, 112(%1) \n\t"
+ ::"r"(in), "r"(in+512)
+ :"memory"
+ );
+
+ apply_window(in + 16, win , win + 512, suma, sumc, 16);
+ apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
+
+ SUM8(MACS, suma[0], win + 32, in + 48);
+
+ sumc[ 0] = 0;
+ sumb[16] = 0;
+ sumd[16] = 0;
+
+#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
+ "movups " #sumd "(%4), %%xmm0 \n\t" \
+ "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
+ "subps " #suma "(%1), %%xmm0 \n\t" \
+ "movaps %%xmm0," #out1 "(%0) \n\t" \
+\
+ "movups " #sumc "(%3), %%xmm0 \n\t" \
+ "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
+ "addps " #sumb "(%2), %%xmm0 \n\t" \
+ "movaps %%xmm0," #out2 "(%0) \n\t"
+
+ if (incr == 1) {
+ __asm__ volatile(
+ SUMS( 0, 48, 4, 52, 0, 112)
+ SUMS(16, 32, 20, 36, 16, 96)
+ SUMS(32, 16, 36, 20, 32, 80)
+ SUMS(48, 0, 52, 4, 48, 64)
+
+ :"+&r"(out)
+ :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
+ :"memory"
+ );
+ out += 16*incr;
+ } else {
+ int j;
+ float *out2 = out + 32 * incr;
+ out[0 ] = -suma[ 0];
+ out += incr;
+ out2 -= incr;
+ for(j=1;j<16;j++) {
+ *out = -suma[ j] + sumd[16-j];
+ *out2 = sumb[16-j] + sumc[ j];
+ out += incr;
+ out2 -= incr;
+ }
+ }
+
+ sum = 0;
+ SUM8(MLSS, sum, win + 16 + 32, in + 32);
+ *out = sum;
+}
+
+#endif /* HAVE_SSE2_INLINE */
+
+#if HAVE_YASM
+#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
+static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
+ int count, int switch_point, int block_type) \
+{ \
+ int align_end = count - (count & 3); \
+ int j; \
+ for (j = 0; j < align_end; j+= 4) { \
+ LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
+ float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
+ /* apply window & overlap with previous buffer */ \
+ \
+ /* select window */ \
+ ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
+ in += 4*18; \
+ buf += 4*18; \
+ out += 4; \
+ } \
+ for (; j < count; j++) { \
+ /* apply window & overlap with previous buffer */ \
+ \
+ /* select window */ \
+ int win_idx = (switch_point && j < 2) ? 0 : block_type; \
+ float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
+ \
+ ff_imdct36_float_ ## CPU1(out, buf, in, win); \
+ \
+ in += 18; \
+ buf++; \
+ out++; \
+ } \
+}
+
+#if HAVE_SSE
+DECL_IMDCT_BLOCKS(sse,sse)
+DECL_IMDCT_BLOCKS(sse2,sse)
+DECL_IMDCT_BLOCKS(sse3,sse)
+DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
+DECL_IMDCT_BLOCKS(avx,avx)
+#endif
+#endif /* HAVE_YASM */
+
+av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ int i, j;
+ for (j = 0; j < 4; j++) {
+ for (i = 0; i < 40; i ++) {
+ mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
+ mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
+ mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
+ mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+ mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
+ mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
+ mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
+ mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+ }
+ }
+
+#if HAVE_SSE2_INLINE
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ s->apply_window_float = apply_window_mp3;
+ }
+#endif /* HAVE_SSE2_INLINE */
+
+#if HAVE_YASM
+ if (EXTERNAL_AVX(mm_flags)) {
+ s->imdct36_blocks_float = imdct36_blocks_avx;
+ } else if (EXTERNAL_SSSE3(mm_flags)) {
+ s->imdct36_blocks_float = imdct36_blocks_ssse3;
+ } else if (EXTERNAL_SSE3(mm_flags)) {
+ s->imdct36_blocks_float = imdct36_blocks_sse3;
+ } else if (EXTERNAL_SSE2(mm_flags)) {
+ s->imdct36_blocks_float = imdct36_blocks_sse2;
+ } else if (EXTERNAL_SSE(mm_flags)) {
+ s->imdct36_blocks_float = imdct36_blocks_sse;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/mpegvideo.c b/ffmpeg/libavcodec/x86/mpegvideo.c
new file mode 100644
index 0000000..903ad62
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mpegvideo.c
@@ -0,0 +1,600 @@
+/*
+ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
+ * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideo.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg level, qmul, qadd, nCoeffs;
+
+ qmul = qscale << 1;
+
+ av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level= block[0];
+ }
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+__asm__ volatile(
+ "movd %1, %%mm6 \n\t" //qmul
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "movd %2, %%mm5 \n\t" //qadd
+ "pxor %%mm7, %%mm7 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "psubw %%mm5, %%mm7 \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %3), %%mm0 \n\t"
+ "movq 8(%0, %3), %%mm1 \n\t"
+
+ "pmullw %%mm6, %%mm0 \n\t"
+ "pmullw %%mm6, %%mm1 \n\t"
+
+ "movq (%0, %3), %%mm2 \n\t"
+ "movq 8(%0, %3), %%mm3 \n\t"
+
+ "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+
+ "paddw %%mm7, %%mm0 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+
+ "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+
+ "pandn %%mm2, %%mm0 \n\t"
+ "pandn %%mm3, %%mm1 \n\t"
+
+ "movq %%mm0, (%0, %3) \n\t"
+ "movq %%mm1, 8(%0, %3) \n\t"
+
+ "add $16, %3 \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+ : "memory"
+ );
+ block[0]= level;
+}
+
+
+static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg qmul, qadd, nCoeffs;
+
+ qmul = qscale << 1;
+ qadd = (qscale - 1) | 1;
+
+ assert(s->block_last_index[n]>=0 || s->h263_aic);
+
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+__asm__ volatile(
+ "movd %1, %%mm6 \n\t" //qmul
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "movd %2, %%mm5 \n\t" //qadd
+ "pxor %%mm7, %%mm7 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "psubw %%mm5, %%mm7 \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %3), %%mm0 \n\t"
+ "movq 8(%0, %3), %%mm1 \n\t"
+
+ "pmullw %%mm6, %%mm0 \n\t"
+ "pmullw %%mm6, %%mm1 \n\t"
+
+ "movq (%0, %3), %%mm2 \n\t"
+ "movq 8(%0, %3), %%mm3 \n\t"
+
+ "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+
+ "paddw %%mm7, %%mm0 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+
+ "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+
+ "pandn %%mm2, %%mm0 \n\t"
+ "pandn %%mm3, %%mm1 \n\t"
+
+ "movq %%mm0, (%0, %3) \n\t"
+ "movq %%mm1, 8(%0, %3) \n\t"
+
+ "add $16, %3 \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+ : "memory"
+ );
+}
+
+
+/*
+ We can suppose that result of two multiplications can't be greater than 0xFFFF
+ i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
+ a complex multiplication.
+=====================================================
+ Full formula for multiplication of 2 integer numbers
+ which are represent as high:low words:
+ input: value1 = high1:low1
+ value2 = high2:low2
+ output: value3 = value1*value2
+ value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
+ this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
+ but this algorithm will compute only 0x66cb0ce4
+ this limited by 16-bit size of operands
+ ---------------------------------
+ tlow1 = high1*low2
+ tlow2 = high2*low1
+ tlow1 = tlow1 + tlow2
+ high3:low3 = low1*low2
+ high3 += tlow1
+*/
+static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+ int block0;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ /* XXX: only mpeg1 */
+ quant_matrix = s->intra_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $3, %%mm0 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm1 \n\t"
+ "por %%mm7, %%mm0 \n\t"
+ "por %%mm7, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "js 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+ block[0]= block0;
+}
+
+static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+ quant_matrix = s->inter_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
+ "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
+ "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
+ "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
+ "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
+ "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $4, %%mm0 \n\t"
+ "psraw $4, %%mm1 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm1 \n\t"
+ "por %%mm7, %%mm0 \n\t"
+ "por %%mm7, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "js 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+ int block0;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ if(s->alternate_scan) nCoeffs= 63; //FIXME
+ else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ quant_matrix = s->intra_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $3, %%mm0 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+ block[0]= block0;
+ //Note, we do not do mismatch control for intra as errors cannot accumulate
+}
+
+static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ if(s->alternate_scan) nCoeffs= 63; //FIXME
+ else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+ quant_matrix = s->inter_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlq $48, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
+ "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
+ "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
+ "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psrlw $4, %%mm0 \n\t"
+ "psrlw $4, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "pxor %%mm4, %%mm7 \n\t"
+ "pxor %%mm5, %%mm7 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "jng 1b \n\t"
+ "movd 124(%0, %3), %%mm0 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "psrlq $32, %%mm7 \n\t"
+ "pxor %%mm6, %%mm7 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "psrlq $16, %%mm7 \n\t"
+ "pxor %%mm6, %%mm7 \n\t"
+ "pslld $31, %%mm7 \n\t"
+ "psrlq $15, %%mm7 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "movd %%mm0, 124(%0, %3) \n\t"
+
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "1: \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "movq (%0), %%mm2 \n\t"
+ "movq 8(%0), %%mm3 \n\t"
+ "pcmpgtw %%mm2, %%mm0 \n\t"
+ "pcmpgtw %%mm3, %%mm1 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psubusw (%2), %%mm2 \n\t"
+ "psubusw 8(%2), %%mm3 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm3, 8(%0) \n\t"
+ "movq %%mm4, %%mm2 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "punpcklwd %%mm7, %%mm4 \n\t"
+ "punpckhwd %%mm7, %%mm2 \n\t"
+ "punpcklwd %%mm7, %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm3 \n\t"
+ "paddd (%1), %%mm4 \n\t"
+ "paddd 8(%1), %%mm2 \n\t"
+ "paddd 16(%1), %%mm5 \n\t"
+ "paddd 24(%1), %%mm3 \n\t"
+ "movq %%mm4, (%1) \n\t"
+ "movq %%mm2, 8(%1) \n\t"
+ "movq %%mm5, 16(%1) \n\t"
+ "movq %%mm3, 24(%1) \n\t"
+ "add $16, %0 \n\t"
+ "add $32, %1 \n\t"
+ "add $16, %2 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ );
+}
+
+static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "pxor %%xmm0, %%xmm0 \n\t"
+ "pxor %%xmm1, %%xmm1 \n\t"
+ "movdqa (%0), %%xmm2 \n\t"
+ "movdqa 16(%0), %%xmm3 \n\t"
+ "pcmpgtw %%xmm2, %%xmm0 \n\t"
+ "pcmpgtw %%xmm3, %%xmm1 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, %%xmm4 \n\t"
+ "movdqa %%xmm3, %%xmm5 \n\t"
+ "psubusw (%2), %%xmm2 \n\t"
+ "psubusw 16(%2), %%xmm3 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm3, 16(%0) \n\t"
+ "movdqa %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm5, %%xmm0 \n\t"
+ "punpcklwd %%xmm7, %%xmm4 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "punpcklwd %%xmm7, %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm0 \n\t"
+ "paddd (%1), %%xmm4 \n\t"
+ "paddd 16(%1), %%xmm6 \n\t"
+ "paddd 32(%1), %%xmm5 \n\t"
+ "paddd 48(%1), %%xmm0 \n\t"
+ "movdqa %%xmm4, (%1) \n\t"
+ "movdqa %%xmm6, 16(%1) \n\t"
+ "movdqa %%xmm5, 32(%1) \n\t"
+ "movdqa %%xmm0, 48(%1) \n\t"
+ "add $32, %0 \n\t"
+ "add $64, %1 \n\t"
+ "add $32, %2 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ );
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
+{
+#if HAVE_INLINE_ASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
+ s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
+ s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
+ if(!(s->flags & CODEC_FLAG_BITEXACT))
+ s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+ s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ s->denoise_dct= denoise_dct_sse2;
+ } else {
+ s->denoise_dct= denoise_dct_mmx;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc.c b/ffmpeg/libavcodec/x86/mpegvideoenc.c
new file mode 100644
index 0000000..6219667
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mpegvideoenc.c
@@ -0,0 +1,107 @@
+/*
+ * The simplest mpeg encoder (well, it was the simplest!)
+ * Copyright (c) 2000,2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dct.h"
+#include "libavcodec/mpegvideo.h"
+#include "dsputil_mmx.h"
+
+extern uint16_t ff_inv_zigzag_direct16[64];
+
+#if HAVE_MMX_INLINE
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_SSSE3 0
+#define RENAME(a) a ## _MMX
+#define RENAMEl(a) a ## _mmx
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_MMXEXT_INLINE
+#undef COMPILE_TEMPLATE_SSSE3
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_SSSE3 0
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _MMXEXT
+#define RENAMEl(a) a ## _mmxext
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_MMXEXT_INLINE */
+
+#if HAVE_SSE2_INLINE
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_SSSE3
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 1
+#define COMPILE_TEMPLATE_SSSE3 0
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSE2
+#define RENAMEl(a) a ## _sse2
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_SSE2_INLINE */
+
+#if HAVE_SSSE3_INLINE
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_SSSE3
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 1
+#define COMPILE_TEMPLATE_SSSE3 1
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSSE3
+#define RENAMEl(a) a ## _sse2
+#include "mpegvideoenc_template.c"
+#endif /* HAVE_SSSE3_INLINE */
+
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
+{
+ int mm_flags = av_get_cpu_flags();
+ const int dct_algo = s->avctx->dct_algo;
+
+ if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
+#if HAVE_MMX_INLINE
+ if (INLINE_MMX(mm_flags))
+ s->dct_quantize = dct_quantize_MMX;
+#endif
+#if HAVE_MMXEXT_INLINE
+ if (INLINE_MMXEXT(mm_flags))
+ s->dct_quantize = dct_quantize_MMXEXT;
+#endif
+#if HAVE_SSE2_INLINE
+ if (INLINE_SSE2(mm_flags))
+ s->dct_quantize = dct_quantize_SSE2;
+#endif
+#if HAVE_SSSE3_INLINE
+ if (INLINE_SSSE3(mm_flags))
+ s->dct_quantize = dct_quantize_SSSE3;
+#endif
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc_template.c b/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
new file mode 100644
index 0000000..1e0505e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
@@ -0,0 +1,364 @@
+/*
+ * MPEG video MMX templates
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MMREG_WIDTH
+#undef MM
+#undef MOVQ
+#undef SPREADW
+#undef PMAXW
+#undef PMAX
+#undef SAVE_SIGN
+#undef RESTORE_SIGN
+
+#if COMPILE_TEMPLATE_SSE2
+#define MMREG_WIDTH "16"
+#define MM "%%xmm"
+#define MOVQ "movdqa"
+#define SPREADW(a) \
+ "pshuflw $0, "a", "a" \n\t"\
+ "punpcklwd "a", "a" \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "movhlps "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshuflw $0x0E, "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshuflw $0x01, "a", "b" \n\t"\
+ PMAXW(b, a)
+#else
+#define MMREG_WIDTH "8"
+#define MM "%%mm"
+#define MOVQ "movq"
+#if COMPILE_TEMPLATE_MMXEXT
+#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "pshufw $0x0E, "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshufw $0x01, "a", "b" \n\t"\
+ PMAXW(b, a)
+#else
+#define SPREADW(a) \
+ "punpcklwd "a", "a" \n\t"\
+ "punpcklwd "a", "a" \n\t"
+#define PMAXW(a,b) \
+ "psubusw "a", "b" \n\t"\
+ "paddw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "movq "a", "b" \n\t"\
+ "psrlq $32, "a" \n\t"\
+ PMAXW(b, a)\
+ "movq "a", "b" \n\t"\
+ "psrlq $16, "a" \n\t"\
+ PMAXW(b, a)
+
+#endif
+#endif
+
+#if COMPILE_TEMPLATE_SSSE3
+#define SAVE_SIGN(a,b) \
+ "movdqa "b", "a" \n\t"\
+ "pabsw "b", "b" \n\t"
+#define RESTORE_SIGN(a,b) \
+ "psignw "a", "b" \n\t"
+#else
+#define SAVE_SIGN(a,b) \
+ "pxor "a", "a" \n\t"\
+ "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
+ "pxor "a", "b" \n\t"\
+ "psubw "a", "b" \n\t" /* ABS(block[i]) */
+#define RESTORE_SIGN(a,b) \
+ "pxor "a", "b" \n\t"\
+ "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+#endif
+
+static int RENAME(dct_quantize)(MpegEncContext *s,
+ int16_t *block, int n,
+ int qscale, int *overflow)
+{
+ x86_reg last_non_zero_p1;
+ int level=0, q; //=0 is because gcc says uninitialized ...
+ const uint16_t *qmat, *bias;
+ LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
+
+ av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+
+ //s->fdct (block);
+ RENAMEl(ff_fdct) (block); //cannot be anything else ...
+
+ if(s->dct_error_sum)
+ s->denoise_dct(s, block);
+
+ if (s->mb_intra) {
+ int dummy;
+ if (n < 4){
+ q = s->y_dc_scale;
+ bias = s->q_intra_matrix16[qscale][1];
+ qmat = s->q_intra_matrix16[qscale][0];
+ }else{
+ q = s->c_dc_scale;
+ bias = s->q_chroma_intra_matrix16[qscale][1];
+ qmat = s->q_chroma_intra_matrix16[qscale][0];
+ }
+ /* note: block[0] is assumed to be positive */
+ if (!s->h263_aic) {
+ __asm__ volatile (
+ "mul %%ecx \n\t"
+ : "=d" (level), "=a"(dummy)
+ : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
+ );
+ } else
+ /* For AIC we skip quant/dequant of INTRADC */
+ level = (block[0] + 4)>>3;
+
+ block[0]=0; //avoid fake overflow
+// temp_block[0] = (block[0] + (q >> 1)) / q;
+ last_non_zero_p1 = 1;
+ } else {
+ last_non_zero_p1 = 0;
+ bias = s->q_inter_matrix16[qscale][1];
+ qmat = s->q_inter_matrix16[qscale][0];
+ }
+
+ if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
+
+ __asm__ volatile(
+ "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
+ SPREADW(MM"3")
+ "pxor "MM"7, "MM"7 \n\t" // 0
+ "pxor "MM"4, "MM"4 \n\t" // 0
+ MOVQ" (%2), "MM"5 \n\t" // qmat[0]
+ "pxor "MM"6, "MM"6 \n\t"
+ "psubw (%3), "MM"6 \n\t" // -bias[0]
+ "mov $-128, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
+ SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
+ "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
+ "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
+ "por "MM"0, "MM"4 \n\t"
+ RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+ MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
+ "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
+ MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
+ MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
+ "pandn "MM"1, "MM"0 \n\t"
+ PMAXW(MM"0", MM"3")
+ "add $"MMREG_WIDTH", %%"REG_a" \n\t"
+ " js 1b \n\t"
+ PMAX(MM"3", MM"0")
+ "movd "MM"3, %%"REG_a" \n\t"
+ "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ : "+a" (last_non_zero_p1)
+ : "r" (block+64), "r" (qmat), "r" (bias),
+ "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ );
+ }else{ // FMT_H263
+ __asm__ volatile(
+ "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
+ SPREADW(MM"3")
+ "pxor "MM"7, "MM"7 \n\t" // 0
+ "pxor "MM"4, "MM"4 \n\t" // 0
+ "mov $-128, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
+ SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
+ MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
+ "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
+ MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
+ "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
+ "por "MM"0, "MM"4 \n\t"
+ RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+ MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
+ "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
+ MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
+ MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
+ "pandn "MM"1, "MM"0 \n\t"
+ PMAXW(MM"0", MM"3")
+ "add $"MMREG_WIDTH", %%"REG_a" \n\t"
+ " js 1b \n\t"
+ PMAX(MM"3", MM"0")
+ "movd "MM"3, %%"REG_a" \n\t"
+ "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ : "+a" (last_non_zero_p1)
+ : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+ "r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ );
+ }
+ __asm__ volatile(
+ "movd %1, "MM"1 \n\t" // max_qcoeff
+ SPREADW(MM"1")
+ "psubusw "MM"1, "MM"4 \n\t"
+ "packuswb "MM"4, "MM"4 \n\t"
+#if COMPILE_TEMPLATE_SSE2
+ "packuswb "MM"4, "MM"4 \n\t"
+#endif
+ "movd "MM"4, %0 \n\t" // *overflow
+ : "=g" (*overflow)
+ : "g" (s->max_qcoeff)
+ );
+
+ if(s->mb_intra) block[0]= level;
+ else block[0]= temp_block[0];
+
+ if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
+ block[0x20] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
+ block[0x09] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
+ block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
+ block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
+ block[0x0C] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
+ block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
+ block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
+ block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
+ block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
+ block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
+ block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
+ block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
+ block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
+ block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
+ block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
+ block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
+ block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+ block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
+ block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
+ block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
+ block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x04] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+ block[0x05] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1C] = temp_block[0x19];
+ block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+ block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+ block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+ block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+ block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+ block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+ block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+ block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+ block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+ block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else{
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x01] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
+ block[0x03] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x19] = temp_block[0x19];
+ block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
+ block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
+ block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
+ block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
+ block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
+ block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
+ block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
+ block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+ block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+ block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
+ block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }
+ end:
+ return last_non_zero_p1 - 1;
+}
diff --git a/ffmpeg/libavcodec/x86/pngdsp.asm b/ffmpeg/libavcodec/x86/pngdsp.asm
new file mode 100644
index 0000000..c05f3da
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/pngdsp.asm
@@ -0,0 +1,173 @@
+;******************************************************************************
+;* x86 optimizations for PNG decoding
+;*
+;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pw_255
+
+SECTION_TEXT
+
+; %1 = nr. of xmm registers used
+%macro ADD_BYTES_FN 1
+cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
+%if ARCH_X86_64
+ movsxd waq, wad
+%endif
+ xor iq, iq
+
+ ; vector loop
+ mov wq, waq
+ and waq, ~(mmsize*2-1)
+ jmp .end_v
+.loop_v:
+ mova m0, [src1q+iq]
+ mova m1, [src1q+iq+mmsize]
+ paddb m0, [src2q+iq]
+ paddb m1, [src2q+iq+mmsize]
+ mova [dstq+iq ], m0
+ mova [dstq+iq+mmsize], m1
+ add iq, mmsize*2
+.end_v:
+ cmp iq, waq
+ jl .loop_v
+
+%if mmsize == 16
+ ; vector loop
+ mov waq, wq
+ and waq, ~7
+ jmp .end_l
+.loop_l:
+ movq mm0, [src1q+iq]
+ paddb mm0, [src2q+iq]
+ movq [dstq+iq ], mm0
+ add iq, 8
+.end_l:
+ cmp iq, waq
+ jl .loop_l
+%endif
+
+ ; scalar loop for leftover
+ jmp .end_s
+.loop_s:
+ mov wab, [src1q+iq]
+ add wab, [src2q+iq]
+ mov [dstq+iq], wab
+ inc iq
+.end_s:
+ cmp iq, wq
+ jl .loop_s
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES_FN 0
+%endif
+
+INIT_XMM sse2
+ADD_BYTES_FN 2
+
+%macro ADD_PAETH_PRED_FN 1
+cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
+%if ARCH_X86_64
+ movsxd bppq, bppd
+ movsxd wq, wd
+%endif
+ lea endq, [dstq+wq-(mmsize/2-1)]
+ sub topq, dstq
+ sub srcq, dstq
+ sub dstq, bppq
+ pxor m7, m7
+
+ PUSH dstq
+ lea cntrq, [bppq-1]
+ shr cntrq, 2 + mmsize/16
+.bpp_loop:
+ lea dstq, [dstq+cntrq*(mmsize/2)]
+ movh m0, [dstq]
+ movh m1, [topq+dstq]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ add dstq, bppq
+.loop:
+ mova m2, m1
+ movh m1, [topq+dstq]
+ mova m3, m2
+ punpcklbw m1, m7
+ mova m4, m2
+ psubw m3, m1
+ psubw m4, m0
+ mova m5, m3
+ paddw m5, m4
+%if cpuflag(ssse3)
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+%else ; !cpuflag(ssse3)
+ psubw m7, m5
+ pmaxsw m5, m7
+ pxor m6, m6
+ pxor m7, m7
+ psubw m6, m3
+ psubw m7, m4
+ pmaxsw m3, m6
+ pmaxsw m4, m7
+ pxor m7, m7
+%endif ; cpuflag(ssse3)
+ mova m6, m4
+ pminsw m6, m5
+ pcmpgtw m3, m6
+ pcmpgtw m4, m5
+ mova m6, m4
+ pand m4, m3
+ pandn m6, m3
+ pandn m3, m0
+ movh m0, [srcq+dstq]
+ pand m6, m1
+ pand m2, m4
+ punpcklbw m0, m7
+ paddw m0, m6
+ paddw m3, m2
+ paddw m0, m3
+ pand m0, [pw_255]
+ mova m3, m0
+ packuswb m3, m3
+ movh [dstq], m3
+ add dstq, bppq
+ cmp dstq, endq
+ jle .loop
+
+ mov dstq, [rsp]
+ dec cntrq
+ jge .bpp_loop
+ POP dstq
+ RET
+%endmacro
+
+INIT_MMX mmxext
+ADD_PAETH_PRED_FN 0
+
+INIT_MMX ssse3
+ADD_PAETH_PRED_FN 0
diff --git a/ffmpeg/libavcodec/x86/pngdsp_init.c b/ffmpeg/libavcodec/x86/pngdsp_init.c
new file mode 100644
index 0000000..4c54ed3
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/pngdsp_init.c
@@ -0,0 +1,50 @@
+/*
+ * x86 PNG optimizations.
+ * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/pngdsp.h"
+
+void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
+ uint8_t *top, int w, int bpp);
+void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
+ uint8_t *top, int w, int bpp);
+void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
+ uint8_t *src2, int w);
+void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
+ uint8_t *src2, int w);
+
+av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
+{
+ int flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(flags))
+ dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
+#endif
+ if (EXTERNAL_MMXEXT(flags))
+ dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
+ if (EXTERNAL_SSE2(flags))
+ dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
+ if (EXTERNAL_SSSE3(flags))
+ dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
+}
diff --git a/ffmpeg/libavcodec/x86/proresdsp.asm b/ffmpeg/libavcodec/x86/proresdsp.asm
new file mode 100644
index 0000000..aedacc2
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/proresdsp.asm
@@ -0,0 +1,326 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
+%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
+%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
+%define W6sh2 8867 ; W6 = 35468 = 8867<<2
+%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+w4_plus_w2: times 4 dw W4sh2, +W2sh2
+w4_min_w2: times 4 dw W4sh2, -W2sh2
+w4_plus_w6: times 4 dw W4sh2, +W6sh2
+w4_min_w6: times 4 dw W4sh2, -W6sh2
+w1_plus_w3: times 4 dw W1sh2, +W3sh2
+w3_min_w1: times 4 dw W3sh2, -W1sh2
+w7_plus_w3: times 4 dw W7sh2, +W3sh2
+w3_min_w7: times 4 dw W3sh2, -W7sh2
+w1_plus_w5: times 4 dw W1sh2, +W5sh2
+w5_min_w1: times 4 dw W5sh2, -W1sh2
+w5_plus_w7: times 4 dw W5sh2, +W7sh2
+w7_min_w5: times 4 dw W7sh2, -W5sh2
+pw_88: times 8 dw 0x2008
+
+cextern pw_1
+cextern pw_4
+cextern pw_512
+cextern pw_1019
+
+section .text align=16
+
+; interleave data while maintaining source
+; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
+%macro SBUTTERFLY3 5
+ punpckl%1 m%2, m%4, m%5
+ punpckh%1 m%3, m%4, m%5
+%endmacro
+
+; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
+; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
+; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
+%macro SUMSUB_SHPK 7
+ psubd %3, %1, %5 ; { a0 - b0 }[0-3]
+ psubd %4, %2, %6 ; { a0 - b0 }[4-7]
+ paddd %1, %5 ; { a0 + b0 }[0-3]
+ paddd %2, %6 ; { a0 + b0 }[4-7]
+ psrad %1, %7
+ psrad %2, %7
+ psrad %3, %7
+ psrad %4, %7
+ packssdw %1, %2 ; row[0]
+ packssdw %3, %4 ; row[7]
+%endmacro
+
+; %1 = row or col (for rounding variable)
+; %2 = number of bits to shift at the end
+%macro IDCT_1D 2
+ ; a0 = (W4 * row[0]) + (1 << (15 - 1));
+ ; a1 = a0;
+ ; a2 = a0;
+ ; a3 = a0;
+ ; a0 += W2 * row[2];
+ ; a1 += W6 * row[2];
+ ; a2 -= W6 * row[2];
+ ; a3 -= W2 * row[2];
+%ifidn %1, col
+ paddw m10,[pw_88]
+%endif
+%ifidn %1, row
+ paddw m10,[pw_1]
+%endif
+ SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
+ pmaddwd m2, m0, [w4_plus_w6]
+ pmaddwd m3, m1, [w4_plus_w6]
+ pmaddwd m4, m0, [w4_min_w6]
+ pmaddwd m5, m1, [w4_min_w6]
+ pmaddwd m6, m0, [w4_min_w2]
+ pmaddwd m7, m1, [w4_min_w2]
+ pmaddwd m0, [w4_plus_w2]
+ pmaddwd m1, [w4_plus_w2]
+
+ ; a0: -1*row[0]-1*row[2]
+ ; a1: -1*row[0]
+ ; a2: -1*row[0]
+ ; a3: -1*row[0]+1*row[2]
+
+ ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
+ ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
+ ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
+ ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
+ SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
+ pmaddwd m10, m8, [w4_plus_w6]
+ pmaddwd m11, m9, [w4_plus_w6]
+ paddd m0, m10 ; a0[0-3]
+ paddd m1, m11 ; a0[4-7]
+ pmaddwd m10, m8, [w4_min_w6]
+ pmaddwd m11, m9, [w4_min_w6]
+ paddd m6, m10 ; a3[0-3]
+ paddd m7, m11 ; a3[4-7]
+ pmaddwd m10, m8, [w4_min_w2]
+ pmaddwd m11, m9, [w4_min_w2]
+ pmaddwd m8, [w4_plus_w2]
+ pmaddwd m9, [w4_plus_w2]
+ psubd m4, m10 ; a2[0-3] intermediate
+ psubd m5, m11 ; a2[4-7] intermediate
+ psubd m2, m8 ; a1[0-3] intermediate
+ psubd m3, m9 ; a1[4-7] intermediate
+
+ ; load/store
+ mova [r2+ 0], m0
+ mova [r2+ 32], m2
+ mova [r2+ 64], m4
+ mova [r2+ 96], m6
+ mova m10,[r2+ 16] ; { row[1] }[0-7]
+ mova m8, [r2+ 48] ; { row[3] }[0-7]
+ mova m13,[r2+ 80] ; { row[5] }[0-7]
+ mova m14,[r2+112] ; { row[7] }[0-7]
+ mova [r2+ 16], m1
+ mova [r2+ 48], m3
+ mova [r2+ 80], m5
+ mova [r2+112], m7
+%ifidn %1, row
+ pmullw m10,[r3+ 16]
+ pmullw m8, [r3+ 48]
+ pmullw m13,[r3+ 80]
+ pmullw m14,[r3+112]
+%endif
+
+ ; b0 = MUL(W1, row[1]);
+ ; MAC(b0, W3, row[3]);
+ ; b1 = MUL(W3, row[1]);
+ ; MAC(b1, -W7, row[3]);
+ ; b2 = MUL(W5, row[1]);
+ ; MAC(b2, -W1, row[3]);
+ ; b3 = MUL(W7, row[1]);
+ ; MAC(b3, -W5, row[3]);
+ SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
+ pmaddwd m2, m0, [w3_min_w7]
+ pmaddwd m3, m1, [w3_min_w7]
+ pmaddwd m4, m0, [w5_min_w1]
+ pmaddwd m5, m1, [w5_min_w1]
+ pmaddwd m6, m0, [w7_min_w5]
+ pmaddwd m7, m1, [w7_min_w5]
+ pmaddwd m0, [w1_plus_w3]
+ pmaddwd m1, [w1_plus_w3]
+
+ ; b0: +1*row[1]+2*row[3]
+ ; b1: +2*row[1]-1*row[3]
+ ; b2: -1*row[1]-1*row[3]
+ ; b3: +1*row[1]+1*row[3]
+
+ ; MAC(b0, W5, row[5]);
+ ; MAC(b0, W7, row[7]);
+ ; MAC(b1, -W1, row[5]);
+ ; MAC(b1, -W5, row[7]);
+ ; MAC(b2, W7, row[5]);
+ ; MAC(b2, W3, row[7]);
+ ; MAC(b3, W3, row[5]);
+ ; MAC(b3, -W1, row[7]);
+ SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
+
+ ; b0: -1*row[5]+1*row[7]
+ ; b1: -1*row[5]+1*row[7]
+ ; b2: +1*row[5]+2*row[7]
+ ; b3: +2*row[5]-1*row[7]
+
+ pmaddwd m10, m8, [w1_plus_w5]
+ pmaddwd m11, m9, [w1_plus_w5]
+ pmaddwd m12, m8, [w5_plus_w7]
+ pmaddwd m13, m9, [w5_plus_w7]
+ psubd m2, m10 ; b1[0-3]
+ psubd m3, m11 ; b1[4-7]
+ paddd m0, m12 ; b0[0-3]
+ paddd m1, m13 ; b0[4-7]
+ pmaddwd m12, m8, [w7_plus_w3]
+ pmaddwd m13, m9, [w7_plus_w3]
+ pmaddwd m8, [w3_min_w1]
+ pmaddwd m9, [w3_min_w1]
+ paddd m4, m12 ; b2[0-3]
+ paddd m5, m13 ; b2[4-7]
+ paddd m6, m8 ; b3[0-3]
+ paddd m7, m9 ; b3[4-7]
+
+ ; row[0] = (a0 + b0) >> 15;
+ ; row[7] = (a0 - b0) >> 15;
+ ; row[1] = (a1 + b1) >> 15;
+ ; row[6] = (a1 - b1) >> 15;
+ ; row[2] = (a2 + b2) >> 15;
+ ; row[5] = (a2 - b2) >> 15;
+ ; row[3] = (a3 + b3) >> 15;
+ ; row[4] = (a3 - b3) >> 15;
+ mova m8, [r2+ 0] ; a0[0-3]
+ mova m9, [r2+16] ; a0[4-7]
+ SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
+ mova m0, [r2+32] ; a1[0-3]
+ mova m1, [r2+48] ; a1[4-7]
+ SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
+ mova m1, [r2+64] ; a2[0-3]
+ mova m2, [r2+80] ; a2[4-7]
+ SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
+ mova m2, [r2+96] ; a3[0-3]
+ mova m3, [r2+112] ; a3[4-7]
+ SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
+%endmacro
+
+; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
+; int16_t *block, const int16_t *qmat);
+%macro idct_put_fn 1
+cglobal prores_idct_put_10, 4, 4, %1
+ movsxd r1, r1d
+ pxor m15, m15 ; zero
+
+ ; for (i = 0; i < 8; i++)
+ ; idctRowCondDC(block + i*8);
+ mova m10,[r2+ 0] ; { row[0] }[0-7]
+ mova m8, [r2+32] ; { row[2] }[0-7]
+ mova m13,[r2+64] ; { row[4] }[0-7]
+ mova m12,[r2+96] ; { row[6] }[0-7]
+
+ pmullw m10,[r3+ 0]
+ pmullw m8, [r3+32]
+ pmullw m13,[r3+64]
+ pmullw m12,[r3+96]
+
+ IDCT_1D row, 15
+
+ ; transpose for second part of IDCT
+ TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
+ mova [r2+ 16], m0
+ mova [r2+ 48], m2
+ mova [r2+ 80], m11
+ mova [r2+112], m10
+ SWAP 8, 10
+ SWAP 1, 8
+ SWAP 4, 13
+ SWAP 9, 12
+
+ ; for (i = 0; i < 8; i++)
+ ; idctSparseColAdd(dest + i, line_size, block + i);
+ IDCT_1D col, 18
+
+ ; clip/store
+ mova m3, [pw_4]
+ mova m5, [pw_1019]
+ pmaxsw m8, m3
+ pmaxsw m0, m3
+ pmaxsw m1, m3
+ pmaxsw m2, m3
+ pmaxsw m4, m3
+ pmaxsw m11, m3
+ pmaxsw m9, m3
+ pmaxsw m10, m3
+ pminsw m8, m5
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m4, m5
+ pminsw m11, m5
+ pminsw m9, m5
+ pminsw m10, m5
+
+ lea r2, [r1*3]
+ mova [r0 ], m8
+ mova [r0+r1 ], m0
+ mova [r0+r1*2], m1
+ mova [r0+r2 ], m2
+ lea r0, [r0+r1*4]
+ mova [r0 ], m4
+ mova [r0+r1 ], m11
+ mova [r0+r1*2], m9
+ mova [r0+r2 ], m10
+ RET
+%endmacro
+
+%macro SIGNEXTEND 2-3
+%if cpuflag(sse4) ; dstlow, dsthigh
+ movhlps %2, %1
+ pmovsxwd %1, %1
+ pmovsxwd %2, %2
+%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
+ pxor %3, %3
+ pcmpgtw %3, %1
+ mova %2, %1
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%endif
+%endmacro
+
+INIT_XMM sse2
+idct_put_fn 16
+INIT_XMM sse4
+idct_put_fn 16
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+idct_put_fn 16
+%endif
+
+%endif
diff --git a/ffmpeg/libavcodec/x86/proresdsp_init.c b/ffmpeg/libavcodec/x86/proresdsp_init.c
new file mode 100644
index 0000000..91ff257
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/proresdsp_init.c
@@ -0,0 +1,57 @@
+/*
+ * Apple ProRes compatible decoder
+ *
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/proresdsp.h"
+
+void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
+ int16_t *block, const int16_t *qmat);
+void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
+ int16_t *block, const int16_t *qmat);
+void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
+ int16_t *block, const int16_t *qmat);
+
+void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
+{
+#if ARCH_X86_64
+ int flags = av_get_cpu_flags();
+
+ if(avctx->flags & CODEC_FLAG_BITEXACT)
+ return;
+
+ if (EXTERNAL_SSE2(flags)) {
+ dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ dsp->idct_put = ff_prores_idct_put_10_sse2;
+ }
+
+ if (EXTERNAL_SSE4(flags)) {
+ dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ dsp->idct_put = ff_prores_idct_put_10_sse4;
+ }
+
+ if (EXTERNAL_AVX(flags)) {
+ dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ dsp->idct_put = ff_prores_idct_put_10_avx;
+ }
+#endif /* ARCH_X86_64 */
+}
diff --git a/ffmpeg/libavcodec/x86/qpelbase.asm b/ffmpeg/libavcodec/x86/qpelbase.asm
new file mode 100644
index 0000000..c2ffb86
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/qpelbase.asm
@@ -0,0 +1,176 @@
+;******************************************************************************
+;* MMX optimized DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro op_avgh 3
+ movh %3, %2
+ pavgb %1, %3
+ movh %2, %1
+%endmacro
+
+%macro op_avg 2
+ pavgb %1, %2
+ mova %2, %1
+%endmacro
+
+%macro op_puth 2-3
+ movh %2, %1
+%endmacro
+
+%macro op_put 2
+ mova %2, %1
+%endmacro
+
+; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PIXELS4_L2 1
+%define OP op_%1h
+cglobal %1_pixels4_l2, 6,6
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ test r5d, 1
+ je .loop
+ movd m0, [r1]
+ movd m1, [r2]
+ add r1, r4
+ add r2, 4
+ pavgb m0, m1
+ OP m0, [r0], m3
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r4]
+ lea r1, [r1+2*r4]
+ pavgb m0, [r2]
+ pavgb m1, [r2+4]
+ OP m0, [r0], m3
+ OP m1, [r0+r3], m3
+ lea r0, [r0+2*r3]
+ mova m0, [r1]
+ mova m1, [r1+r4]
+ lea r1, [r1+2*r4]
+ pavgb m0, [r2+8]
+ pavgb m1, [r2+12]
+ OP m0, [r0], m3
+ OP m1, [r0+r3], m3
+ lea r0, [r0+2*r3]
+ add r2, 16
+ sub r5d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS4_L2 put
+PIXELS4_L2 avg
+
+; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PIXELS8_L2 1
+%define OP op_%1
+cglobal %1_pixels8_l2, 6,6
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r2]
+ add r1, r4
+ add r2, 8
+ pavgb m0, m1
+ OP m0, [r0]
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+r4]
+ lea r1, [r1+2*r4]
+ pavgb m0, [r2]
+ pavgb m1, [r2+8]
+ OP m0, [r0]
+ OP m1, [r0+r3]
+ lea r0, [r0+2*r3]
+ mova m0, [r1]
+ mova m1, [r1+r4]
+ lea r1, [r1+2*r4]
+ pavgb m0, [r2+16]
+ pavgb m1, [r2+24]
+ OP m0, [r0]
+ OP m1, [r0+r3]
+ lea r0, [r0+2*r3]
+ add r2, 32
+ sub r5d, 4
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS8_L2 put
+PIXELS8_L2 avg
+
+; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PIXELS16_L2 1
+%define OP op_%1
+cglobal %1_pixels16_l2, 6,6
+ movsxdifnidn r3, r3d
+ movsxdifnidn r4, r4d
+ test r5d, 1
+ je .loop
+ mova m0, [r1]
+ mova m1, [r1+8]
+ pavgb m0, [r2]
+ pavgb m1, [r2+8]
+ add r1, r4
+ add r2, 16
+ OP m0, [r0]
+ OP m1, [r0+8]
+ add r0, r3
+ dec r5d
+.loop:
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ pavgb m0, [r2]
+ pavgb m1, [r2+8]
+ OP m0, [r0]
+ OP m1, [r0+8]
+ add r0, r3
+ mova m0, [r1]
+ mova m1, [r1+8]
+ add r1, r4
+ pavgb m0, [r2+16]
+ pavgb m1, [r2+24]
+ OP m0, [r0]
+ OP m1, [r0+8]
+ add r0, r3
+ add r2, 32
+ sub r5d, 2
+ jne .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PIXELS16_L2 put
+PIXELS16_L2 avg
diff --git a/ffmpeg/libavcodec/x86/rv34dsp.asm b/ffmpeg/libavcodec/x86/rv34dsp.asm
new file mode 100644
index 0000000..4d9c35b
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/rv34dsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_row_coeffs: times 4 dw 13
+ times 4 dw 17
+ times 4 dw 7
+pd_512: times 2 dd 0x200
+pw_col_coeffs: dw 13, 13, 13, -13
+ dw 17, 7, 7, -17
+ dw 13, -13, 13, 13
+ dw -7, 17, -17, -7
+
+SECTION .text
+
+%macro IDCT_DC_NOROUND 1
+ imul %1, 13*13*3
+ sar %1, 11
+%endmacro
+
+%macro IDCT_DC_ROUND 1
+ imul %1, 13*13
+ add %1, 0x200
+ sar %1, 10
+%endmacro
+
+%macro rv34_idct 1
+cglobal rv34_idct_%1, 1, 2, 0
+ movsx r1, word [r0]
+ IDCT_DC r1
+ movd m0, r1d
+ pshufw m0, m0, 0
+ movq [r0+ 0], m0
+ movq [r0+ 8], m0
+ movq [r0+16], m0
+ movq [r0+24], m0
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define IDCT_DC IDCT_DC_ROUND
+rv34_idct dc
+%define IDCT_DC IDCT_DC_NOROUND
+rv34_idct dc_noround
+
+; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+INIT_MMX mmx
+cglobal rv34_idct_dc_add, 3, 3
+ ; calculate DC
+ IDCT_DC_ROUND r2
+ pxor m1, m1
+ movd m0, r2d
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+
+ ; add DC
+ lea r2, [r0+r1*2]
+ movh m2, [r0]
+ movh m3, [r0+r1]
+ movh m4, [r2]
+ movh m5, [r2+r1]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ movh [r0], m2
+ movh [r0+r1], m3
+ movh [r2], m4
+ movh [r2+r1], m5
+ RET
+
+; Load coeffs and perform row transform
+; Output: coeffs in mm[0467], rounder in mm5
+%macro ROW_TRANSFORM 1
+ pxor mm7, mm7
+ mova mm0, [%1+ 0*8]
+ mova mm1, [%1+ 1*8]
+ mova mm2, [%1+ 2*8]
+ mova mm3, [%1+ 3*8]
+ mova [%1+ 0*8], mm7
+ mova [%1+ 1*8], mm7
+ mova [%1+ 2*8], mm7
+ mova [%1+ 3*8], mm7
+ mova mm4, mm0
+ mova mm6, [pw_row_coeffs+ 0]
+ paddsw mm0, mm2 ; b0 + b2
+ psubsw mm4, mm2 ; b0 - b2
+ pmullw mm0, mm6 ; *13 = z0
+ pmullw mm4, mm6 ; *13 = z1
+ mova mm5, mm1
+ pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
+ pmullw mm5, [pw_row_coeffs+16] ; b1* 7
+ mova mm7, mm3
+ pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
+ pmullw mm7, [pw_row_coeffs+16] ; b3* 7
+ paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
+ psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
+ mova mm7, mm0
+ mova mm6, mm4
+ paddsw mm0, mm1 ; z0 + z3
+ psubsw mm7, mm1 ; z0 - z3
+ paddsw mm4, mm5 ; z1 + z2
+ psubsw mm6, mm5 ; z1 - z2
+ mova mm5, [pd_512] ; 0x200
+%endmacro
+
+; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
+%macro COL_TRANSFORM 4
+ pshufw mm3, %2, 0xDD ; col. 1,3,1,3
+ pshufw %2, %2, 0x88 ; col. 0,2,0,2
+ pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
+ pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
+ paddd %2, mm5
+ pshufw mm1, %2, 01001110b ; z1 | z0
+ pshufw mm2, mm3, 01001110b ; z2 | z3
+ paddd %2, mm3 ; z0+z3 | z1+z2
+ psubd mm1, mm2 ; z1-z2 | z0-z3
+ movd mm3, %1
+ psrad %2, 10
+ pxor mm2, mm2
+ psrad mm1, 10
+ punpcklbw mm3, mm2
+ packssdw %2, mm1
+ paddw %2, mm3
+ packuswb %2, %2
+ movd %1, %2
+%endmacro
+INIT_MMX mmxext
+cglobal rv34_idct_add, 3,3,0, d, s, b
+ ROW_TRANSFORM bq
+ COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
+ mova mm0, [pw_col_coeffs+ 0]
+ COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
+ mova mm4, [pw_col_coeffs+ 8]
+ lea dq, [dq + 2*sq]
+ COL_TRANSFORM [dq], mm6, mm0, mm4
+ COL_TRANSFORM [dq+sq], mm7, mm0, mm4
+ ret
+
+; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
+INIT_XMM sse4
+cglobal rv34_idct_dc_add, 3, 3, 6
+ ; load data
+ IDCT_DC_ROUND r2
+ pxor m1, m1
+
+ ; calculate DC
+ movd m0, r2d
+ lea r2, [r0+r1*2]
+ movd m2, [r0]
+ movd m3, [r0+r1]
+ pshuflw m0, m0, 0
+ movd m4, [r2]
+ movd m5, [r2+r1]
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [r0], m2
+ pextrd [r0+r1], m2, 1
+ pextrd [r2], m2, 2
+ pextrd [r2+r1], m2, 3
+ RET
diff --git a/ffmpeg/libavcodec/x86/rv34dsp_init.c b/ffmpeg/libavcodec/x86/rv34dsp_init.c
new file mode 100644
index 0000000..a2dea74
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/rv34dsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * RV30/40 MMX/SSE2 optimizations
+ * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_rv34_idct_dc_mmxext(int16_t *block);
+void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
+void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
+void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
+
+av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags))
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
+ c->rv34_idct_add = ff_rv34_idct_add_mmxext;
+ }
+ if (EXTERNAL_SSE4(mm_flags))
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
+}
diff --git a/ffmpeg/libavcodec/x86/rv40dsp.asm b/ffmpeg/libavcodec/x86/rv40dsp.asm
new file mode 100644
index 0000000..7ec72be
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/rv40dsp.asm
@@ -0,0 +1,505 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the RV40 decoder
+;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+align 16
+pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
+
+sixtap_filter_hb_m: times 8 db 1, -5
+ times 8 db 52, 20
+ ; multiplied by 2 to have the same shift
+ times 8 db 2, -10
+ times 8 db 40, 40
+ ; back to normal
+ times 8 db 1, -5
+ times 8 db 20, 52
+
+sixtap_filter_v_m: times 8 dw 1
+ times 8 dw -5
+ times 8 dw 52
+ times 8 dw 20
+ ; multiplied by 2 to have the same shift
+ times 8 dw 2
+ times 8 dw -10
+ times 8 dw 40
+ times 8 dw 40
+ ; back to normal
+ times 8 dw 1
+ times 8 dw -5
+ times 8 dw 20
+ times 8 dw 52
+
+%ifdef PIC
+%define sixtap_filter_hw picregq
+%define sixtap_filter_hb picregq
+%define sixtap_filter_v picregq
+%define npicregs 1
+%else
+%define sixtap_filter_hw sixtap_filter_hw_m
+%define sixtap_filter_hb sixtap_filter_hb_m
+%define sixtap_filter_v sixtap_filter_v_m
+%define npicregs 0
+%endif
+
+filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
+
+cextern pw_32
+cextern pw_16
+cextern pw_512
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; subpel MC functions:
+;
+; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
+; uint8_t *src, int srcstride,
+; int len, int m);
+;----------------------------------------------------------------------
+%macro LOAD 2
+%if WIN64
+ movsxd %1q, %1d
+%endif
+%ifdef PIC
+ add %1q, picregq
+%else
+ add %1q, %2
+%endif
+%endmacro
+
+%macro STORE 3
+%ifidn %3, avg
+ movh %2, [dstq]
+%endif
+ packuswb %1, %1
+%ifidn %3, avg
+%if cpuflag(3dnow)
+ pavgusb %1, %2
+%else
+ pavgb %1, %2
+%endif
+%endif
+ movh [dstq], %1
+%endmacro
+
+%macro FILTER_V 1
+cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ pxor m7, m7
+ LOAD my, sixtap_filter_v
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+
+%ifdef m8
+ mova m8, [myq+ 0]
+ mova m9, [myq+16]
+ mova m10, [myq+32]
+ mova m11, [myq+48]
+%define COEFF05 m8
+%define COEFF14 m9
+%define COEFF2 m10
+%define COEFF3 m11
+%else
+%define COEFF05 [myq+ 0]
+%define COEFF14 [myq+16]
+%define COEFF2 [myq+32]
+%define COEFF3 [myq+48]
+%endif
+.nextrow:
+ mova m6, m1
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ paddw m6, m4
+ punpcklbw m5, m7
+ pmullw m6, COEFF14
+ paddw m0, m5
+ pmullw m0, COEFF05
+ paddw m6, m0
+ mova m0, m1
+ paddw m6, [pw_32]
+ mova m1, m2
+ pmullw m2, COEFF2
+ paddw m6, m2
+ mova m2, m3
+ pmullw m3, COEFF3
+ paddw m6, m3
+
+ ; round/clip/store
+ mova m3, m4
+ psraw m6, 6
+ mova m4, m5
+ STORE m6, m5, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+%macro FILTER_H 1
+cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ pxor m7, m7
+ LOAD mx, sixtap_filter_v
+ mova m6, [pw_32]
+%ifdef m8
+ mova m8, [mxq+ 0]
+ mova m9, [mxq+16]
+ mova m10, [mxq+32]
+ mova m11, [mxq+48]
+%define COEFF05 m8
+%define COEFF14 m9
+%define COEFF2 m10
+%define COEFF3 m11
+%else
+%define COEFF05 [mxq+ 0]
+%define COEFF14 [mxq+16]
+%define COEFF2 [mxq+32]
+%define COEFF3 [mxq+48]
+%endif
+.nextrow:
+ movq m0, [srcq-2]
+ movq m5, [srcq+3]
+ movq m1, [srcq-1]
+ movq m4, [srcq+2]
+ punpcklbw m0, m7
+ punpcklbw m5, m7
+ punpcklbw m1, m7
+ punpcklbw m4, m7
+ movq m2, [srcq-0]
+ movq m3, [srcq+1]
+ paddw m0, m5
+ paddw m1, m4
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ pmullw m0, COEFF05
+ pmullw m1, COEFF14
+ pmullw m2, COEFF2
+ pmullw m3, COEFF3
+ paddw m0, m6
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ psraw m0, 6
+ STORE m0, m1, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+FILTER_V put
+FILTER_H put
+
+INIT_MMX mmxext
+FILTER_V avg
+FILTER_H avg
+
+INIT_MMX 3dnow
+FILTER_V avg
+FILTER_H avg
+%endif
+
+INIT_XMM sse2
+FILTER_H put
+FILTER_H avg
+FILTER_V put
+FILTER_V avg
+
+%macro FILTER_SSSE3 1
+cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ LOAD my, sixtap_filter_hb
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ mova m5, [myq]
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ lea srcq, [srcq+2*srcstrideq]
+
+.nextrow:
+ mova m6, m2
+ punpcklbw m0, m1
+ punpcklbw m6, m3
+ pmaddubsw m0, m5
+ pmaddubsw m6, [myq+16]
+ movh m7, [srcq] ; read new row
+ paddw m6, m0
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m7
+ punpcklbw m7, m3
+ pmaddubsw m7, m5
+ paddw m6, m7
+ pmulhrsw m6, [pw_512]
+ STORE m6, m7, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ mova m3, [filter_h6_shuf2]
+ mova m4, [filter_h6_shuf3]
+ LOAD mx, sixtap_filter_hb
+ mova m5, [mxq] ; set up 6tap filter in bytes
+ mova m6, [mxq+16]
+ mova m7, [filter_h6_shuf1]
+
+.nextrow:
+ movu m0, [srcq-2]
+ mova m1, m0
+ mova m2, m0
+ pshufb m0, m7
+ pshufb m1, m3
+ pshufb m2, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ pmaddubsw m2, m5
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, [pw_512]
+ STORE m0, m1, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+FILTER_SSSE3 put
+FILTER_SSSE3 avg
+
+; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
+%macro RV40_WCORE 4-5
+ movh m4, [%3 + r6 + 0]
+ movh m5, [%4 + r6 + 0]
+%if %0 == 4
+%define OFFSET r6 + mmsize / 2
+%else
+ ; 8x8 block and sse2, stride was provided
+%define OFFSET r6
+ add r6, r5
+%endif
+ movh m6, [%3 + OFFSET]
+ movh m7, [%4 + OFFSET]
+
+%if %1 == 0
+ ; 14bits weights
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+
+ psllw m4, 7
+ psllw m5, 7
+ psllw m6, 7
+ psllw m7, 7
+ pmulhw m4, m3
+ pmulhw m5, m2
+ pmulhw m6, m3
+ pmulhw m7, m2
+
+ paddw m4, m5
+ paddw m6, m7
+%else
+ ; 5bits weights
+%if cpuflag(ssse3)
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+
+ pmaddubsw m4, m3
+ pmaddubsw m6, m3
+%else
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+
+ pmullw m4, m3
+ pmullw m5, m2
+ pmullw m6, m3
+ pmullw m7, m2
+ paddw m4, m5
+ paddw m6, m7
+%endif
+
+%endif
+
+ ; bias and shift down
+%if cpuflag(ssse3)
+ pmulhrsw m4, m1
+ pmulhrsw m6, m1
+%else
+ paddw m4, m1
+ paddw m6, m1
+ psrlw m4, 5
+ psrlw m6, 5
+%endif
+
+ packuswb m4, m6
+%if %0 == 5
+ ; Only called for 8x8 blocks and sse2
+ sub r6, r5
+ movh [%2 + r6], m4
+ add r6, r5
+ movhps [%2 + r6], m4
+%else
+ mova [%2 + r6], m4
+%endif
+%endmacro
+
+
+%macro MAIN_LOOP 2
+%if mmsize == 8
+ RV40_WCORE %2, r0, r1, r2
+%if %1 == 16
+ RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
+%endif
+
+ ; Prepare for next loop
+ add r6, r5
+%else
+%ifidn %1, 8
+ RV40_WCORE %2, r0, r1, r2, r5
+ ; Prepare 2 next lines
+ add r6, r5
+%else
+ RV40_WCORE %2, r0, r1, r2
+ ; Prepare single next line
+ add r6, r5
+%endif
+%endif
+
+%endmacro
+
+; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
+; %1=size %2=num of xmm regs
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT 3
+cglobal rv40_weight_func_%1_%2, 6, 7, 8
+%if cpuflag(ssse3)
+ mova m1, [pw_1024]
+%else
+ mova m1, [pw_16]
+%endif
+ pxor m0, m0
+ ; Set loop counter and increments
+ mov r6, r5
+ shl r6, %3
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ neg r6
+
+ movd m2, r3d
+ movd m3, r4d
+%ifidn %1,rnd
+%define RND 0
+ SPLATW m2, m2
+%else
+%define RND 1
+%if cpuflag(ssse3)
+ punpcklbw m3, m2
+%else
+ SPLATW m2, m2
+%endif
+%endif
+ SPLATW m3, m3
+
+.loop:
+ MAIN_LOOP %2, RND
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
+
+INIT_XMM sse2
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
+
+INIT_XMM ssse3
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
diff --git a/ffmpeg/libavcodec/x86/rv40dsp_init.c b/ffmpeg/libavcodec/x86/rv40dsp_init.c
new file mode 100644
index 0000000..2f97518
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/rv40dsp_init.c
@@ -0,0 +1,243 @@
+/*
+ * RV40 decoder motion compensation functions x86-optimised
+ * Copyright (c) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * RV40 decoder motion compensation functions x86-optimised
+ * 2,0 and 0,2 have h264 equivalents.
+ * 3,3 is bugged in the rv40 format and maps to _xy2 version
+ */
+
+#include "libavcodec/rv34dsp.h"
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_YASM
+void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+#define DECLARE_WEIGHT(opt) \
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride);
+DECLARE_WEIGHT(mmxext)
+DECLARE_WEIGHT(sse2)
+DECLARE_WEIGHT(ssse3)
+
+/** @{ */
+/**
+ * Define one qpel function.
+ * LOOPSIZE must be already set to the number of pixels processed per
+ * iteration in the inner loop of the called functions.
+ * COFF(x) must be already defined so as to provide the offset into any
+ * array of coeffs used by the called function for the qpel position x.
+ */
+#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
+static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t stride) \
+{ \
+ int i; \
+ if (PH && PV) { \
+ DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
+ uint8_t *tmpptr = tmp + SIZE * 2; \
+ src -= stride * 2; \
+ \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
+ SIZE + 5, HCOFF(PH)); \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
+ SIZE, SIZE, VCOFF(PV)); \
+ } else if (PV) { \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
+ stride, SIZE, VCOFF(PV)); \
+ } else { \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
+ stride, SIZE, HCOFF(PH)); \
+ } \
+};
+
+/** Declare functions for sizes 8 and 16 and given operations
+ * and qpel position. */
+#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
+ QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
+ QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
+
+/** Declare all functions for all sizes and qpel positions */
+#define QPEL_MC_DECL(OP, OPT) \
+void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
+ const uint8_t *src, \
+ ptrdiff_t srcStride, \
+ int len, int m); \
+void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
+ const uint8_t *src, \
+ ptrdiff_t srcStride, \
+ int len, int m); \
+QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 2, OPT)
+/** @} */
+
+#define LOOPSIZE 8
+#define HCOFF(x) (32 * (x - 1))
+#define VCOFF(x) (32 * (x - 1))
+QPEL_MC_DECL(put_, _ssse3)
+QPEL_MC_DECL(avg_, _ssse3)
+
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE 8
+#define HCOFF(x) (64 * (x - 1))
+#define VCOFF(x) (64 * (x - 1))
+QPEL_MC_DECL(put_, _sse2)
+QPEL_MC_DECL(avg_, _sse2)
+
+#if ARCH_X86_32
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE 4
+#define HCOFF(x) (64 * (x - 1))
+#define VCOFF(x) (64 * (x - 1))
+
+QPEL_MC_DECL(put_, _mmx)
+
+#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _mmxext)
+
+#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _3dnow)
+#endif
+
+/** @{ */
+/** Set one function */
+#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
+ c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
+
+/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
+#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
+ QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
+ QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
+
+/** Set all functions for all sizes and qpel positions */
+#define QPEL_MC_SET(OP, OPT) \
+QPEL_FUNCS_SET (OP, 0, 1, OPT) \
+QPEL_FUNCS_SET (OP, 0, 3, OPT) \
+QPEL_FUNCS_SET (OP, 1, 0, OPT) \
+QPEL_FUNCS_SET (OP, 1, 1, OPT) \
+QPEL_FUNCS_SET (OP, 1, 2, OPT) \
+QPEL_FUNCS_SET (OP, 1, 3, OPT) \
+QPEL_FUNCS_SET (OP, 2, 1, OPT) \
+QPEL_FUNCS_SET (OP, 2, 2, OPT) \
+QPEL_FUNCS_SET (OP, 2, 3, OPT) \
+QPEL_FUNCS_SET (OP, 3, 0, OPT) \
+QPEL_FUNCS_SET (OP, 3, 1, OPT) \
+QPEL_FUNCS_SET (OP, 3, 2, OPT)
+/** @} */
+
+#endif /* HAVE_YASM */
+
+av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
+ c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+#if HAVE_MMX_INLINE
+ c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
+ c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
+ c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
+ c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
+#endif /* HAVE_MMX_INLINE */
+#if ARCH_X86_32
+ QPEL_MC_SET(put_, _mmx)
+#endif
+ }
+ if (EXTERNAL_MMXEXT(mm_flags)) {
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
+#if ARCH_X86_32
+ QPEL_MC_SET(avg_, _mmxext)
+#endif
+ } else if (EXTERNAL_AMD3DNOW(mm_flags)) {
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
+#if ARCH_X86_32
+ QPEL_MC_SET(avg_, _3dnow)
+#endif
+ }
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
+ QPEL_MC_SET(put_, _sse2)
+ QPEL_MC_SET(avg_, _sse2)
+ }
+ if (EXTERNAL_SSSE3(mm_flags)) {
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
+ QPEL_MC_SET(put_, _ssse3)
+ QPEL_MC_SET(avg_, _ssse3)
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/sbrdsp.asm b/ffmpeg/libavcodec/x86/sbrdsp.asm
new file mode 100644
index 0000000..1b7f3a8
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/sbrdsp.asm
@@ -0,0 +1,222 @@
+;******************************************************************************
+;* AAC Spectral Band Replication decoding functions
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; mask equivalent for multiply by -1.0 1.0
+ps_mask times 2 dd 1<<31, 0
+ps_neg times 4 dd 1<<31
+
+SECTION_TEXT
+
+INIT_XMM sse
+cglobal sbr_sum_square, 2, 3, 6
+ mov r2, r1
+ xorps m0, m0
+ xorps m1, m1
+ sar r2, 3
+ jz .prepare
+.loop:
+ movu m2, [r0 + 0]
+ movu m3, [r0 + 16]
+ movu m4, [r0 + 32]
+ movu m5, [r0 + 48]
+ mulps m2, m2
+ mulps m3, m3
+ mulps m4, m4
+ mulps m5, m5
+ addps m0, m2
+ addps m1, m3
+ addps m0, m4
+ addps m1, m5
+ add r0, 64
+ dec r2
+ jnz .loop
+.prepare:
+ and r1, 7
+ sar r1, 1
+ jz .end
+; len is a multiple of 2, thus there are at least 4 elements to process
+.endloop:
+ movu m2, [r0]
+ add r0, 16
+ mulps m2, m2
+ dec r1
+ addps m0, m2
+ jnz .endloop
+.end:
+ addps m0, m1
+ movhlps m2, m0
+ addps m0, m2
+ movss m1, m0
+ shufps m0, m0, 1
+ addss m0, m1
+%if ARCH_X86_64 == 0
+ movss r0m, m0
+ fld dword r0m
+%endif
+ RET
+
+%define STEP 40*4*2
+cglobal sbr_hf_g_filt, 5, 6, 5
+ lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
+ mov r5, r3
+ and r3, 0xFC
+ lea r2, [r2 + r3*4]
+ lea r0, [r0 + r3*8]
+ neg r3
+ jz .loop1
+.loop4:
+ movlps m0, [r2 + 4*r3 + 0]
+ movlps m1, [r2 + 4*r3 + 8]
+ movlps m2, [r1 + 0*STEP]
+ movlps m3, [r1 + 2*STEP]
+ movhps m2, [r1 + 1*STEP]
+ movhps m3, [r1 + 3*STEP]
+ unpcklps m0, m0
+ unpcklps m1, m1
+ mulps m0, m2
+ mulps m1, m3
+ movu [r0 + 8*r3 + 0], m0
+ movu [r0 + 8*r3 + 16], m1
+ add r1, 4*STEP
+ add r3, 4
+ jnz .loop4
+ and r5, 3 ; number of single element loops
+ jz .end
+.loop1: ; element 0 and 1 can be computed at the same time
+ movss m0, [r2]
+ movlps m2, [r1]
+ unpcklps m0, m0
+ mulps m2, m0
+ movlps [r0], m2
+ add r0, 8
+ add r2, 4
+ add r1, STEP
+ dec r5
+ jnz .loop1
+.end:
+ RET
+
+; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
+; const float alpha0[2], const float alpha1[2],
+; float bw, int start, int end)
+;
+cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
+ ; load alpha factors
+%define bw m0
+%if ARCH_X86_64 == 0 || WIN64
+ movss bw, BWm
+%endif
+ movlps m2, [alpha1q]
+ movlps m1, [alpha0q]
+ shufps bw, bw, 0
+ mulps m2, bw ; (a1[0] a1[1])*bw
+ mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
+ mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
+ mova m3, m1
+ mova m4, m2
+
+ ; Set pointers
+%if ARCH_X86_64 == 0 || WIN64
+ ; start and end 6th and 7th args on stack
+ mov r2d, Sm
+ mov r3d, Em
+%define start r2q
+%define end r3q
+%else
+; BW does not actually occupy a register, so shift by 1
+%define start BWq
+%define end Sq
+%endif
+ sub start, end ; neg num of loops
+ lea X_highq, [X_highq + end*2*4]
+ lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
+ shl start, 3 ; offset from num loops
+
+ mova m0, [X_lowq + start]
+ shufps m3, m3, q1111
+ shufps m4, m4, q1111
+ xorps m3, [ps_mask]
+ shufps m1, m1, q0000
+ shufps m2, m2, q0000
+ xorps m4, [ps_mask]
+.loop2:
+ movu m7, [X_lowq + start + 8] ; BbCc
+ mova m6, m0
+ mova m5, m7
+ shufps m0, m0, q2301 ; aAbB
+ shufps m7, m7, q2301 ; bBcC
+ mulps m0, m4
+ mulps m7, m3
+ mulps m6, m2
+ mulps m5, m1
+ addps m7, m0
+ mova m0, [X_lowq + start +16] ; CcDd
+ addps m7, m0
+ addps m6, m5
+ addps m7, m6
+ mova [X_highq + start], m7
+ add start, 16
+ jnz .loop2
+ RET
+
+cglobal sbr_sum64x5, 1,2,4,z
+ lea r1q, [zq+ 256]
+.loop:
+ mova m0, [zq+ 0]
+ mova m2, [zq+ 16]
+ mova m1, [zq+ 256]
+ mova m3, [zq+ 272]
+ addps m0, [zq+ 512]
+ addps m2, [zq+ 528]
+ addps m1, [zq+ 768]
+ addps m3, [zq+ 784]
+ addps m0, [zq+1024]
+ addps m2, [zq+1040]
+ addps m0, m1
+ addps m2, m3
+ mova [zq], m0
+ mova [zq+16], m2
+ add zq, 32
+ cmp zq, r1q
+ jne .loop
+ REP_RET
+
+INIT_XMM sse
+cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
+ lea r2q, [zq + (64-4)*4]
+ mova m3, [ps_neg]
+.loop:
+ mova m1, [zq]
+ xorps m0, m3, [r2q]
+ shufps m0, m0, m0, q0123
+ unpcklps m2, m0, m1
+ unpckhps m0, m0, m1
+ mova [Wq + 0], m2
+ mova [Wq + 16], m0
+ add Wq, 32
+ sub r2q, 16
+ add zq, 16
+ cmp zq, r2q
+ jl .loop
+ REP_RET
diff --git a/ffmpeg/libavcodec/x86/sbrdsp_init.c b/ffmpeg/libavcodec/x86/sbrdsp_init.c
new file mode 100644
index 0000000..27fade1
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/sbrdsp_init.c
@@ -0,0 +1,48 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbrdsp.h"
+
+float ff_sbr_sum_square_sse(float (*x)[2], int n);
+void ff_sbr_sum64x5_sse(float *z);
+void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
+ const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
+ const float alpha0[2], const float alpha1[2],
+ float bw, int start, int end);
+void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+
+av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE(mm_flags)) {
+ s->sum_square = ff_sbr_sum_square_sse;
+ s->sum64x5 = ff_sbr_sum64x5_sse;
+ s->hf_g_filt = ff_sbr_hf_g_filt_sse;
+ s->hf_gen = ff_sbr_hf_gen_sse;
+ s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/simple_idct.c b/ffmpeg/libavcodec/x86/simple_idct.c
new file mode 100644
index 0000000..f27d2b9
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/simple_idct.c
@@ -0,0 +1,1167 @@
+/*
+ * Simple IDCT MMX
+ *
+ * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavcodec/simple_idct.h"
+#include "libavutil/mem.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+/*
+23170.475006
+22725.260826
+21406.727617
+19265.545870
+16384.000000
+12872.826198
+8866.956905
+4520.335430
+*/
+#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20 // 6
+
+DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
+
+DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
+ 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
+// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
+// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
+ 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
+ // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
+// 0, 0, 0, 0,
+// 0, 0, 0, 0,
+
+ C4, C4, C4, C4,
+ C4, -C4, C4, -C4,
+
+ C2, C6, C2, C6,
+ C6, -C2, C6, -C2,
+
+ C1, C3, C1, C3,
+ C5, C7, C5, C7,
+
+ C3, -C7, C3, -C7,
+-C1, -C5, -C1, -C5,
+
+ C5, -C1, C5, -C1,
+ C7, C3, C7, C3,
+
+ C7, -C5, C7, -C5,
+ C3, -C1, C3, -C1
+};
+
+static inline void idct(int16_t *block)
+{
+ DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+ int16_t * const temp= (int16_t*)align_tmp;
+
+ __asm__ volatile(
+#if 0 //Alternative, simpler variant
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"\
+
+
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq "MANGLE(wm1010)", %%mm4 \n\t"\
+ "pand %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz 1f \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+ "jmp 2f \n\t"\
+ "1: \n\t"\
+ "pslld $16, %%mm0 \n\t"\
+ "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
+ "psrad $13, %%mm0 \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t"\
+ "movq %%mm0, " #dst " \n\t"\
+ "movq %%mm0, 8+" #dst " \n\t"\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 24+" #dst " \n\t"\
+ "2: \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, rounder, shift)
+ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
+/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
+
+DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+#else
+
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq "MANGLE(wm1010)", %%mm4 \n\t"\
+ "pand %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz 1f \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+ "jmp 2f \n\t"\
+ "1: \n\t"\
+ "pslld $16, %%mm0 \n\t"\
+ "paddd "MANGLE(d40000)", %%mm0 \n\t"\
+ "psrad $13, %%mm0 \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t"\
+ "movq %%mm0, " #dst " \n\t"\
+ "movq %%mm0, 8+" #dst " \n\t"\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 24+" #dst " \n\t"\
+ "2: \n\t"
+
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz " #bt " \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+//IDCT( src0, src4, src1, src5, dst, rounder, shift)
+DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
+Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "# .p2align 4 \n\t"\
+ "4: \n\t"
+Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm1, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm1, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "# .p2align 4 \n\t"\
+ "6: \n\t"
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm1, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm1, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "# .p2align 4 \n\t"\
+ "2: \n\t"
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "# .p2align 4 \n\t"\
+ "3: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 64(%2), %%mm3 \n\t"\
+ "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm1, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm1, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "# .p2align 4 \n\t"\
+ "5: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
+ "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
+ "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
+ "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
+ "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
+ "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
+ "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
+ "movq %%mm4, " #dst " \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 96+" #dst " \n\t"\
+ "movq %%mm4, 112+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movq %%mm5, 32+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movq %%mm6, 48+" #dst " \n\t"\
+ "movq %%mm6, 64+" #dst " \n\t"\
+ "movq %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+
+ "# .p2align 4 \n\t"\
+ "1: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 64(%2), %%mm1 \n\t"\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm3, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
+ "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
+ "movd %%mm3, 32+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+
+ "# .p2align 4 \n\t"
+ "7: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
+ "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
+ "movq %%mm4, " #dst " \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 96+" #dst " \n\t"\
+ "movq %%mm4, 112+" #dst " \n\t"\
+ "movq %%mm0, 32+" #dst " \n\t"\
+ "movq %%mm4, 48+" #dst " \n\t"\
+ "movq %%mm4, 64+" #dst " \n\t"\
+ "movq %%mm0, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+
+#endif
+
+/*
+Input
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
+ 11 31 13 33 51 71 53 73
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+
+Temp
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
+ 01 03 11 13 21 23 31 33
+ 41 43 51 53 61 63 71 73
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
+ 05 07 15 17 25 27 35 37
+ 45 47 55 57 65 67 75 77
+*/
+
+"9: \n\t"
+ :: "r" (block), "r" (temp), "r" (coeffs)
+ : "%eax"
+ );
+}
+
+void ff_simple_idct_mmx(int16_t *block)
+{
+ idct(block);
+}
+
+//FIXME merge add/put into the idct
+
+void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
+{
+ idct(block);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
+{
+ idct(block);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/snowdsp.c b/ffmpeg/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000..5505ee8
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/snowdsp.c
@@ -0,0 +1,902 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+#include "dsputil_mmx.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+ IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+ // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+ // The savings in code and time are well worth having to store this value and
+ // calculate b[0] correctly afterwards.
+
+ i = 0;
+ __asm__ volatile(
+ "pcmpeqd %%xmm7, %%xmm7 \n\t"
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"
+ "psllw $1, %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "psllw $13, %%xmm3 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm2 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ "pmulhw %%xmm3, %%xmm2 \n\t"
+ "pmulhw %%xmm3, %%xmm6 \n\t"
+ "paddw (%0), %%xmm2 \n\t"
+ "paddw 16(%0), %%xmm6 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm6, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+ dst[i] = dst[i] - (b[i] + b[i + 1]);
+ }
+ for(; i<w_r-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubw %%xmm2, %%xmm0 \n\t"
+ "psubw %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+ IDWTELEM b_0 = b[0];
+
+ i = 0;
+ __asm__ volatile(
+ "psllw $15, %%xmm7 \n\t"
+ "pcmpeqw %%xmm6, %%xmm6 \n\t"
+ "psrlw $13, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu 16(%1), %%xmm4 \n\t"
+ "movdqu 2(%1), %%xmm1 \n\t"
+ "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm5, %%xmm4 \n\t"
+ "psubw %%xmm7, %%xmm0 \n\t"
+ "psubw %%xmm7, %%xmm4 \n\t"
+ "psraw $1, %%xmm0 \n\t"
+ "psraw $1, %%xmm4 \n\t"
+ "movdqa (%0), %%xmm1 \n\t"
+ "movdqa 16(%0), %%xmm5 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "psraw $2, %%xmm0 \n\t"
+ "psraw $2, %%xmm4 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+ temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+ }
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw (%1), %%xmm2 \n\t"
+ "paddw 16(%1), %%xmm6 \n\t"
+ "movdqu (%0), %%xmm0 \n\t"
+ "movdqu 16(%0), %%xmm4 \n\t"
+ "paddw %%xmm2, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "psraw $1, %%xmm2 \n\t"
+ "psraw $1, %%xmm6 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "paddw %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm2, (%2) \n\t"
+ "movdqa %%xmm6, 16(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x3E) != 0x3E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=62; i>=0; i-=64){
+ __asm__ volatile(
+ "movdqa (%1), %%xmm0 \n\t"
+ "movdqa 16(%1), %%xmm2 \n\t"
+ "movdqa 32(%1), %%xmm4 \n\t"
+ "movdqa 48(%1), %%xmm6 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "movdqa 16(%1), %%xmm3 \n\t"
+ "movdqa 32(%1), %%xmm5 \n\t"
+ "movdqa 48(%1), %%xmm7 \n\t"
+ "punpcklwd (%2), %%xmm0 \n\t"
+ "punpcklwd 16(%2), %%xmm2 \n\t"
+ "punpcklwd 32(%2), %%xmm4 \n\t"
+ "punpcklwd 48(%2), %%xmm6 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm2, 32(%0) \n\t"
+ "movdqa %%xmm4, 64(%0) \n\t"
+ "movdqa %%xmm6, 96(%0) \n\t"
+ "punpckhwd (%2), %%xmm1 \n\t"
+ "punpckhwd 16(%2), %%xmm3 \n\t"
+ "punpckhwd 32(%2), %%xmm5 \n\t"
+ "punpckhwd 48(%2), %%xmm7 \n\t"
+ "movdqa %%xmm1, 16(%0) \n\t"
+ "movdqa %%xmm3, 48(%0) \n\t"
+ "movdqa %%xmm5, 80(%0) \n\t"
+ "movdqa %%xmm7, 112(%0) \n\t"
+ :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm3, %%mm3 \n\t"
+ "psllw $1, %%mm3 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "psllw $13, %%mm3 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "paddw %%mm7, %%mm2 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ "pmulhw %%mm3, %%mm2 \n\t"
+ "pmulhw %%mm3, %%mm6 \n\t"
+ "paddw (%0), %%mm2 \n\t"
+ "paddw 8(%0), %%mm6 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm6, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+
+ i = 1;
+ b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+ __asm__ volatile(
+ "psllw $15, %%mm7 \n\t"
+ "pcmpeqw %%mm6, %%mm6 \n\t"
+ "psrlw $13, %%mm6 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq 2(%1), %%mm1 \n\t"
+ "movq 10(%1), %%mm5 \n\t"
+ "paddw %%mm6, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm5, %%mm4 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm4 \n\t"
+ "psraw $1, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "movq (%0), %%mm1 \n\t"
+ "movq 8(%0), %%mm5 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "psraw $2, %%mm0 \n\t"
+ "psraw $2, %%mm4 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+ i = 0;
+
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq 2(%1), %%mm2 \n\t"
+ "movq 10(%1), %%mm6 \n\t"
+ "paddw (%1), %%mm2 \n\t"
+ "paddw 8(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "psraw $1, %%mm2 \n\t"
+ "psraw $1, %%mm6 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm6, 8(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x1E) != 0x1E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=30; i>=0; i-=32){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm6 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm5 \n\t"
+ "movq 24(%1), %%mm7 \n\t"
+ "punpcklwd (%2), %%mm0 \n\t"
+ "punpcklwd 8(%2), %%mm2 \n\t"
+ "punpcklwd 16(%2), %%mm4 \n\t"
+ "punpcklwd 24(%2), %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, 16(%0) \n\t"
+ "movq %%mm4, 32(%0) \n\t"
+ "movq %%mm6, 48(%0) \n\t"
+ "punpckhwd (%2), %%mm1 \n\t"
+ "punpckhwd 8(%2), %%mm3 \n\t"
+ "punpckhwd 16(%2), %%mm5 \n\t"
+ "punpckhwd 24(%2), %%mm7 \n\t"
+ "movq %%mm1, 8(%0) \n\t"
+ "movq %%mm3, 24(%0) \n\t"
+ "movq %%mm5, 40(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 48("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "psubw %%"s0", %%"t0" \n\t"\
+ "psubw %%"s1", %%"t1" \n\t"\
+ "psubw %%"s2", %%"t2" \n\t"\
+ "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+ "movdqa %%"s0", ("w",%%"REG_d") \n\t"\
+ "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
+ "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
+ "movdqa %%"s3", 48("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+ "psraw $"n", %%"t0" \n\t"\
+ "psraw $"n", %%"t1" \n\t"\
+ "psraw $"n", %%"t2" \n\t"\
+ "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "paddw %%"s0", %%"t0" \n\t"\
+ "paddw %%"s1", %%"t1" \n\t"\
+ "paddw %%"s2", %%"t2" \n\t"\
+ "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "pmulhw %%"s0", %%"t0" \n\t"\
+ "pmulhw %%"s1", %%"t1" \n\t"\
+ "pmulhw %%"s2", %%"t2" \n\t"\
+ "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movdqa %%"s0", %%"t0" \n\t"\
+ "movdqa %%"s1", %%"t1" \n\t"\
+ "movdqa %%"s2", %%"t2" \n\t"\
+ "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+
+ while(i & 0x1F)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+
+ __asm__ volatile (
+ "jmp 2f \n\t"
+ "1: \n\t"
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+ "pcmpeqw %%xmm0, %%xmm0 \n\t"
+ "pcmpeqw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "psllw $13, %%xmm2 \n\t"
+ snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "pcmpeqw %%xmm5, %%xmm5 \n\t"
+ "psllw $15, %%xmm7 \n\t"
+ "psrlw $13, %%xmm5 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+ "movq (%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm3, %%xmm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm4 \n\t"
+ "pavgw %%xmm3, %%xmm6 \n\t"
+ snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+ snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+ "2: \n\t"
+ "sub $64, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+ "movq %%"s0", ("w",%%"REG_d") \n\t"\
+ "movq %%"s1", 8("w",%%"REG_d") \n\t"\
+ "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+ "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movq %%"s0", %%"t0" \n\t"\
+ "movq %%"s1", %%"t1" \n\t"\
+ "movq %%"s2", %%"t2" \n\t"\
+ "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+ while(i & 15)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+ "pcmpeqw %%mm0, %%mm0 \n\t"
+ "pcmpeqw %%mm2, %%mm2 \n\t"
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "psllw $13, %%mm2 \n\t"
+ snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm5, %%mm5 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $13, %%mm5 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+ "movq (%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm3, %%mm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm4 \n\t"
+ "pavgw %%mm3, %%mm6 \n\t"
+ snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+ snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+ "2: \n\t"
+ "sub $32, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#define snow_inner_add_yblock_sse2_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"\
+ "psllw $15, %%xmm3 \n\t"\
+ "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+ "add $32, %%"REG_S" \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+ "sal $1, %%"REG_c" \n\t"\
+ "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "sar $1, %%"REG_c" \n\t"\
+ "sub $2, %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+ "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "dec %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+ "mov %0, %%"REG_d" \n\t"
+ "movdqa (%%"REG_D"), %%xmm0 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+
+ "punpckhwd %%xmm7, %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm3, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+
+ "mov %1, %%"REG_D" \n\t"
+ "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+ "add %3, %%"REG_D" \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm4 \n\t"
+ "movdqa %%xmm5, %%xmm6 \n\t"
+ "punpckhwd %%xmm7, %%xmm5 \n\t"
+ "punpcklwd %%xmm7, %%xmm6 \n\t"
+ "paddd %%xmm6, %%xmm4 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "paddd %%xmm3, %%xmm4 \n\t"
+ "paddd %%xmm3, %%xmm6 \n\t"
+
+ "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm2, %%xmm0 \n\t"
+ "packuswb %%xmm7, %%xmm0 \n\t"
+ "movq %%xmm0, (%%"REG_d") \n\t"
+
+ "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm6, %%xmm4 \n\t"
+ "packuswb %%xmm7, %%xmm4 \n\t"
+ "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+ "mov %0, %%"REG_d" \n\t"
+ "psrlw $4, %%xmm1 \n\t"
+ "psrlw $4, %%xmm5 \n\t"
+ "paddw (%%"REG_D"), %%xmm1 \n\t"
+ "paddw 16(%%"REG_D"), %%xmm5 \n\t"
+ "paddw %%xmm3, %%xmm1 \n\t"
+ "paddw %%xmm3, %%xmm5 \n\t"
+ "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
+ "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
+ "packuswb %%xmm5, %%xmm1 \n\t"
+
+ "movdqu %%xmm1, (%%"REG_d") \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */\
+ "pcmpeqd %%mm3, %%mm3 \n\t"\
+ "psllw $15, %%mm3 \n\t"\
+ "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+ "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+ "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+ "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "pmullw %%mm0, %%"out_reg1" \n\t"\
+ "pmullw %%mm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+ snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+ "paddusw %%mm2, %%mm1 \n\t"\
+ "paddusw %%mm6, %%mm5 \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+ "mov %0, %%"REG_d" \n\t"\
+ "psrlw $4, %%mm1 \n\t"\
+ "psrlw $4, %%mm5 \n\t"\
+ "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+ "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psraw $4, %%mm1 \n\t"\
+ "psraw $4, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm1 \n\t"\
+ "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+ "add $"s_step", %%"REG_S" \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"\
+ "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "dec %2 \n\t"\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16) {
+ if (!(b_h & 1))
+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ } else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16)
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+ }
+ else{
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+ }
+ c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/ffmpeg/libavcodec/x86/v210-init.c b/ffmpeg/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000..02c5eaa
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+ if (s->aligned_input) {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+ }
+ else {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+ }
+#endif
+}
diff --git a/ffmpeg/libavcodec/x86/v210.asm b/ffmpeg/libavcodec/x86/v210.asm
new file mode 100644
index 0000000..5473126
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/v210.asm
@@ -0,0 +1,88 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 2
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1_%2, 5, 5, 7
+ movsxdifnidn r4, r4d
+ lea r1, [r1+2*r4]
+ add r2, r4
+ add r3, r4
+ neg r4
+
+ mova m3, [v210_mult]
+ mova m4, [v210_mask]
+ mova m5, [v210_luma_shuf]
+ mova m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+ movu m0, [r0]
+%else
+ mova m0, [r0]
+%endif
+
+ pmullw m1, m0, m3
+ psrld m0, 10
+ psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
+ pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+ shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+ pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ movu [r1+2*r4], m2
+
+ shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+ pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+ movq [r2+r4], m1
+ movhps [r3+r4], m1
+
+ add r0, mmsize
+ add r4, 6
+ jl .loop
+
+ REP_RET
+%endmacro
+
+INIT_XMM
+v210_planar_unpack unaligned, ssse3
+%if HAVE_AVX_EXTERNAL
+INIT_AVX
+v210_planar_unpack unaligned, avx
+%endif
+
+INIT_XMM
+v210_planar_unpack aligned, ssse3
+%if HAVE_AVX_EXTERNAL
+INIT_AVX
+v210_planar_unpack aligned, avx
+%endif
diff --git a/ffmpeg/libavcodec/x86/vc1dsp.asm b/ffmpeg/libavcodec/x86/vc1dsp.asm
new file mode 100644
index 0000000..546688c
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vc1dsp.asm
@@ -0,0 +1,317 @@
+;******************************************************************************
+;* VC1 deblocking optimizations
+;* Copyright (c) 2009 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_4
+cextern pw_5
+
+section .text
+
+; dst_low, dst_high (src), zero
+; zero-extends one vector from 8 to 16 bits
+%macro UNPACK_8TO16 4
+ mova m%2, m%3
+ punpckh%1 m%3, m%4
+ punpckl%1 m%2, m%4
+%endmacro
+
+%macro STORE_4_WORDS 6
+%if cpuflag(sse4)
+ pextrw %1, %5, %6+0
+ pextrw %2, %5, %6+1
+ pextrw %3, %5, %6+2
+ pextrw %4, %5, %6+3
+%else
+ movd %6d, %5
+%if mmsize==16
+ psrldq %5, 4
+%else
+ psrlq %5, 32
+%endif
+ mov %1, %6w
+ shr %6, 16
+ mov %2, %6w
+ movd %6d, %5
+ mov %3, %6w
+ shr %6, 16
+ mov %4, %6w
+%endif
+%endmacro
+
+; in: p1 p0 q0 q1, clobbers p0
+; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
+%macro VC1_LOOP_FILTER_A0 4
+ psubw %1, %4
+ psubw %2, %3
+ paddw %1, %1
+ pmullw %2, [pw_5]
+ psubw %1, %2
+ paddw %1, [pw_4]
+ psraw %1, 3
+%endmacro
+
+; in: p0 q0 a0 a1 a2
+; m0 m1 m7 m6 m5
+; %1: size
+; out: m0=p0' m1=q0'
+%macro VC1_FILTER 1
+ PABSW m4, m7
+ PABSW m3, m6
+ PABSW m2, m5
+ mova m6, m4
+ pminsw m3, m2
+ pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
+ psubw m3, m4
+ pmullw m3, [pw_5] ; 5*(a3 - a0)
+ PABSW m2, m3
+ psraw m2, 3 ; abs(d/8)
+ pxor m7, m3 ; d_sign ^= a0_sign
+
+ pxor m5, m5
+ movd m3, r2d
+%if %1 > 4
+ punpcklbw m3, m3
+%endif
+ punpcklbw m3, m5
+ pcmpgtw m3, m4 ; if (a0 < pq)
+ pand m6, m3
+
+ mova m3, m0
+ psubw m3, m1
+ PABSW m4, m3
+ psraw m4, 1
+ pxor m3, m7 ; d_sign ^ clip_sign
+ psraw m3, 15
+ pminsw m2, m4 ; min(d, clip)
+ pcmpgtw m4, m5
+ pand m6, m4 ; filt3 (C return value)
+
+; each set of 4 pixels is not filtered if the 3rd is not
+%if mmsize==16
+ pshuflw m4, m6, 0xaa
+%if %1 > 4
+ pshufhw m4, m4, 0xaa
+%endif
+%else
+ pshufw m4, m6, 0xaa
+%endif
+ pandn m3, m4
+ pand m2, m6
+ pand m3, m2 ; d final
+
+ psraw m7, 15
+ pxor m3, m7
+ psubw m3, m7
+ psubw m0, m3
+ paddw m1, m3
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
+; 1st param: size of filter
+; 2nd param: mov suffix equivalent to the filter size
+%macro VC1_V_LOOP_FILTER 2
+ pxor m5, m5
+ mov%2 m6, [r4]
+ mov%2 m4, [r4+r1]
+ mov%2 m7, [r4+2*r1]
+ mov%2 m0, [r4+r3]
+ punpcklbw m6, m5
+ punpcklbw m4, m5
+ punpcklbw m7, m5
+ punpcklbw m0, m5
+
+ VC1_LOOP_FILTER_A0 m6, m4, m7, m0
+ mov%2 m1, [r0]
+ mov%2 m2, [r0+r1]
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ mova m4, m0
+ VC1_LOOP_FILTER_A0 m7, m4, m1, m2
+ mov%2 m3, [r0+2*r1]
+ mov%2 m4, [r0+r3]
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ mova m5, m1
+ VC1_LOOP_FILTER_A0 m5, m2, m3, m4
+
+ VC1_FILTER %1
+ mov%2 [r4+r3], m0
+ mov%2 [r0], m1
+%endmacro
+
+; 1st param: size of filter
+; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
+; 2nd (optional) param: temp register to use for storing words
+%macro VC1_H_LOOP_FILTER 1-2
+%if %1 == 4
+ movq m0, [r0 -4]
+ movq m1, [r0+ r1-4]
+ movq m2, [r0+2*r1-4]
+ movq m3, [r0+ r3-4]
+ TRANSPOSE4x4B 0, 1, 2, 3, 4
+%else
+ movq m0, [r0 -4]
+ movq m4, [r0+ r1-4]
+ movq m1, [r0+2*r1-4]
+ movq m5, [r0+ r3-4]
+ movq m2, [r4 -4]
+ movq m6, [r4+ r1-4]
+ movq m3, [r4+2*r1-4]
+ movq m7, [r4+ r3-4]
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+ pxor m5, m5
+
+ UNPACK_8TO16 bw, 6, 0, 5
+ UNPACK_8TO16 bw, 7, 1, 5
+ VC1_LOOP_FILTER_A0 m6, m0, m7, m1
+ UNPACK_8TO16 bw, 4, 2, 5
+ mova m0, m1 ; m0 = p0
+ VC1_LOOP_FILTER_A0 m7, m1, m4, m2
+ UNPACK_8TO16 bw, 1, 3, 5
+ mova m5, m4
+ VC1_LOOP_FILTER_A0 m5, m2, m1, m3
+ SWAP 1, 4 ; m1 = q0
+
+ VC1_FILTER %1
+ punpcklbw m0, m1
+%if %0 > 1
+ STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
+%if %1 > 4
+ psrldq m0, 4
+ STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
+%endif
+%else
+ STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
+ STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
+%endif
+%endmacro
+
+
+%macro START_V_FILTER 0
+ mov r4, r0
+ lea r3, [4*r1]
+ sub r4, r3
+ lea r3, [r1+2*r1]
+ imul r2, 0x01010101
+%endmacro
+
+%macro START_H_FILTER 1
+ lea r3, [r1+2*r1]
+%if %1 > 4
+ lea r4, [r0+4*r1]
+%endif
+ imul r2, 0x01010101
+%endmacro
+
+%macro VC1_LF 0
+cglobal vc1_v_loop_filter_internal
+ VC1_V_LOOP_FILTER 4, d
+ ret
+
+cglobal vc1_h_loop_filter_internal
+ VC1_H_LOOP_FILTER 4, r4
+ ret
+
+; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal
+ RET
+
+; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal
+ RET
+
+; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal
+ add r4, 4
+ add r0, 4
+ call vc1_v_loop_filter_internal
+ RET
+
+; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal
+ lea r0, [r0+4*r1]
+ call vc1_h_loop_filter_internal
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VC1_LF
+
+INIT_XMM sse2
+; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+INIT_MMX ssse3
+; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4, 3,5,0
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 4, d
+ RET
+
+; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4, 3,5,0
+ START_H_FILTER 4
+ VC1_H_LOOP_FILTER 4, r4
+ RET
+
+INIT_XMM ssse3
+; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+INIT_XMM sse4
+; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,5,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8
+ RET
diff --git a/ffmpeg/libavcodec/x86/vc1dsp.h b/ffmpeg/libavcodec/x86/vc1dsp.h
new file mode 100644
index 0000000..fdd4de1
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vc1dsp.h
@@ -0,0 +1,29 @@
+/*
+ * VC-1 and WMV3 decoder - X86 DSP init functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VC1DSP_H
+#define AVCODEC_X86_VC1DSP_H
+
+#include "libavcodec/vc1dsp.h"
+
+void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
+void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VC1DSP_H */
diff --git a/ffmpeg/libavcodec/x86/vc1dsp_init.c b/ffmpeg/libavcodec/x86/vc1dsp_init.c
new file mode 100644
index 0000000..228f4dc
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vc1dsp_init.c
@@ -0,0 +1,132 @@
+/*
+ * VC-1 and WMV3 - DSP functions MMX-optimized
+ * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vc1dsp.h"
+#include "dsputil_mmx.h"
+#include "vc1dsp.h"
+#include "config.h"
+
+#define LOOP_FILTER(EXT) \
+void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
+\
+static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+{ \
+ ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
+ ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
+} \
+\
+static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+{ \
+ ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
+ ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
+}
+
+#if HAVE_YASM
+LOOP_FILTER(mmxext)
+LOOP_FILTER(sse2)
+LOOP_FILTER(ssse3)
+
+void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
+
+static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
+{
+ ff_vc1_h_loop_filter8_sse4(src, stride, pq);
+ ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
+}
+
+static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride, int rnd)
+{
+ ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+#endif /* HAVE_YASM */
+
+void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+
+av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (INLINE_MMX(mm_flags))
+ ff_vc1dsp_init_mmx(dsp);
+
+ if (INLINE_MMXEXT(mm_flags))
+ ff_vc1dsp_init_mmxext(dsp);
+
+#define ASSIGN_LF(EXT) \
+ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
+ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
+ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
+ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
+ dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
+ dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
+
+#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ ASSIGN_LF(mmxext);
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
+
+ dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
+ } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
+ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
+ dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
+ dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+ }
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+ ASSIGN_LF(ssse3);
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
+ }
+ if (mm_flags & AV_CPU_FLAG_SSE4) {
+ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
+ dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/vc1dsp_mmx.c b/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
new file mode 100644
index 0000000..df0385f
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
@@ -0,0 +1,750 @@
+/*
+ * VC-1 and WMV3 - DSP functions MMX-optimized
+ * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "dsputil_mmx.h"
+#include "libavcodec/vc1dsp.h"
+#include "vc1dsp.h"
+
+#if HAVE_INLINE_ASM
+
+#define OP_PUT(S,D)
+#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
+
+/** Add rounder from mm7 to mm3 and pack result at destination */
+#define NORMALIZE_MMX(SHIFT) \
+ "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
+ "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
+ "psraw "SHIFT", %%mm3 \n\t" \
+ "psraw "SHIFT", %%mm4 \n\t"
+
+#define TRANSFER_DO_PACK(OP) \
+ "packuswb %%mm4, %%mm3 \n\t" \
+ OP((%2), %%mm3) \
+ "movq %%mm3, (%2) \n\t"
+
+#define TRANSFER_DONT_PACK(OP) \
+ OP(0(%2), %%mm3) \
+ OP(8(%2), %%mm4) \
+ "movq %%mm3, 0(%2) \n\t" \
+ "movq %%mm4, 8(%2) \n\t"
+
+/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
+#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
+#define DONT_UNPACK(reg)
+
+/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
+#define LOAD_ROUNDER_MMX(ROUND) \
+ "movd "ROUND", %%mm7 \n\t" \
+ "punpcklwd %%mm7, %%mm7 \n\t" \
+ "punpckldq %%mm7, %%mm7 \n\t"
+
+#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
+ "paddw %%mm"#R2", %%mm"#R1" \n\t" \
+ "movd (%0,%3), %%mm"#R0" \n\t" \
+ "pmullw %%mm6, %%mm"#R1" \n\t" \
+ "punpcklbw %%mm0, %%mm"#R0" \n\t" \
+ "movd (%0,%2), %%mm"#R3" \n\t" \
+ "psubw %%mm"#R0", %%mm"#R1" \n\t" \
+ "punpcklbw %%mm0, %%mm"#R3" \n\t" \
+ "paddw %%mm7, %%mm"#R1" \n\t" \
+ "psubw %%mm"#R3", %%mm"#R1" \n\t" \
+ "psraw %4, %%mm"#R1" \n\t" \
+ "movq %%mm"#R1", "#OFF"(%1) \n\t" \
+ "add %2, %0 \n\t"
+
+/** Sacrifying mm6 allows to pipeline loads from src */
+static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+ const uint8_t *src, x86_reg stride,
+ int rnd, int64_t shift)
+{
+ __asm__ volatile(
+ "mov $3, %%"REG_c" \n\t"
+ LOAD_ROUNDER_MMX("%5")
+ "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
+ "1: \n\t"
+ "movd (%0), %%mm2 \n\t"
+ "add %2, %0 \n\t"
+ "movd (%0), %%mm3 \n\t"
+ "punpcklbw %%mm0, %%mm2 \n\t"
+ "punpcklbw %%mm0, %%mm3 \n\t"
+ SHIFT2_LINE( 0, 1, 2, 3, 4)
+ SHIFT2_LINE( 24, 2, 3, 4, 1)
+ SHIFT2_LINE( 48, 3, 4, 1, 2)
+ SHIFT2_LINE( 72, 4, 1, 2, 3)
+ SHIFT2_LINE( 96, 1, 2, 3, 4)
+ SHIFT2_LINE(120, 2, 3, 4, 1)
+ SHIFT2_LINE(144, 3, 4, 1, 2)
+ SHIFT2_LINE(168, 4, 1, 2, 3)
+ "sub %6, %0 \n\t"
+ "add $8, %1 \n\t"
+ "dec %%"REG_c" \n\t"
+ "jnz 1b \n\t"
+ : "+r"(src), "+r"(dst)
+ : "r"(stride), "r"(-2*stride),
+ "m"(shift), "m"(rnd), "r"(9*stride-4)
+ : "%"REG_c, "memory"
+ );
+}
+
+/**
+ * Data is already unpacked, so some operations can directly be made from
+ * memory.
+ */
+#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
+static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
+ const int16_t *src, int rnd)\
+{\
+ int h = 8;\
+\
+ src -= 1;\
+ rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
+ __asm__ volatile(\
+ LOAD_ROUNDER_MMX("%4")\
+ "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
+ "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
+ "1: \n\t"\
+ "movq 2*0+0(%1), %%mm1 \n\t"\
+ "movq 2*0+8(%1), %%mm2 \n\t"\
+ "movq 2*1+0(%1), %%mm3 \n\t"\
+ "movq 2*1+8(%1), %%mm4 \n\t"\
+ "paddw 2*3+0(%1), %%mm1 \n\t"\
+ "paddw 2*3+8(%1), %%mm2 \n\t"\
+ "paddw 2*2+0(%1), %%mm3 \n\t"\
+ "paddw 2*2+8(%1), %%mm4 \n\t"\
+ "pmullw %%mm5, %%mm3 \n\t"\
+ "pmullw %%mm5, %%mm4 \n\t"\
+ "psubw %%mm1, %%mm3 \n\t"\
+ "psubw %%mm2, %%mm4 \n\t"\
+ NORMALIZE_MMX("$7")\
+ /* Remove bias */\
+ "paddw %%mm6, %%mm3 \n\t"\
+ "paddw %%mm6, %%mm4 \n\t"\
+ TRANSFER_DO_PACK(OP)\
+ "add $24, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ "decl %0 \n\t"\
+ "jnz 1b \n\t"\
+ : "+r"(h), "+r" (src), "+r" (dst)\
+ : "r"(stride), "m"(rnd)\
+ : "memory"\
+ );\
+}
+
+VC1_HOR_16b_SHIFT2(OP_PUT, put_)
+VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
+
+
+/**
+ * Purely vertical or horizontal 1/2 shift interpolation.
+ * Sacrify mm6 for *9 factor.
+ */
+#define VC1_SHIFT2(OP, OPNAME)\
+static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
+ x86_reg stride, int rnd, x86_reg offset)\
+{\
+ rnd = 8-rnd;\
+ __asm__ volatile(\
+ "mov $8, %%"REG_c" \n\t"\
+ LOAD_ROUNDER_MMX("%5")\
+ "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
+ "1: \n\t"\
+ "movd 0(%0 ), %%mm3 \n\t"\
+ "movd 4(%0 ), %%mm4 \n\t"\
+ "movd 0(%0,%2), %%mm1 \n\t"\
+ "movd 4(%0,%2), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm0, %%mm3 \n\t"\
+ "punpcklbw %%mm0, %%mm4 \n\t"\
+ "punpcklbw %%mm0, %%mm1 \n\t"\
+ "punpcklbw %%mm0, %%mm2 \n\t"\
+ "paddw %%mm1, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm4 \n\t"\
+ "movd 0(%0,%3), %%mm1 \n\t"\
+ "movd 4(%0,%3), %%mm2 \n\t"\
+ "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
+ "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
+ "punpcklbw %%mm0, %%mm1 \n\t"\
+ "punpcklbw %%mm0, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
+ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
+ "movd 0(%0,%2), %%mm1 \n\t"\
+ "movd 4(%0,%2), %%mm2 \n\t"\
+ "punpcklbw %%mm0, %%mm1 \n\t"\
+ "punpcklbw %%mm0, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
+ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
+ NORMALIZE_MMX("$4")\
+ "packuswb %%mm4, %%mm3 \n\t"\
+ OP((%1), %%mm3)\
+ "movq %%mm3, (%1) \n\t"\
+ "add %6, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "dec %%"REG_c" \n\t"\
+ "jnz 1b \n\t"\
+ : "+r"(src), "+r"(dst)\
+ : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
+ "g"(stride-offset)\
+ : "%"REG_c, "memory"\
+ );\
+}
+
+VC1_SHIFT2(OP_PUT, put_)
+VC1_SHIFT2(OP_AVG, avg_)
+
+/**
+ * Core of the 1/4 and 3/4 shift bicubic interpolation.
+ *
+ * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
+ * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
+ * @param A1 Address of 1st tap (beware of unpacked/packed).
+ * @param A2 Address of 2nd tap
+ * @param A3 Address of 3rd tap
+ * @param A4 Address of 4th tap
+ */
+#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
+ MOVQ "*0+"A1", %%mm1 \n\t" \
+ MOVQ "*4+"A1", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
+ "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
+ MOVQ "*0+"A2", %%mm3 \n\t" \
+ MOVQ "*4+"A2", %%mm4 \n\t" \
+ UNPACK("%%mm3") \
+ UNPACK("%%mm4") \
+ "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
+ "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
+ "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
+ "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
+ MOVQ "*0+"A4", %%mm1 \n\t" \
+ MOVQ "*4+"A4", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "psllw $2, %%mm1 \n\t" /* 4* */ \
+ "psllw $2, %%mm2 \n\t" /* 4* */ \
+ "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
+ "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
+ MOVQ "*0+"A3", %%mm1 \n\t" \
+ MOVQ "*4+"A3", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
+ "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
+ "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
+ "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
+
+/**
+ * Macro to build the vertical 16bits version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (src_stride) and %4 (3*src_stride).
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
+static void \
+vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
+ x86_reg src_stride, \
+ int rnd, int64_t shift) \
+{ \
+ int h = 8; \
+ src -= src_stride; \
+ __asm__ volatile( \
+ LOAD_ROUNDER_MMX("%5") \
+ "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
+ "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
+ ".p2align 3 \n\t" \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
+ NORMALIZE_MMX("%6") \
+ TRANSFER_DONT_PACK(OP_PUT) \
+ /* Last 3 (in fact 4) bytes on the line */ \
+ "movd 8+"A1", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "movq %%mm1, %%mm3 \n\t" \
+ "paddw %%mm1, %%mm1 \n\t" \
+ "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
+ "movd 8+"A2", %%mm3 \n\t" \
+ DO_UNPACK("%%mm3") \
+ "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
+ "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
+ "movd 8+"A3", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
+ "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
+ "movd 8+"A4", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "psllw $2, %%mm1 \n\t" /* 4* */ \
+ "psubw %%mm1, %%mm3 \n\t" \
+ "paddw %%mm7, %%mm3 \n\t" \
+ "psraw %6, %%mm3 \n\t" \
+ "movq %%mm3, 16(%2) \n\t" \
+ "add %3, %1 \n\t" \
+ "add $24, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(src_stride), "r"(3*src_stride), \
+ "m"(rnd), "m"(shift) \
+ : "memory" \
+ ); \
+}
+
+/**
+ * Macro to build the horizontal 16bits version of vc1_put_shift[13].
+ * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
+static void \
+OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
+ const int16_t *src, int rnd) \
+{ \
+ int h = 8; \
+ src -= 1; \
+ rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
+ __asm__ volatile( \
+ LOAD_ROUNDER_MMX("%4") \
+ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
+ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
+ ".p2align 3 \n\t" \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
+ NORMALIZE_MMX("$7") \
+ /* Remove bias */ \
+ "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
+ "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
+ TRANSFER_DO_PACK(OP) \
+ "add $24, %1 \n\t" \
+ "add %3, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(stride), "m"(rnd) \
+ : "memory" \
+ ); \
+}
+
+/**
+ * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (offset) and %4 (3*offset).
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
+static void \
+OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
+ x86_reg stride, int rnd, x86_reg offset) \
+{ \
+ int h = 8; \
+ src -= offset; \
+ rnd = 32-rnd; \
+ __asm__ volatile ( \
+ LOAD_ROUNDER_MMX("%6") \
+ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
+ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
+ ".p2align 3 \n\t" \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
+ NORMALIZE_MMX("$6") \
+ TRANSFER_DO_PACK(OP) \
+ "add %5, %1 \n\t" \
+ "add %5, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
+ : "memory" \
+ ); \
+}
+
+/** 1/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
+MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
+MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
+
+/** 3/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
+MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
+MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
+MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
+MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
+
+typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
+typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
+typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
+
+/**
+ * Interpolate fractional pel values by applying proper vertical then
+ * horizontal filter.
+ *
+ * @param dst Destination buffer for interpolated pels.
+ * @param src Source buffer.
+ * @param stride Stride for both src and dst buffers.
+ * @param hmode Horizontal filter (expressed in quarter pixels shift).
+ * @param hmode Vertical filter.
+ * @param rnd Rounding bias.
+ */
+#define VC1_MSPEL_MC(OP)\
+static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
+ int hmode, int vmode, int rnd)\
+{\
+ static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
+ { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+ static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
+ { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+ static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
+ { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
+\
+ __asm__ volatile(\
+ "pxor %%mm0, %%mm0 \n\t"\
+ ::: "memory"\
+ );\
+\
+ if (vmode) { /* Vertical filter to apply */\
+ if (hmode) { /* Horizontal filter to apply, output to tmp */\
+ static const int shift_value[] = { 0, 5, 1, 5 };\
+ int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
+ int r;\
+ DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+\
+ r = (1<<(shift-1)) + rnd-1;\
+ vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
+\
+ vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
+ return;\
+ }\
+ else { /* No horizontal filter, output 8 lines to dst */\
+ vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
+ return;\
+ }\
+ }\
+\
+ /* Horizontal mode with no vertical mode */\
+ vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+}
+
+VC1_MSPEL_MC(put_)
+VC1_MSPEL_MC(avg_)
+
+/** Macro to ease bicubic filter interpolation functions declarations */
+#define DECLARE_FUNCTION(a, b) \
+static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
+ const uint8_t *src, \
+ ptrdiff_t stride, \
+ int rnd) \
+{ \
+ put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
+ const uint8_t *src, \
+ ptrdiff_t stride, \
+ int rnd) \
+{ \
+ avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
+}
+
+DECLARE_FUNCTION(0, 1)
+DECLARE_FUNCTION(0, 2)
+DECLARE_FUNCTION(0, 3)
+
+DECLARE_FUNCTION(1, 0)
+DECLARE_FUNCTION(1, 1)
+DECLARE_FUNCTION(1, 2)
+DECLARE_FUNCTION(1, 3)
+
+DECLARE_FUNCTION(2, 0)
+DECLARE_FUNCTION(2, 1)
+DECLARE_FUNCTION(2, 2)
+DECLARE_FUNCTION(2, 3)
+
+DECLARE_FUNCTION(3, 0)
+DECLARE_FUNCTION(3, 1)
+DECLARE_FUNCTION(3, 2)
+DECLARE_FUNCTION(3, 3)
+
+static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (12 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block)
+{
+ int dc = block[0];
+ dc = ( 3 * dc + 1) >> 1;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block)
+{
+ int dc = block[0];
+ dc = (3 * dc + 1) >> 1;
+ dc = (3 * dc + 16) >> 5;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
+{
+ dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
+ dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
+ dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
+ dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
+ dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
+ dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
+ dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+}
+
+av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
+{
+ dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
+
+ dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
+
+ dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
+
+ dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
+
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
+}
+#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/videodsp.asm b/ffmpeg/libavcodec/x86/videodsp.asm
new file mode 100644
index 0000000..0eb4721
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/videodsp.asm
@@ -0,0 +1,612 @@
+;******************************************************************************
+;* Core video DSP functions
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
+; x86_reg start_y, x86_reg end_y, x86_reg block_h,
+; x86_reg start_x, x86_reg end_x, x86_reg block_w);
+;
+; The actual function itself is below. It basically wraps a very simple
+; w = end_x - start_x
+; if (w) {
+; if (w > 22) {
+; jump to the slow loop functions
+; } else {
+; jump to the fast loop functions
+; }
+; }
+;
+; ... and then the same for left/right extend also. See below for loop
+; function implementations. Fast are fixed-width, slow is variable-width
+
+%macro EMU_EDGE_FUNC 0
+%if ARCH_X86_64
+%define w_reg r7
+cglobal emu_edge_core, 6, 9, 1
+ mov r8, r5 ; save block_h
+%else
+%define w_reg r6
+cglobal emu_edge_core, 2, 7, 0
+ mov r4, r4m ; end_y
+ mov r5, r5m ; block_h
+%endif
+
+ ; start with vertical extend (top/bottom) and body pixel copy
+ mov w_reg, r7m
+ sub w_reg, r6m ; w = start_x - end_x
+ sub r5, r4
+%if ARCH_X86_64
+ sub r4, r3
+%else
+ sub r4, dword r3m
+%endif
+ cmp w_reg, 22
+ jg .slow_v_extend_loop
+%if ARCH_X86_32
+ mov r2, r2m ; linesize
+%endif
+ sal w_reg, 7 ; w * 128
+%ifdef PIC
+ lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
+ add w_reg, rax
+%else
+ lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
+%endif
+ call w_reg ; fast top extend, body copy and bottom extend
+.v_extend_end:
+
+ ; horizontal extend (left/right)
+ mov w_reg, r6m ; start_x
+ sub r0, w_reg
+%if ARCH_X86_64
+ mov r3, r0 ; backup of buf+block_h*linesize
+ mov r5, r8
+%else
+ mov r0m, r0 ; backup of buf+block_h*linesize
+ mov r5, r5m
+%endif
+ test w_reg, w_reg
+ jz .right_extend
+ cmp w_reg, 22
+ jg .slow_left_extend_loop
+ mov r1, w_reg
+ dec w_reg
+ ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
+ sar w_reg, 1
+ sal w_reg, 6
+ ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
+ ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
+%ifdef PIC
+ lea rax, [.emuedge_extend_left_2]
+ add w_reg, rax
+%else
+ lea w_reg, [.emuedge_extend_left_2+w_reg]
+%endif
+ call w_reg
+
+ ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
+.right_extend:
+%if ARCH_X86_32
+ mov r0, r0m
+ mov r5, r5m
+%endif
+ mov w_reg, r7m ; end_x
+ mov r1, r8m ; block_w
+ mov r4, r1
+ sub r1, w_reg
+ jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
+ cmp r1, 22
+ jg .slow_right_extend_loop
+ dec r1
+ ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
+ sar r1, 1
+ sal r1, 6
+%ifdef PIC
+ lea rax, [.emuedge_extend_right_2]
+ add r1, rax
+%else
+ lea r1, [.emuedge_extend_right_2+r1]
+%endif
+ call r1
+.h_extend_end:
+ RET
+
+%if ARCH_X86_64
+%define vall al
+%define valh ah
+%define valw ax
+%define valw2 r7w
+%define valw3 r3w
+%if WIN64
+%define valw4 r7w
+%else ; unix64
+%define valw4 r3w
+%endif
+%define vald eax
+%else
+%define vall bl
+%define valh bh
+%define valw bx
+%define valw2 r6w
+%define valw3 valw2
+%define valw4 valw3
+%define vald ebx
+%define stack_offset 0x14
+%endif
+
+%endmacro
+
+; macro to read/write a horizontal number of pixels (%2) to/from registers
+; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
+; - if (%2 & 15 == 8) fills the last 8 bytes into rax
+; - else if (%2 & 8) fills 8 bytes into mm0
+; - if (%2 & 7 == 4) fills the last 4 bytes into rax
+; - else if (%2 & 4) fills 4 bytes into mm0-1
+; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
+; (note that we're using r3 for body/bottom because it's a shorter
+; opcode, and then the loop fits in 128 bytes)
+; - else fills remaining bytes into rax
+; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
+; - if (%2 & 7 == 4) fills 4 bytes into ebx
+; - else if (%2 & 4) fills 4 bytes into mm0-7
+; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
+; - else fills remaining bytes into ebx
+; writing data out is in the same way
+%macro READ_NUM_BYTES 2
+%assign %%src_off 0 ; offset in source buffer
+%assign %%smidx 0 ; mmx register idx
+%assign %%sxidx 0 ; xmm register idx
+
+%if cpuflag(sse)
+%rep %2/16
+ movups xmm %+ %%sxidx, [r1+%%src_off]
+%assign %%src_off %%src_off+16
+%assign %%sxidx %%sxidx+1
+%endrep ; %2/16
+%endif
+
+%if ARCH_X86_64
+%if (%2-%%src_off) == 8
+ mov rax, [r1+%%src_off]
+%assign %%src_off %%src_off+8
+%endif ; (%2-%%src_off) == 8
+%endif ; x86-64
+
+%rep (%2-%%src_off)/8
+ movq mm %+ %%smidx, [r1+%%src_off]
+%assign %%src_off %%src_off+8
+%assign %%smidx %%smidx+1
+%endrep ; (%2-%%dst_off)/8
+
+%if (%2-%%src_off) == 4
+ mov vald, [r1+%%src_off]
+%elif (%2-%%src_off) & 4
+ movd mm %+ %%smidx, [r1+%%src_off]
+%assign %%src_off %%src_off+4
+%endif ; (%2-%%src_off) ==/& 4
+
+%if (%2-%%src_off) == 1
+ mov vall, [r1+%%src_off]
+%elif (%2-%%src_off) == 2
+ mov valw, [r1+%%src_off]
+%elif (%2-%%src_off) == 3
+%ifidn %1, top
+ mov valw2, [r1+%%src_off]
+%elifidn %1, body
+ mov valw3, [r1+%%src_off]
+%elifidn %1, bottom
+ mov valw4, [r1+%%src_off]
+%endif ; %1 ==/!= top
+ mov vall, [r1+%%src_off+2]
+%endif ; (%2-%%src_off) == 1/2/3
+%endmacro ; READ_NUM_BYTES
+
+%macro WRITE_NUM_BYTES 2
+%assign %%dst_off 0 ; offset in destination buffer
+%assign %%dmidx 0 ; mmx register idx
+%assign %%dxidx 0 ; xmm register idx
+
+%if cpuflag(sse)
+%rep %2/16
+ movups [r0+%%dst_off], xmm %+ %%dxidx
+%assign %%dst_off %%dst_off+16
+%assign %%dxidx %%dxidx+1
+%endrep ; %2/16
+%endif
+
+%if ARCH_X86_64
+%if (%2-%%dst_off) == 8
+ mov [r0+%%dst_off], rax
+%assign %%dst_off %%dst_off+8
+%endif ; (%2-%%dst_off) == 8
+%endif ; x86-64
+
+%rep (%2-%%dst_off)/8
+ movq [r0+%%dst_off], mm %+ %%dmidx
+%assign %%dst_off %%dst_off+8
+%assign %%dmidx %%dmidx+1
+%endrep ; (%2-%%dst_off)/8
+
+%if (%2-%%dst_off) == 4
+ mov [r0+%%dst_off], vald
+%elif (%2-%%dst_off) & 4
+ movd [r0+%%dst_off], mm %+ %%dmidx
+%assign %%dst_off %%dst_off+4
+%endif ; (%2-%%dst_off) ==/& 4
+
+%if (%2-%%dst_off) == 1
+ mov [r0+%%dst_off], vall
+%elif (%2-%%dst_off) == 2
+ mov [r0+%%dst_off], valw
+%elif (%2-%%dst_off) == 3
+%ifidn %1, top
+ mov [r0+%%dst_off], valw2
+%elifidn %1, body
+ mov [r0+%%dst_off], valw3
+%elifidn %1, bottom
+ mov [r0+%%dst_off], valw4
+%endif ; %1 ==/!= top
+ mov [r0+%%dst_off+2], vall
+%endif ; (%2-%%dst_off) == 1/2/3
+%endmacro ; WRITE_NUM_BYTES
+
+; vertical top/bottom extend and body copy fast loops
+; these are function pointers to set-width line copy functions, i.e.
+; they read a fixed number of pixels into set registers, and write
+; those out into the destination buffer
+; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
+; r6(eax/64)/r3(ebx/32)=val_reg
+%macro VERTICAL_EXTEND 0
+%assign %%n 1
+%rep 22
+ALIGN 128
+.emuedge_v_extend_ %+ %%n:
+ ; extend pixels above body
+%if ARCH_X86_64
+ test r3 , r3 ; if (!start_y)
+ jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
+%else ; ARCH_X86_32
+ cmp dword r3m, 0
+ je .emuedge_copy_body_ %+ %%n %+ _loop
+%endif ; ARCH_X86_64/32
+ READ_NUM_BYTES top, %%n ; read bytes
+.emuedge_extend_top_ %+ %%n %+ _loop: ; do {
+ WRITE_NUM_BYTES top, %%n ; write bytes
+ add r0 , r2 ; dst += linesize
+%if ARCH_X86_64
+ dec r3d
+%else ; ARCH_X86_32
+ dec dword r3m
+%endif ; ARCH_X86_64/32
+ jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
+
+ ; copy body pixels
+.emuedge_copy_body_ %+ %%n %+ _loop: ; do {
+ READ_NUM_BYTES body, %%n ; read bytes
+ WRITE_NUM_BYTES body, %%n ; write bytes
+ add r0 , r2 ; dst += linesize
+ add r1 , r2 ; src += linesize
+ dec r4d
+ jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
+
+ ; copy bottom pixels
+ test r5 , r5 ; if (!block_h)
+ jz .emuedge_v_extend_end_ %+ %%n ; goto end
+ sub r1 , r2 ; src -= linesize
+ READ_NUM_BYTES bottom, %%n ; read bytes
+.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
+ WRITE_NUM_BYTES bottom, %%n ; write bytes
+ add r0 , r2 ; dst += linesize
+ dec r5d
+ jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
+
+.emuedge_v_extend_end_ %+ %%n:
+%if ARCH_X86_64
+ ret
+%else ; ARCH_X86_32
+ rep ret
+%endif ; ARCH_X86_64/32
+%assign %%n %%n+1
+%endrep
+%endmacro VERTICAL_EXTEND
+
+; left/right (horizontal) fast extend functions
+; these are essentially identical to the vertical extend ones above,
+; just left/right separated because number of pixels to extend is
+; obviously not the same on both sides.
+; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
+; lowest two bytes of the register (so val*0x0101), and are splatted
+; into each byte of mm0 as well if n_pixels >= 8
+
+%macro READ_V_PIXEL 2
+ mov vall, %2
+ mov valh, vall
+%if %1 >= 8
+ movd mm0, vald
+%if cpuflag(mmxext)
+ pshufw mm0, mm0, 0
+%else ; mmx
+ punpcklwd mm0, mm0
+ punpckldq mm0, mm0
+%endif ; sse
+%endif ; %1 >= 8
+%endmacro
+
+%macro WRITE_V_PIXEL 2
+%assign %%dst_off 0
+%rep %1/8
+ movq [%2+%%dst_off], mm0
+%assign %%dst_off %%dst_off+8
+%endrep
+%if %1 & 4
+%if %1 >= 8
+ movd [%2+%%dst_off], mm0
+%else ; %1 < 8
+ mov [%2+%%dst_off] , valw
+ mov [%2+%%dst_off+2], valw
+%endif ; %1 >=/< 8
+%assign %%dst_off %%dst_off+4
+%endif ; %1 & 4
+%if %1&2
+ mov [%2+%%dst_off], valw
+%endif ; %1 & 2
+%endmacro
+
+; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
+%macro LEFT_EXTEND 0
+%assign %%n 2
+%rep 11
+ALIGN 64
+.emuedge_extend_left_ %+ %%n: ; do {
+ sub r0, r2 ; dst -= linesize
+ READ_V_PIXEL %%n, [r0+r1] ; read pixels
+ WRITE_V_PIXEL %%n, r0 ; write pixels
+ dec r5
+ jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
+%if ARCH_X86_64
+ ret
+%else ; ARCH_X86_32
+ rep ret
+%endif ; ARCH_X86_64/32
+%assign %%n %%n+2
+%endrep
+%endmacro ; LEFT_EXTEND
+
+; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
+%macro RIGHT_EXTEND 0
+%assign %%n 2
+%rep 11
+ALIGN 64
+.emuedge_extend_right_ %+ %%n: ; do {
+%if ARCH_X86_64
+ sub r3, r2 ; dst -= linesize
+ READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
+ WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
+ dec r8
+%else ; ARCH_X86_32
+ sub r0, r2 ; dst -= linesize
+ READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
+ WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
+ dec r5
+%endif ; ARCH_X86_64/32
+ jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
+%if ARCH_X86_64
+ ret
+%else ; ARCH_X86_32
+ rep ret
+%endif ; ARCH_X86_64/32
+%assign %%n %%n+2
+%endrep
+
+%if ARCH_X86_32
+%define stack_offset 0x10
+%endif
+%endmacro ; RIGHT_EXTEND
+
+; below follow the "slow" copy/extend functions, these act on a non-fixed
+; width specified in a register, and run a loop to copy the full amount
+; of bytes. They are optimized for copying of large amounts of pixels per
+; line, so they unconditionally splat data into mm registers to copy 8
+; bytes per loop iteration. It could be considered to use xmm for x86-64
+; also, but I haven't optimized this as much (i.e. FIXME)
+%macro V_COPY_NPX 4-5
+%if %0 == 4
+ test w_reg, %4
+ jz .%1_skip_%4_px
+%else ; %0 == 5
+.%1_%4_px_loop:
+%endif
+ %3 %2, [r1+cnt_reg]
+ %3 [r0+cnt_reg], %2
+ add cnt_reg, %4
+%if %0 == 5
+ sub w_reg, %4
+ test w_reg, %5
+ jnz .%1_%4_px_loop
+%endif
+.%1_skip_%4_px:
+%endmacro
+
+%macro V_COPY_ROW 2
+%ifidn %1, bottom
+ sub r1, linesize
+%endif
+.%1_copy_loop:
+ xor cnt_reg, cnt_reg
+%if notcpuflag(sse)
+%define linesize r2m
+ V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
+%else ; sse
+ V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
+%if ARCH_X86_64
+%define linesize r2
+ V_COPY_NPX %1, rax , mov, 8
+%else ; ARCH_X86_32
+%define linesize r2m
+ V_COPY_NPX %1, mm0, movq, 8
+%endif ; ARCH_X86_64/32
+%endif ; sse
+ V_COPY_NPX %1, vald, mov, 4
+ V_COPY_NPX %1, valw, mov, 2
+ V_COPY_NPX %1, vall, mov, 1
+ mov w_reg, cnt_reg
+%ifidn %1, body
+ add r1, linesize
+%endif
+ add r0, linesize
+ dec %2
+ jnz .%1_copy_loop
+%endmacro
+
+%macro SLOW_V_EXTEND 0
+.slow_v_extend_loop:
+; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
+; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
+%if ARCH_X86_64
+ push r8 ; save old value of block_h
+ test r3, r3
+%define cnt_reg r8
+ jz .do_body_copy ; if (!start_y) goto do_body_copy
+ V_COPY_ROW top, r3
+%else
+ cmp dword r3m, 0
+%define cnt_reg r2
+ je .do_body_copy ; if (!start_y) goto do_body_copy
+ V_COPY_ROW top, dword r3m
+%endif
+
+.do_body_copy:
+ V_COPY_ROW body, r4
+
+%if ARCH_X86_64
+ pop r8 ; restore old value of block_h
+%define cnt_reg r3
+%endif
+ test r5, r5
+%if ARCH_X86_64
+ jz .v_extend_end
+%else
+ jz .skip_bottom_extend
+%endif
+ V_COPY_ROW bottom, r5
+%if ARCH_X86_32
+.skip_bottom_extend:
+ mov r2, r2m
+%endif
+ jmp .v_extend_end
+%endmacro
+
+%macro SLOW_LEFT_EXTEND 0
+.slow_left_extend_loop:
+; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
+ mov r4, 8
+ sub r0, linesize
+ READ_V_PIXEL 8, [r0+w_reg]
+.left_extend_8px_loop:
+ movq [r0+r4-8], mm0
+ add r4, 8
+ cmp r4, w_reg
+ jle .left_extend_8px_loop
+ sub r4, 8
+ cmp r4, w_reg
+ jge .left_extend_loop_end
+.left_extend_2px_loop:
+ mov [r0+r4], valw
+ add r4, 2
+ cmp r4, w_reg
+ jl .left_extend_2px_loop
+.left_extend_loop_end:
+ dec r5
+ jnz .slow_left_extend_loop
+%if ARCH_X86_32
+ mov r2, r2m
+%endif
+ jmp .right_extend
+%endmacro
+
+%macro SLOW_RIGHT_EXTEND 0
+.slow_right_extend_loop:
+; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
+; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
+%if ARCH_X86_64
+%define buf_reg r3
+%define bh_reg r8
+%else
+%define buf_reg r0
+%define bh_reg r5
+%endif
+ lea r1, [r4-8]
+ sub buf_reg, linesize
+ READ_V_PIXEL 8, [buf_reg+w_reg-1]
+.right_extend_8px_loop:
+ movq [buf_reg+r1], mm0
+ sub r1, 8
+ cmp r1, w_reg
+ jge .right_extend_8px_loop
+ add r1, 8
+ cmp r1, w_reg
+ je .right_extend_loop_end
+.right_extend_2px_loop:
+ sub r1, 2
+ mov [buf_reg+r1], valw
+ cmp r1, w_reg
+ jg .right_extend_2px_loop
+.right_extend_loop_end:
+ dec bh_reg
+ jnz .slow_right_extend_loop
+ jmp .h_extend_end
+%endmacro
+
+%macro emu_edge 1
+INIT_XMM %1
+EMU_EDGE_FUNC
+VERTICAL_EXTEND
+LEFT_EXTEND
+RIGHT_EXTEND
+SLOW_V_EXTEND
+SLOW_LEFT_EXTEND
+SLOW_RIGHT_EXTEND
+%endmacro
+
+emu_edge sse
+%if ARCH_X86_32
+emu_edge mmx
+%endif
+
+%macro PREFETCH_FN 1
+cglobal prefetch, 3, 3, 0, buf, stride, h
+.loop:
+ %1 [bufq]
+ add bufq, strideq
+ dec hd
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PREFETCH_FN prefetcht0
+%if ARCH_X86_32
+INIT_MMX 3dnow
+PREFETCH_FN prefetch
+%endif
diff --git a/ffmpeg/libavcodec/x86/videodsp_init.c b/ffmpeg/libavcodec/x86/videodsp_init.c
new file mode 100644
index 0000000..902450e
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/videodsp_init.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2002-2012 Michael Niedermayer
+ * Copyright (C) 2012 Ronald S. Bultje
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/videodsp.h"
+
+#if HAVE_YASM
+typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
+ x86_reg linesize, x86_reg start_y,
+ x86_reg end_y, x86_reg block_h,
+ x86_reg start_x, x86_reg end_x,
+ x86_reg block_w);
+extern emu_edge_core_func ff_emu_edge_core_mmx;
+extern emu_edge_core_func ff_emu_edge_core_sse;
+
+static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t linesize_arg,
+ int block_w, int block_h,
+ int src_x, int src_y,
+ int w, int h,
+ emu_edge_core_func *core_fn)
+{
+ int start_y, start_x, end_y, end_x, src_y_add = 0;
+ int linesize = linesize_arg;
+
+ if(!w || !h)
+ return;
+
+ if (src_y >= h) {
+ src -= src_y*linesize;
+ src_y_add = h - 1;
+ src_y = h - 1;
+ } else if (src_y <= -block_h) {
+ src -= src_y*linesize;
+ src_y_add = 1 - block_h;
+ src_y = 1 - block_h;
+ }
+ if (src_x >= w) {
+ src += w - 1 - src_x;
+ src_x = w - 1;
+ } else if (src_x <= -block_w) {
+ src += 1 - block_w - src_x;
+ src_x = 1 - block_w;
+ }
+
+ start_y = FFMAX(0, -src_y);
+ start_x = FFMAX(0, -src_x);
+ end_y = FFMIN(block_h, h-src_y);
+ end_x = FFMIN(block_w, w-src_x);
+ av_assert2(start_x < end_x && block_w > 0);
+ av_assert2(start_y < end_y && block_h > 0);
+
+ // fill in the to-be-copied part plus all above/below
+ src += (src_y_add + start_y) * linesize + start_x;
+ buf += start_x;
+ core_fn(buf, src, linesize, start_y, end_y,
+ block_h, start_x, end_x, block_w);
+}
+
+#if ARCH_X86_32
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
+ w, h, &ff_emu_edge_core_mmx);
+}
+#endif
+
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
+ ptrdiff_t linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
+ w, h, &ff_emu_edge_core_sse);
+}
+#endif /* HAVE_YASM */
+
+void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
+void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (bpc <= 8 && mm_flags & AV_CPU_FLAG_MMX) {
+ ctx->emulated_edge_mc = emulated_edge_mc_mmx;
+ }
+ if (mm_flags & AV_CPU_FLAG_3DNOW) {
+ ctx->prefetch = ff_prefetch_3dnow;
+ }
+#endif /* ARCH_X86_32 */
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ ctx->prefetch = ff_prefetch_mmxext;
+ }
+ if (bpc <= 8 && mm_flags & AV_CPU_FLAG_SSE) {
+ ctx->emulated_edge_mc = emulated_edge_mc_sse;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/vorbisdsp.asm b/ffmpeg/libavcodec/x86/vorbisdsp.asm
new file mode 100644
index 0000000..b25d838
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vorbisdsp.asm
@@ -0,0 +1,83 @@
+;******************************************************************************
+;* Vorbis x86 optimizations
+;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pdw_80000000: times 4 dd 0x80000000
+
+SECTION .text
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
+ pxor m7, m7
+ lea magq, [magq+block_sizeq*4]
+ lea angq, [angq+block_sizeq*4]
+ neg block_sizeq
+.loop:
+ mova m0, [magq+block_sizeq*4]
+ mova m1, [angq+block_sizeq*4]
+ mova m2, m0
+ mova m3, m1
+ pfcmpge m2, m7 ; m <= 0.0
+ pfcmpge m3, m7 ; a <= 0.0
+ pslld m2, 31 ; keep only the sign bit
+ pxor m1, m2
+ mova m4, m3
+ pand m3, m1
+ pandn m4, m1
+ pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
+ pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
+ mova [angq+block_sizeq*4], m3
+ mova [magq+block_sizeq*4], m0
+ add block_sizeq, 2
+ jl .loop
+ femms
+ RET
+%endif
+
+INIT_XMM sse
+cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
+ mova m5, [pdw_80000000]
+ xor cntrq, cntrq
+align 16
+.loop:
+ mova m0, [magq+cntrq*4]
+ mova m1, [angq+cntrq*4]
+ xorps m2, m2
+ xorps m3, m3
+ cmpleps m2, m0 ; m <= 0.0
+ cmpleps m3, m1 ; a <= 0.0
+ andps m2, m5 ; keep only the sign bit
+ xorps m1, m2
+ mova m4, m3
+ andps m3, m1
+ andnps m4, m1
+ addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
+ subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
+ mova [angq+cntrq*4], m3
+ mova [magq+cntrq*4], m0
+ add cntrq, 4
+ cmp cntrq, block_sizeq
+ jl .loop
+ RET
diff --git a/ffmpeg/libavcodec/x86/vorbisdsp_init.c b/ffmpeg/libavcodec/x86/vorbisdsp_init.c
new file mode 100644
index 0000000..08a2c09
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vorbisdsp_init.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+
+void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
+ intptr_t blocksize);
+void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
+ intptr_t blocksize);
+
+av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (mm_flags & AV_CPU_FLAG_3DNOW)
+ dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
+#endif /* ARCH_X86_32 */
+ if (mm_flags & AV_CPU_FLAG_SSE)
+ dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/vp3dsp.asm b/ffmpeg/libavcodec/x86/vp3dsp.asm
new file mode 100644
index 0000000..a47b8f2
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp3dsp.asm
@@ -0,0 +1,709 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the VP3 decoder
+;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+; MMX-optimized functions cribbed from the original VP3 source code.
+
+SECTION_RODATA
+
+vp3_idct_data: times 8 dw 64277
+ times 8 dw 60547
+ times 8 dw 54491
+ times 8 dw 46341
+ times 8 dw 36410
+ times 8 dw 25080
+ times 8 dw 12785
+
+pb_7: times 8 db 7
+pb_1F: times 8 db 0x1f
+pb_81: times 8 db 0x81
+
+cextern pb_1
+cextern pb_3
+cextern pb_80
+
+cextern pw_8
+
+SECTION .text
+
+; this is off by one or two for some cases when filter_limit is greater than 63
+; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
+; out: p1 in mm4, p2 in mm3
+%macro VP3_LOOP_FILTER 0
+ movq m7, m6
+ pand m6, [pb_7] ; p0&7
+ psrlw m7, 3
+ pand m7, [pb_1F] ; p0>>3
+ movq m3, m2 ; p2
+ pxor m2, m4
+ pand m2, [pb_1] ; (p2^p1)&1
+ movq m5, m2
+ paddb m2, m2
+ paddb m2, m5 ; 3*(p2^p1)&1
+ paddb m2, m6 ; extra bits lost in shifts
+ pcmpeqb m0, m0
+ pxor m1, m0 ; 255 - p3
+ pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
+ pxor m0, m4 ; 255 - p1
+ pavgb m0, m3 ; (256 + p2-p1) >> 1
+ paddb m1, [pb_3]
+ pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
+ pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
+ paddusb m7, m1 ; d+128+1
+ movq m6, [pb_81]
+ psubusb m6, m7
+ psubusb m7, [pb_81]
+
+ movq m5, [r2+516] ; flim
+ pminub m6, m5
+ pminub m7, m5
+ movq m0, m6
+ movq m1, m7
+ paddb m6, m6
+ paddb m7, m7
+ pminub m6, m5
+ pminub m7, m5
+ psubb m6, m0
+ psubb m7, m1
+ paddusb m4, m7
+ psubusb m4, m6
+ psubusb m3, m7
+ paddusb m3, m6
+%endmacro
+
+%macro STORE_4_WORDS 1
+ movd r2d, %1
+ mov [r0 -1], r2w
+ psrlq %1, 32
+ shr r2, 16
+ mov [r0+r1 -1], r2w
+ movd r2d, %1
+ mov [r0+r1*2-1], r2w
+ shr r2, 16
+ mov [r0+r3 -1], r2w
+%endmacro
+
+INIT_MMX mmxext
+cglobal vp3_v_loop_filter, 3, 4
+%if ARCH_X86_64
+ movsxd r1, r1d
+%endif
+ mov r3, r1
+ neg r1
+ movq m6, [r0+r1*2]
+ movq m4, [r0+r1 ]
+ movq m2, [r0 ]
+ movq m1, [r0+r3 ]
+
+ VP3_LOOP_FILTER
+
+ movq [r0+r1], m4
+ movq [r0 ], m3
+ RET
+
+cglobal vp3_h_loop_filter, 3, 4
+%if ARCH_X86_64
+ movsxd r1, r1d
+%endif
+ lea r3, [r1*3]
+
+ movd m6, [r0 -2]
+ movd m4, [r0+r1 -2]
+ movd m2, [r0+r1*2-2]
+ movd m1, [r0+r3 -2]
+ lea r0, [r0+r1*4 ]
+ punpcklbw m6, [r0 -2]
+ punpcklbw m4, [r0+r1 -2]
+ punpcklbw m2, [r0+r1*2-2]
+ punpcklbw m1, [r0+r3 -2]
+ sub r0, r3
+ sub r0, r1
+
+ TRANSPOSE4x4B 6, 4, 2, 1, 0
+ VP3_LOOP_FILTER
+ SBUTTERFLY bw, 4, 3, 5
+
+ STORE_4_WORDS m4
+ lea r0, [r0+r1*4 ]
+ STORE_4_WORDS m3
+ RET
+
+; from original comments: The Macro does IDct on 4 1-D Dcts
+%macro BeginIDCT 0
+ movq m2, I(3)
+ movq m6, C(3)
+ movq m4, m2
+ movq m7, J(5)
+ pmulhw m4, m6 ; r4 = c3*i3 - i3
+ movq m1, C(5)
+ pmulhw m6, m7 ; r6 = c3*i5 - i5
+ movq m5, m1
+ pmulhw m1, m2 ; r1 = c5*i3 - i3
+ movq m3, I(1)
+ pmulhw m5, m7 ; r5 = c5*i5 - i5
+ movq m0, C(1)
+ paddw m4, m2 ; r4 = c3*i3
+ paddw m6, m7 ; r6 = c3*i5
+ paddw m2, m1 ; r2 = c5*i3
+ movq m1, J(7)
+ paddw m7, m5 ; r7 = c5*i5
+ movq m5, m0 ; r5 = c1
+ pmulhw m0, m3 ; r0 = c1*i1 - i1
+ paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
+ pmulhw m5, m1 ; r5 = c1*i7 - i7
+ movq m7, C(7)
+ psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
+ paddw m0, m3 ; r0 = c1*i1
+ pmulhw m3, m7 ; r3 = c7*i1
+ movq m2, I(2)
+ pmulhw m7, m1 ; r7 = c7*i7
+ paddw m5, m1 ; r5 = c1*i7
+ movq m1, m2 ; r1 = i2
+ pmulhw m2, C(2) ; r2 = c2*i2 - i2
+ psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
+ movq m5, J(6)
+ paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
+ movq m7, m5 ; r7 = i6
+ psubsw m0, m4 ; r0 = A - C
+ pmulhw m5, C(2) ; r5 = c2*i6 - i6
+ paddw m2, m1 ; r2 = c2*i2
+ pmulhw m1, C(6) ; r1 = c6*i2
+ paddsw m4, m4 ; r4 = C + C
+ paddsw m4, m0 ; r4 = C. = A + C
+ psubsw m3, m6 ; r3 = B - D
+ paddw m5, m7 ; r5 = c2*i6
+ paddsw m6, m6 ; r6 = D + D
+ pmulhw m7, C(6) ; r7 = c6*i6
+ paddsw m6, m3 ; r6 = D. = B + D
+ movq I(1), m4 ; save C. at I(1)
+ psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
+ movq m4, C(4)
+ movq m5, m3 ; r5 = B - D
+ pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
+ paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
+ movq I(2), m6 ; save D. at I(2)
+ movq m2, m0 ; r2 = A - C
+ movq m6, I(0)
+ pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
+ paddw m5, m3 ; r5 = B. = c4 * (B - D)
+ movq m3, J(4)
+ psubsw m5, m1 ; r5 = B.. = B. - H
+ paddw m2, m0 ; r0 = A. = c4 * (A - C)
+ psubsw m6, m3 ; r6 = i0 - i4
+ movq m0, m6
+ pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
+ paddsw m3, m3 ; r3 = i4 + i4
+ paddsw m1, m1 ; r1 = H + H
+ paddsw m3, m0 ; r3 = i0 + i4
+ paddsw m1, m5 ; r1 = H. = B + H
+ pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
+ paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
+ psubsw m6, m2 ; r6 = F. = F - A.
+ paddsw m2, m2 ; r2 = A. + A.
+ movq m0, I(1) ; r0 = C.
+ paddsw m2, m6 ; r2 = A.. = F + A.
+ paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
+ psubsw m2, m1 ; r2 = R2 = A.. - H.
+%endmacro
+
+; RowIDCT gets ready to transpose
+%macro RowIDCT 0
+ BeginIDCT
+ movq m3, I(2) ; r3 = D.
+ psubsw m4, m7 ; r4 = E. = E - G
+ paddsw m1, m1 ; r1 = H. + H.
+ paddsw m7, m7 ; r7 = G + G
+ paddsw m1, m2 ; r1 = R1 = A.. + H.
+ paddsw m7, m4 ; r1 = R1 = A.. + H.
+ psubsw m4, m3 ; r4 = R4 = E. - D.
+ paddsw m3, m3
+ psubsw m6, m5 ; r6 = R6 = F. - B..
+ paddsw m5, m5
+ paddsw m3, m4 ; r3 = R3 = E. + D.
+ paddsw m5, m6 ; r5 = R5 = F. + B..
+ psubsw m7, m0 ; r7 = R7 = G. - C.
+ paddsw m0, m0
+ movq I(1), m1 ; save R1
+ paddsw m0, m7 ; r0 = R0 = G. + C.
+%endmacro
+
+; Column IDCT normalizes and stores final results
+%macro ColumnIDCT 0
+ BeginIDCT
+ paddsw m2, OC_8 ; adjust R2 (and R1) for shift
+ paddsw m1, m1 ; r1 = H. + H.
+ paddsw m1, m2 ; r1 = R1 = A.. + H.
+ psraw m2, 4 ; r2 = NR2
+ psubsw m4, m7 ; r4 = E. = E - G
+ psraw m1, 4 ; r1 = NR2
+ movq m3, I(2) ; r3 = D.
+ paddsw m7, m7 ; r7 = G + G
+ movq I(2), m2 ; store NR2 at I2
+ paddsw m7, m4 ; r7 = G. = E + G
+ movq I(1), m1 ; store NR1 at I1
+ psubsw m4, m3 ; r4 = R4 = E. - D.
+ paddsw m4, OC_8 ; adjust R4 (and R3) for shift
+ paddsw m3, m3 ; r3 = D. + D.
+ paddsw m3, m4 ; r3 = R3 = E. + D.
+ psraw m4, 4 ; r4 = NR4
+ psubsw m6, m5 ; r6 = R6 = F. - B..
+ psraw m3, 4 ; r3 = NR3
+ paddsw m6, OC_8 ; adjust R6 (and R5) for shift
+ paddsw m5, m5 ; r5 = B.. + B..
+ paddsw m5, m6 ; r5 = R5 = F. + B..
+ psraw m6, 4 ; r6 = NR6
+ movq J(4), m4 ; store NR4 at J4
+ psraw m5, 4 ; r5 = NR5
+ movq I(3), m3 ; store NR3 at I3
+ psubsw m7, m0 ; r7 = R7 = G. - C.
+ paddsw m7, OC_8 ; adjust R7 (and R0) for shift
+ paddsw m0, m0 ; r0 = C. + C.
+ paddsw m0, m7 ; r0 = R0 = G. + C.
+ psraw m7, 4 ; r7 = NR7
+ movq J(6), m6 ; store NR6 at J6
+ psraw m0, 4 ; r0 = NR0
+ movq J(5), m5 ; store NR5 at J5
+ movq J(7), m7 ; store NR7 at J7
+ movq I(0), m0 ; store NR0 at I0
+%endmacro
+
+; Following macro does two 4x4 transposes in place.
+;
+; At entry (we assume):
+;
+; r0 = a3 a2 a1 a0
+; I(1) = b3 b2 b1 b0
+; r2 = c3 c2 c1 c0
+; r3 = d3 d2 d1 d0
+;
+; r4 = e3 e2 e1 e0
+; r5 = f3 f2 f1 f0
+; r6 = g3 g2 g1 g0
+; r7 = h3 h2 h1 h0
+;
+; At exit, we have:
+;
+; I(0) = d0 c0 b0 a0
+; I(1) = d1 c1 b1 a1
+; I(2) = d2 c2 b2 a2
+; I(3) = d3 c3 b3 a3
+;
+; J(4) = h0 g0 f0 e0
+; J(5) = h1 g1 f1 e1
+; J(6) = h2 g2 f2 e2
+; J(7) = h3 g3 f3 e3
+;
+; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+;
+; Since r1 is free at entry, we calculate the Js first.
+%macro Transpose 0
+ movq m1, m4 ; r1 = e3 e2 e1 e0
+ punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
+ movq I(0), m0 ; save a3 a2 a1 a0
+ punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
+ movq m0, m6 ; r0 = g3 g2 g1 g0
+ punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
+ movq m5, m4 ; r5 = f1 e1 f0 e0
+ punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
+ punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
+ movq m6, m1 ; r6 = f3 e3 f2 e2
+ movq J(4), m4
+ punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
+ movq J(5), m5
+ punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
+ movq m4, I(0) ; r4 = a3 a2 a1 a0
+ punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
+ movq m5, I(1) ; r5 = b3 b2 b1 b0
+ movq m0, m4 ; r0 = a3 a2 a1 a0
+ movq J(7), m6
+ punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
+ movq J(6), m1
+ punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
+ movq m5, m2 ; r5 = c3 c2 c1 c0
+ punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
+ movq m1, m0 ; r1 = b1 a1 b0 a0
+ punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
+ punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
+ movq m2, m4 ; r2 = b3 a3 b2 a2
+ movq I(0), m0
+ punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
+ movq I(1), m1
+ punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
+ punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
+ movq I(3), m4
+ movq I(2), m2
+%endmacro
+
+%macro VP3_1D_IDCT_SSE2 0
+ movdqa m2, I(3) ; xmm2 = i3
+ movdqa m6, C(3) ; xmm6 = c3
+ movdqa m4, m2 ; xmm4 = i3
+ movdqa m7, I(5) ; xmm7 = i5
+ pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
+ movdqa m1, C(5) ; xmm1 = c5
+ pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
+ movdqa m5, m1 ; xmm5 = c5
+ pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
+ movdqa m3, I(1) ; xmm3 = i1
+ pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
+ movdqa m0, C(1) ; xmm0 = c1
+ paddw m4, m2 ; xmm4 = c3 * i3
+ paddw m6, m7 ; xmm6 = c3 * i5
+ paddw m2, m1 ; xmm2 = c5 * i3
+ movdqa m1, I(7) ; xmm1 = i7
+ paddw m7, m5 ; xmm7 = c5 * i5
+ movdqa m5, m0 ; xmm5 = c1
+ pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
+ paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
+ pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
+ movdqa m7, C(7) ; xmm7 = c7
+ psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
+ paddw m0, m3 ; xmm0 = c1 * i1
+ pmulhw m3, m7 ; xmm3 = c7 * i1
+ movdqa m2, I(2) ; xmm2 = i2
+ pmulhw m7, m1 ; xmm7 = c7 * i7
+ paddw m5, m1 ; xmm5 = c1 * i7
+ movdqa m1, m2 ; xmm1 = i2
+ pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
+ psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
+ movdqa m5, I(6) ; xmm5 = i6
+ paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
+ movdqa m7, m5 ; xmm7 = i6
+ psubsw m0, m4 ; xmm0 = A - C
+ pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
+ paddw m2, m1 ; xmm2 = i2 * c2
+ pmulhw m1, C(6) ; xmm1 = c6 * i2
+ paddsw m4, m4 ; xmm4 = C + C
+ paddsw m4, m0 ; xmm4 = A + C = C.
+ psubsw m3, m6 ; xmm3 = B - D
+ paddw m5, m7 ; xmm5 = c2 * i6
+ paddsw m6, m6 ; xmm6 = D + D
+ pmulhw m7, C(6) ; xmm7 = c6 * i6
+ paddsw m6, m3 ; xmm6 = B + D = D.
+ movdqa I(1), m4 ; Save C. at I(1)
+ psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
+ movdqa m4, C(4) ; xmm4 = C4
+ movdqa m5, m3 ; xmm5 = B - D
+ pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
+ paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
+ movdqa I(2), m6 ; save D. at I(2)
+ movdqa m2, m0 ; xmm2 = A - C
+ movdqa m6, I(0) ; xmm6 = i0
+ pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
+ paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
+ movdqa m3, I(4) ; xmm3 = i4
+ psubsw m5, m1 ; xmm5 = B. - H = B..
+ paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
+ psubsw m6, m3 ; xmm6 = i0 - i4
+ movdqa m0, m6 ; xmm0 = i0 - i4
+ pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
+ paddsw m3, m3 ; xmm3 = i4 + i4
+ paddsw m1, m1 ; xmm1 = H + H
+ paddsw m3, m0 ; xmm3 = i0 + i4
+ paddsw m1, m5 ; xmm1 = B. + H = H.
+ pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
+ paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
+ psubsw m6, m2 ; xmm6 = F - A. = F.
+ paddsw m2, m2 ; xmm2 = A. + A.
+ movdqa m0, I(1) ; Load C. from I(1)
+ paddsw m2, m6 ; xmm2 = F + A. = A..
+ paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
+ psubsw m2, m1 ; xmm2 = A.. - H. = R2
+ ADD(m2) ; Adjust R2 and R1 before shifting
+ paddsw m1, m1 ; xmm1 = H. + H.
+ paddsw m1, m2 ; xmm1 = A.. + H. = R1
+ SHIFT(m2) ; xmm2 = op2
+ psubsw m4, m7 ; xmm4 = E - G = E.
+ SHIFT(m1) ; xmm1 = op1
+ movdqa m3, I(2) ; Load D. from I(2)
+ paddsw m7, m7 ; xmm7 = G + G
+ paddsw m7, m4 ; xmm7 = E + G = G.
+ psubsw m4, m3 ; xmm4 = E. - D. = R4
+ ADD(m4) ; Adjust R4 and R3 before shifting
+ paddsw m3, m3 ; xmm3 = D. + D.
+ paddsw m3, m4 ; xmm3 = E. + D. = R3
+ SHIFT(m4) ; xmm4 = op4
+ psubsw m6, m5 ; xmm6 = F. - B..= R6
+ SHIFT(m3) ; xmm3 = op3
+ ADD(m6) ; Adjust R6 and R5 before shifting
+ paddsw m5, m5 ; xmm5 = B.. + B..
+ paddsw m5, m6 ; xmm5 = F. + B.. = R5
+ SHIFT(m6) ; xmm6 = op6
+ SHIFT(m5) ; xmm5 = op5
+ psubsw m7, m0 ; xmm7 = G. - C. = R7
+ ADD(m7) ; Adjust R7 and R0 before shifting
+ paddsw m0, m0 ; xmm0 = C. + C.
+ paddsw m0, m7 ; xmm0 = G. + C.
+ SHIFT(m7) ; xmm7 = op7
+ SHIFT(m0) ; xmm0 = op0
+%endmacro
+
+%macro PUT_BLOCK 8
+ movdqa O(0), m%1
+ movdqa O(1), m%2
+ movdqa O(2), m%3
+ movdqa O(3), m%4
+ movdqa O(4), m%5
+ movdqa O(5), m%6
+ movdqa O(6), m%7
+ movdqa O(7), m%8
+%endmacro
+
+%macro VP3_IDCT 1
+%if mmsize == 16
+%define I(x) [%1+16*x]
+%define O(x) [%1+16*x]
+%define C(x) [vp3_idct_data+16*(x-1)]
+%define SHIFT(x)
+%define ADD(x)
+ VP3_1D_IDCT_SSE2
+%if ARCH_X86_64
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
+%endif
+ PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
+
+%define SHIFT(x) psraw x, 4
+%define ADD(x) paddsw x, [pw_8]
+ VP3_1D_IDCT_SSE2
+ PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
+%else ; mmsize == 8
+ ; eax = quantized input
+ ; ebx = dequantizer matrix
+ ; ecx = IDCT constants
+ ; M(I) = ecx + MaskOffset(0) + I * 8
+ ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
+ ; edx = output
+ ; r0..r7 = mm0..mm7
+%define OC_8 [pw_8]
+%define C(x) [vp3_idct_data+16*(x-1)]
+
+ ; at this point, function has completed dequantization + dezigzag +
+ ; partial transposition; now do the idct itself
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
+ RowIDCT
+ Transpose
+
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
+ RowIDCT
+ Transpose
+
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
+ ColumnIDCT
+
+%define I(x) [%1+16* x +64]
+%define J(x) [%1+16*(x-4)+72]
+ ColumnIDCT
+%endif ; mmsize == 16/8
+%endmacro
+
+%macro vp3_idct_funcs 0
+cglobal vp3_idct_put, 3, 4, 9
+ VP3_IDCT r2
+
+ movsxdifnidn r1, r1d
+ mova m4, [pb_80]
+ lea r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+ mova m0, [r2+mmsize*0+%%i]
+ mova m1, [r2+mmsize*2+%%i]
+ mova m2, [r2+mmsize*4+%%i]
+ mova m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+ packsswb m0, [r2+mmsize*8+%%i]
+ packsswb m1, [r2+mmsize*10+%%i]
+ packsswb m2, [r2+mmsize*12+%%i]
+ packsswb m3, [r2+mmsize*14+%%i]
+%else
+ packsswb m0, [r2+mmsize*1+%%i]
+ packsswb m1, [r2+mmsize*3+%%i]
+ packsswb m2, [r2+mmsize*5+%%i]
+ packsswb m3, [r2+mmsize*7+%%i]
+%endif
+ paddb m0, m4
+ paddb m1, m4
+ paddb m2, m4
+ paddb m3, m4
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
+%else
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m1
+ movhps [r0+r3 ], m1
+%endif
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+ movq [r0 ], m2
+ movhps [r0+r1 ], m2
+ movq [r0+r1*2], m3
+ movhps [r0+r3 ], m3
+%endif
+%assign %%i %%i+8
+%endrep
+
+ pxor m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+ mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
+%endrep
+ RET
+
+cglobal vp3_idct_add, 3, 4, 9
+ VP3_IDCT r2
+
+ movsxdifnidn r1, r1d
+ lea r3, [r1*3]
+ pxor m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
+ movq m0, [r0]
+ movq m1, [r0+r1]
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m3, [r2+48+%%i]
+ packuswb m0, m1
+ packuswb m2, m3
+ movq [r0 ], m0
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m2
+ movhps [r0+r3 ], m2
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+ movq m0, [r0]
+ movq m1, [r0+r1]
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
+ movq m5, m0
+ movq m6, m1
+ movq m7, m2
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpckhbw m5, m4
+ punpckhbw m6, m4
+ punpckhbw m7, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m5, [r2+64+%%i]
+ paddsw m6, [r2+80+%%i]
+ paddsw m7, [r2+96+%%i]
+ packuswb m0, m5
+ movq m5, m3
+ punpcklbw m3, m4
+ punpckhbw m5, m4
+ packuswb m1, m6
+ paddsw m3, [r2+48+%%i]
+ paddsw m5, [r2+112+%%i]
+ packuswb m2, m7
+ packuswb m3, m5
+ movq [r0 ], m0
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+ mova [r2+%%i], m4
+%assign %%i %%i+mmsize
+%endrep
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+vp3_idct_funcs
+%endif
+
+INIT_XMM sse2
+vp3_idct_funcs
+
+%macro DC_ADD 0
+ movq m2, [r0 ]
+ movq m3, [r0+r1 ]
+ paddusb m2, m0
+ movq m4, [r0+r1*2]
+ paddusb m3, m0
+ movq m5, [r0+r2 ]
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ movq [r0 ], m2
+ psubusb m4, m1
+ movq [r0+r1 ], m3
+ psubusb m5, m1
+ movq [r0+r1*2], m4
+ movq [r0+r2 ], m5
+%endmacro
+
+INIT_MMX mmxext
+cglobal vp3_idct_dc_add, 3, 4
+%if ARCH_X86_64
+ movsxd r1, r1d
+%endif
+ movsx r3, word [r2]
+ mov word [r2], 0
+ lea r2, [r1*3]
+ add r3, 15
+ sar r3, 5
+ movd m0, r3d
+ pshufw m0, m0, 0x0
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ DC_ADD
+ lea r0, [r0+r1*4]
+ DC_ADD
+ RET
diff --git a/ffmpeg/libavcodec/x86/vp3dsp_init.c b/ffmpeg/libavcodec/x86/vp3dsp_init.c
new file mode 100644
index 0000000..252b40a
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp3dsp_init.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/vp3dsp.h"
+#include "config.h"
+
+void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
+void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block);
+void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
+ int16_t *block);
+
+void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
+ int *bounding_values);
+void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
+ int *bounding_values);
+
+#if HAVE_INLINE_ASM
+
+#define MOVQ_BFE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "paddb %%"#regd", %%"#regd" \n\t" ::)
+
+#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
+ "movq "#rega", "#regr" \n\t" \
+ "movq "#regc", "#regp" \n\t" \
+ "pand "#regb", "#regr" \n\t" \
+ "pand "#regd", "#regp" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pxor "#regc", "#regd" \n\t" \
+ "pand %%mm6, "#regb" \n\t" \
+ "pand %%mm6, "#regd" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psrlq $1, "#regd" \n\t" \
+ "paddb "#regb", "#regr" \n\t" \
+ "paddb "#regd", "#regp" \n\t"
+
+static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
+{
+// START_TIMER
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "movq (%1,%4), %%mm2 \n\t"
+ "movq (%2,%4), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "movq %%mm5, (%3,%4) \n\t"
+
+ "movq (%1,%4,2), %%mm0 \n\t"
+ "movq (%2,%4,2), %%mm1 \n\t"
+ "movq (%1,%5), %%mm2 \n\t"
+ "movq (%2,%5), %%mm3 \n\t"
+ "lea (%1,%4,4), %1 \n\t"
+ "lea (%2,%4,4), %2 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3,%4,2) \n\t"
+ "movq %%mm5, (%3,%5) \n\t"
+ "lea (%3,%4,4), %3 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
+ :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
+ :"memory");
+// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
+{
+ int cpuflags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
+ c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
+#endif /* HAVE_INLINE_ASM */
+
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(cpuflags)) {
+ c->idct_put = ff_vp3_idct_put_mmx;
+ c->idct_add = ff_vp3_idct_add_mmx;
+ }
+#endif
+
+ if (EXTERNAL_MMXEXT(cpuflags)) {
+ c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
+
+ if (!(flags & CODEC_FLAG_BITEXACT)) {
+ c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
+ c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
+ }
+ }
+
+ if (EXTERNAL_SSE2(cpuflags)) {
+ c->idct_put = ff_vp3_idct_put_sse2;
+ c->idct_add = ff_vp3_idct_add_sse2;
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/vp56_arith.h b/ffmpeg/libavcodec/x86/vp56_arith.h
new file mode 100644
index 0000000..e71dbf8
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp56_arith.h
@@ -0,0 +1,54 @@
+/**
+ * VP5 and VP6 compatible video decoder (arith decoder)
+ *
+ * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2010 Eli Friedman
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP56_ARITH_H
+#define AVCODEC_X86_VP56_ARITH_H
+
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#define vp56_rac_get_prob vp56_rac_get_prob
+static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
+{
+ unsigned int code_word = vp56_rac_renorm(c);
+ unsigned int high = c->high;
+ unsigned int low = 1 + (((high - 1) * prob) >> 8);
+ unsigned int low_shift = low << 16;
+ int bit = 0;
+
+ __asm__(
+ "subl %4, %1 \n\t"
+ "subl %3, %2 \n\t"
+ "leal (%2, %3), %3 \n\t"
+ "setae %b0 \n\t"
+ "cmovb %4, %1 \n\t"
+ "cmovb %3, %2 \n\t"
+ : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
+ : "r"(low)
+ );
+
+ c->high = high;
+ c->code_word = code_word;
+ return bit;
+}
+#endif
+
+#endif /* AVCODEC_X86_VP56_ARITH_H */
diff --git a/ffmpeg/libavcodec/x86/vp56dsp.asm b/ffmpeg/libavcodec/x86/vp56dsp.asm
new file mode 100644
index 0000000..3d874ea
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp56dsp.asm
@@ -0,0 +1,170 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the VP6 decoder
+;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
+;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_64
+
+SECTION .text
+
+%macro DIAG4 6
+%if mmsize == 8
+ movq m0, [%1+%2]
+ movq m1, [%1+%3]
+ movq m3, m0
+ movq m4, m1
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpckhbw m3, m7
+ punpckhbw m4, m7
+ pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
+ pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
+ pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
+ pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
+ paddw m0, m1
+ paddw m3, m4
+ movq m1, [%1+%4]
+ movq m2, [%1+%5]
+ movq m4, m1
+ movq m5, m2
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m4, m7
+ punpckhbw m5, m7
+ pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
+ pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
+ pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
+ pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
+ paddw m1, m2
+ paddw m4, m5
+ paddsw m0, m1
+ paddsw m3, m4
+ paddsw m0, m6 ; Add 64
+ paddsw m3, m6 ; Add 64
+ psraw m0, 7
+ psraw m3, 7
+ packuswb m0, m3
+ movq [%6], m0
+%else ; mmsize == 16
+ movq m0, [%1+%2]
+ movq m1, [%1+%3]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m4 ; src[x-8 ] * biweight [0]
+ pmullw m1, m5 ; src[x ] * biweight [1]
+ paddw m0, m1
+ movq m1, [%1+%4]
+ movq m2, [%1+%5]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ pmullw m1, m6 ; src[x+8 ] * biweight [2]
+ pmullw m2, m3 ; src[x+16] * biweight [3]
+ paddw m1, m2
+ paddsw m0, m1
+ paddsw m0, [pw_64] ; Add 64
+ psraw m0, 7
+ packuswb m0, m0
+ movq [%6], m0
+%endif ; mmsize == 8/16
+%endmacro
+
+%macro SPLAT4REGS 0
+%if mmsize == 8
+ movq m5, m3
+ punpcklwd m3, m3
+ movq m4, m3
+ punpckldq m3, m3
+ punpckhdq m4, m4
+ punpckhwd m5, m5
+ movq m2, m5
+ punpckhdq m2, m2
+ punpckldq m5, m5
+ movq [rsp+8*11], m3
+ movq [rsp+8*12], m4
+ movq [rsp+8*13], m5
+ movq [rsp+8*14], m2
+%else ; mmsize == 16
+ pshuflw m4, m3, 0x0
+ pshuflw m5, m3, 0x55
+ pshuflw m6, m3, 0xAA
+ pshuflw m3, m3, 0xFF
+ punpcklqdq m4, m4
+ punpcklqdq m5, m5
+ punpcklqdq m6, m6
+ punpcklqdq m3, m3
+%endif ; mmsize == 8/16
+%endmacro
+
+%macro vp6_filter_diag4 0
+; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
+; const int16_t h_weight[4], const int16_t v_weights[4])
+cglobal vp6_filter_diag4, 5, 7, 8
+ mov r5, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+%if mmsize == 16
+ sub rsp, 8*11
+%else
+ sub rsp, 8*15
+ movq m6, [pw_64]
+%endif
+%if ARCH_X86_64
+ movsxd r2, r2d
+%endif
+
+ sub r1, r2
+
+ pxor m7, m7
+ movq m3, [r3]
+ SPLAT4REGS
+
+ mov r3, rsp
+ mov r6, 11
+.nextrow:
+ DIAG4 r1, -1, 0, 1, 2, r3
+ add r3, 8
+ add r1, r2
+ dec r6
+ jnz .nextrow
+
+ movq m3, [r4]
+ SPLAT4REGS
+
+ lea r3, [rsp+8]
+ mov r6, 8
+.nextcol:
+ DIAG4 r3, -8, 0, 8, 16, r0
+ add r3, 8
+ add r0, r2
+ dec r6
+ jnz .nextcol
+
+ mov rsp, r5 ; restore stack pointer
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+vp6_filter_diag4
+%endif
+
+INIT_XMM sse2
+vp6_filter_diag4
diff --git a/ffmpeg/libavcodec/x86/vp56dsp_init.c b/ffmpeg/libavcodec/x86/vp56dsp_init.c
new file mode 100644
index 0000000..defc63b
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp56dsp_init.c
@@ -0,0 +1,48 @@
+/*
+ * VP6 MMX/SSE2 optimizations
+ * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
+ * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp56dsp.h"
+
+void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride,
+ const int16_t *h_weights,const int16_t *v_weights);
+void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
+ const int16_t *h_weights,const int16_t *v_weights);
+
+av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (CONFIG_VP6_DECODER && codec == AV_CODEC_ID_VP6) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(mm_flags)) {
+ c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
+ }
+#endif
+
+ if (EXTERNAL_SSE2(mm_flags)) {
+ c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
+ }
+ }
+}
diff --git a/ffmpeg/libavcodec/x86/vp8dsp.asm b/ffmpeg/libavcodec/x86/vp8dsp.asm
new file mode 100644
index 0000000..ca07333
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp8dsp.asm
@@ -0,0 +1,2780 @@
+;******************************************************************************
+;* VP8 MMXEXT optimizations
+;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+fourtap_filter_hw_m: times 4 dw -6, 123
+ times 4 dw 12, -1
+ times 4 dw -9, 93
+ times 4 dw 50, -6
+ times 4 dw -6, 50
+ times 4 dw 93, -9
+ times 4 dw -1, 12
+ times 4 dw 123, -6
+
+sixtap_filter_hw_m: times 4 dw 2, -11
+ times 4 dw 108, 36
+ times 4 dw -8, 1
+ times 4 dw 3, -16
+ times 4 dw 77, 77
+ times 4 dw -16, 3
+ times 4 dw 1, -8
+ times 4 dw 36, 108
+ times 4 dw -11, 2
+
+fourtap_filter_hb_m: times 8 db -6, 123
+ times 8 db 12, -1
+ times 8 db -9, 93
+ times 8 db 50, -6
+ times 8 db -6, 50
+ times 8 db 93, -9
+ times 8 db -1, 12
+ times 8 db 123, -6
+
+sixtap_filter_hb_m: times 8 db 2, 1
+ times 8 db -11, 108
+ times 8 db 36, -8
+ times 8 db 3, 3
+ times 8 db -16, 77
+ times 8 db 77, -16
+ times 8 db 1, 2
+ times 8 db -8, 36
+ times 8 db 108, -11
+
+fourtap_filter_v_m: times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+
+sixtap_filter_v_m: times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+bilinear_filter_vw_m: times 8 dw 1
+ times 8 dw 2
+ times 8 dw 3
+ times 8 dw 4
+ times 8 dw 5
+ times 8 dw 6
+ times 8 dw 7
+
+bilinear_filter_vb_m: times 8 db 7, 1
+ times 8 db 6, 2
+ times 8 db 5, 3
+ times 8 db 4, 4
+ times 8 db 3, 5
+ times 8 db 2, 6
+ times 8 db 1, 7
+
+%ifdef PIC
+%define fourtap_filter_hw picregq
+%define sixtap_filter_hw picregq
+%define fourtap_filter_hb picregq
+%define sixtap_filter_hb picregq
+%define fourtap_filter_v picregq
+%define sixtap_filter_v picregq
+%define bilinear_filter_vw picregq
+%define bilinear_filter_vb picregq
+%define npicregs 1
+%else
+%define fourtap_filter_hw fourtap_filter_hw_m
+%define sixtap_filter_hw sixtap_filter_hw_m
+%define fourtap_filter_hb fourtap_filter_hb_m
+%define sixtap_filter_hb sixtap_filter_hb_m
+%define fourtap_filter_v fourtap_filter_v_m
+%define sixtap_filter_v sixtap_filter_v_m
+%define bilinear_filter_vw bilinear_filter_vw_m
+%define bilinear_filter_vb bilinear_filter_vb_m
+%define npicregs 0
+%endif
+
+filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+
+filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+
+pw_27: times 8 dw 27
+pw_63: times 8 dw 63
+pw_256: times 8 dw 256
+pw_20091: times 4 dw 20091
+pw_17734: times 4 dw 17734
+
+pb_4: times 16 db 4
+pb_F8: times 16 db 0xF8
+pb_FE: times 16 db 0xFE
+pb_27_63: times 8 db 27, 63
+pb_18_63: times 8 db 18, 63
+pb_9_63: times 8 db 9, 63
+
+cextern pb_1
+cextern pw_3
+cextern pb_3
+cextern pw_4
+cextern pw_9
+cextern pw_18
+cextern pw_64
+cextern pb_80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; subpel MC functions:
+;
+; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
+; uint8_t *src, int srcstride,
+; int height, int mx, int my);
+;-----------------------------------------------------------------------------
+
+%macro FILTER_SSSE3 1
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+ mova m3, [filter_h6_shuf2]
+ mova m4, [filter_h6_shuf3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
+ mova m6, [sixtap_filter_hb+mxq*8-32]
+ mova m7, [sixtap_filter_hb+mxq*8-16]
+
+.nextrow:
+ movu m0, [srcq-2]
+ mova m1, m0
+ mova m2, m0
+%if mmsize == 8
+; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
+; shuffle with a memory operand
+ punpcklbw m0, [srcq+3]
+%else
+ pshufb m0, [filter_h6_shuf1]
+%endif
+ pshufb m1, m3
+ pshufb m2, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ paddsw m0, m1
+ paddsw m0, m2
+ pmulhrsw m0, [pw_256]
+ packuswb m0, m0
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+ mova m2, [pw_256]
+ mova m3, [filter_h2_shuf]
+ mova m4, [filter_h4_shuf]
+%ifdef PIC
+ lea picregq, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
+ mova m6, [fourtap_filter_hb+mxq]
+
+.nextrow:
+ movu m0, [srcq-1]
+ mova m1, m0
+ pshufb m0, m3
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ paddsw m0, m1
+ pmulhrsw m0, m2
+ packuswb m0, m0
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+myq-16]
+ mova m6, [fourtap_filter_hb+myq]
+ mova m7, [pw_256]
+
+ ; read 3 lines
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
+
+.nextrow:
+ movh m3, [srcq+2*srcstrideq] ; read new row
+ mova m4, m0
+ mova m0, m1
+ punpcklbw m4, m1
+ mova m1, m2
+ punpcklbw m2, m3
+ pmaddubsw m4, m5
+ pmaddubsw m2, m6
+ paddsw m4, m2
+ mova m2, m3
+ pmulhrsw m4, m7
+ packuswb m4, m4
+ movh [dstq], m4
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ lea myd, [myq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ lea myq, [sixtap_filter_hb+myq*8]
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+
+.nextrow:
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ mova m6, m0
+ punpcklbw m6, m5
+ mova m0, m1
+ punpcklbw m1, m2
+ mova m7, m3
+ punpcklbw m7, m4
+ pmaddubsw m6, [myq-48]
+ pmaddubsw m1, [myq-32]
+ pmaddubsw m7, [myq-16]
+ paddsw m6, m1
+ paddsw m6, m7
+ mova m1, m2
+ mova m2, m3
+ pmulhrsw m6, [pw_256]
+ mova m3, m4
+ packuswb m6, m6
+ mova m4, m5
+ movh [dstq], m6
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+FILTER_SSSE3 4
+INIT_XMM ssse3
+FILTER_SSSE3 8
+
+; 4x4 block, H-only 4-tap filter
+INIT_MMX mmxext
+cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [fourtap_filter_hw_m]
+%endif
+ movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
+ movq mm5, [fourtap_filter_hw+mxq]
+ movq mm7, [pw_64]
+ pxor mm6, mm6
+
+.nextrow:
+ movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
+
+ ; first set of 2 pixels
+ movq mm2, mm1 ; byte ABCD..
+ punpcklbw mm1, mm6 ; byte->word ABCD
+ pshufw mm0, mm2, 9 ; byte CDEF..
+ punpcklbw mm0, mm6 ; byte->word CDEF
+ pshufw mm3, mm1, 0x94 ; word ABBC
+ pshufw mm1, mm0, 0x94 ; word CDDE
+ pmaddwd mm3, mm4 ; multiply 2px with F0/F1
+ movq mm0, mm1 ; backup for second set of pixels
+ pmaddwd mm1, mm5 ; multiply 2px with F2/F3
+ paddd mm3, mm1 ; finish 1st 2px
+
+ ; second set of 2 pixels, use backup of above
+ punpckhbw mm2, mm6 ; byte->word EFGH
+ pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
+ pshufw mm1, mm2, 0x94 ; word EFFG
+ pmaddwd mm1, mm5 ; multiply 2px with F2/F3
+ paddd mm0, mm1 ; finish 2nd 2px
+
+ ; merge two sets of 2 pixels into one set of 4, round/clip/store
+ packssdw mm3, mm0 ; merge dword->word (4px)
+ paddsw mm3, mm7 ; rounding
+ psraw mm3, 7
+ packuswb mm3, mm6 ; clip and word->bytes
+ movd [dstq], mm3 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+; 4x4 block, H-only 6-tap filter
+INIT_MMX mmxext
+cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_hw_m]
+%endif
+ movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
+ movq mm5, [sixtap_filter_hw+mxq*8-32]
+ movq mm6, [sixtap_filter_hw+mxq*8-16]
+ movq mm7, [pw_64]
+ pxor mm3, mm3
+
+.nextrow:
+ movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
+
+ ; first set of 2 pixels
+ movq mm2, mm1 ; byte ABCD..
+ punpcklbw mm1, mm3 ; byte->word ABCD
+ pshufw mm0, mm2, 0x9 ; byte CDEF..
+ punpckhbw mm2, mm3 ; byte->word EFGH
+ punpcklbw mm0, mm3 ; byte->word CDEF
+ pshufw mm1, mm1, 0x94 ; word ABBC
+ pshufw mm2, mm2, 0x94 ; word EFFG
+ pmaddwd mm1, mm4 ; multiply 2px with F0/F1
+ pshufw mm3, mm0, 0x94 ; word CDDE
+ movq mm0, mm3 ; backup for second set of pixels
+ pmaddwd mm3, mm5 ; multiply 2px with F2/F3
+ paddd mm1, mm3 ; add to 1st 2px cache
+ movq mm3, mm2 ; backup for second set of pixels
+ pmaddwd mm2, mm6 ; multiply 2px with F4/F5
+ paddd mm1, mm2 ; finish 1st 2px
+
+ ; second set of 2 pixels, use backup of above
+ movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
+ pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
+ pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
+ paddd mm0, mm3 ; add to 2nd 2px cache
+ pxor mm3, mm3
+ punpcklbw mm2, mm3 ; byte->word FGHI
+ pshufw mm2, mm2, 0xE9 ; word GHHI
+ pmaddwd mm2, mm6 ; multiply 2px with F4/F5
+ paddd mm0, mm2 ; finish 2nd 2px
+
+ ; merge two sets of 2 pixels into one set of 4, round/clip/store
+ packssdw mm1, mm0 ; merge dword->word (4px)
+ paddsw mm1, mm7 ; rounding
+ psraw mm1, 7
+ packuswb mm1, mm3 ; clip and word->bytes
+ movd [dstq], mm1 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+INIT_XMM sse2
+cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 5
+%ifdef PIC
+ lea picregq, [fourtap_filter_v_m]
+%endif
+ lea mxq, [fourtap_filter_v+mxq-32]
+ pxor m7, m7
+ mova m4, [pw_64]
+ mova m5, [mxq+ 0]
+ mova m6, [mxq+16]
+%ifdef m8
+ mova m8, [mxq+32]
+ mova m9, [mxq+48]
+%endif
+.nextrow:
+ movq m0, [srcq-1]
+ movq m1, [srcq-0]
+ movq m2, [srcq+1]
+ movq m3, [srcq+2]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ pmullw m0, m5
+ pmullw m1, m6
+%ifdef m8
+ pmullw m2, m8
+ pmullw m3, m9
+%else
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
+%endif
+ paddsw m0, m1
+ paddsw m2, m3
+ paddsw m0, m2
+ paddsw m0, m4
+ psraw m0, 7
+ packuswb m0, m7
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+INIT_XMM sse2
+cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
+ lea mxd, [mxq*3]
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ lea mxq, [sixtap_filter_v+mxq-96]
+ pxor m7, m7
+ mova m6, [pw_64]
+%ifdef m8
+ mova m8, [mxq+ 0]
+ mova m9, [mxq+16]
+ mova m10, [mxq+32]
+ mova m11, [mxq+48]
+ mova m12, [mxq+64]
+ mova m13, [mxq+80]
+%endif
+.nextrow:
+ movq m0, [srcq-2]
+ movq m1, [srcq-1]
+ movq m2, [srcq-0]
+ movq m3, [srcq+1]
+ movq m4, [srcq+2]
+ movq m5, [srcq+3]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+%ifdef m8
+ pmullw m0, m8
+ pmullw m1, m9
+ pmullw m2, m10
+ pmullw m3, m11
+ pmullw m4, m12
+ pmullw m5, m13
+%else
+ pmullw m0, [mxq+ 0]
+ pmullw m1, [mxq+16]
+ pmullw m2, [mxq+32]
+ pmullw m3, [mxq+48]
+ pmullw m4, [mxq+64]
+ pmullw m5, [mxq+80]
+%endif
+ paddsw m1, m4
+ paddsw m0, m5
+ paddsw m1, m2
+ paddsw m0, m3
+ paddsw m0, m1
+ paddsw m0, m6
+ psraw m0, 7
+ packuswb m0, m7
+ movh [dstq], m0 ; store
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+%macro FILTER_V 1
+; 4x4 block, V-only 4-tap filter
+cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 5
+%ifdef PIC
+ lea picregq, [fourtap_filter_v_m]
+%endif
+ lea myq, [fourtap_filter_v+myq-32]
+ mova m6, [pw_64]
+ pxor m7, m7
+ mova m5, [myq+48]
+
+ ; read 3 lines
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+ srcstrideq]
+ movh m2, [srcq+2*srcstrideq]
+ add srcq, srcstrideq
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+
+.nextrow:
+ ; first calculate negative taps (to prevent losing positive overflows)
+ movh m4, [srcq+2*srcstrideq] ; read new row
+ punpcklbw m4, m7
+ mova m3, m4
+ pmullw m0, [myq+0]
+ pmullw m4, m5
+ paddsw m4, m0
+
+ ; then calculate positive taps
+ mova m0, m1
+ pmullw m1, [myq+16]
+ paddsw m4, m1
+ mova m1, m2
+ pmullw m2, [myq+32]
+ paddsw m4, m2
+ mova m2, m3
+
+ ; round/clip/store
+ paddsw m4, m6
+ psraw m4, 7
+ packuswb m4, m7
+ movh [dstq], m4
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+
+; 4x4 block, V-only 6-tap filter
+cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+ lea myq, [myq*3]
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ lea myq, [sixtap_filter_v+myq-96]
+ pxor m7, m7
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+
+.nextrow:
+ ; first calculate negative taps (to prevent losing positive overflows)
+ mova m5, m1
+ pmullw m5, [myq+16]
+ mova m6, m4
+ pmullw m6, [myq+64]
+ paddsw m6, m5
+
+ ; then calculate positive taps
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ punpcklbw m5, m7
+ pmullw m0, [myq+0]
+ paddsw m6, m0
+ mova m0, m1
+ mova m1, m2
+ pmullw m2, [myq+32]
+ paddsw m6, m2
+ mova m2, m3
+ pmullw m3, [myq+48]
+ paddsw m6, m3
+ mova m3, m4
+ mova m4, m5
+ pmullw m5, [myq+80]
+ paddsw m6, m5
+
+ ; round/clip/store
+ paddsw m6, [pw_64]
+ psraw m6, 7
+ packuswb m6, m7
+ movh [dstq], m6
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+FILTER_V 4
+INIT_XMM sse2
+FILTER_V 8
+
+%macro FILTER_BILINEAR 1
+cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vw_m]
+%endif
+ pxor m6, m6
+ mova m5, [bilinear_filter_vw+myq-1*16]
+ neg myq
+ mova m4, [bilinear_filter_vw+myq+7*16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m3, [srcq+srcstrideq*2]
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ punpcklbw m3, m6
+ mova m2, m1
+ pmullw m0, m4
+ pmullw m1, m5
+ pmullw m2, m4
+ pmullw m3, m5
+ paddsw m0, m1
+ paddsw m2, m3
+ psraw m0, 2
+ psraw m2, 2
+ pavgw m0, m6
+ pavgw m2, m6
+%if mmsize == 8
+ packuswb m0, m0
+ packuswb m2, m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
+%else
+ packuswb m0, m2
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vw_m]
+%endif
+ pxor m6, m6
+ mova m5, [bilinear_filter_vw+mxq-1*16]
+ neg mxq
+ mova m4, [bilinear_filter_vw+mxq+7*16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0+0]
+ movh m1, [srcq+srcstrideq*0+1]
+ movh m2, [srcq+srcstrideq*1+0]
+ movh m3, [srcq+srcstrideq*1+1]
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ punpcklbw m2, m6
+ punpcklbw m3, m6
+ pmullw m0, m4
+ pmullw m1, m5
+ pmullw m2, m4
+ pmullw m3, m5
+ paddsw m0, m1
+ paddsw m2, m3
+ psraw m0, 2
+ psraw m2, 2
+ pavgw m0, m6
+ pavgw m2, m6
+%if mmsize == 8
+ packuswb m0, m0
+ packuswb m2, m2
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m2
+%else
+ packuswb m0, m2
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+FILTER_BILINEAR 4
+INIT_XMM sse2
+FILTER_BILINEAR 8
+
+%macro FILTER_BILINEAR_SSSE3 1
+cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
+ shl myd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vb_m]
+%endif
+ pxor m4, m4
+ mova m3, [bilinear_filter_vb+myq-16]
+.nextrow:
+ movh m0, [srcq+srcstrideq*0]
+ movh m1, [srcq+srcstrideq*1]
+ movh m2, [srcq+srcstrideq*2]
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ pavgw m0, m4
+ pavgw m1, m4
+%if mmsize==8
+ packuswb m0, m0
+ packuswb m1, m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
+%else
+ packuswb m0, m1
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
+ shl mxd, 4
+%ifdef PIC
+ lea picregq, [bilinear_filter_vb_m]
+%endif
+ pxor m4, m4
+ mova m2, [filter_h2_shuf]
+ mova m3, [bilinear_filter_vb+mxq-16]
+.nextrow:
+ movu m0, [srcq+srcstrideq*0]
+ movu m1, [srcq+srcstrideq*1]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ pavgw m0, m4
+ pavgw m1, m4
+%if mmsize==8
+ packuswb m0, m0
+ packuswb m1, m1
+ movh [dstq+dststrideq*0], m0
+ movh [dstq+dststrideq*1], m1
+%else
+ packuswb m0, m1
+ movh [dstq+dststrideq*0], m0
+ movhps [dstq+dststrideq*1], m0
+%endif
+
+ lea dstq, [dstq+dststrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+FILTER_BILINEAR_SSSE3 4
+INIT_XMM ssse3
+FILTER_BILINEAR_SSSE3 8
+
+INIT_MMX mmx
+cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
+.nextrow:
+ movq mm0, [srcq+srcstrideq*0]
+ movq mm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0], mm0
+ movq [dstq+dststrideq*1], mm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
+.nextrow:
+ movq mm0, [srcq+srcstrideq*0+0]
+ movq mm1, [srcq+srcstrideq*0+8]
+ movq mm2, [srcq+srcstrideq*1+0]
+ movq mm3, [srcq+srcstrideq*1+8]
+ lea srcq, [srcq+srcstrideq*2]
+ movq [dstq+dststrideq*0+0], mm0
+ movq [dstq+dststrideq*0+8], mm1
+ movq [dstq+dststrideq*1+0], mm2
+ movq [dstq+dststrideq*1+8], mm3
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+%endif
+
+INIT_XMM sse
+cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
+.nextrow:
+ movups xmm0, [srcq+srcstrideq*0]
+ movups xmm1, [srcq+srcstrideq*1]
+ lea srcq, [srcq+srcstrideq*2]
+ movaps [dstq+dststrideq*0], xmm0
+ movaps [dstq+dststrideq*1], xmm1
+ lea dstq, [dstq+dststrideq*2]
+ sub heightd, 2
+ jg .nextrow
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
+;-----------------------------------------------------------------------------
+
+%macro ADD_DC 4
+ %4 m2, [dst1q+%3]
+ %4 m3, [dst1q+strideq+%3]
+ %4 m4, [dst2q+%3]
+ %4 m5, [dst2q+strideq+%3]
+ paddusb m2, %1
+ paddusb m3, %1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m2, %2
+ psubusb m3, %2
+ psubusb m4, %2
+ psubusb m5, %2
+ %4 [dst1q+%3], m2
+ %4 [dst1q+strideq+%3], m3
+ %4 [dst2q+%3], m4
+ %4 [dst2q+strideq+%3], m5
+%endmacro
+
+INIT_MMX mmx
+cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq]
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ pxor m1, m1
+ psraw m0, 3
+ movd [blockq], m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m1, 0, movh
+ RET
+
+INIT_XMM sse4
+cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
+ ; load data
+ movd m0, [blockq]
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq], m1
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ movd m2, [dst1q]
+ movd m3, [dst1q+strideq]
+ movd m4, [dst2q]
+ movd m5, [dst2q+strideq]
+ psraw m0, 3
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [dst1q], m2
+ pextrd [dst1q+strideq], m2, 1
+ pextrd [dst2q], m2, 2
+ pextrd [dst2q+strideq], m2, 3
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m6, 0, mova
+ ADD_DC m1, m7, 8, mova
+ RET
+%endif
+
+INIT_XMM sse2
+cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m1
+ movd [blockq+32*1], m1
+ movd [blockq+32*2], m1
+ movd [blockq+32*3], m1
+ psraw m0, 3
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m1, 0, mova
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
+ ; load data
+ movd m0, [blockq+32*0] ; A
+ movd m1, [blockq+32*2] ; C
+ punpcklwd m0, [blockq+32*1] ; A B
+ punpcklwd m1, [blockq+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [blockq+32*0], m6
+ movd [blockq+32*1], m6
+ movd [blockq+32*2], m6
+ movd [blockq+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+strideq*2]
+ ADD_DC m0, m6, 0, mova
+ lea dst1q, [dst1q+strideq*4]
+ lea dst2q, [dst2q+strideq*4]
+ ADD_DC m1, m7, 0, mova
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
+;-----------------------------------------------------------------------------
+
+; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
+; this macro assumes that m6/m7 have words for 20091/17734 loaded
+%macro VP8_MULTIPLY_SUMSUB 4
+ mova %3, %1
+ mova %4, %2
+ pmulhw %3, m6 ;20091(1)
+ pmulhw %4, m6 ;20091(2)
+ paddw %3, %1
+ paddw %4, %2
+ paddw %1, %1
+ paddw %2, %2
+ pmulhw %1, m7 ;35468(1)
+ pmulhw %2, m7 ;35468(2)
+ psubw %1, %4
+ paddw %2, %3
+%endmacro
+
+; calculate x0=%1+%3; x1=%1-%3
+; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
+; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
+; %5/%6 are temporary registers
+; we assume m6/m7 have constant words 20091/17734 loaded in them
+%macro VP8_IDCT_TRANSFORM4x4_1D 6
+ SUMSUB_BA w, %3, %1, %5 ;t0, t1
+ VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
+ SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
+ SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
+ SWAP %4, %1
+ SWAP %4, %3
+%endmacro
+
+%macro VP8_IDCT_ADD 0
+cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
+ ; load block data
+ movq m0, [blockq+ 0]
+ movq m1, [blockq+ 8]
+ movq m2, [blockq+16]
+ movq m3, [blockq+24]
+ movq m6, [pw_20091]
+ movq m7, [pw_17734]
+%if cpuflag(sse)
+ xorps xmm0, xmm0
+ movaps [blockq+ 0], xmm0
+ movaps [blockq+16], xmm0
+%else
+ pxor m4, m4
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+ movq [blockq+16], m4
+ movq [blockq+24], m4
+%endif
+
+ ; actual IDCT
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, [pw_4]
+ VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+
+ ; store
+ pxor m4, m4
+ DEFINE_ARGS dst1, dst2, stride
+ lea dst2q, [dst1q+2*strideq]
+ STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
+ STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
+
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+VP8_IDCT_ADD
+%endif
+INIT_MMX sse
+VP8_IDCT_ADD
+
+;-----------------------------------------------------------------------------
+; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
+;-----------------------------------------------------------------------------
+
+%macro SCATTER_WHT 3
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(0+%3)], dc1w
+ mov [blockq+2*16*(1+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ psrlq m%1, 32
+ psrlq m%2, 32
+ mov [blockq+2*16*(4+%3)], dc1w
+ mov [blockq+2*16*(5+%3)], dc2w
+ movd dc1d, m%1
+ movd dc2d, m%2
+ mov [blockq+2*16*(8+%3)], dc1w
+ mov [blockq+2*16*(9+%3)], dc2w
+ shr dc1d, 16
+ shr dc2d, 16
+ mov [blockq+2*16*(12+%3)], dc1w
+ mov [blockq+2*16*(13+%3)], dc2w
+%endmacro
+
+%macro HADAMARD4_1D 4
+ SUMSUB_BADC w, %2, %1, %4, %3
+ SUMSUB_BADC w, %4, %2, %3, %1
+ SWAP %1, %4, %3
+%endmacro
+
+%macro VP8_DC_WHT 0
+cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
+ movq m0, [dc1q]
+ movq m1, [dc1q+8]
+ movq m2, [dc1q+16]
+ movq m3, [dc1q+24]
+%if cpuflag(sse)
+ xorps xmm0, xmm0
+ movaps [dc1q+ 0], xmm0
+ movaps [dc1q+16], xmm0
+%else
+ pxor m4, m4
+ movq [dc1q+ 0], m4
+ movq [dc1q+ 8], m4
+ movq [dc1q+16], m4
+ movq [dc1q+24], m4
+%endif
+ HADAMARD4_1D 0, 1, 2, 3
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ paddw m0, [pw_3]
+ HADAMARD4_1D 0, 1, 2, 3
+ psraw m0, 3
+ psraw m1, 3
+ psraw m2, 3
+ psraw m3, 3
+ SCATTER_WHT 0, 1, 0
+ SCATTER_WHT 2, 3, 2
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+VP8_DC_WHT
+%endif
+INIT_MMX sse
+VP8_DC_WHT
+
+;-----------------------------------------------------------------------------
+; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
+;-----------------------------------------------------------------------------
+
+; macro called with 7 mm register indexes as argument, and 4 regular registers
+;
+; first 4 mm registers will carry the transposed pixel data
+; the other three are scratchspace (one would be sufficient, but this allows
+; for more spreading/pipelining and thus faster execution on OOE CPUs)
+;
+; first two regular registers are buf+4*stride and buf+5*stride
+; third is -stride, fourth is +stride
+%macro READ_8x4_INTERLEAVED 11
+ ; interleave 8 (A-H) rows of 4 pixels each
+ movd m%1, [%8+%10*4] ; A0-3
+ movd m%5, [%9+%10*4] ; B0-3
+ movd m%2, [%8+%10*2] ; C0-3
+ movd m%6, [%8+%10] ; D0-3
+ movd m%3, [%8] ; E0-3
+ movd m%7, [%9] ; F0-3
+ movd m%4, [%9+%11] ; G0-3
+ punpcklbw m%1, m%5 ; A/B interleaved
+ movd m%5, [%9+%11*2] ; H0-3
+ punpcklbw m%2, m%6 ; C/D interleaved
+ punpcklbw m%3, m%7 ; E/F interleaved
+ punpcklbw m%4, m%5 ; G/H interleaved
+%endmacro
+
+; macro called with 7 mm register indexes as argument, and 5 regular registers
+; first 11 mean the same as READ_8x4_TRANSPOSED above
+; fifth regular register is scratchspace to reach the bottom 8 rows, it
+; will be set to second regular register + 8*stride at the end
+%macro READ_16x4_INTERLEAVED 12
+ ; transpose 16 (A-P) rows of 4 pixels each
+ lea %12, [r0+8*r2]
+
+ ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
+ movd m%1, [%8+%10*4] ; A0-3
+ movd m%3, [%12+%10*4] ; I0-3
+ movd m%2, [%8+%10*2] ; C0-3
+ movd m%4, [%12+%10*2] ; K0-3
+ movd m%6, [%8+%10] ; D0-3
+ movd m%5, [%12+%10] ; L0-3
+ movd m%7, [%12] ; M0-3
+ add %12, %11
+ punpcklbw m%1, m%3 ; A/I
+ movd m%3, [%8] ; E0-3
+ punpcklbw m%2, m%4 ; C/K
+ punpcklbw m%6, m%5 ; D/L
+ punpcklbw m%3, m%7 ; E/M
+ punpcklbw m%2, m%6 ; C/D/K/L interleaved
+
+ ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
+ movd m%5, [%9+%10*4] ; B0-3
+ movd m%4, [%12+%10*4] ; J0-3
+ movd m%7, [%9] ; F0-3
+ movd m%6, [%12] ; N0-3
+ punpcklbw m%5, m%4 ; B/J
+ punpcklbw m%7, m%6 ; F/N
+ punpcklbw m%1, m%5 ; A/B/I/J interleaved
+ punpcklbw m%3, m%7 ; E/F/M/N interleaved
+ movd m%4, [%9+%11] ; G0-3
+ movd m%6, [%12+%11] ; O0-3
+ movd m%5, [%9+%11*2] ; H0-3
+ movd m%7, [%12+%11*2] ; P0-3
+ punpcklbw m%4, m%6 ; G/O
+ punpcklbw m%5, m%7 ; H/P
+ punpcklbw m%4, m%5 ; G/H/O/P interleaved
+%endmacro
+
+; write 4 mm registers of 2 dwords each
+; first four arguments are mm register indexes containing source data
+; last four are registers containing buf+4*stride, buf+5*stride,
+; -stride and +stride
+%macro WRITE_4x2D 8
+ ; write out (2 dwords per register)
+ movd [%5+%7*4], m%1
+ movd [%5+%7*2], m%2
+ movd [%5], m%3
+ movd [%6+%8], m%4
+ punpckhdq m%1, m%1
+ punpckhdq m%2, m%2
+ punpckhdq m%3, m%3
+ punpckhdq m%4, m%4
+ movd [%6+%7*4], m%1
+ movd [%5+%7], m%2
+ movd [%6], m%3
+ movd [%6+%8*2], m%4
+%endmacro
+
+; write 4 xmm registers of 4 dwords each
+; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
+; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
+; we add 1*stride to the third regular registry in the process
+; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
+; same memory region), or 8 if they cover two separate buffers (third one points to
+; a different memory region than the first two), allowing for more optimal code for
+; the 16-width case
+%macro WRITE_4x4D 10
+ ; write out (4 dwords per register), start with dwords zero
+ movd [%5+%8*4], m%1
+ movd [%5], m%2
+ movd [%7+%8*4], m%3
+ movd [%7], m%4
+
+ ; store dwords 1
+ psrldq m%1, 4
+ psrldq m%2, 4
+ psrldq m%3, 4
+ psrldq m%4, 4
+ movd [%6+%8*4], m%1
+ movd [%6], m%2
+%if %10 == 16
+ movd [%6+%9*4], m%3
+%endif
+ movd [%7+%9], m%4
+
+ ; write dwords 2
+ psrldq m%1, 4
+ psrldq m%2, 4
+%if %10 == 8
+ movd [%5+%8*2], m%1
+ movd %5d, m%3
+%endif
+ psrldq m%3, 4
+ psrldq m%4, 4
+%if %10 == 16
+ movd [%5+%8*2], m%1
+%endif
+ movd [%6+%9], m%2
+ movd [%7+%8*2], m%3
+ movd [%7+%9*2], m%4
+ add %7, %9
+
+ ; store dwords 3
+ psrldq m%1, 4
+ psrldq m%2, 4
+ psrldq m%3, 4
+ psrldq m%4, 4
+%if %10 == 8
+ mov [%7+%8*4], %5d
+ movd [%6+%8*2], m%1
+%else
+ movd [%5+%8], m%1
+%endif
+ movd [%6+%9*2], m%2
+ movd [%7+%8*2], m%3
+ movd [%7+%9*2], m%4
+%endmacro
+
+; write 4 or 8 words in the mmx/xmm registers as 8 lines
+; 1 and 2 are the registers to write, this can be the same (for SSE2)
+; for pre-SSE4:
+; 3 is a general-purpose register that we will clobber
+; for SSE4:
+; 3 is a pointer to the destination's 5th line
+; 4 is a pointer to the destination's 4th line
+; 5/6 is -stride and +stride
+%macro WRITE_2x4W 6
+ movd %3d, %1
+ punpckhdq %1, %1
+ mov [%4+%5*4], %3w
+ shr %3, 16
+ add %4, %6
+ mov [%4+%5*4], %3w
+
+ movd %3d, %1
+ add %4, %5
+ mov [%4+%5*2], %3w
+ shr %3, 16
+ mov [%4+%5 ], %3w
+
+ movd %3d, %2
+ punpckhdq %2, %2
+ mov [%4 ], %3w
+ shr %3, 16
+ mov [%4+%6 ], %3w
+
+ movd %3d, %2
+ add %4, %6
+ mov [%4+%6 ], %3w
+ shr %3, 16
+ mov [%4+%6*2], %3w
+ add %4, %5
+%endmacro
+
+%macro WRITE_8W 5
+%if cpuflag(sse4)
+ pextrw [%3+%4*4], %1, 0
+ pextrw [%2+%4*4], %1, 1
+ pextrw [%3+%4*2], %1, 2
+ pextrw [%3+%4 ], %1, 3
+ pextrw [%3 ], %1, 4
+ pextrw [%2 ], %1, 5
+ pextrw [%2+%5 ], %1, 6
+ pextrw [%2+%5*2], %1, 7
+%else
+ movd %2d, %1
+ psrldq %1, 4
+ mov [%3+%4*4], %2w
+ shr %2, 16
+ add %3, %5
+ mov [%3+%4*4], %2w
+
+ movd %2d, %1
+ psrldq %1, 4
+ add %3, %4
+ mov [%3+%4*2], %2w
+ shr %2, 16
+ mov [%3+%4 ], %2w
+
+ movd %2d, %1
+ psrldq %1, 4
+ mov [%3 ], %2w
+ shr %2, 16
+ mov [%3+%5 ], %2w
+
+ movd %2d, %1
+ add %3, %5
+ mov [%3+%5 ], %2w
+ shr %2, 16
+ mov [%3+%5*2], %2w
+%endif
+%endmacro
+
+%macro SIMPLE_LOOPFILTER 2
+cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
+%if mmsize == 8 ; mmx/mmxext
+ mov cntrq, 2
+%endif
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, flim, m0 ; splat "flim" into register
+
+ ; set up indexes to address 4 rows
+%if mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, cntr, dst2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst3, dst2
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+4*strideq-2]
+%endif
+
+%if mmsize == 8 ; mmx / mmxext
+.next8px:
+%endif
+%ifidn %1, v
+ ; read 4 half/full rows of pixels
+ mova m0, [dst1q+mstrideq*2] ; p1
+ mova m1, [dst1q+mstrideq] ; p0
+ mova m2, [dst1q] ; q0
+ mova m3, [dst1q+ strideq] ; q1
+%else ; h
+ lea dst2q, [dst1q+ strideq]
+
+%if mmsize == 8 ; mmx/mmxext
+ READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
+%else ; sse2
+ READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
+%endif
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+
+ ; simple_limit
+ mova m5, m2 ; m5=backup of q0
+ mova m6, m1 ; m6=backup of p0
+ psubusb m1, m2 ; p0-q0
+ psubusb m2, m6 ; q0-p0
+ por m1, m2 ; FFABS(p0-q0)
+ paddusb m1, m1 ; m1=FFABS(p0-q0)*2
+
+ mova m4, m3
+ mova m2, m0
+ psubusb m3, m0 ; q1-p1
+ psubusb m0, m4 ; p1-q1
+ por m3, m0 ; FFABS(p1-q1)
+ mova m0, [pb_80]
+ pxor m2, m0
+ pxor m4, m0
+ psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
+ pand m3, [pb_FE]
+ psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
+ paddusb m3, m1
+ psubusb m3, m7
+ pxor m1, m1
+ pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
+
+ ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
+ mova m4, m5
+ pxor m5, m0
+ pxor m0, m6
+ psubsb m5, m0 ; q0-p0 (signed)
+ paddsb m2, m5
+ paddsb m2, m5
+ paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
+ pand m2, m3 ; apply filter mask (m3)
+
+ mova m3, [pb_F8]
+ mova m1, m2
+ paddsb m2, [pb_4] ; f1<<3=a+4
+ paddsb m1, [pb_3] ; f2<<3=a+3
+ pand m2, m3
+ pand m1, m3 ; cache f2<<3
+
+ pxor m0, m0
+ pxor m3, m3
+ pcmpgtb m0, m2 ; which values are <0?
+ psubb m3, m2 ; -f1<<3
+ psrlq m2, 3 ; +f1
+ psrlq m3, 3 ; -f1
+ pand m3, m0
+ pandn m0, m2
+ psubusb m4, m0
+ paddusb m4, m3 ; q0-f1
+
+ pxor m0, m0
+ pxor m3, m3
+ pcmpgtb m0, m1 ; which values are <0?
+ psubb m3, m1 ; -f2<<3
+ psrlq m1, 3 ; +f2
+ psrlq m3, 3 ; -f2
+ pand m3, m0
+ pandn m0, m1
+ paddusb m6, m0
+ psubusb m6, m3 ; p0+f2
+
+ ; store
+%ifidn %1, v
+ mova [dst1q], m4
+ mova [dst1q+mstrideq], m6
+%else ; h
+ inc dst1q
+ SBUTTERFLY bw, 6, 4, 0
+
+%if mmsize == 16 ; sse2
+%if cpuflag(sse4)
+ inc dst2q
+%endif
+ WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
+ lea dst2q, [dst3q+mstrideq+1]
+%if cpuflag(sse4)
+ inc dst3q
+%endif
+ WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
+%else ; mmx/mmxext
+ WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
+%endif
+%endif
+
+%if mmsize == 8 ; mmx/mmxext
+ ; next 8 pixels
+%ifidn %1, v
+ add dst1q, 8 ; advance 8 cols = pixels
+%else ; h
+ lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
+%endif
+ dec cntrq
+ jg .next8px
+ REP_RET
+%else ; sse2
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+INIT_MMX mmxext
+SIMPLE_LOOPFILTER v, 4
+SIMPLE_LOOPFILTER h, 5
+%endif
+
+INIT_XMM sse2
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM ssse3
+SIMPLE_LOOPFILTER v, 3
+SIMPLE_LOOPFILTER h, 5
+INIT_XMM sse4
+SIMPLE_LOOPFILTER h, 5
+
+;-----------------------------------------------------------------------------
+; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
+; int flimE, int flimI, int hev_thr);
+;-----------------------------------------------------------------------------
+
+%macro INNER_LOOPFILTER 2
+%define stack_size 0
+%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+%ifidn %1, v ; [3]=hev() result
+%define stack_size mmsize * -4
+%else ; h ; extra storage space for transposes
+%define stack_size mmsize * -5
+%endif
+%endif
+
+%if %2 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
+%else ; luma
+cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
+%endif
+
+%if cpuflag(ssse3)
+ pxor m7, m7
+%endif
+
+%ifndef m8
+ ; splat function arguments
+ SPLATB_REG m0, flimEq, m7 ; E
+ SPLATB_REG m1, flimIq, m7 ; I
+ SPLATB_REG m2, hevthrq, m7 ; hev_thresh
+
+%define m_flimE [rsp]
+%define m_flimI [rsp+mmsize]
+%define m_hevthr [rsp+mmsize*2]
+%define m_maskres [rsp+mmsize*3]
+%define m_p0backup [rsp+mmsize*3]
+%define m_q0backup [rsp+mmsize*4]
+
+ mova m_flimE, m0
+ mova m_flimI, m1
+ mova m_hevthr, m2
+%else
+%define m_flimE m9
+%define m_flimI m10
+%define m_hevthr m11
+%define m_maskres m12
+%define m_p0backup m12
+%define m_q0backup m8
+
+ ; splat function arguments
+ SPLATB_REG m_flimE, flimEq, m7 ; E
+ SPLATB_REG m_flimI, flimIq, m7 ; I
+ SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
+%endif
+
+%if %2 == 8 ; chroma
+ DEFINE_ARGS dst1, dst8, mstride, stride, dst2
+%elif mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, dst2, cntr
+ mov cntrq, 2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst2, dst8
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+strideq*4-4]
+%if %2 == 8 ; chroma
+ lea dst8q, [dst8q+strideq*4-4]
+%endif
+%endif
+
+%if mmsize == 8
+.next8px:
+%endif
+ ; read
+ lea dst2q, [dst1q+strideq]
+%ifidn %1, v
+%if %2 == 8 && mmsize == 16
+%define movrow movh
+%else
+%define movrow mova
+%endif
+ movrow m0, [dst1q+mstrideq*4] ; p3
+ movrow m1, [dst2q+mstrideq*4] ; p2
+ movrow m2, [dst1q+mstrideq*2] ; p1
+ movrow m5, [dst2q] ; q1
+ movrow m6, [dst2q+ strideq*1] ; q2
+ movrow m7, [dst2q+ strideq*2] ; q3
+%if mmsize == 16 && %2 == 8
+ movhps m0, [dst8q+mstrideq*4]
+ movhps m2, [dst8q+mstrideq*2]
+ add dst8q, strideq
+ movhps m1, [dst8q+mstrideq*4]
+ movhps m5, [dst8q]
+ movhps m6, [dst8q+ strideq ]
+ movhps m7, [dst8q+ strideq*2]
+ add dst8q, mstrideq
+%endif
+%elif mmsize == 8 ; mmx/mmxext (h)
+ ; read 8 rows of 8px each
+ movu m0, [dst1q+mstrideq*4]
+ movu m1, [dst2q+mstrideq*4]
+ movu m2, [dst1q+mstrideq*2]
+ movu m3, [dst1q+mstrideq ]
+ movu m4, [dst1q]
+ movu m5, [dst2q]
+ movu m6, [dst2q+ strideq ]
+
+ ; 8x8 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+ mova m_q0backup, m1
+ movu m7, [dst2q+ strideq*2]
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+ mova m_p0backup, m5 ; store p0
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%else ; sse2 (h)
+%if %2 == 16
+ lea dst8q, [dst1q+ strideq*8]
+%endif
+
+ ; read 16 rows of 8px each, interleave
+ movh m0, [dst1q+mstrideq*4]
+ movh m1, [dst8q+mstrideq*4]
+ movh m2, [dst1q+mstrideq*2]
+ movh m5, [dst8q+mstrideq*2]
+ movh m3, [dst1q+mstrideq ]
+ movh m6, [dst8q+mstrideq ]
+ movh m4, [dst1q]
+ movh m7, [dst8q]
+ punpcklbw m0, m1 ; A/I
+ punpcklbw m2, m5 ; C/K
+ punpcklbw m3, m6 ; D/L
+ punpcklbw m4, m7 ; E/M
+
+ add dst8q, strideq
+ movh m1, [dst2q+mstrideq*4]
+ movh m6, [dst8q+mstrideq*4]
+ movh m5, [dst2q]
+ movh m7, [dst8q]
+ punpcklbw m1, m6 ; B/J
+ punpcklbw m5, m7 ; F/N
+ movh m6, [dst2q+ strideq ]
+ movh m7, [dst8q+ strideq ]
+ punpcklbw m6, m7 ; G/O
+
+ ; 8x16 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m_q0backup, m1
+%endif
+ movh m7, [dst2q+ strideq*2]
+ movh m1, [dst8q+ strideq*2]
+ punpcklbw m7, m1 ; H/P
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m8
+ SWAP 1, 8
+ SWAP 2, 8
+%else
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m12
+ SWAP 5, 12
+%else
+ mova m_p0backup, m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%endif
+
+ ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
+ mova m4, m1
+ SWAP 4, 1
+ psubusb m4, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m4 ; abs(p3-p2)
+
+ mova m4, m2
+ SWAP 4, 2
+ psubusb m4, m1 ; p1-p2
+ psubusb m1, m2 ; p2-p1
+ por m1, m4 ; abs(p2-p1)
+
+ mova m4, m6
+ SWAP 4, 6
+ psubusb m4, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m4 ; abs(q3-q2)
+
+ mova m4, m5
+ SWAP 4, 5
+ psubusb m4, m6 ; q1-q2
+ psubusb m6, m5 ; q2-q1
+ por m6, m4 ; abs(q2-q1)
+
+%if notcpuflag(mmxext)
+ mova m4, m_flimI
+ pxor m3, m3
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m7, m4
+ psubusb m6, m4
+ pcmpeqb m0, m3 ; abs(p3-p2) <= I
+ pcmpeqb m1, m3 ; abs(p2-p1) <= I
+ pcmpeqb m7, m3 ; abs(q3-q2) <= I
+ pcmpeqb m6, m3 ; abs(q2-q1) <= I
+ pand m0, m1
+ pand m7, m6
+ pand m0, m7
+%else ; mmxext/sse2
+ pmaxub m0, m1
+ pmaxub m6, m7
+ pmaxub m0, m6
+%endif
+
+ ; normal_limit and high_edge_variance for p1-p0, q1-q0
+ SWAP 7, 3 ; now m7 is zero
+%ifidn %1, v
+ movrow m3, [dst1q+mstrideq ] ; p0
+%if mmsize == 16 && %2 == 8
+ movhps m3, [dst8q+mstrideq ]
+%endif
+%elifdef m12
+ SWAP 3, 12
+%else
+ mova m3, m_p0backup
+%endif
+
+ mova m1, m2
+ SWAP 1, 2
+ mova m6, m3
+ SWAP 3, 6
+ psubusb m1, m3 ; p1-p0
+ psubusb m6, m2 ; p0-p1
+ por m1, m6 ; abs(p1-p0)
+%if notcpuflag(mmxext)
+ mova m6, m1
+ psubusb m1, m4
+ psubusb m6, m_hevthr
+ pcmpeqb m1, m7 ; abs(p1-p0) <= I
+ pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
+ pand m0, m1
+ mova m_maskres, m6
+%else ; mmxext/sse2
+ pmaxub m0, m1 ; max_I
+ SWAP 1, 4 ; max_hev_thresh
+%endif
+
+ SWAP 6, 4 ; now m6 is I
+%ifidn %1, v
+ movrow m4, [dst1q] ; q0
+%if mmsize == 16 && %2 == 8
+ movhps m4, [dst8q]
+%endif
+%elifdef m8
+ SWAP 4, 8
+%else
+ mova m4, m_q0backup
+%endif
+ mova m1, m4
+ SWAP 1, 4
+ mova m7, m5
+ SWAP 7, 5
+ psubusb m1, m5 ; q0-q1
+ psubusb m7, m4 ; q1-q0
+ por m1, m7 ; abs(q1-q0)
+%if notcpuflag(mmxext)
+ mova m7, m1
+ psubusb m1, m6
+ psubusb m7, m_hevthr
+ pxor m6, m6
+ pcmpeqb m1, m6 ; abs(q1-q0) <= I
+ pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
+ mova m6, m_maskres
+ pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
+ pand m6, m7
+%else ; mmxext/sse2
+ pxor m7, m7
+ pmaxub m0, m1
+ pmaxub m6, m1
+ psubusb m0, m_flimI
+ psubusb m6, m_hevthr
+ pcmpeqb m0, m7 ; max(abs(..)) <= I
+ pcmpeqb m6, m7 ; !(max(abs..) > thresh)
+%endif
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
+%endif
+
+ ; simple_limit
+ mova m1, m3
+ SWAP 1, 3
+ mova m6, m4 ; keep copies of p0/q0 around for later use
+ SWAP 6, 4
+ psubusb m1, m4 ; p0-q0
+ psubusb m6, m3 ; q0-p0
+ por m1, m6 ; abs(q0-p0)
+ paddusb m1, m1 ; m1=2*abs(q0-p0)
+
+ mova m7, m2
+ SWAP 7, 2
+ mova m6, m5
+ SWAP 6, 5
+ psubusb m7, m5 ; p1-q1
+ psubusb m6, m2 ; q1-p1
+ por m7, m6 ; abs(q1-p1)
+ pxor m6, m6
+ pand m7, [pb_FE]
+ psrlq m7, 1 ; abs(q1-p1)/2
+ paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
+ psubusb m7, m_flimE
+ pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
+ pand m0, m7 ; normal_limit result
+
+ ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
+%ifdef m8 ; x86-64 && sse2
+ mova m8, [pb_80]
+%define m_pb_80 m8
+%else ; x86-32 or mmx/mmxext
+%define m_pb_80 [pb_80]
+%endif
+ mova m1, m4
+ mova m7, m3
+ pxor m1, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m1, m7 ; (signed) q0-p0
+ mova m6, m2
+ mova m7, m5
+ pxor m6, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m6, m7 ; (signed) p1-q1
+ mova m7, m_maskres
+ pandn m7, m6
+ paddsb m7, m1
+ paddsb m7, m1
+ paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
+
+ pand m7, m0
+ mova m1, [pb_F8]
+ mova m6, m7
+ paddsb m7, [pb_3]
+ paddsb m6, [pb_4]
+ pand m7, m1
+ pand m6, m1
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m1, m7
+ psubb m0, m7
+ psrlq m7, 3 ; +f2
+ psrlq m0, 3 ; -f2
+ pand m0, m1
+ pandn m1, m7
+ psubusb m3, m0
+ paddusb m3, m1 ; p0+f2
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m0, m6
+ psubb m1, m6
+ psrlq m6, 3 ; +f1
+ psrlq m1, 3 ; -f1
+ pand m1, m0
+ pandn m0, m6
+ psubusb m4, m0
+ paddusb m4, m1 ; q0-f1
+
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m6, m_maskres
+%endif
+%if notcpuflag(mmxext)
+ mova m7, [pb_1]
+%else ; mmxext/sse2
+ pxor m7, m7
+%endif
+ pand m0, m6
+ pand m1, m6
+%if notcpuflag(mmxext)
+ paddusb m0, m7
+ pand m1, [pb_FE]
+ pandn m7, m0
+ psrlq m1, 1
+ psrlq m7, 1
+ SWAP 0, 7
+%else ; mmxext/sse2
+ psubusb m1, [pb_1]
+ pavgb m0, m7 ; a
+ pavgb m1, m7 ; -a
+%endif
+ psubusb m5, m0
+ psubusb m2, m1
+ paddusb m5, m1 ; q1-a
+ paddusb m2, m0 ; p1+a
+
+ ; store
+%ifidn %1, v
+ movrow [dst1q+mstrideq*2], m2
+ movrow [dst1q+mstrideq ], m3
+ movrow [dst1q], m4
+ movrow [dst1q+ strideq ], m5
+%if mmsize == 16 && %2 == 8
+ movhps [dst8q+mstrideq*2], m2
+ movhps [dst8q+mstrideq ], m3
+ movhps [dst8q], m4
+ movhps [dst8q+ strideq ], m5
+%endif
+%else ; h
+ add dst1q, 2
+ add dst2q, 2
+
+ ; 4x8/16 transpose
+ TRANSPOSE4x4B 2, 3, 4, 5, 6
+
+%if mmsize == 8 ; mmx/mmxext (h)
+ WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
+%else ; sse2 (h)
+ lea dst8q, [dst8q+mstrideq +2]
+ WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
+%endif
+%endif
+
+%if mmsize == 8
+%if %2 == 8 ; chroma
+%ifidn %1, h
+ sub dst1q, 2
+%endif
+ cmp dst1q, dst8q
+ mov dst1q, dst8q
+ jnz .next8px
+%else
+%ifidn %1, h
+ lea dst1q, [dst1q+ strideq*8-2]
+%else ; v
+ add dst1q, 8
+%endif
+ dec cntrq
+ jg .next8px
+%endif
+ REP_RET
+%else ; mmsize == 16
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+INIT_MMX mmxext
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+%endif
+
+INIT_XMM sse2
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+INIT_XMM ssse3
+INNER_LOOPFILTER v, 16
+INNER_LOOPFILTER h, 16
+INNER_LOOPFILTER v, 8
+INNER_LOOPFILTER h, 8
+
+;-----------------------------------------------------------------------------
+; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
+; int flimE, int flimI, int hev_thr);
+;-----------------------------------------------------------------------------
+
+%macro MBEDGE_LOOPFILTER 2
+%define stack_size 0
+%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+%if mmsize == 16 ; [3]=hev() result
+ ; [4]=filter tmp result
+ ; [5]/[6] = p2/q2 backup
+ ; [7]=lim_res sign result
+%define stack_size mmsize * -7
+%else ; 8 ; extra storage space for transposes
+%define stack_size mmsize * -8
+%endif
+%endif
+
+%if %2 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
+%else ; luma
+cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
+%endif
+
+%if cpuflag(ssse3)
+ pxor m7, m7
+%endif
+
+%ifndef m8
+ ; splat function arguments
+ SPLATB_REG m0, flimEq, m7 ; E
+ SPLATB_REG m1, flimIq, m7 ; I
+ SPLATB_REG m2, hevthrq, m7 ; hev_thresh
+
+%define m_flimE [rsp]
+%define m_flimI [rsp+mmsize]
+%define m_hevthr [rsp+mmsize*2]
+%define m_maskres [rsp+mmsize*3]
+%define m_limres [rsp+mmsize*4]
+%define m_p0backup [rsp+mmsize*3]
+%define m_q0backup [rsp+mmsize*4]
+%define m_p2backup [rsp+mmsize*5]
+%define m_q2backup [rsp+mmsize*6]
+%if mmsize == 16
+%define m_limsign [rsp]
+%else
+%define m_limsign [rsp+mmsize*7]
+%endif
+
+ mova m_flimE, m0
+ mova m_flimI, m1
+ mova m_hevthr, m2
+%else ; sse2 on x86-64
+%define m_flimE m9
+%define m_flimI m10
+%define m_hevthr m11
+%define m_maskres m12
+%define m_limres m8
+%define m_p0backup m12
+%define m_q0backup m8
+%define m_p2backup m13
+%define m_q2backup m14
+%define m_limsign m9
+
+ ; splat function arguments
+ SPLATB_REG m_flimE, flimEq, m7 ; E
+ SPLATB_REG m_flimI, flimIq, m7 ; I
+ SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
+%endif
+
+%if %2 == 8 ; chroma
+ DEFINE_ARGS dst1, dst8, mstride, stride, dst2
+%elif mmsize == 8
+ DEFINE_ARGS dst1, mstride, stride, dst2, cntr
+ mov cntrq, 2
+%else
+ DEFINE_ARGS dst1, mstride, stride, dst2, dst8
+%endif
+ mov strideq, mstrideq
+ neg mstrideq
+%ifidn %1, h
+ lea dst1q, [dst1q+strideq*4-4]
+%if %2 == 8 ; chroma
+ lea dst8q, [dst8q+strideq*4-4]
+%endif
+%endif
+
+%if mmsize == 8
+.next8px:
+%endif
+ ; read
+ lea dst2q, [dst1q+ strideq ]
+%ifidn %1, v
+%if %2 == 8 && mmsize == 16
+%define movrow movh
+%else
+%define movrow mova
+%endif
+ movrow m0, [dst1q+mstrideq*4] ; p3
+ movrow m1, [dst2q+mstrideq*4] ; p2
+ movrow m2, [dst1q+mstrideq*2] ; p1
+ movrow m5, [dst2q] ; q1
+ movrow m6, [dst2q+ strideq ] ; q2
+ movrow m7, [dst2q+ strideq*2] ; q3
+%if mmsize == 16 && %2 == 8
+ movhps m0, [dst8q+mstrideq*4]
+ movhps m2, [dst8q+mstrideq*2]
+ add dst8q, strideq
+ movhps m1, [dst8q+mstrideq*4]
+ movhps m5, [dst8q]
+ movhps m6, [dst8q+ strideq ]
+ movhps m7, [dst8q+ strideq*2]
+ add dst8q, mstrideq
+%endif
+%elif mmsize == 8 ; mmx/mmxext (h)
+ ; read 8 rows of 8px each
+ movu m0, [dst1q+mstrideq*4]
+ movu m1, [dst2q+mstrideq*4]
+ movu m2, [dst1q+mstrideq*2]
+ movu m3, [dst1q+mstrideq ]
+ movu m4, [dst1q]
+ movu m5, [dst2q]
+ movu m6, [dst2q+ strideq ]
+
+ ; 8x8 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+ mova m_q0backup, m1
+ movu m7, [dst2q+ strideq*2]
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+ mova m_p0backup, m5 ; store p0
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%else ; sse2 (h)
+%if %2 == 16
+ lea dst8q, [dst1q+ strideq*8 ]
+%endif
+
+ ; read 16 rows of 8px each, interleave
+ movh m0, [dst1q+mstrideq*4]
+ movh m1, [dst8q+mstrideq*4]
+ movh m2, [dst1q+mstrideq*2]
+ movh m5, [dst8q+mstrideq*2]
+ movh m3, [dst1q+mstrideq ]
+ movh m6, [dst8q+mstrideq ]
+ movh m4, [dst1q]
+ movh m7, [dst8q]
+ punpcklbw m0, m1 ; A/I
+ punpcklbw m2, m5 ; C/K
+ punpcklbw m3, m6 ; D/L
+ punpcklbw m4, m7 ; E/M
+
+ add dst8q, strideq
+ movh m1, [dst2q+mstrideq*4]
+ movh m6, [dst8q+mstrideq*4]
+ movh m5, [dst2q]
+ movh m7, [dst8q]
+ punpcklbw m1, m6 ; B/J
+ punpcklbw m5, m7 ; F/N
+ movh m6, [dst2q+ strideq ]
+ movh m7, [dst8q+ strideq ]
+ punpcklbw m6, m7 ; G/O
+
+ ; 8x16 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m_q0backup, m1
+%endif
+ movh m7, [dst2q+ strideq*2]
+ movh m1, [dst8q+ strideq*2]
+ punpcklbw m7, m1 ; H/P
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m8
+ SWAP 1, 8
+ SWAP 2, 8
+%else
+ mova m1, m_q0backup
+ mova m_q0backup, m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m12
+ SWAP 5, 12
+%else
+ mova m_p0backup, m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%endif
+
+ ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
+ mova m4, m1
+ SWAP 4, 1
+ psubusb m4, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m4 ; abs(p3-p2)
+
+ mova m4, m2
+ SWAP 4, 2
+ psubusb m4, m1 ; p1-p2
+ mova m_p2backup, m1
+ psubusb m1, m2 ; p2-p1
+ por m1, m4 ; abs(p2-p1)
+
+ mova m4, m6
+ SWAP 4, 6
+ psubusb m4, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m4 ; abs(q3-q2)
+
+ mova m4, m5
+ SWAP 4, 5
+ psubusb m4, m6 ; q1-q2
+ mova m_q2backup, m6
+ psubusb m6, m5 ; q2-q1
+ por m6, m4 ; abs(q2-q1)
+
+%if notcpuflag(mmxext)
+ mova m4, m_flimI
+ pxor m3, m3
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m7, m4
+ psubusb m6, m4
+ pcmpeqb m0, m3 ; abs(p3-p2) <= I
+ pcmpeqb m1, m3 ; abs(p2-p1) <= I
+ pcmpeqb m7, m3 ; abs(q3-q2) <= I
+ pcmpeqb m6, m3 ; abs(q2-q1) <= I
+ pand m0, m1
+ pand m7, m6
+ pand m0, m7
+%else ; mmxext/sse2
+ pmaxub m0, m1
+ pmaxub m6, m7
+ pmaxub m0, m6
+%endif
+
+ ; normal_limit and high_edge_variance for p1-p0, q1-q0
+ SWAP 7, 3 ; now m7 is zero
+%ifidn %1, v
+ movrow m3, [dst1q+mstrideq ] ; p0
+%if mmsize == 16 && %2 == 8
+ movhps m3, [dst8q+mstrideq ]
+%endif
+%elifdef m12
+ SWAP 3, 12
+%else
+ mova m3, m_p0backup
+%endif
+
+ mova m1, m2
+ SWAP 1, 2
+ mova m6, m3
+ SWAP 3, 6
+ psubusb m1, m3 ; p1-p0
+ psubusb m6, m2 ; p0-p1
+ por m1, m6 ; abs(p1-p0)
+%if notcpuflag(mmxext)
+ mova m6, m1
+ psubusb m1, m4
+ psubusb m6, m_hevthr
+ pcmpeqb m1, m7 ; abs(p1-p0) <= I
+ pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
+ pand m0, m1
+ mova m_maskres, m6
+%else ; mmxext/sse2
+ pmaxub m0, m1 ; max_I
+ SWAP 1, 4 ; max_hev_thresh
+%endif
+
+ SWAP 6, 4 ; now m6 is I
+%ifidn %1, v
+ movrow m4, [dst1q] ; q0
+%if mmsize == 16 && %2 == 8
+ movhps m4, [dst8q]
+%endif
+%elifdef m8
+ SWAP 4, 8
+%else
+ mova m4, m_q0backup
+%endif
+ mova m1, m4
+ SWAP 1, 4
+ mova m7, m5
+ SWAP 7, 5
+ psubusb m1, m5 ; q0-q1
+ psubusb m7, m4 ; q1-q0
+ por m1, m7 ; abs(q1-q0)
+%if notcpuflag(mmxext)
+ mova m7, m1
+ psubusb m1, m6
+ psubusb m7, m_hevthr
+ pxor m6, m6
+ pcmpeqb m1, m6 ; abs(q1-q0) <= I
+ pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
+ mova m6, m_maskres
+ pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
+ pand m6, m7
+%else ; mmxext/sse2
+ pxor m7, m7
+ pmaxub m0, m1
+ pmaxub m6, m1
+ psubusb m0, m_flimI
+ psubusb m6, m_hevthr
+ pcmpeqb m0, m7 ; max(abs(..)) <= I
+ pcmpeqb m6, m7 ; !(max(abs..) > thresh)
+%endif
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
+%endif
+
+ ; simple_limit
+ mova m1, m3
+ SWAP 1, 3
+ mova m6, m4 ; keep copies of p0/q0 around for later use
+ SWAP 6, 4
+ psubusb m1, m4 ; p0-q0
+ psubusb m6, m3 ; q0-p0
+ por m1, m6 ; abs(q0-p0)
+ paddusb m1, m1 ; m1=2*abs(q0-p0)
+
+ mova m7, m2
+ SWAP 7, 2
+ mova m6, m5
+ SWAP 6, 5
+ psubusb m7, m5 ; p1-q1
+ psubusb m6, m2 ; q1-p1
+ por m7, m6 ; abs(q1-p1)
+ pxor m6, m6
+ pand m7, [pb_FE]
+ psrlq m7, 1 ; abs(q1-p1)/2
+ paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
+ psubusb m7, m_flimE
+ pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
+ pand m0, m7 ; normal_limit result
+
+ ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
+%ifdef m8 ; x86-64 && sse2
+ mova m8, [pb_80]
+%define m_pb_80 m8
+%else ; x86-32 or mmx/mmxext
+%define m_pb_80 [pb_80]
+%endif
+ mova m1, m4
+ mova m7, m3
+ pxor m1, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m1, m7 ; (signed) q0-p0
+ mova m6, m2
+ mova m7, m5
+ pxor m6, m_pb_80
+ pxor m7, m_pb_80
+ psubsb m6, m7 ; (signed) p1-q1
+ mova m7, m_maskres
+ paddsb m6, m1
+ paddsb m6, m1
+ paddsb m6, m1
+ pand m6, m0
+%ifdef m8
+ mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
+ pand m_limres, m7
+%else
+ mova m0, m6
+ pand m0, m7
+ mova m_limres, m0
+%endif
+ pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
+
+ mova m1, [pb_F8]
+ mova m6, m7
+ paddsb m7, [pb_3]
+ paddsb m6, [pb_4]
+ pand m7, m1
+ pand m6, m1
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m1, m7
+ psubb m0, m7
+ psrlq m7, 3 ; +f2
+ psrlq m0, 3 ; -f2
+ pand m0, m1
+ pandn m1, m7
+ psubusb m3, m0
+ paddusb m3, m1 ; p0+f2
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m0, m6
+ psubb m1, m6
+ psrlq m6, 3 ; +f1
+ psrlq m1, 3 ; -f1
+ pand m1, m0
+ pandn m0, m6
+ psubusb m4, m0
+ paddusb m4, m1 ; q0-f1
+
+ ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
+%if cpuflag(ssse3)
+ mova m7, [pb_1]
+%else
+ mova m7, [pw_63]
+%endif
+%ifdef m8
+ SWAP 1, 8
+%else
+ mova m1, m_limres
+%endif
+ pxor m0, m0
+ mova m6, m1
+ pcmpgtb m0, m1 ; which are negative
+%if cpuflag(ssse3)
+ punpcklbw m6, m7 ; interleave with "1" for rounding
+ punpckhbw m1, m7
+%else
+ punpcklbw m6, m0 ; signed byte->word
+ punpckhbw m1, m0
+%endif
+ mova m_limsign, m0
+%if cpuflag(ssse3)
+ mova m7, [pb_27_63]
+%ifndef m8
+ mova m_limres, m1
+%endif
+%ifdef m10
+ SWAP 0, 10 ; don't lose lim_sign copy
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%ifdef m10
+ SWAP 0, 10
+%else
+ mova m0, m_limsign
+%endif
+%else
+ mova m_maskres, m6 ; backup for later in filter
+ mova m_limres, m1
+ pmullw m6, [pw_27]
+ pmullw m1, [pw_27]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a0
+ pxor m1, m1
+ psubb m1, m6
+ pand m1, m0 ; -a0
+ pandn m0, m6 ; +a0
+%if cpuflag(ssse3)
+ mova m6, [pb_18_63] ; pipelining
+%endif
+ psubusb m3, m1
+ paddusb m4, m1
+ paddusb m3, m0 ; p0+a0
+ psubusb m4, m0 ; q0-a0
+
+%if cpuflag(ssse3)
+ SWAP 6, 7
+%ifdef m10
+ SWAP 1, 10
+%else
+ mova m1, m_limres
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%ifdef m10
+ SWAP 0, 10
+%endif
+ mova m0, m_limsign
+%else
+ mova m6, m_maskres
+ mova m1, m_limres
+ pmullw m6, [pw_18]
+ pmullw m1, [pw_18]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+ mova m0, m_limsign
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a1
+ pxor m1, m1
+ psubb m1, m6
+ pand m1, m0 ; -a1
+ pandn m0, m6 ; +a1
+%if cpuflag(ssse3)
+ mova m6, [pb_9_63]
+%endif
+ psubusb m2, m1
+ paddusb m5, m1
+ paddusb m2, m0 ; p1+a1
+ psubusb m5, m0 ; q1-a1
+
+%if cpuflag(ssse3)
+ SWAP 6, 7
+%ifdef m10
+ SWAP 1, 10
+%else
+ mova m1, m_limres
+%endif
+ mova m0, m7
+ pmaddubsw m7, m6
+ SWAP 6, 7
+ pmaddubsw m0, m1
+ SWAP 1, 0
+%else
+%ifdef m8
+ SWAP 6, 12
+ SWAP 1, 8
+%else
+ mova m6, m_maskres
+ mova m1, m_limres
+%endif
+ pmullw m6, [pw_9]
+ pmullw m1, [pw_9]
+ paddw m6, m7
+ paddw m1, m7
+%endif
+%ifdef m9
+ SWAP 7, 9
+%else
+ mova m7, m_limsign
+%endif
+ psraw m6, 7
+ psraw m1, 7
+ packsswb m6, m1 ; a1
+ pxor m0, m0
+ psubb m0, m6
+ pand m0, m7 ; -a1
+ pandn m7, m6 ; +a1
+%ifdef m8
+ SWAP 1, 13
+ SWAP 6, 14
+%else
+ mova m1, m_p2backup
+ mova m6, m_q2backup
+%endif
+ psubusb m1, m0
+ paddusb m6, m0
+ paddusb m1, m7 ; p1+a1
+ psubusb m6, m7 ; q1-a1
+
+ ; store
+%ifidn %1, v
+ movrow [dst2q+mstrideq*4], m1
+ movrow [dst1q+mstrideq*2], m2
+ movrow [dst1q+mstrideq ], m3
+ movrow [dst1q], m4
+ movrow [dst2q], m5
+ movrow [dst2q+ strideq ], m6
+%if mmsize == 16 && %2 == 8
+ add dst8q, mstrideq
+ movhps [dst8q+mstrideq*2], m1
+ movhps [dst8q+mstrideq ], m2
+ movhps [dst8q], m3
+ add dst8q, strideq
+ movhps [dst8q], m4
+ movhps [dst8q+ strideq ], m5
+ movhps [dst8q+ strideq*2], m6
+%endif
+%else ; h
+ inc dst1q
+ inc dst2q
+
+ ; 4x8/16 transpose
+ TRANSPOSE4x4B 1, 2, 3, 4, 0
+ SBUTTERFLY bw, 5, 6, 0
+
+%if mmsize == 8 ; mmx/mmxext (h)
+ WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
+ add dst1q, 4
+ WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
+%else ; sse2 (h)
+ lea dst8q, [dst8q+mstrideq+1]
+ WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
+ lea dst1q, [dst2q+mstrideq+4]
+ lea dst8q, [dst8q+mstrideq+4]
+%if cpuflag(sse4)
+ add dst2q, 4
+%endif
+ WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
+%if cpuflag(sse4)
+ lea dst2q, [dst8q+ strideq ]
+%endif
+ WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
+%endif
+%endif
+
+%if mmsize == 8
+%if %2 == 8 ; chroma
+%ifidn %1, h
+ sub dst1q, 5
+%endif
+ cmp dst1q, dst8q
+ mov dst1q, dst8q
+ jnz .next8px
+%else
+%ifidn %1, h
+ lea dst1q, [dst1q+ strideq*8-5]
+%else ; v
+ add dst1q, 8
+%endif
+ dec cntrq
+ jg .next8px
+%endif
+ REP_RET
+%else ; mmsize == 16
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_MMX mmxext
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+%endif
+
+INIT_XMM sse2
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_XMM ssse3
+MBEDGE_LOOPFILTER v, 16
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER v, 8
+MBEDGE_LOOPFILTER h, 8
+
+INIT_XMM sse4
+MBEDGE_LOOPFILTER h, 16
+MBEDGE_LOOPFILTER h, 8
diff --git a/ffmpeg/libavcodec/x86/vp8dsp_init.c b/ffmpeg/libavcodec/x86/vp8dsp_init.c
new file mode 100644
index 0000000..09e2d91
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/vp8dsp_init.c
@@ -0,0 +1,442 @@
+/*
+ * VP8 DSP functions x86-optimized
+ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/vp8dsp.h"
+
+#if HAVE_YASM
+
+/*
+ * MC functions
+ */
+extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+
+extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int mx, int my);
+
+#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ dst, dststride, src, srcstride, height, mx, my); \
+ ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ dst + 8, dststride, src + 8, srcstride, height, mx, my); \
+}
+#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
+static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+ dst, dststride, src, srcstride, height, mx, my); \
+ ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
+ dst + 4, dststride, src + 4, srcstride, height, mx, my); \
+}
+
+#if ARCH_X86_32
+TAP_W8 (mmxext, epel, h4)
+TAP_W8 (mmxext, epel, h6)
+TAP_W16(mmxext, epel, h6)
+TAP_W8 (mmxext, epel, v4)
+TAP_W8 (mmxext, epel, v6)
+TAP_W16(mmxext, epel, v6)
+TAP_W8 (mmxext, bilinear, h)
+TAP_W16(mmxext, bilinear, h)
+TAP_W8 (mmxext, bilinear, v)
+TAP_W16(mmxext, bilinear, v)
+#endif
+
+TAP_W16(sse2, epel, h6)
+TAP_W16(sse2, epel, v6)
+TAP_W16(sse2, bilinear, h)
+TAP_W16(sse2, bilinear, v)
+
+TAP_W16(ssse3, epel, h6)
+TAP_W16(ssse3, epel, v6)
+TAP_W16(ssse3, bilinear, h)
+TAP_W16(ssse3, bilinear, v)
+
+#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+ uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
+ src -= srcstride * (TAPNUMY / 2 - 1); \
+ ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
+ tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
+ ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
+ dst, dststride, tmpptr, SIZE, height, mx, my); \
+}
+
+#if ARCH_X86_32
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y, 4, 8) \
+HVTAP(mmxext, 8, x, y, 8, 16)
+
+HVTAP(mmxext, 8, 6, 6, 16, 16)
+#else
+#define HVTAPMMX(x, y) \
+HVTAP(mmxext, 8, x, y, 4, 8)
+#endif
+
+HVTAPMMX(4, 4)
+HVTAPMMX(4, 6)
+HVTAPMMX(6, 4)
+HVTAPMMX(6, 6)
+
+#define HVTAPSSE2(x, y, w) \
+HVTAP(sse2, 16, x, y, w, 16) \
+HVTAP(ssse3, 16, x, y, w, 16)
+
+HVTAPSSE2(4, 4, 8)
+HVTAPSSE2(4, 6, 8)
+HVTAPSSE2(6, 4, 8)
+HVTAPSSE2(6, 6, 8)
+HVTAPSSE2(6, 6, 16)
+
+HVTAP(ssse3, 16, 4, 4, 4, 8)
+HVTAP(ssse3, 16, 4, 6, 4, 8)
+HVTAP(ssse3, 16, 6, 4, 4, 8)
+HVTAP(ssse3, 16, 6, 6, 4, 8)
+
+#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
+static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
+ uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
+ ptrdiff_t srcstride, int height, int mx, int my) \
+{ \
+ DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+ ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
+ tmp, SIZE, src, srcstride, height + 1, mx, my); \
+ ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
+ dst, dststride, tmp, SIZE, height, mx, my); \
+}
+
+HVBILIN(mmxext, 8, 4, 8)
+#if ARCH_X86_32
+HVBILIN(mmxext, 8, 8, 16)
+HVBILIN(mmxext, 8, 16, 16)
+#endif
+HVBILIN(sse2, 8, 8, 16)
+HVBILIN(sse2, 8, 16, 16)
+HVBILIN(ssse3, 8, 4, 8)
+HVBILIN(ssse3, 8, 8, 16)
+HVBILIN(ssse3, 8, 16, 16)
+
+extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
+ ptrdiff_t stride);
+extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
+ ptrdiff_t stride);
+extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
+ ptrdiff_t stride);
+extern void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
+extern void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
+extern void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+extern void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16],
+ ptrdiff_t stride);
+
+#define DECLARE_LOOP_FILTER(NAME)\
+extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim);\
+extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim);\
+extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
+ ptrdiff_t stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
+ ptrdiff_t stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
+ uint8_t *dstV,\
+ ptrdiff_t s, \
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
+ uint8_t *dstV,\
+ ptrdiff_t s, \
+ int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
+ ptrdiff_t stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
+ uint8_t *dstV,\
+ ptrdiff_t s, \
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
+ uint8_t *dstV,\
+ ptrdiff_t s, \
+ int e, int i, int hvt);
+
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
+DECLARE_LOOP_FILTER(sse4)
+
+#endif /* HAVE_YASM */
+
+#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
+
+#define VP8_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
+ c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
+ VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
+
+#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
+ c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
+ c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
+
+
+av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
+ c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+#if ARCH_X86_32
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
+ c->vp8_idct_add = ff_vp8_idct_add_mmx;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
+ c->put_vp8_epel_pixels_tab[0][0][0] =
+ c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
+#endif
+ c->put_vp8_epel_pixels_tab[1][0][0] =
+ c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
+
+#if ARCH_X86_32
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
+#endif
+ }
+
+ /* note that 4-tap width=16 functions are missing because w=16
+ * is only used for luma, and luma is always a copy or sixtap. */
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ VP8_MC_FUNC(2, 4, mmxext);
+ VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
+#if ARCH_X86_32
+ VP8_LUMA_MC_FUNC(0, 16, mmxext);
+ VP8_MC_FUNC(1, 8, mmxext);
+ VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
+ VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+#endif
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE) {
+ c->vp8_idct_add = ff_vp8_idct_add_sse;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
+ c->put_vp8_epel_pixels_tab[0][0][0] =
+ c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
+ }
+
+ if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
+ VP8_LUMA_MC_FUNC(0, 16, sse2);
+ VP8_MC_FUNC(1, 8, sse2);
+ VP8_BILINEAR_MC_FUNC(0, 16, sse2);
+ VP8_BILINEAR_MC_FUNC(1, 8, sse2);
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
+
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
+
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+ VP8_LUMA_MC_FUNC(0, 16, ssse3);
+ VP8_MC_FUNC(1, 8, ssse3);
+ VP8_MC_FUNC(2, 4, ssse3);
+ VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
+ VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
+ VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE4) {
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
+
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavcodec/x86/w64xmmtest.c b/ffmpeg/libavcodec/x86/w64xmmtest.c
new file mode 100644
index 0000000..f6e3de9
--- /dev/null
+++ b/ffmpeg/libavcodec/x86/w64xmmtest.c
@@ -0,0 +1,80 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavutil/x86/w64xmmtest.h"
+
+wrap(avcodec_open2(AVCodecContext *avctx,
+ AVCodec *codec,
+ AVDictionary **options))
+{
+ testxmmclobbers(avcodec_open2, avctx, codec, options);
+}
+
+wrap(avcodec_decode_audio4(AVCodecContext *avctx,
+ AVFrame *frame,
+ int *got_frame_ptr,
+ AVPacket *avpkt))
+{
+ testxmmclobbers(avcodec_decode_audio4, avctx, frame,
+ got_frame_ptr, avpkt);
+}
+
+wrap(avcodec_decode_video2(AVCodecContext *avctx,
+ AVFrame *picture,
+ int *got_picture_ptr,
+ AVPacket *avpkt))
+{
+ testxmmclobbers(avcodec_decode_video2, avctx, picture,
+ got_picture_ptr, avpkt);
+}
+
+wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
+ AVSubtitle *sub,
+ int *got_sub_ptr,
+ AVPacket *avpkt))
+{
+ testxmmclobbers(avcodec_decode_subtitle2, avctx, sub,
+ got_sub_ptr, avpkt);
+}
+
+wrap(avcodec_encode_audio2(AVCodecContext *avctx,
+ AVPacket *avpkt,
+ const AVFrame *frame,
+ int *got_packet_ptr))
+{
+ testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
+ got_packet_ptr);
+}
+
+wrap(avcodec_encode_video(AVCodecContext *avctx,
+ uint8_t *buf, int buf_size,
+ const AVFrame *pict))
+{
+ testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
+}
+
+wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
+ uint8_t *buf, int buf_size,
+ const AVSubtitle *sub))
+{
+ testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
+}