summaryrefslogtreecommitdiff
path: root/ffmpeg/libavcodec/x86
diff options
context:
space:
mode:
authorTim Redfern <tim@eclectronics.org>2014-02-17 13:36:38 +0000
committerTim Redfern <tim@eclectronics.org>2014-02-17 13:36:38 +0000
commit22e28216336da876e1fd17f380ce42eaf1446769 (patch)
tree444dad3dc7e2656992d29f34f7bce31970c122a5 /ffmpeg/libavcodec/x86
parentae5e8541f6e06e64c28719467cdf366ac57aff31 (diff)
chasing indexing error
Diffstat (limited to 'ffmpeg/libavcodec/x86')
-rw-r--r--ffmpeg/libavcodec/x86/Makefile106
-rw-r--r--ffmpeg/libavcodec/x86/ac3dsp.asm421
-rw-r--r--ffmpeg/libavcodec/x86/ac3dsp_init.c257
-rw-r--r--ffmpeg/libavcodec/x86/cabac.h299
-rw-r--r--ffmpeg/libavcodec/x86/cavsdsp.c558
-rw-r--r--ffmpeg/libavcodec/x86/constants.c53
-rw-r--r--ffmpeg/libavcodec/x86/dct32.asm490
-rw-r--r--ffmpeg/libavcodec/x86/deinterlace.asm82
-rw-r--r--ffmpeg/libavcodec/x86/dirac_dwt.c202
-rw-r--r--ffmpeg/libavcodec/x86/dirac_dwt.h30
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_mmx.c104
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_mmx.h47
-rw-r--r--ffmpeg/libavcodec/x86/diracdsp_yasm.asm264
-rw-r--r--ffmpeg/libavcodec/x86/dnxhdenc.c67
-rw-r--r--ffmpeg/libavcodec/x86/dsputil.asm653
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_mmx.c638
-rw-r--r--ffmpeg/libavcodec/x86/dsputil_qns_template.c101
-rw-r--r--ffmpeg/libavcodec/x86/dsputilenc.asm487
-rw-r--r--ffmpeg/libavcodec/x86/dsputilenc_mmx.c1061
-rw-r--r--ffmpeg/libavcodec/x86/dwt_yasm.asm306
-rw-r--r--ffmpeg/libavcodec/x86/fdct.c594
-rw-r--r--ffmpeg/libavcodec/x86/fft.asm1092
-rw-r--r--ffmpeg/libavcodec/x86/fft.h38
-rw-r--r--ffmpeg/libavcodec/x86/fft_init.c57
-rw-r--r--ffmpeg/libavcodec/x86/fmtconvert.asm429
-rw-r--r--ffmpeg/libavcodec/x86/fmtconvert_init.c147
-rw-r--r--ffmpeg/libavcodec/x86/h263_loopfilter.asm189
-rw-r--r--ffmpeg/libavcodec/x86/h264_chromamc.asm678
-rw-r--r--ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm271
-rw-r--r--ffmpeg/libavcodec/x86/h264_deblock.asm1078
-rw-r--r--ffmpeg/libavcodec/x86/h264_deblock_10bit.asm923
-rw-r--r--ffmpeg/libavcodec/x86/h264_i386.h204
-rw-r--r--ffmpeg/libavcodec/x86/h264_idct.asm1082
-rw-r--r--ffmpeg/libavcodec/x86/h264_idct_10bit.asm589
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred.asm2699
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm1199
-rw-r--r--ffmpeg/libavcodec/x86/h264_intrapred_init.c402
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel.c634
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel_10bit.asm884
-rw-r--r--ffmpeg/libavcodec/x86/h264_qpel_8bit.asm862
-rw-r--r--ffmpeg/libavcodec/x86/h264_weight.asm317
-rw-r--r--ffmpeg/libavcodec/x86/h264_weight_10bit.asm282
-rw-r--r--ffmpeg/libavcodec/x86/h264chroma_init.c119
-rw-r--r--ffmpeg/libavcodec/x86/h264dsp_init.c371
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp.asm461
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp_init.c269
-rw-r--r--ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c198
-rw-r--r--ffmpeg/libavcodec/x86/idct_mmx_xvid.c562
-rw-r--r--ffmpeg/libavcodec/x86/idct_sse2_xvid.c407
-rw-r--r--ffmpeg/libavcodec/x86/idct_xvid.h43
-rw-r--r--ffmpeg/libavcodec/x86/imdct36.asm724
-rw-r--r--ffmpeg/libavcodec/x86/lpc.c159
-rw-r--r--ffmpeg/libavcodec/x86/mathops.h128
-rw-r--r--ffmpeg/libavcodec/x86/mlpdsp.c186
-rw-r--r--ffmpeg/libavcodec/x86/motion_est.c474
-rw-r--r--ffmpeg/libavcodec/x86/mpeg4qpel.asm560
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideo.c463
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideoenc.c228
-rw-r--r--ffmpeg/libavcodec/x86/mpegvideoenc_template.c364
-rw-r--r--ffmpeg/libavcodec/x86/pngdsp.asm173
-rw-r--r--ffmpeg/libavcodec/x86/pngdsp_init.c50
-rw-r--r--ffmpeg/libavcodec/x86/proresdsp.asm326
-rw-r--r--ffmpeg/libavcodec/x86/proresdsp_init.c58
-rw-r--r--ffmpeg/libavcodec/x86/rv34dsp.asm196
-rw-r--r--ffmpeg/libavcodec/x86/rv34dsp_init.c45
-rw-r--r--ffmpeg/libavcodec/x86/rv40dsp.asm501
-rw-r--r--ffmpeg/libavcodec/x86/rv40dsp_init.c270
-rw-r--r--ffmpeg/libavcodec/x86/sbrdsp.asm425
-rw-r--r--ffmpeg/libavcodec/x86/sbrdsp_init.c76
-rw-r--r--ffmpeg/libavcodec/x86/simple_idct.c1167
-rw-r--r--ffmpeg/libavcodec/x86/snowdsp.c902
-rw-r--r--ffmpeg/libavcodec/x86/v210-init.c48
-rw-r--r--ffmpeg/libavcodec/x86/v210.asm88
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp.asm317
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp.h29
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp_init.c131
-rw-r--r--ffmpeg/libavcodec/x86/vc1dsp_mmx.c757
-rw-r--r--ffmpeg/libavcodec/x86/videodsp.asm444
-rw-r--r--ffmpeg/libavcodec/x86/videodsp_init.c270
-rw-r--r--ffmpeg/libavcodec/x86/vorbisdsp.asm83
-rw-r--r--ffmpeg/libavcodec/x86/vorbisdsp_init.c44
-rw-r--r--ffmpeg/libavcodec/x86/vp3dsp.asm709
-rw-r--r--ffmpeg/libavcodec/x86/vp3dsp_init.c128
-rw-r--r--ffmpeg/libavcodec/x86/vp56_arith.h54
-rw-r--r--ffmpeg/libavcodec/x86/vp8dsp.asm1225
-rw-r--r--ffmpeg/libavcodec/x86/vp8dsp_init.c441
-rw-r--r--ffmpeg/libavcodec/x86/w64xmmtest.c86
87 files changed, 0 insertions, 35635 deletions
diff --git a/ffmpeg/libavcodec/x86/Makefile b/ffmpeg/libavcodec/x86/Makefile
deleted file mode 100644
index 2d2d5a0..0000000
--- a/ffmpeg/libavcodec/x86/Makefile
+++ /dev/null
@@ -1,106 +0,0 @@
-OBJS += x86/constants.o \
- x86/fmtconvert_init.o \
-
-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
-OBJS-$(CONFIG_DCT) += x86/dct_init.o
-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
- x86/dsputil_x86.o
-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
- x86/fdct.o \
- x86/motion_est.o
-OBJS-$(CONFIG_FFT) += x86/fft_init.o
-OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
-OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
-OBJS-$(CONFIG_LPC) += x86/lpc.o
-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
-OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
- x86/rv40dsp_init.o
-OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
-OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
-
-MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
- x86/fpel_mmx.o \
- x86/idct_mmx_xvid.o \
- x86/idct_sse2_xvid.o \
- x86/rnd_mmx.o \
- x86/simple_idct.o
-MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
-MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
- x86/hpeldsp_mmx.o \
- x86/rnd_mmx.o
-MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
-MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
-MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
-
-YASM-OBJS += x86/deinterlace.o \
- x86/fmtconvert.o \
-
-YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
-YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
-YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
- x86/dwt_yasm.o
-YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
- x86/fpel.o \
- x86/mpeg4qpel.o \
- x86/qpel.o
-YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
-YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
-YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
-YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
- x86/h264_chromamc_10bit.o
-YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
- x86/h264_deblock_10bit.o \
- x86/h264_idct.o \
- x86/h264_idct_10bit.o \
- x86/h264_weight.o \
- x86/h264_weight_10bit.o
-YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
- x86/h264_intrapred_10bit.o
-YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
- x86/h264_qpel_10bit.o \
- x86/fpel.o \
- x86/qpel.o
-YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
- x86/hpeldsp.o
-YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
-YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
-YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
-YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
-YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
-YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
- x86/rv40dsp.o
-YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
-YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
-YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
-YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
-YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
-YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
-YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o \
- x86/vp8dsp_loopfilter.o
-YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9itxfm.o \
- x86/vp9mc.o
-YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
diff --git a/ffmpeg/libavcodec/x86/ac3dsp.asm b/ffmpeg/libavcodec/x86/ac3dsp.asm
deleted file mode 100644
index 89a64f5..0000000
--- a/ffmpeg/libavcodec/x86/ac3dsp.asm
+++ /dev/null
@@ -1,421 +0,0 @@
-;*****************************************************************************
-;* x86-optimized AC-3 DSP utils
-;* Copyright (c) 2011 Justin Ruggles
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-; 16777216.0f - used in ff_float_to_fixed24()
-pf_1_24: times 4 dd 0x4B800000
-
-; used in ff_ac3_compute_mantissa_size()
-cextern ac3_bap_bits
-pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
-pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
-
-; used in ff_ac3_extract_exponents()
-pd_1: times 4 dd 1
-pd_151: times 4 dd 151
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
-;-----------------------------------------------------------------------------
-
-%macro AC3_EXPONENT_MIN 0
-cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
- shl reuse_blksq, 8
- jz .end
- LOOP_ALIGN
-.nextexp:
- mov offsetq, reuse_blksq
- mova m0, [expq+offsetq]
- sub offsetq, 256
- LOOP_ALIGN
-.nextblk:
- PMINUB m0, [expq+offsetq], m1
- sub offsetq, 256
- jae .nextblk
- mova [expq], m0
- add expq, mmsize
- sub expnq, mmsize
- jg .nextexp
-.end:
- REP_RET
-%endmacro
-
-%define LOOP_ALIGN
-INIT_MMX mmx
-AC3_EXPONENT_MIN
-%if HAVE_MMXEXT_EXTERNAL
-%define LOOP_ALIGN ALIGN 16
-INIT_MMX mmxext
-AC3_EXPONENT_MIN
-%endif
-%if HAVE_SSE2_EXTERNAL
-INIT_XMM sse2
-AC3_EXPONENT_MIN
-%endif
-%undef LOOP_ALIGN
-
-;-----------------------------------------------------------------------------
-; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
-;
-; This function uses 2 different methods to calculate a valid result.
-; 1) logical 'or' of abs of each element
-; This is used for ssse3 because of the pabsw instruction.
-; It is also used for mmx because of the lack of min/max instructions.
-; 2) calculate min/max for the array, then or(abs(min),abs(max))
-; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
-;-----------------------------------------------------------------------------
-
-; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
-%macro OR_WORDS_HORIZ 2 ; src, tmp
-%if cpuflag(sse2)
- movhlps %2, %1
- por %1, %2
- pshuflw %2, %1, q0032
- por %1, %2
- pshuflw %2, %1, q0001
- por %1, %2
-%elif cpuflag(mmxext)
- pshufw %2, %1, q0032
- por %1, %2
- pshufw %2, %1, q0001
- por %1, %2
-%else ; mmx
- movq %2, %1
- psrlq %2, 32
- por %1, %2
- movq %2, %1
- psrlq %2, 16
- por %1, %2
-%endif
-%endmacro
-
-%macro AC3_MAX_MSB_ABS_INT16 1
-cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
- pxor m2, m2
- pxor m3, m3
-.loop:
-%ifidn %1, min_max
- mova m0, [srcq]
- mova m1, [srcq+mmsize]
- pminsw m2, m0
- pminsw m2, m1
- pmaxsw m3, m0
- pmaxsw m3, m1
-%else ; or_abs
-%if notcpuflag(ssse3)
- mova m0, [srcq]
- mova m1, [srcq+mmsize]
- ABS2 m0, m1, m3, m4
-%else ; ssse3
- ; using memory args is faster for ssse3
- pabsw m0, [srcq]
- pabsw m1, [srcq+mmsize]
-%endif
- por m2, m0
- por m2, m1
-%endif
- add srcq, mmsize*2
- sub lend, mmsize
- ja .loop
-%ifidn %1, min_max
- ABS2 m2, m3, m0, m1
- por m2, m3
-%endif
- OR_WORDS_HORIZ m2, m0
- movd eax, m2
- and eax, 0xFFFF
- RET
-%endmacro
-
-INIT_MMX mmx
-AC3_MAX_MSB_ABS_INT16 or_abs
-INIT_MMX mmxext
-AC3_MAX_MSB_ABS_INT16 min_max
-INIT_XMM sse2
-AC3_MAX_MSB_ABS_INT16 min_max
-INIT_XMM ssse3
-AC3_MAX_MSB_ABS_INT16 or_abs
-
-;-----------------------------------------------------------------------------
-; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
-;-----------------------------------------------------------------------------
-
-%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
-cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
- movd m0, shiftd
-.loop:
- mova m1, [srcq ]
- mova m2, [srcq+mmsize ]
- mova m3, [srcq+mmsize*2]
- mova m4, [srcq+mmsize*3]
- %3 m1, m0
- %3 m2, m0
- %3 m3, m0
- %3 m4, m0
- mova [srcq ], m1
- mova [srcq+mmsize ], m2
- mova [srcq+mmsize*2], m3
- mova [srcq+mmsize*3], m4
- add srcq, mmsize*4
- sub lend, mmsize*32/%2
- ja .loop
-.end:
- REP_RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-AC3_SHIFT l, 16, psllw
-INIT_XMM sse2
-AC3_SHIFT l, 16, psllw
-
-;-----------------------------------------------------------------------------
-; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-AC3_SHIFT r, 32, psrad
-INIT_XMM sse2
-AC3_SHIFT r, 32, psrad
-
-;-----------------------------------------------------------------------------
-; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
-;-----------------------------------------------------------------------------
-
-; The 3DNow! version is not bit-identical because pf2id uses truncation rather
-; than round-to-nearest.
-INIT_MMX 3dnow
-cglobal float_to_fixed24, 3, 3, 0, dst, src, len
- movq m0, [pf_1_24]
-.loop:
- movq m1, [srcq ]
- movq m2, [srcq+8 ]
- movq m3, [srcq+16]
- movq m4, [srcq+24]
- pfmul m1, m0
- pfmul m2, m0
- pfmul m3, m0
- pfmul m4, m0
- pf2id m1, m1
- pf2id m2, m2
- pf2id m3, m3
- pf2id m4, m4
- movq [dstq ], m1
- movq [dstq+8 ], m2
- movq [dstq+16], m3
- movq [dstq+24], m4
- add srcq, 32
- add dstq, 32
- sub lend, 8
- ja .loop
- femms
- RET
-
-INIT_XMM sse
-cglobal float_to_fixed24, 3, 3, 3, dst, src, len
- movaps m0, [pf_1_24]
-.loop:
- movaps m1, [srcq ]
- movaps m2, [srcq+16]
- mulps m1, m0
- mulps m2, m0
- cvtps2pi mm0, m1
- movhlps m1, m1
- cvtps2pi mm1, m1
- cvtps2pi mm2, m2
- movhlps m2, m2
- cvtps2pi mm3, m2
- movq [dstq ], mm0
- movq [dstq+ 8], mm1
- movq [dstq+16], mm2
- movq [dstq+24], mm3
- add srcq, 32
- add dstq, 32
- sub lend, 8
- ja .loop
- emms
- RET
-
-INIT_XMM sse2
-cglobal float_to_fixed24, 3, 3, 9, dst, src, len
- movaps m0, [pf_1_24]
-.loop:
- movaps m1, [srcq ]
- movaps m2, [srcq+16 ]
- movaps m3, [srcq+32 ]
- movaps m4, [srcq+48 ]
-%ifdef m8
- movaps m5, [srcq+64 ]
- movaps m6, [srcq+80 ]
- movaps m7, [srcq+96 ]
- movaps m8, [srcq+112]
-%endif
- mulps m1, m0
- mulps m2, m0
- mulps m3, m0
- mulps m4, m0
-%ifdef m8
- mulps m5, m0
- mulps m6, m0
- mulps m7, m0
- mulps m8, m0
-%endif
- cvtps2dq m1, m1
- cvtps2dq m2, m2
- cvtps2dq m3, m3
- cvtps2dq m4, m4
-%ifdef m8
- cvtps2dq m5, m5
- cvtps2dq m6, m6
- cvtps2dq m7, m7
- cvtps2dq m8, m8
-%endif
- movdqa [dstq ], m1
- movdqa [dstq+16 ], m2
- movdqa [dstq+32 ], m3
- movdqa [dstq+48 ], m4
-%ifdef m8
- movdqa [dstq+64 ], m5
- movdqa [dstq+80 ], m6
- movdqa [dstq+96 ], m7
- movdqa [dstq+112], m8
- add srcq, 128
- add dstq, 128
- sub lenq, 32
-%else
- add srcq, 64
- add dstq, 64
- sub lenq, 16
-%endif
- ja .loop
- REP_RET
-
-;------------------------------------------------------------------------------
-; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
-;------------------------------------------------------------------------------
-
-%macro PHADDD4 2 ; xmm src, xmm tmp
- movhlps %2, %1
- paddd %1, %2
- pshufd %2, %1, 0x1
- paddd %1, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
- movdqa m0, [mant_cntq ]
- movdqa m1, [mant_cntq+ 1*16]
- paddw m0, [mant_cntq+ 2*16]
- paddw m1, [mant_cntq+ 3*16]
- paddw m0, [mant_cntq+ 4*16]
- paddw m1, [mant_cntq+ 5*16]
- paddw m0, [mant_cntq+ 6*16]
- paddw m1, [mant_cntq+ 7*16]
- paddw m0, [mant_cntq+ 8*16]
- paddw m1, [mant_cntq+ 9*16]
- paddw m0, [mant_cntq+10*16]
- paddw m1, [mant_cntq+11*16]
- pmaddwd m0, [ac3_bap_bits ]
- pmaddwd m1, [ac3_bap_bits+16]
- paddd m0, m1
- PHADDD4 m0, m1
- movd sumd, m0
- movdqa m3, [pw_bap_mul1]
- movhpd m0, [mant_cntq +2]
- movlpd m0, [mant_cntq+1*32+2]
- movhpd m1, [mant_cntq+2*32+2]
- movlpd m1, [mant_cntq+3*32+2]
- movhpd m2, [mant_cntq+4*32+2]
- movlpd m2, [mant_cntq+5*32+2]
- pmulhuw m0, m3
- pmulhuw m1, m3
- pmulhuw m2, m3
- paddusw m0, m1
- paddusw m0, m2
- pmaddwd m0, [pw_bap_mul2]
- PHADDD4 m0, m1
- movd eax, m0
- add eax, sumd
- RET
-
-;------------------------------------------------------------------------------
-; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
-;------------------------------------------------------------------------------
-
-%macro PABSD 1-2 ; src/dst, unused
-%if cpuflag(ssse3)
- pabsd %1, %1
-%else ; src/dst, tmp
- pxor %2, %2
- pcmpgtd %2, %1
- pxor %1, %2
- psubd %1, %2
-%endif
-%endmacro
-
-%macro AC3_EXTRACT_EXPONENTS 0
-cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
- add expq, lenq
- lea coefq, [coefq+4*lenq]
- neg lenq
- mova m2, [pd_1]
- mova m3, [pd_151]
-.loop:
- ; move 4 32-bit coefs to xmm0
- mova m0, [coefq+4*lenq]
- ; absolute value
- PABSD m0, m1
- ; convert to float and extract exponents
- pslld m0, 1
- por m0, m2
- cvtdq2ps m1, m0
- psrld m1, 23
- mova m0, m3
- psubd m0, m1
- ; move the lowest byte in each of 4 dwords to the low dword
- ; NOTE: We cannot just extract the low bytes with pshufb because the dword
- ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
- ; clips this to 0, which is the correct exponent.
- packssdw m0, m0
- packuswb m0, m0
- movd [expq+lenq], m0
-
- add lenq, 4
- jl .loop
- REP_RET
-%endmacro
-
-%if HAVE_SSE2_EXTERNAL
-INIT_XMM sse2
-AC3_EXTRACT_EXPONENTS
-%endif
-%if HAVE_SSSE3_EXTERNAL
-INIT_XMM ssse3
-AC3_EXTRACT_EXPONENTS
-%endif
diff --git a/ffmpeg/libavcodec/x86/ac3dsp_init.c b/ffmpeg/libavcodec/x86/ac3dsp_init.c
deleted file mode 100644
index 5819d00..0000000
--- a/ffmpeg/libavcodec/x86/ac3dsp_init.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * x86-optimized AC-3 DSP utils
- * Copyright (c) 2011 Justin Ruggles
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "dsputil_x86.h"
-#include "libavcodec/ac3.h"
-#include "libavcodec/ac3dsp.h"
-
-void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
-void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
-void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
-
-int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
-
-void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
-void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
-
-void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
-void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
-
-void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
-void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
-void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
-
-int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
-
-void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
-void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
-void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
-
-void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-
-#if ARCH_X86_32 && defined(__INTEL_COMPILER)
-# undef HAVE_7REGS
-# define HAVE_7REGS 0
-#endif
-
-#if HAVE_SSE_INLINE && HAVE_7REGS
-
-#define IF1(x) x
-#define IF0(x)
-
-#define MIX5(mono, stereo) \
- __asm__ volatile ( \
- "movss 0(%1), %%xmm5 \n" \
- "movss 8(%1), %%xmm6 \n" \
- "movss 24(%1), %%xmm7 \n" \
- "shufps $0, %%xmm5, %%xmm5 \n" \
- "shufps $0, %%xmm6, %%xmm6 \n" \
- "shufps $0, %%xmm7, %%xmm7 \n" \
- "1: \n" \
- "movaps (%0, %2), %%xmm0 \n" \
- "movaps (%0, %3), %%xmm1 \n" \
- "movaps (%0, %4), %%xmm2 \n" \
- "movaps (%0, %5), %%xmm3 \n" \
- "movaps (%0, %6), %%xmm4 \n" \
- "mulps %%xmm5, %%xmm0 \n" \
- "mulps %%xmm6, %%xmm1 \n" \
- "mulps %%xmm5, %%xmm2 \n" \
- "mulps %%xmm7, %%xmm3 \n" \
- "mulps %%xmm7, %%xmm4 \n" \
- stereo("addps %%xmm1, %%xmm0 \n") \
- "addps %%xmm1, %%xmm2 \n" \
- "addps %%xmm3, %%xmm0 \n" \
- "addps %%xmm4, %%xmm2 \n" \
- mono("addps %%xmm2, %%xmm0 \n") \
- "movaps %%xmm0, (%0, %2) \n" \
- stereo("movaps %%xmm2, (%0, %3) \n") \
- "add $16, %0 \n" \
- "jl 1b \n" \
- : "+&r"(i) \
- : "r"(matrix), \
- "r"(samples[0] + len), \
- "r"(samples[1] + len), \
- "r"(samples[2] + len), \
- "r"(samples[3] + len), \
- "r"(samples[4] + len) \
- : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
- "memory" \
- );
-
-#define MIX_MISC(stereo) \
- __asm__ volatile ( \
- "mov %5, %2 \n" \
- "1: \n" \
- "mov -%c7(%6, %2, %c8), %3 \n" \
- "movaps (%3, %0), %%xmm0 \n" \
- stereo("movaps %%xmm0, %%xmm1 \n") \
- "mulps %%xmm4, %%xmm0 \n" \
- stereo("mulps %%xmm5, %%xmm1 \n") \
- "2: \n" \
- "mov (%6, %2, %c8), %1 \n" \
- "movaps (%1, %0), %%xmm2 \n" \
- stereo("movaps %%xmm2, %%xmm3 \n") \
- "mulps (%4, %2, 8), %%xmm2 \n" \
- stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
- "addps %%xmm2, %%xmm0 \n" \
- stereo("addps %%xmm3, %%xmm1 \n") \
- "add $4, %2 \n" \
- "jl 2b \n" \
- "mov %5, %2 \n" \
- stereo("mov (%6, %2, %c8), %1 \n") \
- "movaps %%xmm0, (%3, %0) \n" \
- stereo("movaps %%xmm1, (%1, %0) \n") \
- "add $16, %0 \n" \
- "jl 1b \n" \
- : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
- : "r"(matrix_simd + in_ch), \
- "g"((intptr_t) - 4 * (in_ch - 1)), \
- "r"(samp + in_ch), \
- "i"(sizeof(float *)), "i"(sizeof(float *)/4) \
- : "memory" \
- );
-
-static void ac3_downmix_sse(float **samples, float (*matrix)[2],
- int out_ch, int in_ch, int len)
-{
- int (*matrix_cmp)[2] = (int(*)[2])matrix;
- intptr_t i, j, k, m;
-
- i = -len * sizeof(float);
- if (in_ch == 5 && out_ch == 2 &&
- !(matrix_cmp[0][1] | matrix_cmp[2][0] |
- matrix_cmp[3][1] | matrix_cmp[4][0] |
- (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
- (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
- MIX5(IF0, IF1);
- } else if (in_ch == 5 && out_ch == 1 &&
- matrix_cmp[0][0] == matrix_cmp[2][0] &&
- matrix_cmp[3][0] == matrix_cmp[4][0]) {
- MIX5(IF1, IF0);
- } else {
- DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
- float *samp[AC3_MAX_CHANNELS];
-
- for (j = 0; j < in_ch; j++)
- samp[j] = samples[j] + len;
-
- j = 2 * in_ch * sizeof(float);
- __asm__ volatile (
- "1: \n"
- "sub $8, %0 \n"
- "movss (%2, %0), %%xmm4 \n"
- "movss 4(%2, %0), %%xmm5 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "shufps $0, %%xmm5, %%xmm5 \n"
- "movaps %%xmm4, (%1, %0, 4) \n"
- "movaps %%xmm5, 16(%1, %0, 4) \n"
- "jg 1b \n"
- : "+&r"(j)
- : "r"(matrix_simd), "r"(matrix)
- : "memory"
- );
- if (out_ch == 2) {
- MIX_MISC(IF1);
- } else {
- MIX_MISC(IF0);
- }
- }
-}
-
-#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
-
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMX(cpu_flags)) {
- c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
- c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
- c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
- }
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- if (!bit_exact) {
- c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
- }
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
- if (bit_exact) {
- c->apply_window_int16 = ff_apply_window_int16_mmxext;
- } else {
- c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
- }
- }
- if (EXTERNAL_SSE(cpu_flags)) {
- c->float_to_fixed24 = ff_float_to_fixed24_sse;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
- c->float_to_fixed24 = ff_float_to_fixed24_sse2;
- c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
- c->extract_exponents = ff_ac3_extract_exponents_sse2;
- if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
- c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
- c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
- }
- if (bit_exact) {
- c->apply_window_int16 = ff_apply_window_int16_sse2;
- } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
- c->apply_window_int16 = ff_apply_window_int16_round_sse2;
- }
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
- if (cpu_flags & AV_CPU_FLAG_ATOM) {
- c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
- } else {
- c->extract_exponents = ff_ac3_extract_exponents_ssse3;
- c->apply_window_int16 = ff_apply_window_int16_ssse3;
- }
- }
-
-#if HAVE_SSE_INLINE && HAVE_7REGS
- if (INLINE_SSE(cpu_flags)) {
- c->downmix = ac3_downmix_sse;
- }
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/cabac.h b/ffmpeg/libavcodec/x86/cabac.h
deleted file mode 100644
index 558d287..0000000
--- a/ffmpeg/libavcodec/x86/cabac.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_CABAC_H
-#define AVCODEC_X86_CABAC_H
-
-#include "libavcodec/cabac.h"
-#include "libavutil/attributes.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/internal.h"
-#include "config.h"
-
-#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
- || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
-# define BROKEN_COMPILER 1
-#else
-# define BROKEN_COMPILER 0
-#endif
-
-#if HAVE_INLINE_ASM
-
-#ifndef UNCHECKED_BITSTREAM_READER
-#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
-#endif
-
-#if UNCHECKED_BITSTREAM_READER
-#define END_CHECK(end) ""
-#else
-#define END_CHECK(end) \
- "cmp "end" , %%"REG_c" \n\t"\
- "jge 1f \n\t"
-#endif
-
-#ifdef BROKEN_RELOCATIONS
-#define TABLES_ARG , "r"(tables)
-
-#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
- "cmp "low" , "tmp" \n\t"\
- "cmova %%ecx , "range" \n\t"\
- "sbb %%rcx , %%rcx \n\t"\
- "and %%ecx , "tmp" \n\t"\
- "xor %%rcx , "retq" \n\t"\
- "sub "tmp" , "low" \n\t"
-#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
-/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
- "sub "low" , "tmp" \n\t"\
- "sar $31 , "tmp" \n\t"\
- "sub %%ecx , "range" \n\t"\
- "and "tmp" , "range" \n\t"\
- "add %%ecx , "range" \n\t"\
- "shl $17 , %%ecx \n\t"\
- "and "tmp" , %%ecx \n\t"\
- "sub %%ecx , "low" \n\t"\
- "xor "tmp" , "ret" \n\t"\
- "movslq "ret" , "retq" \n\t"
-#endif /* HAVE_FAST_CMOV */
-
-#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
- "movzbl "statep" , "ret" \n\t"\
- "mov "range" , "tmp" \n\t"\
- "and $0xC0 , "range" \n\t"\
- "lea ("ret", "range", 2), %%ecx \n\t"\
- "movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
- "sub "range" , "tmp" \n\t"\
- "mov "tmp" , %%ecx \n\t"\
- "shl $17 , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
- "movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
- "shl %%cl , "range" \n\t"\
- "movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
- "shl %%cl , "low" \n\t"\
- "mov "tmpbyte" , "statep" \n\t"\
- "test "lowword" , "lowword" \n\t"\
- "jnz 2f \n\t"\
- "mov "byte" , %%"REG_c" \n\t"\
- END_CHECK(end)\
- "add"OPSIZE" $2 , "byte" \n\t"\
- "1: \n\t"\
- "movzwl (%%"REG_c") , "tmp" \n\t"\
- "lea -1("low") , %%ecx \n\t"\
- "xor "low" , %%ecx \n\t"\
- "shr $15 , %%ecx \n\t"\
- "bswap "tmp" \n\t"\
- "shr $15 , "tmp" \n\t"\
- "movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
- "sub $0xFFFF , "tmp" \n\t"\
- "neg %%ecx \n\t"\
- "add $7 , %%ecx \n\t"\
- "shl %%cl , "tmp" \n\t"\
- "add "tmp" , "low" \n\t"\
- "2: \n\t"
-
-#else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
-#define RIP_ARG
-
-#if HAVE_FAST_CMOV
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
- "mov "tmp" , %%ecx \n\t"\
- "shl $17 , "tmp" \n\t"\
- "cmp "low" , "tmp" \n\t"\
- "cmova %%ecx , "range" \n\t"\
- "sbb %%ecx , %%ecx \n\t"\
- "and %%ecx , "tmp" \n\t"\
- "xor %%ecx , "ret" \n\t"\
- "sub "tmp" , "low" \n\t"
-#else /* HAVE_FAST_CMOV */
-#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
- "mov "tmp" , %%ecx \n\t"\
- "shl $17 , "tmp" \n\t"\
- "sub "low" , "tmp" \n\t"\
- "sar $31 , "tmp" \n\t" /*lps_mask*/\
- "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
- "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
- "add %%ecx , "range" \n\t" /*new range*/\
- "shl $17 , %%ecx \n\t"\
- "and "tmp" , %%ecx \n\t"\
- "sub %%ecx , "low" \n\t"\
- "xor "tmp" , "ret" \n\t"
-#endif /* HAVE_FAST_CMOV */
-
-#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
- "movzbl "statep" , "ret" \n\t"\
- "mov "range" , "tmp" \n\t"\
- "and $0xC0 , "range" \n\t"\
- "movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
- "sub "range" , "tmp" \n\t"\
- BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
- "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
- "shl %%cl , "range" \n\t"\
- "movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
- "shl %%cl , "low" \n\t"\
- "mov "tmpbyte" , "statep" \n\t"\
- "test "lowword" , "lowword" \n\t"\
- " jnz 2f \n\t"\
- "mov "byte" , %%"REG_c" \n\t"\
- END_CHECK(end)\
- "add"OPSIZE" $2 , "byte" \n\t"\
- "1: \n\t"\
- "movzwl (%%"REG_c") , "tmp" \n\t"\
- "lea -1("low") , %%ecx \n\t"\
- "xor "low" , %%ecx \n\t"\
- "shr $15 , %%ecx \n\t"\
- "bswap "tmp" \n\t"\
- "shr $15 , "tmp" \n\t"\
- "movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
- "sub $0xFFFF , "tmp" \n\t"\
- "neg %%ecx \n\t"\
- "add $7 , %%ecx \n\t"\
- "shl %%cl , "tmp" \n\t"\
- "add "tmp" , "low" \n\t"\
- "2: \n\t"
-
-#endif /* BROKEN_RELOCATIONS */
-
-#if HAVE_7REGS && !BROKEN_COMPILER
-#define get_cabac_inline get_cabac_inline_x86
-static av_always_inline int get_cabac_inline_x86(CABACContext *c,
- uint8_t *const state)
-{
- int bit, tmp;
-#ifdef BROKEN_RELOCATIONS
- void *tables;
-
- __asm__ volatile(
- "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
- : "=&r"(tables)
- );
-#endif
-
- __asm__ volatile(
- BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
- "%2", "%q2", "%3", "%b3",
- "%c6(%5)", "%c7(%5)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%8")
- : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
- : "r"(state), "r"(c),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end))
- TABLES_ARG
- ,"1"(c->low), "2"(c->range)
- : "%"REG_c, "memory"
- );
- return bit & 1;
-}
-#endif /* HAVE_7REGS */
-
-#if !BROKEN_COMPILER
-#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
-static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
-{
- x86_reg tmp;
- __asm__ volatile(
- "movl %c6(%2), %k1 \n\t"
- "movl %c3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "xor %%edx, %%ecx \n\t"
- "sub %%edx, %%ecx \n\t"
- "test %%ax, %%ax \n\t"
- "jnz 1f \n\t"
- "mov %c4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%edx \n\t"
- "bswap %%edx \n\t"
- "shrl $15, %%edx \n\t"
-#if UNCHECKED_BITSTREAM_READER
- "add $2, %1 \n\t"
- "addl %%edx, %%eax \n\t"
- "mov %1, %c4(%2) \n\t"
-#else
- "addl %%edx, %%eax \n\t"
- "cmp %c5(%2), %1 \n\t"
- "jge 1f \n\t"
- "add"OPSIZE" $2, %c4(%2) \n\t"
-#endif
- "1: \n\t"
- "movl %%eax, %c3(%2) \n\t"
-
- : "+c"(val), "=&r"(tmp)
- : "r"(c),
- "i"(offsetof(CABACContext, low)),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end)),
- "i"(offsetof(CABACContext, range))
- : "%eax", "%edx", "memory"
- );
- return val;
-}
-
-#define get_cabac_bypass get_cabac_bypass_x86
-static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
-{
- x86_reg tmp;
- int res;
- __asm__ volatile(
- "movl %c6(%2), %k1 \n\t"
- "movl %c3(%2), %%eax \n\t"
- "shl $17, %k1 \n\t"
- "add %%eax, %%eax \n\t"
- "sub %k1, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %k1 \n\t"
- "add %k1, %%eax \n\t"
- "inc %%edx \n\t"
- "test %%ax, %%ax \n\t"
- "jnz 1f \n\t"
- "mov %c4(%2), %1 \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%1), %%ecx \n\t"
- "bswap %%ecx \n\t"
- "shrl $15, %%ecx \n\t"
- "addl %%ecx, %%eax \n\t"
- "cmp %c5(%2), %1 \n\t"
- "jge 1f \n\t"
- "add"OPSIZE" $2, %c4(%2) \n\t"
- "1: \n\t"
- "movl %%eax, %c3(%2) \n\t"
-
- : "=&d"(res), "=&r"(tmp)
- : "r"(c),
- "i"(offsetof(CABACContext, low)),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end)),
- "i"(offsetof(CABACContext, range))
- : "%eax", "%ecx", "memory"
- );
- return res;
-}
-#endif /* !BROKEN_COMPILER */
-
-#endif /* HAVE_INLINE_ASM */
-#endif /* AVCODEC_X86_CABAC_H */
diff --git a/ffmpeg/libavcodec/x86/cavsdsp.c b/ffmpeg/libavcodec/x86/cavsdsp.c
deleted file mode 100644
index aaa09d1..0000000
--- a/ffmpeg/libavcodec/x86/cavsdsp.c
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
- * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
- * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
- *
- * MMX-optimized DSP functions, based on H.264 optimizations by
- * Michael Niedermayer and Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/common.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/cavsdsp.h"
-#include "constants.h"
-#include "dsputil_x86.h"
-#include "config.h"
-
-#if HAVE_MMX_INLINE
-
-/* in/out: mma=mma+mmb, mmb=mmb-mma */
-#define SUMSUB_BA( a, b ) \
- "paddw "#b", "#a" \n\t"\
- "paddw "#b", "#b" \n\t"\
- "psubw "#a", "#b" \n\t"
-
-/*****************************************************************************
- *
- * inverse transform
- *
- ****************************************************************************/
-
-static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
-{
- __asm__ volatile(
- "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
- "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
- "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
- "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
- "movq %%mm4, %%mm0 \n\t"
- "movq %%mm5, %%mm3 \n\t"
- "movq %%mm2, %%mm6 \n\t"
- "movq %%mm7, %%mm1 \n\t"
-
- "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
- "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
- "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
- "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
- "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
- "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
- "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
- "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
- "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
- "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
- "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
- "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
-
- "movq %%mm5, %%mm4 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "movq %%mm3, %%mm0 \n\t"
- "movq %%mm1, %%mm2 \n\t"
- SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
- "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
- "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
- "paddw %%mm7, %%mm7 \n\t"
- "paddw %%mm5, %%mm5 \n\t"
- "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
- "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
-
- SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
- "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
- "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
- "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
- "paddw %%mm1, %%mm1 \n\t"
- "paddw %%mm3, %%mm3 \n\t"
- "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
- "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
-
- "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
- "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
- "movq %%mm2, %%mm4 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
- "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
- "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
- "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
- "paddw %%mm2, %%mm2 \n\t"
- "paddw %%mm0, %%mm0 \n\t"
- "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
- "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
-
- "movq (%0), %%mm2 \n\t" /* mm2 = src0 */
- "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
- SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
- "psllw $3, %%mm0 \n\t"
- "psllw $3, %%mm2 \n\t"
- "paddw %1, %%mm0 \n\t" /* add rounding bias */
- "paddw %1, %%mm2 \n\t" /* add rounding bias */
-
- SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
- SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
- SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
- SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
- SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
- SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
- :: "r"(block), "m"(bias)
- );
-}
-
-#define SBUTTERFLY(a,b,t,n,m)\
- "mov" #m " " #a ", " #t " \n\t" /* abcd */\
- "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
- "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
-
-#define TRANSPOSE4(a,b,c,d,t)\
- SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
- SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
- SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
- SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
-
-static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
- int i;
- DECLARE_ALIGNED(8, int16_t, b2)[64];
-
- for(i=0; i<2; i++){
- DECLARE_ALIGNED(8, uint64_t, tmp);
-
- cavs_idct8_1d(block+4*i, ff_pw_4.a);
-
- __asm__ volatile(
- "psraw $3, %%mm7 \n\t"
- "psraw $3, %%mm6 \n\t"
- "psraw $3, %%mm5 \n\t"
- "psraw $3, %%mm4 \n\t"
- "psraw $3, %%mm3 \n\t"
- "psraw $3, %%mm2 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psraw $3, %%mm0 \n\t"
- "movq %%mm7, %0 \n\t"
- TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
- "movq %%mm0, 8(%1) \n\t"
- "movq %%mm6, 24(%1) \n\t"
- "movq %%mm7, 40(%1) \n\t"
- "movq %%mm4, 56(%1) \n\t"
- "movq %0, %%mm7 \n\t"
- TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
- "movq %%mm7, (%1) \n\t"
- "movq %%mm1, 16(%1) \n\t"
- "movq %%mm0, 32(%1) \n\t"
- "movq %%mm3, 48(%1) \n\t"
- : "=m"(tmp)
- : "r"(b2+32*i)
- : "memory"
- );
- }
-
- for(i=0; i<2; i++){
- cavs_idct8_1d(b2+4*i, ff_pw_64.a);
-
- __asm__ volatile(
- "psraw $7, %%mm7 \n\t"
- "psraw $7, %%mm6 \n\t"
- "psraw $7, %%mm5 \n\t"
- "psraw $7, %%mm4 \n\t"
- "psraw $7, %%mm3 \n\t"
- "psraw $7, %%mm2 \n\t"
- "psraw $7, %%mm1 \n\t"
- "psraw $7, %%mm0 \n\t"
- "movq %%mm7, (%0) \n\t"
- "movq %%mm5, 16(%0) \n\t"
- "movq %%mm3, 32(%0) \n\t"
- "movq %%mm1, 48(%0) \n\t"
- "movq %%mm0, 64(%0) \n\t"
- "movq %%mm2, 80(%0) \n\t"
- "movq %%mm4, 96(%0) \n\t"
- "movq %%mm6, 112(%0) \n\t"
- :: "r"(b2+4*i)
- : "memory"
- );
- }
-
- ff_add_pixels_clamped_mmx(b2, dst, stride);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
-
-/*****************************************************************************
- *
- * motion compensation
- *
- ****************************************************************************/
-
-/* vertical filter [-1 -2 96 42 -7 0] */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
- "movd (%0), "#F" \n\t"\
- "movq "#C", %%mm6 \n\t"\
- "pmullw %5, %%mm6 \n\t"\
- "movq "#D", %%mm7 \n\t"\
- "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
- "psllw $3, "#E" \n\t"\
- "psubw "#E", %%mm6 \n\t"\
- "psraw $3, "#E" \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw "#E", %%mm6 \n\t"\
- "paddw "#B", "#B" \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, "#F" \n\t"\
- "psubw "#B", %%mm6 \n\t"\
- "psraw $1, "#B" \n\t"\
- "psubw "#A", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
- "psraw $7, %%mm6 \n\t"\
- "packuswb %%mm6, %%mm6 \n\t"\
- OP(%%mm6, (%1), A, d) \
- "add %3, %1 \n\t"
-
-/* vertical filter [ 0 -1 5 5 -1 0] */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
- "movd (%0), "#F" \n\t"\
- "movq "#C", %%mm6 \n\t"\
- "paddw "#D", %%mm6 \n\t"\
- "pmullw %5, %%mm6 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, "#F" \n\t"\
- "psubw "#B", %%mm6 \n\t"\
- "psubw "#E", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
- "psraw $3, %%mm6 \n\t"\
- "packuswb %%mm6, %%mm6 \n\t"\
- OP(%%mm6, (%1), A, d) \
- "add %3, %1 \n\t"
-
-/* vertical filter [ 0 -7 42 96 -2 -1] */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
- "movd (%0), "#F" \n\t"\
- "movq "#C", %%mm6 \n\t"\
- "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
- "movq "#D", %%mm7 \n\t"\
- "pmullw %5, %%mm7 \n\t"\
- "psllw $3, "#B" \n\t"\
- "psubw "#B", %%mm6 \n\t"\
- "psraw $3, "#B" \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw "#B", %%mm6 \n\t"\
- "paddw "#E", "#E" \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, "#F" \n\t"\
- "psubw "#E", %%mm6 \n\t"\
- "psraw $1, "#E" \n\t"\
- "psubw "#F", %%mm6 \n\t"\
- "paddw %4, %%mm6 \n\t"\
- "psraw $7, %%mm6 \n\t"\
- "packuswb %%mm6, %%mm6 \n\t"\
- OP(%%mm6, (%1), A, d) \
- "add %3, %1 \n\t"
-
-
-#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
- int w= 2;\
- src -= 2*srcStride;\
- \
- while(w--){\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
- VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
- VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
- : "memory"\
- );\
- if(h==16){\
- __asm__ volatile(\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
- VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
- VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
- VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
- VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
- VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
- VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
- : "memory"\
- );\
- }\
- src += 4-(h+5)*srcStride;\
- dst += 4-h*dstStride;\
- }
-
-#define QPEL_CAVS(OPNAME, OP, MMX)\
-static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- int h=8;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movq %5, %%mm6 \n\t"\
- "1: \n\t"\
- "movq (%0), %%mm0 \n\t"\
- "movq 1(%0), %%mm2 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpckhbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm3, %%mm1 \n\t"\
- "pmullw %%mm6, %%mm0 \n\t"\
- "pmullw %%mm6, %%mm1 \n\t"\
- "movq -1(%0), %%mm2 \n\t"\
- "movq 2(%0), %%mm4 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "movq %%mm4, %%mm5 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- "punpckhbw %%mm7, %%mm5 \n\t"\
- "paddw %%mm4, %%mm2 \n\t"\
- "paddw %%mm3, %%mm5 \n\t"\
- "psubw %%mm2, %%mm0 \n\t"\
- "psubw %%mm5, %%mm1 \n\t"\
- "movq %6, %%mm5 \n\t"\
- "paddw %%mm5, %%mm0 \n\t"\
- "paddw %%mm5, %%mm1 \n\t"\
- "psraw $3, %%mm0 \n\t"\
- "psraw $3, %%mm1 \n\t"\
- "packuswb %%mm1, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm5, q) \
- "add %3, %0 \n\t"\
- "add %4, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(src), "+c"(dst), "+m"(h)\
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
- : "memory"\
- );\
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
-}\
-\
-static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
- OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
- OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
- OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define CAVS_MC(OPNAME, SIZE, MMX) \
-static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
-}\
-
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp " \n\t"\
-"pavgusb " #temp ", " #a " \n\t"\
-"mov" #size " " #a ", " #b " \n\t"
-#define AVG_MMXEXT_OP(a, b, temp, size) \
-"mov" #size " " #b ", " #temp " \n\t"\
-"pavgb " #temp ", " #a " \n\t"\
-"mov" #size " " #a ", " #b " \n\t"
-
-#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
-
-#if HAVE_MMX_INLINE
-static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels8_mmx(dst, src, stride, 8);
-}
-
-static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels8_mmx(dst, src, stride, 8);
-}
-
-static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels16_mmx(dst, src, stride, 16);
-}
-
-static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels16_mmx(dst, src, stride, 16);
-}
-
-static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
- AVCodecContext *avctx)
-{
- c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
- c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
- c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
- c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
-
- c->cavs_idct8_add = cavs_idct8_add_mmx;
- c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
-}
-#endif /* HAVE_MMX_INLINE */
-
-#define DSPFUNC(PFX, IDX, NUM, EXT) \
- c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
- c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
- c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
- c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
-
-#if HAVE_MMXEXT_INLINE
-QPEL_CAVS(put_, PUT_OP, mmxext)
-QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
-
-CAVS_MC(put_, 8, mmxext)
-CAVS_MC(put_, 16, mmxext)
-CAVS_MC(avg_, 8, mmxext)
-CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
- AVCodecContext *avctx)
-{
- DSPFUNC(put, 0, 16, mmxext);
- DSPFUNC(put, 1, 8, mmxext);
- DSPFUNC(avg, 0, 16, mmxext);
- DSPFUNC(avg, 1, 8, mmxext);
-}
-#endif /* HAVE_MMXEXT_INLINE */
-
-#if HAVE_AMD3DNOW_INLINE
-QPEL_CAVS(put_, PUT_OP, 3dnow)
-QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
-
-CAVS_MC(put_, 8, 3dnow)
-CAVS_MC(put_, 16,3dnow)
-CAVS_MC(avg_, 8, 3dnow)
-CAVS_MC(avg_, 16,3dnow)
-
-static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
- AVCodecContext *avctx)
-{
- DSPFUNC(put, 0, 16, 3dnow);
- DSPFUNC(put, 1, 8, 3dnow);
- DSPFUNC(avg, 0, 16, 3dnow);
- DSPFUNC(avg, 1, 8, 3dnow);
-}
-#endif /* HAVE_AMD3DNOW_INLINE */
-
-av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
-{
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags))
- cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
-#if HAVE_AMD3DNOW_INLINE
- if (INLINE_AMD3DNOW(cpu_flags))
- cavsdsp_init_3dnow(c, avctx);
-#endif /* HAVE_AMD3DNOW_INLINE */
-#if HAVE_MMXEXT_INLINE
- if (INLINE_MMXEXT(cpu_flags))
- cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
-}
diff --git a/ffmpeg/libavcodec/x86/constants.c b/ffmpeg/libavcodec/x86/constants.c
deleted file mode 100644
index 3bba80b..0000000
--- a/ffmpeg/libavcodec/x86/constants.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX/SSE constants used across x86 dsp optimizations.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h" // for xmm_reg
-#include "constants.h"
-
-DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
-
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
diff --git a/ffmpeg/libavcodec/x86/dct32.asm b/ffmpeg/libavcodec/x86/dct32.asm
deleted file mode 100644
index 6fd5ba3..0000000
--- a/ffmpeg/libavcodec/x86/dct32.asm
+++ /dev/null
@@ -1,490 +0,0 @@
-;******************************************************************************
-;* 32 point SSE-optimized DCT transform
-;* Copyright (c) 2010 Vitor Sessak
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-align 32
-ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
- dd 0.553104, 0.582935, 0.622504, 0.674808
- dd -10.190008, -3.407609, -2.057781, -1.484165
- dd -1.169440, -0.972568, -0.839350, -0.744536
- dd 0.502419, 0.522499, 0.566944, 0.646822
- dd 0.788155, 1.060678, 1.722447, 5.101149
- dd 0.509796, 0.601345, 0.899976, 2.562916
- dd 0.509796, 0.601345, 0.899976, 2.562916
- dd 1.000000, 1.000000, 1.306563, 0.541196
- dd 1.000000, 1.000000, 1.306563, 0.541196
- dd 1.000000, 0.707107, 1.000000, -0.707107
- dd 1.000000, 0.707107, 1.000000, -0.707107
- dd 0.707107, 0.707107, 0.707107, 0.707107
-
-align 32
-ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
-
-%macro BUTTERFLY 4
- subps %4, %1, %2
- addps %2, %2, %1
- mulps %1, %4, %3
-%endmacro
-
-%macro BUTTERFLY0 5
-%if cpuflag(sse2) && notcpuflag(avx)
- pshufd %4, %1, %5
- xorps %1, %2
- addps %1, %4
- mulps %1, %3
-%else
- shufps %4, %1, %1, %5
- xorps %1, %1, %2
- addps %4, %4, %1
- mulps %1, %4, %3
-%endif
-%endmacro
-
-%macro BUTTERFLY2 4
- BUTTERFLY0 %1, %2, %3, %4, 0x1b
-%endmacro
-
-%macro BUTTERFLY3 4
- BUTTERFLY0 %1, %2, %3, %4, 0xb1
-%endmacro
-
-%macro BUTTERFLY3V 5
- movaps m%5, m%1
- addps m%1, m%2
- subps m%5, m%2
- SWAP %2, %5
- mulps m%2, [ps_cos_vec+192]
- movaps m%5, m%3
- addps m%3, m%4
- subps m%4, m%5
- mulps m%4, [ps_cos_vec+192]
-%endmacro
-
-%macro PASS6_AND_PERMUTE 0
- mov tmpd, [outq+4]
- movss m7, [outq+72]
- addss m7, [outq+76]
- movss m3, [outq+56]
- addss m3, [outq+60]
- addss m4, m3
- movss m2, [outq+52]
- addss m2, m3
- movss m3, [outq+104]
- addss m3, [outq+108]
- addss m1, m3
- addss m5, m4
- movss [outq+ 16], m1
- movss m1, [outq+100]
- addss m1, m3
- movss m3, [outq+40]
- movss [outq+ 48], m1
- addss m3, [outq+44]
- movss m1, [outq+100]
- addss m4, m3
- addss m3, m2
- addss m1, [outq+108]
- movss [outq+ 40], m3
- addss m2, [outq+36]
- movss m3, [outq+8]
- movss [outq+ 56], m2
- addss m3, [outq+12]
- movss [outq+ 32], m3
- movss m3, [outq+80]
- movss [outq+ 8], m5
- movss [outq+ 80], m1
- movss m2, [outq+52]
- movss m5, [outq+120]
- addss m5, [outq+124]
- movss m1, [outq+64]
- addss m2, [outq+60]
- addss m0, m5
- addss m5, [outq+116]
- mov [outq+64], tmpd
- addss m6, m0
- addss m1, m6
- mov tmpd, [outq+12]
- mov [outq+ 96], tmpd
- movss [outq+ 4], m1
- movss m1, [outq+24]
- movss [outq+ 24], m4
- movss m4, [outq+88]
- addss m4, [outq+92]
- addss m3, m4
- addss m4, [outq+84]
- mov tmpd, [outq+108]
- addss m1, [outq+28]
- addss m0, m1
- addss m1, m5
- addss m6, m3
- addss m3, m0
- addss m0, m7
- addss m5, [outq+20]
- addss m7, m1
- movss [outq+ 12], m6
- mov [outq+112], tmpd
- movss m6, [outq+28]
- movss [outq+ 28], m0
- movss m0, [outq+36]
- movss [outq+ 36], m7
- addss m1, m4
- movss m7, [outq+116]
- addss m0, m2
- addss m7, [outq+124]
- movss [outq+ 72], m0
- movss m0, [outq+44]
- addss m2, m0
- movss [outq+ 44], m1
- movss [outq+ 88], m2
- addss m0, [outq+60]
- mov tmpd, [outq+60]
- mov [outq+120], tmpd
- movss [outq+104], m0
- addss m4, m5
- addss m5, [outq+68]
- movss [outq+52], m4
- movss [outq+60], m5
- movss m4, [outq+68]
- movss m5, [outq+20]
- movss [outq+ 20], m3
- addss m5, m7
- addss m7, m6
- addss m4, m5
- movss m2, [outq+84]
- addss m2, [outq+92]
- addss m5, m2
- movss [outq+ 68], m4
- addss m2, m7
- movss m4, [outq+76]
- movss [outq+ 84], m2
- movss [outq+ 76], m5
- addss m7, m4
- addss m6, [outq+124]
- addss m4, m6
- addss m6, [outq+92]
- movss [outq+100], m4
- movss [outq+108], m6
- movss m6, [outq+92]
- movss [outq+92], m7
- addss m6, [outq+124]
- movss [outq+116], m6
-%endmacro
-
-INIT_YMM avx
-SECTION_TEXT
-%if HAVE_AVX_EXTERNAL
-; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
-cglobal dct32_float, 2,3,8, out, in, tmp
- ; pass 1
- vmovaps m4, [inq+0]
- vinsertf128 m5, m5, [inq+96], 1
- vinsertf128 m5, m5, [inq+112], 0
- vshufps m5, m5, m5, 0x1b
- BUTTERFLY m4, m5, [ps_cos_vec], m6
-
- vmovaps m2, [inq+64]
- vinsertf128 m6, m6, [inq+32], 1
- vinsertf128 m6, m6, [inq+48], 0
- vshufps m6, m6, m6, 0x1b
- BUTTERFLY m2, m6, [ps_cos_vec+32], m0
-
- ; pass 2
-
- BUTTERFLY m5, m6, [ps_cos_vec+64], m0
- BUTTERFLY m4, m2, [ps_cos_vec+64], m7
-
-
- ; pass 3
- vperm2f128 m3, m6, m4, 0x31
- vperm2f128 m1, m6, m4, 0x20
- vshufps m3, m3, m3, 0x1b
-
- BUTTERFLY m1, m3, [ps_cos_vec+96], m6
-
-
- vperm2f128 m4, m5, m2, 0x20
- vperm2f128 m5, m5, m2, 0x31
- vshufps m5, m5, m5, 0x1b
-
- BUTTERFLY m4, m5, [ps_cos_vec+96], m6
-
- ; pass 4
- vmovaps m6, [ps_p1p1m1m1+0]
- vmovaps m2, [ps_cos_vec+128]
-
- BUTTERFLY2 m5, m6, m2, m7
- BUTTERFLY2 m4, m6, m2, m7
- BUTTERFLY2 m1, m6, m2, m7
- BUTTERFLY2 m3, m6, m2, m7
-
-
- ; pass 5
- vshufps m6, m6, m6, 0xcc
- vmovaps m2, [ps_cos_vec+160]
-
- BUTTERFLY3 m5, m6, m2, m7
- BUTTERFLY3 m4, m6, m2, m7
- BUTTERFLY3 m1, m6, m2, m7
- BUTTERFLY3 m3, m6, m2, m7
-
- vperm2f128 m6, m3, m3, 0x31
- vmovaps [outq], m3
-
- vextractf128 [outq+64], m5, 1
- vextractf128 [outq+32], m5, 0
-
- vextractf128 [outq+80], m4, 1
- vextractf128 [outq+48], m4, 0
-
- vperm2f128 m0, m1, m1, 0x31
- vmovaps [outq+96], m1
-
- vzeroupper
-
- ; pass 6, no SIMD...
-INIT_XMM
- PASS6_AND_PERMUTE
- RET
-%endif
-
-%if ARCH_X86_64
-%define SPILL SWAP
-%define UNSPILL SWAP
-
-%macro PASS5 0
- nop ; FIXME code alignment
- SWAP 5, 8
- SWAP 4, 12
- SWAP 6, 14
- SWAP 7, 13
- SWAP 0, 15
- PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
- TRANSPOSE4x4PS 8, 9, 10, 11, 0
- BUTTERFLY3V 8, 9, 10, 11, 0
- addps m10, m11
- TRANSPOSE4x4PS 12, 13, 14, 15, 0
- BUTTERFLY3V 12, 13, 14, 15, 0
- addps m14, m15
- addps m12, m14
- addps m14, m13
- addps m13, m15
-%endmacro
-
-%macro PASS6 0
- SWAP 9, 12
- SWAP 11, 14
- movss [outq+0x00], m8
- pshuflw m0, m8, 0xe
- movss [outq+0x10], m9
- pshuflw m1, m9, 0xe
- movss [outq+0x20], m10
- pshuflw m2, m10, 0xe
- movss [outq+0x30], m11
- pshuflw m3, m11, 0xe
- movss [outq+0x40], m12
- pshuflw m4, m12, 0xe
- movss [outq+0x50], m13
- pshuflw m5, m13, 0xe
- movss [outq+0x60], m14
- pshuflw m6, m14, 0xe
- movaps [outq+0x70], m15
- pshuflw m7, m15, 0xe
- addss m0, m1
- addss m1, m2
- movss [outq+0x08], m0
- addss m2, m3
- movss [outq+0x18], m1
- addss m3, m4
- movss [outq+0x28], m2
- addss m4, m5
- movss [outq+0x38], m3
- addss m5, m6
- movss [outq+0x48], m4
- addss m6, m7
- movss [outq+0x58], m5
- movss [outq+0x68], m6
- movss [outq+0x78], m7
-
- PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
- movhlps m0, m1
- pshufd m1, m1, 3
- SWAP 0, 2, 4, 6, 8, 10, 12, 14
- SWAP 1, 3, 5, 7, 9, 11, 13, 15
-%rep 7
- movhlps m0, m1
- pshufd m1, m1, 3
- addss m15, m1
- SWAP 0, 2, 4, 6, 8, 10, 12, 14
- SWAP 1, 3, 5, 7, 9, 11, 13, 15
-%endrep
-%assign i 4
-%rep 15
- addss m0, m1
- movss [outq+i], m0
- SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- %assign i i+8
-%endrep
-%endmacro
-
-%else ; ARCH_X86_32
-%macro SPILL 2 ; xmm#, mempos
- movaps [outq+(%2-8)*16], m%1
-%endmacro
-%macro UNSPILL 2
- movaps m%1, [outq+(%2-8)*16]
-%endmacro
-
-%define PASS6 PASS6_AND_PERMUTE
-%macro PASS5 0
- movaps m2, [ps_cos_vec+160]
- shufps m3, m3, 0xcc
-
- BUTTERFLY3 m5, m3, m2, m1
- SPILL 5, 8
-
- UNSPILL 1, 9
- BUTTERFLY3 m1, m3, m2, m5
- SPILL 1, 14
-
- BUTTERFLY3 m4, m3, m2, m5
- SPILL 4, 12
-
- BUTTERFLY3 m7, m3, m2, m5
- SPILL 7, 13
-
- UNSPILL 5, 10
- BUTTERFLY3 m5, m3, m2, m7
- SPILL 5, 10
-
- UNSPILL 4, 11
- BUTTERFLY3 m4, m3, m2, m7
- SPILL 4, 11
-
- BUTTERFLY3 m6, m3, m2, m7
- SPILL 6, 9
-
- BUTTERFLY3 m0, m3, m2, m7
- SPILL 0, 15
-%endmacro
-%endif
-
-
-; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-%macro DCT32_FUNC 0
-cglobal dct32_float, 2, 3, 16, out, in, tmp
- ; pass 1
-
- movaps m0, [inq+0]
- LOAD_INV m1, [inq+112]
- BUTTERFLY m0, m1, [ps_cos_vec], m3
-
- movaps m7, [inq+64]
- LOAD_INV m4, [inq+48]
- BUTTERFLY m7, m4, [ps_cos_vec+32], m3
-
- ; pass 2
- movaps m2, [ps_cos_vec+64]
- BUTTERFLY m1, m4, m2, m3
- SPILL 1, 11
- SPILL 4, 8
-
- ; pass 1
- movaps m1, [inq+16]
- LOAD_INV m6, [inq+96]
- BUTTERFLY m1, m6, [ps_cos_vec+16], m3
-
- movaps m4, [inq+80]
- LOAD_INV m5, [inq+32]
- BUTTERFLY m4, m5, [ps_cos_vec+48], m3
-
- ; pass 2
- BUTTERFLY m0, m7, m2, m3
-
- movaps m2, [ps_cos_vec+80]
- BUTTERFLY m6, m5, m2, m3
-
- BUTTERFLY m1, m4, m2, m3
-
- ; pass 3
- movaps m2, [ps_cos_vec+96]
- shufps m1, m1, 0x1b
- BUTTERFLY m0, m1, m2, m3
- SPILL 0, 15
- SPILL 1, 14
-
- UNSPILL 0, 8
- shufps m5, m5, 0x1b
- BUTTERFLY m0, m5, m2, m3
-
- UNSPILL 1, 11
- shufps m6, m6, 0x1b
- BUTTERFLY m1, m6, m2, m3
- SPILL 1, 11
-
- shufps m4, m4, 0x1b
- BUTTERFLY m7, m4, m2, m3
-
- ; pass 4
- movaps m3, [ps_p1p1m1m1+0]
- movaps m2, [ps_cos_vec+128]
-
- BUTTERFLY2 m5, m3, m2, m1
-
- BUTTERFLY2 m0, m3, m2, m1
- SPILL 0, 9
-
- BUTTERFLY2 m6, m3, m2, m1
- SPILL 6, 10
-
- UNSPILL 0, 11
- BUTTERFLY2 m0, m3, m2, m1
- SPILL 0, 11
-
- BUTTERFLY2 m4, m3, m2, m1
-
- BUTTERFLY2 m7, m3, m2, m1
-
- UNSPILL 6, 14
- BUTTERFLY2 m6, m3, m2, m1
-
- UNSPILL 0, 15
- BUTTERFLY2 m0, m3, m2, m1
-
- PASS5
- PASS6
- RET
-%endmacro
-
-%macro LOAD_INV 2
-%if cpuflag(sse2)
- pshufd %1, %2, 0x1b
-%elif cpuflag(sse)
- movaps %1, %2
- shufps %1, %1, 0x1b
-%endif
-%endmacro
-
-INIT_XMM sse
-DCT32_FUNC
-INIT_XMM sse2
-DCT32_FUNC
diff --git a/ffmpeg/libavcodec/x86/deinterlace.asm b/ffmpeg/libavcodec/x86/deinterlace.asm
deleted file mode 100644
index 3812dbe..0000000
--- a/ffmpeg/libavcodec/x86/deinterlace.asm
+++ /dev/null
@@ -1,82 +0,0 @@
-;******************************************************************************
-;* MMX optimized deinterlacing functions
-;* Copyright (c) 2010 Vitor Sessak
-;* Copyright (c) 2002 Michael Niedermayer
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-cextern pw_4
-
-SECTION .text
-
-%macro DEINTERLACE 1
-%ifidn %1, inplace
-;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
-cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
-%else
-;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
-cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
-%endif
- pxor mm7, mm7
- movq mm6, [pw_4]
-.nextrow:
- movd mm0, [lum_m4q]
- movd mm1, [lum_m3q]
- movd mm2, [lum_m2q]
-%ifidn %1, inplace
- movd [lum_m4q], mm2
-%endif
- movd mm3, [lum_m1q]
- movd mm4, [lumq]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpcklbw mm2, mm7
- punpcklbw mm3, mm7
- punpcklbw mm4, mm7
- paddw mm1, mm3
- psllw mm2, 1
- paddw mm0, mm4
- psllw mm1, 2
- paddw mm2, mm6
- paddw mm1, mm2
- psubusw mm1, mm0
- psrlw mm1, 3
- packuswb mm1, mm7
-%ifidn %1, inplace
- movd [lum_m2q], mm1
-%else
- movd [dstq], mm1
- add dstq, 4
-%endif
- add lum_m4q, 4
- add lum_m3q, 4
- add lum_m2q, 4
- add lum_m1q, 4
- add lumq, 4
- sub sized, 4
- jg .nextrow
- REP_RET
-%endmacro
-
-DEINTERLACE ""
-
-DEINTERLACE inplace
diff --git a/ffmpeg/libavcodec/x86/dirac_dwt.c b/ffmpeg/libavcodec/x86/dirac_dwt.c
deleted file mode 100644
index 04c514f..0000000
--- a/ffmpeg/libavcodec/x86/dirac_dwt.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * MMX optimized discrete wavelet transform
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2010 David Conrad
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
-#include "dirac_dwt.h"
-
-#define COMPOSE_VERTICAL(ext, align) \
-void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
-void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
-void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
-void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
-void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
-void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
-void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
-\
-static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
-{ \
- int i, width_align = width&~(align-1); \
-\
- for(i=width_align; i<width; i++) \
- b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
-\
- ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
-} \
-\
-static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
-{ \
- int i, width_align = width&~(align-1); \
-\
- for(i=width_align; i<width; i++) \
- b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
-\
- ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
-} \
-\
-static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
- IDWTELEM *b3, IDWTELEM *b4, int width) \
-{ \
- int i, width_align = width&~(align-1); \
-\
- for(i=width_align; i<width; i++) \
- b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
-\
- ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
-} \
-\
-static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
- IDWTELEM *b3, IDWTELEM *b4, int width) \
-{ \
- int i, width_align = width&~(align-1); \
-\
- for(i=width_align; i<width; i++) \
- b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
-\
- ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
-} \
-static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
-{ \
- int i, width_align = width&~(align-1); \
-\
- for(i=width_align; i<width; i++) { \
- b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
- b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
- } \
-\
- ff_vertical_compose_haar##ext(b0, b1, width_align); \
-} \
-static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
-{\
- int w2= w>>1;\
- int x= w2 - (w2&(align-1));\
- ff_horizontal_compose_haar0i##ext(b, tmp, w);\
-\
- for (; x < w2; x++) {\
- b[2*x ] = tmp[x];\
- b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
- }\
-}\
-static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
-{\
- int w2= w>>1;\
- int x= w2 - (w2&(align-1));\
- ff_horizontal_compose_haar1i##ext(b, tmp, w);\
-\
- for (; x < w2; x++) {\
- b[2*x ] = (tmp[x] + 1)>>1;\
- b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
- }\
-}\
-\
-
-#if HAVE_YASM
-#if !ARCH_X86_64
-COMPOSE_VERTICAL(_mmx, 4)
-#endif
-COMPOSE_VERTICAL(_sse2, 8)
-
-
-void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
-
-static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
-{
- int w2= w>>1;
- int x= w2 - (w2&7);
- ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
-
- for (; x < w2; x++) {
- b[2*x ] = (tmp[x] + 1)>>1;
- b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
- }
-}
-#endif
-
-void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
-{
-#if HAVE_YASM
- int mm_flags = av_get_cpu_flags();
-
-#if !ARCH_X86_64
- if (!(mm_flags & AV_CPU_FLAG_MMX))
- return;
-
- switch (type) {
- case DWT_DIRAC_DD9_7:
- d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
- d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
- break;
- case DWT_DIRAC_LEGALL5_3:
- d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
- d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
- break;
- case DWT_DIRAC_DD13_7:
- d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
- d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
- break;
- case DWT_DIRAC_HAAR0:
- d->vertical_compose = (void*)vertical_compose_haar_mmx;
- d->horizontal_compose = horizontal_compose_haar0i_mmx;
- break;
- case DWT_DIRAC_HAAR1:
- d->vertical_compose = (void*)vertical_compose_haar_mmx;
- d->horizontal_compose = horizontal_compose_haar1i_mmx;
- break;
- }
-#endif
-
- if (!(mm_flags & AV_CPU_FLAG_SSE2))
- return;
-
- switch (type) {
- case DWT_DIRAC_DD9_7:
- d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
- d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
- break;
- case DWT_DIRAC_LEGALL5_3:
- d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
- d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
- break;
- case DWT_DIRAC_DD13_7:
- d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
- d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
- break;
- case DWT_DIRAC_HAAR0:
- d->vertical_compose = (void*)vertical_compose_haar_sse2;
- d->horizontal_compose = horizontal_compose_haar0i_sse2;
- break;
- case DWT_DIRAC_HAAR1:
- d->vertical_compose = (void*)vertical_compose_haar_sse2;
- d->horizontal_compose = horizontal_compose_haar1i_sse2;
- break;
- }
-
- if (!(mm_flags & AV_CPU_FLAG_SSSE3))
- return;
-
- switch (type) {
- case DWT_DIRAC_DD9_7:
- d->horizontal_compose = horizontal_compose_dd97i_ssse3;
- break;
- }
-#endif // HAVE_YASM
-}
diff --git a/ffmpeg/libavcodec/x86/dirac_dwt.h b/ffmpeg/libavcodec/x86/dirac_dwt.h
deleted file mode 100644
index 126b290..0000000
--- a/ffmpeg/libavcodec/x86/dirac_dwt.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DIRAC_DWT_H
-#define AVCODEC_X86_DIRAC_DWT_H
-
-#include "libavcodec/dirac_dwt.h"
-
-void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
-void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
-void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
-
-void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
-
-#endif
diff --git a/ffmpeg/libavcodec/x86/diracdsp_mmx.c b/ffmpeg/libavcodec/x86/diracdsp_mmx.c
deleted file mode 100644
index a28bb82..0000000
--- a/ffmpeg/libavcodec/x86/diracdsp_mmx.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2010 David Conrad
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "dsputil_x86.h"
-#include "diracdsp_mmx.h"
-
-void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
-void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
-void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
-void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
-
-#define HPEL_FILTER(MMSIZE, EXT) \
- void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
- void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
- \
- static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
- const uint8_t *src, int stride, int width, int height) \
- { \
- while( height-- ) \
- { \
- ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
- ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
- ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
- \
- dsth += stride; \
- dstv += stride; \
- dstc += stride; \
- src += stride; \
- } \
- }
-
-#if !ARCH_X86_64
-HPEL_FILTER(8, mmx)
-#endif
-HPEL_FILTER(16, sse2)
-
-#define PIXFUNC(PFX, IDX, EXT) \
- /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
- c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
- c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
-
-void ff_diracdsp_init_mmx(DiracDSPContext* c)
-{
- int mm_flags = av_get_cpu_flags();
-
- if (!(mm_flags & AV_CPU_FLAG_MMX))
- return;
-
-#if HAVE_YASM
- c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
-#if !ARCH_X86_64
- c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
- c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
- c->dirac_hpel_filter = dirac_hpel_filter_mmx;
- c->add_rect_clamped = ff_add_rect_clamped_mmx;
- c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
-#endif
-#endif
-
-#if HAVE_MMX_INLINE
- PIXFUNC(put, 0, mmx);
- PIXFUNC(avg, 0, mmx);
-#endif
-
-#if HAVE_MMXEXT_INLINE
- if (mm_flags & AV_CPU_FLAG_MMX2) {
- PIXFUNC(avg, 0, mmxext);
- }
-#endif
-
- if (mm_flags & AV_CPU_FLAG_SSE2) {
-#if HAVE_YASM
- c->dirac_hpel_filter = dirac_hpel_filter_sse2;
- c->add_rect_clamped = ff_add_rect_clamped_sse2;
- c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
-
- c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
- c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
-#endif
-#if HAVE_SSE2_INLINE
- c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
- c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
- c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
- c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
-#endif
- }
-}
diff --git a/ffmpeg/libavcodec/x86/diracdsp_mmx.h b/ffmpeg/libavcodec/x86/diracdsp_mmx.h
deleted file mode 100644
index 8985854..0000000
--- a/ffmpeg/libavcodec/x86/diracdsp_mmx.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2010 David Conrad
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DIRACDSP_H
-#define AVCODEC_X86_DIRACDSP_H
-
-#include "libavcodec/diracdsp.h"
-
-void ff_diracdsp_init_mmx(DiracDSPContext* c);
-
-DECL_DIRAC_PIXOP(put, mmx);
-DECL_DIRAC_PIXOP(avg, mmx);
-DECL_DIRAC_PIXOP(avg, mmxext);
-
-void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
-void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
-void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
-void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
-
-void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
-void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
-
-void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-
-void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-
-#endif
diff --git a/ffmpeg/libavcodec/x86/diracdsp_yasm.asm b/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
deleted file mode 100644
index 3e9765b..0000000
--- a/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
+++ /dev/null
@@ -1,264 +0,0 @@
-;******************************************************************************
-;* Copyright (c) 2010 David Conrad
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-pw_3: times 8 dw 3
-pw_7: times 8 dw 7
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-pb_128: times 16 db 128
-
-section .text
-
-%macro UNPACK_ADD 6
- mov%5 %1, %3
- mov%6 m5, %4
- mova m4, %1
- mova %2, m5
- punpcklbw %1, m7
- punpcklbw m5, m7
- punpckhbw m4, m7
- punpckhbw %2, m7
- paddw %1, m5
- paddw %2, m4
-%endmacro
-
-%macro HPEL_FILTER 1
-; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
-cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
- mov src0q, srcq
- lea stridex3q, [3*strideq]
- sub src0q, stridex3q
- pxor m7, m7
-.loop:
- ; 7*(src[0] + src[1])
- UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
- pmullw m0, [pw_7]
- pmullw m1, [pw_7]
-
- ; 3*( ... + src[-2] + src[3])
- UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
- paddw m0, m2
- paddw m1, m3
- pmullw m0, [pw_3]
- pmullw m1, [pw_3]
-
- ; ... - 7*(src[-1] + src[2])
- UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
- pmullw m2, [pw_7]
- pmullw m3, [pw_7]
- psubw m0, m2
- psubw m1, m3
-
- ; ... - (src[-3] + src[4])
- UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
- psubw m0, m2
- psubw m1, m3
-
- paddw m0, [pw_16]
- paddw m1, [pw_16]
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m1
- mova [dstq], m0
- add dstq, mmsize
- add srcq, mmsize
- add src0q, mmsize
- sub widthd, mmsize
- jg .loop
- RET
-
-; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
-cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
- dec widthd
- pxor m7, m7
- and widthd, ~(mmsize-1)
-.loop:
- ; 7*(src[0] + src[1])
- UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
- pmullw m0, [pw_7]
- pmullw m1, [pw_7]
-
- ; 3*( ... + src[-2] + src[3])
- UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
- paddw m0, m2
- paddw m1, m3
- pmullw m0, [pw_3]
- pmullw m1, [pw_3]
-
- ; ... - 7*(src[-1] + src[2])
- UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
- pmullw m2, [pw_7]
- pmullw m3, [pw_7]
- psubw m0, m2
- psubw m1, m3
-
- ; ... - (src[-3] + src[4])
- UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
- psubw m0, m2
- psubw m1, m3
-
- paddw m0, [pw_16]
- paddw m1, [pw_16]
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m1
- mova [dstq + widthq], m0
- sub widthd, mmsize
- jge .loop
- RET
-%endmacro
-
-%macro PUT_RECT 1
-; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
-cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
- mova m0, [pb_128]
- add wd, (mmsize-1)
- and wd, ~(mmsize-1)
-
-%if ARCH_X86_64
- movsxd dst_strideq, dst_strided
- movsxd src_strideq, src_strided
- mov r7d, r5m
- mov r8d, wd
- %define wspill r8d
- %define hd r7d
-%else
- mov r4m, wd
- %define wspill r4m
- %define hd r5mp
-%endif
-
-.loopy
- lea src2q, [srcq+src_strideq*2]
- lea dst2q, [dstq+dst_strideq]
-.loopx:
- sub wd, mmsize
- mova m1, [srcq +2*wq]
- mova m2, [src2q+2*wq]
- packsswb m1, [srcq +2*wq+mmsize]
- packsswb m2, [src2q+2*wq+mmsize]
- paddb m1, m0
- paddb m2, m0
- mova [dstq +wq], m1
- mova [dst2q+wq], m2
- jg .loopx
-
- lea srcq, [srcq+src_strideq*4]
- lea dstq, [dstq+dst_strideq*2]
- sub hd, 2
- mov wd, wspill
- jg .loopy
- RET
-%endm
-
-%macro ADD_RECT 1
-; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
-cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
- mova m0, [pw_32]
- add wd, (mmsize-1)
- and wd, ~(mmsize-1)
-
-%if ARCH_X86_64
- movsxd strideq, strided
- movsxd idwt_strideq, idwt_strided
- mov r8d, wd
- %define wspill r8d
-%else
- mov r5m, wd
- %define wspill r5m
-%endif
-
-.loop:
- sub wd, mmsize
- movu m1, [srcq +2*wq] ; FIXME: ensure alignment
- paddw m1, m0
- psraw m1, 6
- movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
- paddw m2, m0
- psraw m2, 6
- paddw m1, [idwtq+2*wq]
- paddw m2, [idwtq+2*wq+mmsize]
- packuswb m1, m2
- mova [dstq +wq], m1
- jg .loop
-
- lea srcq, [srcq + 2*strideq]
- add dstq, strideq
- lea idwtq, [idwtq+ 2*idwt_strideq]
- sub hd, 1
- mov wd, wspill
- jg .loop
- RET
-%endm
-
-%macro ADD_OBMC 2
-; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
-cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
- pxor m4, m4
-.loop:
-%assign i 0
-%rep %1 / mmsize
- mova m0, [srcq+i]
- mova m1, m0
- punpcklbw m0, m4
- punpckhbw m1, m4
- mova m2, [obmcq+i]
- mova m3, m2
- punpcklbw m2, m4
- punpckhbw m3, m4
- pmullw m0, m2
- pmullw m1, m3
- movu m2, [dstq+2*i]
- movu m3, [dstq+2*i+mmsize]
- paddw m0, m2
- paddw m1, m3
- movu [dstq+2*i], m0
- movu [dstq+2*i+mmsize], m1
-%assign i i+mmsize
-%endrep
- lea srcq, [srcq+strideq]
- lea dstq, [dstq+2*strideq]
- add obmcq, 32
- sub yblend, 1
- jg .loop
- RET
-%endm
-
-INIT_MMX
-%if ARCH_X86_64 == 0
-PUT_RECT mmx
-ADD_RECT mmx
-
-HPEL_FILTER mmx
-ADD_OBMC 32, mmx
-ADD_OBMC 16, mmx
-%endif
-ADD_OBMC 8, mmx
-
-INIT_XMM
-PUT_RECT sse2
-ADD_RECT sse2
-
-HPEL_FILTER sse2
-ADD_OBMC 32, sse2
-ADD_OBMC 16, sse2
diff --git a/ffmpeg/libavcodec/x86/dnxhdenc.c b/ffmpeg/libavcodec/x86/dnxhdenc.c
deleted file mode 100644
index c7e776a..0000000
--- a/ffmpeg/libavcodec/x86/dnxhdenc.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * VC3/DNxHD SIMD functions
- * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
- *
- * VC-3 encoder funded by the British Broadcasting Corporation
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/dnxhdenc.h"
-
-#if HAVE_SSE2_INLINE
-
-static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size)
-{
- __asm__ volatile(
- "pxor %%xmm5, %%xmm5 \n\t"
- "movq (%0), %%xmm0 \n\t"
- "add %2, %0 \n\t"
- "movq (%0), %%xmm1 \n\t"
- "movq (%0, %2), %%xmm2 \n\t"
- "movq (%0, %2,2), %%xmm3 \n\t"
- "punpcklbw %%xmm5, %%xmm0 \n\t"
- "punpcklbw %%xmm5, %%xmm1 \n\t"
- "punpcklbw %%xmm5, %%xmm2 \n\t"
- "punpcklbw %%xmm5, %%xmm3 \n\t"
- "movdqa %%xmm0, (%1) \n\t"
- "movdqa %%xmm1, 16(%1) \n\t"
- "movdqa %%xmm2, 32(%1) \n\t"
- "movdqa %%xmm3, 48(%1) \n\t"
- "movdqa %%xmm3 , 64(%1) \n\t"
- "movdqa %%xmm2 , 80(%1) \n\t"
- "movdqa %%xmm1 , 96(%1) \n\t"
- "movdqa %%xmm0, 112(%1) \n\t"
- : "+r" (pixels)
- : "r" (block), "r" ((x86_reg)line_size)
- );
-}
-
-#endif /* HAVE_SSE2_INLINE */
-
-av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
-{
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2(av_get_cpu_flags())) {
- if (ctx->cid_table->bit_depth == 8)
- ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
- }
-#endif /* HAVE_SSE2_INLINE */
-}
diff --git a/ffmpeg/libavcodec/x86/dsputil.asm b/ffmpeg/libavcodec/x86/dsputil.asm
deleted file mode 100644
index 77069e2..0000000
--- a/ffmpeg/libavcodec/x86/dsputil.asm
+++ /dev/null
@@ -1,653 +0,0 @@
-;******************************************************************************
-;* MMX optimized DSP utils
-;* Copyright (c) 2008 Loren Merritt
-;* Copyright (c) 2003-2013 Michael Niedermayer
-;* Copyright (c) 2013 Daniel Kang
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
-pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
-pd_16384: times 4 dd 16384
-pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-
-SECTION_TEXT
-
-%macro SCALARPRODUCT 0
-; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
-cglobal scalarproduct_int16, 3,3,3, v1, v2, order
- shl orderq, 1
- add v1q, orderq
- add v2q, orderq
- neg orderq
- pxor m2, m2
-.loop:
- movu m0, [v1q + orderq]
- movu m1, [v1q + orderq + mmsize]
- pmaddwd m0, [v2q + orderq]
- pmaddwd m1, [v2q + orderq + mmsize]
- paddd m2, m0
- paddd m2, m1
- add orderq, mmsize*2
- jl .loop
-%if mmsize == 16
- movhlps m0, m2
- paddd m2, m0
- pshuflw m0, m2, 0x4e
-%else
- pshufw m0, m2, 0x4e
-%endif
- paddd m2, m0
- movd eax, m2
- RET
-
-; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
-%if mmsize == 16
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
-%else
- pshufw m7, m7, 0
-%endif
- pxor m6, m6
- add v1q, orderq
- add v2q, orderq
- add v3q, orderq
- neg orderq
-.loop:
- movu m0, [v2q + orderq]
- movu m1, [v2q + orderq + mmsize]
- mova m4, [v1q + orderq]
- mova m5, [v1q + orderq + mmsize]
- movu m2, [v3q + orderq]
- movu m3, [v3q + orderq + mmsize]
- pmaddwd m0, m4
- pmaddwd m1, m5
- pmullw m2, m7
- pmullw m3, m7
- paddd m6, m0
- paddd m6, m1
- paddw m2, m4
- paddw m3, m5
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- add orderq, mmsize*2
- jl .loop
-%if mmsize == 16
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
-%else
- pshufw m0, m6, 0x4e
-%endif
- paddd m6, m0
- movd eax, m6
- RET
-%endmacro
-
-INIT_MMX mmxext
-SCALARPRODUCT
-INIT_XMM sse2
-SCALARPRODUCT
-
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
- sub orderq, mmsize*2
-%if %1
- mova m1, m4
- mova m4, [v2q + orderq]
- mova m0, [v2q + orderq + mmsize]
- palignr m1, m0, %1
- palignr m0, m4, %1
- mova m3, m5
- mova m5, [v3q + orderq]
- mova m2, [v3q + orderq + mmsize]
- palignr m3, m2, %1
- palignr m2, m5, %1
-%else
- mova m0, [v2q + orderq]
- mova m1, [v2q + orderq + mmsize]
- mova m2, [v3q + orderq]
- mova m3, [v3q + orderq + mmsize]
-%endif
- %define t0 [v1q + orderq]
- %define t1 [v1q + orderq + mmsize]
-%if ARCH_X86_64
- mova m8, t0
- mova m9, t1
- %define t0 m8
- %define t1 m9
-%endif
- pmaddwd m0, t0
- pmaddwd m1, t1
- pmullw m2, m7
- pmullw m3, m7
- paddw m2, t0
- paddw m3, t1
- paddd m6, m0
- paddd m6, m1
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- jg .loop%1
-%if %1
- jmp .end
-%endif
-%endmacro
-
-; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
- pxor m6, m6
- mov r4d, v2d
- and r4d, 15
- and v2q, ~15
- and v3q, ~15
- mova m4, [v2q + orderq]
- mova m5, [v3q + orderq]
- ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
- cmp r4d, 0
- je .loop0
- cmp r4d, 2
- je .loop2
- cmp r4d, 4
- je .loop4
- cmp r4d, 6
- je .loop6
- cmp r4d, 8
- je .loop8
- cmp r4d, 10
- je .loop10
- cmp r4d, 12
- je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
- paddd m6, m0
- movd eax, m6
- RET
-
-
-;-----------------------------------------------------------------------------
-; void ff_apply_window_int16(int16_t *output, const int16_t *input,
-; const int16_t *window, unsigned int len)
-;-----------------------------------------------------------------------------
-
-%macro REVERSE_WORDS 1-2
-%if cpuflag(ssse3) && notcpuflag(atom)
- pshufb %1, %2
-%elif cpuflag(sse2)
- pshuflw %1, %1, 0x1B
- pshufhw %1, %1, 0x1B
- pshufd %1, %1, 0x4E
-%elif cpuflag(mmxext)
- pshufw %1, %1, 0x1B
-%endif
-%endmacro
-
-%macro MUL16FIXED 3
-%if cpuflag(ssse3) ; dst, src, unused
-; dst = ((dst * src) + (1<<14)) >> 15
- pmulhrsw %1, %2
-%elif cpuflag(mmxext) ; dst, src, temp
-; dst = (dst * src) >> 15
-; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
-; in from the pmullw result.
- mova %3, %1
- pmulhw %1, %2
- pmullw %3, %2
- psrlw %3, 15
- psllw %1, 1
- por %1, %3
-%endif
-%endmacro
-
-%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
-%if %1
-cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
-%else
-cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
-%endif
- lea offset2q, [offsetq-mmsize]
-%if cpuflag(ssse3) && notcpuflag(atom)
- mova m5, [pb_revwords]
- ALIGN 16
-%elif %1
- mova m5, [pd_16384]
-%endif
-.loop:
-%if cpuflag(ssse3)
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The ssse3 version is bit-identical.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- pmulhrsw m1, m0
- REVERSE_WORDS m0, m5
- pmulhrsw m0, [ inputq+offsetq ]
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m0
-%elif %1
- ; This version expands 16-bit to 32-bit, multiplies by the window,
- ; adds 16384 for rounding, right shifts 15, then repacks back to words to
- ; save to the output. The window is reversed for the second half.
- mova m3, [windowq+offset2q]
- mova m4, [ inputq+offset2q]
- pxor m0, m0
- punpcklwd m0, m3
- punpcklwd m1, m4
- pmaddwd m0, m1
- paddd m0, m5
- psrad m0, 15
- pxor m2, m2
- punpckhwd m2, m3
- punpckhwd m1, m4
- pmaddwd m2, m1
- paddd m2, m5
- psrad m2, 15
- packssdw m0, m2
- mova [outputq+offset2q], m0
- REVERSE_WORDS m3
- mova m4, [ inputq+offsetq]
- pxor m0, m0
- punpcklwd m0, m3
- punpcklwd m1, m4
- pmaddwd m0, m1
- paddd m0, m5
- psrad m0, 15
- pxor m2, m2
- punpckhwd m2, m3
- punpckhwd m1, m4
- pmaddwd m2, m1
- paddd m2, m5
- psrad m2, 15
- packssdw m0, m2
- mova [outputq+offsetq], m0
-%else
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
- ; therefore are not bit-identical to the C version.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- mova m2, [ inputq+offsetq ]
- MUL16FIXED m1, m0, m3
- REVERSE_WORDS m0
- MUL16FIXED m2, m0, m3
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m2
-%endif
- add offsetd, mmsize
- sub offset2d, mmsize
- jae .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 0
-INIT_XMM sse2
-APPLY_WINDOW_INT16 0
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 1
-INIT_XMM sse2
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3, atom
-APPLY_WINDOW_INT16 1
-
-
-; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
- movq mm0, [topq]
- movq mm2, mm0
- movd mm4, [left_topq]
- psllq mm2, 8
- movq mm1, mm0
- por mm4, mm2
- movd mm3, [leftq]
- psubb mm0, mm4 ; t-tl
- add dstq, wq
- add topq, wq
- add diffq, wq
- neg wq
- jmp .skip
-.loop:
- movq mm4, [topq+wq]
- movq mm0, mm4
- psllq mm4, 8
- por mm4, mm1
- movq mm1, mm0 ; t
- psubb mm0, mm4 ; t-tl
-.skip:
- movq mm2, [diffq+wq]
-%assign i 0
-%rep 8
- movq mm4, mm0
- paddb mm4, mm3 ; t-tl+l
- movq mm5, mm3
- pmaxub mm3, mm1
- pminub mm5, mm1
- pminub mm3, mm4
- pmaxub mm3, mm5 ; median
- paddb mm3, mm2 ; +residual
-%if i==0
- movq mm7, mm3
- psllq mm7, 56
-%else
- movq mm6, mm3
- psrlq mm7, 8
- psllq mm6, 56
- por mm7, mm6
-%endif
-%if i<7
- psrlq mm0, 8
- psrlq mm1, 8
- psrlq mm2, 8
-%endif
-%assign i i+1
-%endrep
- movq [dstq+wq], mm7
- add wq, 8
- jl .loop
- movzx r2d, byte [dstq-1]
- mov [leftq], r2d
- movzx r2d, byte [topq-1]
- mov [left_topq], r2d
- RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
- add srcq, wq
- add dstq, wq
- neg wq
-%%.loop:
-%if %2
- mova m1, [srcq+wq]
-%else
- movu m1, [srcq+wq]
-%endif
- mova m2, m1
- psllw m1, 8
- paddb m1, m2
- mova m2, m1
- pshufb m1, m3
- paddb m1, m2
- pshufb m0, m5
- mova m2, m1
- pshufb m1, m4
- paddb m1, m2
-%if mmsize == 16
- mova m2, m1
- pshufb m1, m6
- paddb m1, m2
-%endif
- paddb m0, m1
-%if %1
- mova [dstq+wq], m0
-%else
- movq [dstq+wq], m0
- movhps [dstq+wq+8], m0
-%endif
- add wq, mmsize
- jl %%.loop
- mov eax, mmsize-1
- sub eax, wd
- movd m1, eax
- pshufb m0, m1
- movd eax, m0
- RET
-%endmacro
-
-; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
-.skip_prologue:
- mova m5, [pb_7]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- psllq m0, 56
- ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM sse4
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
- mova m5, [pb_f]
- mova m6, [pb_zzzzzzzz77777777]
- mova m4, [pb_zzzz3333zzzzbbbb]
- mova m3, [pb_zz11zz55zz99zzdd]
- movd m0, leftm
- pslldq m0, 15
- test srcq, 15
- jnz .src_unaligned
- test dstq, 15
- jnz .dst_unaligned
- ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
- ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
- ADD_HFYU_LEFT_LOOP 0, 0
-
-;-----------------------------------------------------------------------------
-; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
-; int32_t max, unsigned int len)
-;-----------------------------------------------------------------------------
-
-; %1 = number of xmm registers used
-; %2 = number of inline load/process/store loops per asm loop
-; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
-; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
-; %5 = suffix
-%macro VECTOR_CLIP_INT32 4-5
-cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
-%if %4
- cvtsi2ss m4, minm
- cvtsi2ss m5, maxm
-%else
- movd m4, minm
- movd m5, maxm
-%endif
- SPLATD m4
- SPLATD m5
-.loop:
-%assign %%i 1
-%rep %2
- mova m0, [srcq+mmsize*0*%%i]
- mova m1, [srcq+mmsize*1*%%i]
- mova m2, [srcq+mmsize*2*%%i]
- mova m3, [srcq+mmsize*3*%%i]
-%if %3
- mova m7, [srcq+mmsize*4*%%i]
- mova m8, [srcq+mmsize*5*%%i]
- mova m9, [srcq+mmsize*6*%%i]
- mova m10, [srcq+mmsize*7*%%i]
-%endif
- CLIPD m0, m4, m5, m6
- CLIPD m1, m4, m5, m6
- CLIPD m2, m4, m5, m6
- CLIPD m3, m4, m5, m6
-%if %3
- CLIPD m7, m4, m5, m6
- CLIPD m8, m4, m5, m6
- CLIPD m9, m4, m5, m6
- CLIPD m10, m4, m5, m6
-%endif
- mova [dstq+mmsize*0*%%i], m0
- mova [dstq+mmsize*1*%%i], m1
- mova [dstq+mmsize*2*%%i], m2
- mova [dstq+mmsize*3*%%i], m3
-%if %3
- mova [dstq+mmsize*4*%%i], m7
- mova [dstq+mmsize*5*%%i], m8
- mova [dstq+mmsize*6*%%i], m9
- mova [dstq+mmsize*7*%%i], m10
-%endif
-%assign %%i %%i+1
-%endrep
- add srcq, mmsize*4*(%2+%3)
- add dstq, mmsize*4*(%2+%3)
- sub lend, mmsize*(%2+%3)
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-%define CLIPD CLIPD_MMX
-VECTOR_CLIP_INT32 0, 1, 0, 0
-INIT_XMM sse2
-VECTOR_CLIP_INT32 6, 1, 0, 0, _int
-%define CLIPD CLIPD_SSE2
-VECTOR_CLIP_INT32 6, 2, 0, 1
-INIT_XMM sse4
-%define CLIPD CLIPD_SSE41
-%ifdef m8
-VECTOR_CLIP_INT32 11, 1, 1, 0
-%else
-VECTOR_CLIP_INT32 6, 1, 0, 0
-%endif
-
-; %1 = aligned/unaligned
-%macro BSWAP_LOOPS 1
- mov r3, r2
- sar r2, 3
- jz .left4_%1
-.loop8_%1:
- mov%1 m0, [r1 + 0]
- mov%1 m1, [r1 + 16]
-%if cpuflag(ssse3)
- pshufb m0, m2
- pshufb m1, m2
- mov%1 [r0 + 0], m0
- mov%1 [r0 + 16], m1
-%else
- pshuflw m0, m0, 10110001b
- pshuflw m1, m1, 10110001b
- pshufhw m0, m0, 10110001b
- pshufhw m1, m1, 10110001b
- mova m2, m0
- mova m3, m1
- psllw m0, 8
- psllw m1, 8
- psrlw m2, 8
- psrlw m3, 8
- por m2, m0
- por m3, m1
- mov%1 [r0 + 0], m2
- mov%1 [r0 + 16], m3
-%endif
- add r0, 32
- add r1, 32
- dec r2
- jnz .loop8_%1
-.left4_%1:
- mov r2, r3
- and r3, 4
- jz .left
- mov%1 m0, [r1]
-%if cpuflag(ssse3)
- pshufb m0, m2
- mov%1 [r0], m0
-%else
- pshuflw m0, m0, 10110001b
- pshufhw m0, m0, 10110001b
- mova m2, m0
- psllw m0, 8
- psrlw m2, 8
- por m2, m0
- mov%1 [r0], m2
-%endif
- add r1, 16
- add r0, 16
-%endmacro
-
-; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
-%macro BSWAP32_BUF 0
-%if cpuflag(ssse3)
-cglobal bswap32_buf, 3,4,3
- mov r3, r1
- mova m2, [pb_bswap32]
-%else
-cglobal bswap32_buf, 3,4,5
- mov r3, r1
-%endif
- or r3, r0
- and r3, 15
- jz .start_align
- BSWAP_LOOPS u
- jmp .left
-.start_align:
- BSWAP_LOOPS a
-.left:
-%if cpuflag(ssse3)
- mov r3, r2
- and r2, 2
- jz .left1
- movq m0, [r1]
- pshufb m0, m2
- movq [r0], m0
- add r1, 8
- add r0, 8
-.left1:
- and r3, 1
- jz .end
- mov r2d, [r1]
- bswap r2d
- mov [r0], r2d
-%else
- and r2, 3
- jz .end
-.loop2:
- mov r3d, [r1]
- bswap r3d
- mov [r0], r3d
- add r1, 4
- add r0, 4
- dec r2
- jnz .loop2
-%endif
-.end:
- RET
-%endmacro
-
-INIT_XMM sse2
-BSWAP32_BUF
-
-INIT_XMM ssse3
-BSWAP32_BUF
diff --git a/ffmpeg/libavcodec/x86/dsputil_mmx.c b/ffmpeg/libavcodec/x86/dsputil_mmx.c
deleted file mode 100644
index df8cfdb..0000000
--- a/ffmpeg/libavcodec/x86/dsputil_mmx.c
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- */
-
-#include "config.h"
-#include "libavutil/avassert.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavcodec/videodsp.h"
-#include "constants.h"
-#include "dsputil_x86.h"
-#include "diracdsp_mmx.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- const int16_t *p;
- uint8_t *pix;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- /* unrolled loop */
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
- "r"(p)
- : "memory");
- pix += line_size * 4;
- p += 32;
-
- // if here would be an exact copy of the code above
- // compiler would generate some very strange code
- // thus using "r"
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
- : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off) \
- "movq "#off"(%2), %%mm1 \n\t" \
- "movq 16 + "#off"(%2), %%mm2 \n\t" \
- "movq 32 + "#off"(%2), %%mm3 \n\t" \
- "movq 48 + "#off"(%2), %%mm4 \n\t" \
- "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
- "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
- "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
- "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
- "paddb %%mm0, %%mm1 \n\t" \
- "paddb %%mm0, %%mm2 \n\t" \
- "paddb %%mm0, %%mm3 \n\t" \
- "paddb %%mm0, %%mm4 \n\t" \
- "movq %%mm1, (%0) \n\t" \
- "movq %%mm2, (%0, %3) \n\t" \
- "movq %%mm3, (%0, %3, 2) \n\t" \
- "movq %%mm4, (%0, %1) \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- x86_reg line_skip = line_size;
- x86_reg line_skip3;
-
- __asm__ volatile (
- "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
- "lea (%3, %3, 2), %1 \n\t"
- put_signed_pixels_clamped_mmx_half(0)
- "lea (%0, %3, 4), %0 \n\t"
- put_signed_pixels_clamped_mmx_half(64)
- : "+&r"(pixels), "=&r"(line_skip3)
- : "r"(block), "r"(line_skip)
- : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- const int16_t *p;
- uint8_t *pix;
- int i;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- MOVQ_ZERO(mm7);
- i = 4;
- do {
- __asm__ volatile (
- "movq (%2), %%mm0 \n\t"
- "movq 8(%2), %%mm1 \n\t"
- "movq 16(%2), %%mm2 \n\t"
- "movq 24(%2), %%mm3 \n\t"
- "movq %0, %%mm4 \n\t"
- "movq %1, %%mm6 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm4, %%mm0 \n\t"
- "paddsw %%mm5, %%mm1 \n\t"
- "movq %%mm6, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm6 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm6, %%mm2 \n\t"
- "paddsw %%mm5, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %0 \n\t"
- "movq %%mm2, %1 \n\t"
- : "+m"(*pix), "+m"(*(pix + line_size))
- : "r"(p)
- : "memory");
- pix += line_size * 2;
- p += 16;
- } while (--i);
-}
-
-#define CLEAR_BLOCKS(name, n) \
-void name(int16_t *blocks) \
-{ \
- __asm__ volatile ( \
- "pxor %%mm7, %%mm7 \n\t" \
- "mov %1, %%"REG_a" \n\t" \
- "1: \n\t" \
- "movq %%mm7, (%0, %%"REG_a") \n\t" \
- "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
- "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
- "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
- "add $32, %%"REG_a" \n\t" \
- "js 1b \n\t" \
- :: "r"(((uint8_t *)blocks) + 128 * n), \
- "i"(-128 * n) \
- : "%"REG_a \
- ); \
-}
-CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
-CLEAR_BLOCKS(ff_clear_block_mmx, 1)
-
-void ff_clear_block_sse(int16_t *block)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "movaps %%xmm0, (%0) \n"
- "movaps %%xmm0, 16(%0) \n"
- "movaps %%xmm0, 32(%0) \n"
- "movaps %%xmm0, 48(%0) \n"
- "movaps %%xmm0, 64(%0) \n"
- "movaps %%xmm0, 80(%0) \n"
- "movaps %%xmm0, 96(%0) \n"
- "movaps %%xmm0, 112(%0) \n"
- :: "r"(block)
- : "memory"
- );
-}
-
-void ff_clear_blocks_sse(int16_t *blocks)
-{
- __asm__ volatile (
- "xorps %%xmm0, %%xmm0 \n"
- "mov %1, %%"REG_a" \n"
- "1: \n"
- "movaps %%xmm0, (%0, %%"REG_a") \n"
- "movaps %%xmm0, 16(%0, %%"REG_a") \n"
- "movaps %%xmm0, 32(%0, %%"REG_a") \n"
- "movaps %%xmm0, 48(%0, %%"REG_a") \n"
- "movaps %%xmm0, 64(%0, %%"REG_a") \n"
- "movaps %%xmm0, 80(%0, %%"REG_a") \n"
- "movaps %%xmm0, 96(%0, %%"REG_a") \n"
- "movaps %%xmm0, 112(%0, %%"REG_a") \n"
- "add $128, %%"REG_a" \n"
- "js 1b \n"
- :: "r"(((uint8_t *)blocks) + 128 * 6),
- "i"(-128 * 6)
- : "%"REG_a
- );
-}
-
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
- x86_reg i = 0;
- __asm__ volatile (
- "jmp 2f \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq (%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%2, %0) \n\t"
- "movq 8(%1, %0), %%mm0 \n\t"
- "movq 8(%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "2: \n\t"
- "cmp %3, %0 \n\t"
- "js 1b \n\t"
- : "+r"(i)
- : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
- );
- for ( ; i < w; i++)
- dst[i + 0] += src[i + 0];
-}
-
-/* Draw the edges of width 'w' of an image of size width, height
- * this MMX version can only handle w == 8 || w == 16. */
-void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
- int w, int h, int sides)
-{
- uint8_t *ptr, *last_line;
- int i;
-
- last_line = buf + (height - 1) * wrap;
- /* left and right */
- ptr = buf;
- if (w == 8) {
- __asm__ volatile (
- "1: \n\t"
- "movd (%0), %%mm0 \n\t"
- "punpcklbw %%mm0, %%mm0 \n\t"
- "punpcklwd %%mm0, %%mm0 \n\t"
- "punpckldq %%mm0, %%mm0 \n\t"
- "movq %%mm0, -8(%0) \n\t"
- "movq -8(%0, %2), %%mm1 \n\t"
- "punpckhbw %%mm1, %%mm1 \n\t"
- "punpckhwd %%mm1, %%mm1 \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movq %%mm1, (%0, %2) \n\t"
- "add %1, %0 \n\t"
- "cmp %3, %0 \n\t"
- "jb 1b \n\t"
- : "+r"(ptr)
- : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
- );
- } else if(w==16){
- __asm__ volatile (
- "1: \n\t"
- "movd (%0), %%mm0 \n\t"
- "punpcklbw %%mm0, %%mm0 \n\t"
- "punpcklwd %%mm0, %%mm0 \n\t"
- "punpckldq %%mm0, %%mm0 \n\t"
- "movq %%mm0, -8(%0) \n\t"
- "movq %%mm0, -16(%0) \n\t"
- "movq -8(%0, %2), %%mm1 \n\t"
- "punpckhbw %%mm1, %%mm1 \n\t"
- "punpckhwd %%mm1, %%mm1 \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movq %%mm1, (%0, %2) \n\t"
- "movq %%mm1, 8(%0, %2) \n\t"
- "add %1, %0 \n\t"
- "cmp %3, %0 \n\t"
- "jb 1b \n\t"
- : "+r"(ptr)
- : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
- );
- } else {
- av_assert1(w == 4);
- __asm__ volatile (
- "1: \n\t"
- "movd (%0), %%mm0 \n\t"
- "punpcklbw %%mm0, %%mm0 \n\t"
- "punpcklwd %%mm0, %%mm0 \n\t"
- "movd %%mm0, -4(%0) \n\t"
- "movd -4(%0, %2), %%mm1 \n\t"
- "punpcklbw %%mm1, %%mm1 \n\t"
- "punpckhwd %%mm1, %%mm1 \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movd %%mm1, (%0, %2) \n\t"
- "add %1, %0 \n\t"
- "cmp %3, %0 \n\t"
- "jb 1b \n\t"
- : "+r"(ptr)
- : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
- );
- }
-
- /* top and bottom (and hopefully also the corners) */
- if (sides & EDGE_TOP) {
- for (i = 0; i < h; i += 4) {
- ptr = buf - (i + 1) * wrap - w;
- __asm__ volatile (
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm0, (%0, %2) \n\t"
- "movq %%mm0, (%0, %2, 2) \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "add $8, %0 \n\t"
- "cmp %4, %0 \n\t"
- "jb 1b \n\t"
- : "+r"(ptr)
- : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
- "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
- );
- }
- }
-
- if (sides & EDGE_BOTTOM) {
- for (i = 0; i < h; i += 4) {
- ptr = last_line + (i + 1) * wrap - w;
- __asm__ volatile (
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm0, (%0, %2) \n\t"
- "movq %%mm0, (%0, %2, 2) \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "add $8, %0 \n\t"
- "cmp %4, %0 \n\t"
- "jb 1b \n\t"
- : "+r"(ptr)
- : "r"((x86_reg)last_line - (x86_reg)ptr - w),
- "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
- "r"(ptr + width + 2 * w)
- );
- }
- }
-}
-
-typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride,
- ptrdiff_t src_linesize,
- int block_w, int block_h,
- int src_x, int src_y, int w, int h);
-
-static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
- int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy,
- int shift, int r, int width, int height,
- emulated_edge_mc_func *emu_edge_fn)
-{
- const int w = 8;
- const int ix = ox >> (16 + shift);
- const int iy = oy >> (16 + shift);
- const int oxs = ox >> 4;
- const int oys = oy >> 4;
- const int dxxs = dxx >> 4;
- const int dxys = dxy >> 4;
- const int dyxs = dyx >> 4;
- const int dyys = dyy >> 4;
- const uint16_t r4[4] = { r, r, r, r };
- const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
- const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
- const uint64_t shift2 = 2 * shift;
-#define MAX_STRIDE 4096U
-#define MAX_H 8U
- uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
- int x, y;
-
- const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
- const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
- const int dxh = dxy * (h - 1);
- const int dyw = dyx * (w - 1);
- int need_emu = (unsigned)ix >= width - w ||
- (unsigned)iy >= height - h;
-
- if ( // non-constant fullpel offset (3% of blocks)
- ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
- (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
- // uses more than 16 bits of subpel mv (only at huge resolution)
- || (dxx | dxy | dyx | dyy) & 15
- || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
- // FIXME could still use mmx for some of the rows
- ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
- shift, r, width, height);
- return;
- }
-
- src += ix + iy * stride;
- if (need_emu) {
- emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
- src = edge_buf;
- }
-
- __asm__ volatile (
- "movd %0, %%mm6 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
- "punpcklwd %%mm6, %%mm6 \n\t"
- :: "r"(1<<shift)
- );
-
- for (x = 0; x < w; x += 4) {
- uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
- oxs - dxys + dxxs * (x + 1),
- oxs - dxys + dxxs * (x + 2),
- oxs - dxys + dxxs * (x + 3) };
- uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
- oys - dyys + dyxs * (x + 1),
- oys - dyys + dyxs * (x + 2),
- oys - dyys + dyxs * (x + 3) };
-
- for (y = 0; y < h; y++) {
- __asm__ volatile (
- "movq %0, %%mm4 \n\t"
- "movq %1, %%mm5 \n\t"
- "paddw %2, %%mm4 \n\t"
- "paddw %3, %%mm5 \n\t"
- "movq %%mm4, %0 \n\t"
- "movq %%mm5, %1 \n\t"
- "psrlw $12, %%mm4 \n\t"
- "psrlw $12, %%mm5 \n\t"
- : "+m"(*dx4), "+m"(*dy4)
- : "m"(*dxy4), "m"(*dyy4)
- );
-
- __asm__ volatile (
- "movq %%mm6, %%mm2 \n\t"
- "movq %%mm6, %%mm1 \n\t"
- "psubw %%mm4, %%mm2 \n\t"
- "psubw %%mm5, %%mm1 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "movq %%mm4, %%mm3 \n\t"
- "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
- "pmullw %%mm5, %%mm3 \n\t" // dx * dy
- "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
- "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
-
- "movd %4, %%mm5 \n\t"
- "movd %3, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
- "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
-
- "movd %2, %%mm5 \n\t"
- "movd %1, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
- "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
- "paddw %5, %%mm1 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
-
- "psrlw %6, %%mm0 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "movd %%mm0, %0 \n\t"
-
- : "=m"(dst[x + y * stride])
- : "m"(src[0]), "m"(src[1]),
- "m"(src[stride]), "m"(src[stride + 1]),
- "m"(*r4), "m"(shift2)
- );
- src += stride;
- }
- src += 4 - h * stride;
- }
-}
-
-#if CONFIG_VIDEODSP
-#if HAVE_YASM
-#if ARCH_X86_32
-void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
- int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy,
- int shift, int r, int width, int height)
-{
- gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
- width, height, &ff_emulated_edge_mc_8);
-}
-#endif
-void ff_gmc_sse(uint8_t *dst, uint8_t *src,
- int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy,
- int shift, int r, int width, int height)
-{
- gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
- width, height, &ff_emulated_edge_mc_8);
-}
-#else
-void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
- int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy,
- int shift, int r, int width, int height)
-{
- gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
- width, height, &ff_emulated_edge_mc_8);
-}
-#endif
-#endif
-
-#if CONFIG_DIRAC_DECODER
-#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
-void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
-{\
- if (h&3)\
- ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
- else\
- OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
-}\
-void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
-{\
- if (h&3)\
- ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
- else\
- OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
-}\
-void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
-{\
- if (h&3) {\
- ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
- } else {\
- OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
- OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
- }\
-}
-
-#if HAVE_MMX_INLINE
-PIXELS16(static, ff_avg, , , _mmxext)
-DIRAC_PIXOP(put, ff_put, mmx)
-DIRAC_PIXOP(avg, ff_avg, mmx)
-#endif
-
-#if HAVE_YASM
-DIRAC_PIXOP(avg, ff_avg, mmxext)
-
-void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
-{
- if (h&3)
- ff_put_dirac_pixels16_c(dst, src, stride, h);
- else
- ff_put_pixels16_sse2(dst, src[0], stride, h);
-}
-void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
-{
- if (h&3)
- ff_avg_dirac_pixels16_c(dst, src, stride, h);
- else
- ff_avg_pixels16_sse2(dst, src[0], stride, h);
-}
-void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
-{
- if (h&3) {
- ff_put_dirac_pixels32_c(dst, src, stride, h);
- } else {
- ff_put_pixels16_sse2(dst , src[0] , stride, h);
- ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
- }
-}
-void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
-{
- if (h&3) {
- ff_avg_dirac_pixels32_c(dst, src, stride, h);
- } else {
- ff_avg_pixels16_sse2(dst , src[0] , stride, h);
- ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
- }
-}
-#endif
-#endif
-
-void ff_vector_clipf_sse(float *dst, const float *src,
- float min, float max, int len)
-{
- x86_reg i = (len - 16) * 4;
- __asm__ volatile (
- "movss %3, %%xmm4 \n\t"
- "movss %4, %%xmm5 \n\t"
- "shufps $0, %%xmm4, %%xmm4 \n\t"
- "shufps $0, %%xmm5, %%xmm5 \n\t"
- "1: \n\t"
- "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
- "movaps 16(%2, %0), %%xmm1 \n\t"
- "movaps 32(%2, %0), %%xmm2 \n\t"
- "movaps 48(%2, %0), %%xmm3 \n\t"
- "maxps %%xmm4, %%xmm0 \n\t"
- "maxps %%xmm4, %%xmm1 \n\t"
- "maxps %%xmm4, %%xmm2 \n\t"
- "maxps %%xmm4, %%xmm3 \n\t"
- "minps %%xmm5, %%xmm0 \n\t"
- "minps %%xmm5, %%xmm1 \n\t"
- "minps %%xmm5, %%xmm2 \n\t"
- "minps %%xmm5, %%xmm3 \n\t"
- "movaps %%xmm0, (%1, %0) \n\t"
- "movaps %%xmm1, 16(%1, %0) \n\t"
- "movaps %%xmm2, 32(%1, %0) \n\t"
- "movaps %%xmm3, 48(%1, %0) \n\t"
- "sub $64, %0 \n\t"
- "jge 1b \n\t"
- : "+&r"(i)
- : "r"(dst), "r"(src), "m"(min), "m"(max)
- : "memory"
- );
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/dsputil_qns_template.c b/ffmpeg/libavcodec/x86/dsputil_qns_template.c
deleted file mode 100644
index bde6b0a..0000000
--- a/ffmpeg/libavcodec/x86/dsputil_qns_template.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
- * Copyright (c) 2004 Michael Niedermayer
- *
- * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
- * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
-
-static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
-{
- x86_reg i=0;
-
- av_assert2(FFABS(scale) < MAX_ABS);
- scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-
- SET_RND(mm6);
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "movd %4, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
- "psraw $6, %%mm1 \n\t"
- "pmullw (%3, %0), %%mm0 \n\t"
- "pmullw 8(%3, %0), %%mm1 \n\t"
- "pmaddwd %%mm0, %%mm0 \n\t"
- "pmaddwd %%mm1, %%mm1 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "psrld $4, %%mm0 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
- PHADDD(%%mm7, %%mm6)
- "psrld $2, %%mm7 \n\t"
- "movd %%mm7, %0 \n\t"
-
- : "+r" (i)
- : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
- );
- return i;
-}
-
-static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
-{
- x86_reg i=0;
-
- if(FFABS(scale) < MAX_ABS){
- scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
- SET_RND(mm6);
- __asm__ volatile(
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" // FIXME optimize & bench
- " jb 1b \n\t"
-
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- );
- }else{
- for(i=0; i<8*8; i++){
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
diff --git a/ffmpeg/libavcodec/x86/dsputilenc.asm b/ffmpeg/libavcodec/x86/dsputilenc.asm
deleted file mode 100644
index 1839bee..0000000
--- a/ffmpeg/libavcodec/x86/dsputilenc.asm
+++ /dev/null
@@ -1,487 +0,0 @@
-;*****************************************************************************
-;* MMX optimized DSP utils
-;*****************************************************************************
-;* Copyright (c) 2000, 2001 Fabrice Bellard
-;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;*****************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION .text
-
-%macro DIFF_PIXELS_1 4
- movh %1, %3
- movh %2, %4
- punpcklbw %2, %1
- punpcklbw %1, %1
- psubw %1, %2
-%endmacro
-
-; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
-; %6=temporary storage location
-; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
-%macro DIFF_PIXELS_8 6
- DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
- DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
- DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
- add %1, %5
- add %2, %5
- DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
- DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
- DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
- DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
-%ifdef m8
- DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
-%else
- mova [%6], m0
- DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
- mova m0, [%6]
-%endif
- sub %1, %5
- sub %2, %5
-%endmacro
-
-%macro HADAMARD8 0
- SUMSUB_BADC w, 0, 1, 2, 3
- SUMSUB_BADC w, 4, 5, 6, 7
- SUMSUB_BADC w, 0, 2, 1, 3
- SUMSUB_BADC w, 4, 6, 5, 7
- SUMSUB_BADC w, 0, 4, 1, 5
- SUMSUB_BADC w, 2, 6, 3, 7
-%endmacro
-
-%macro ABS1_SUM 3
- ABS1 %1, %2
- paddusw %3, %1
-%endmacro
-
-%macro ABS2_SUM 6
- ABS2 %1, %2, %3, %4
- paddusw %5, %1
- paddusw %6, %2
-%endmacro
-
-%macro ABS_SUM_8x8_64 1
- ABS2 m0, m1, m8, m9
- ABS2_SUM m2, m3, m8, m9, m0, m1
- ABS2_SUM m4, m5, m8, m9, m0, m1
- ABS2_SUM m6, m7, m8, m9, m0, m1
- paddusw m0, m1
-%endmacro
-
-%macro ABS_SUM_8x8_32 1
- mova [%1], m7
- ABS1 m0, m7
- ABS1 m1, m7
- ABS1_SUM m2, m7, m0
- ABS1_SUM m3, m7, m1
- ABS1_SUM m4, m7, m0
- ABS1_SUM m5, m7, m1
- ABS1_SUM m6, m7, m0
- mova m2, [%1]
- ABS1_SUM m2, m7, m1
- paddusw m0, m1
-%endmacro
-
-; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
-; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
-; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
-%macro HSUM 3
-%if cpuflag(sse2)
- movhlps %2, %1
- paddusw %1, %2
- pshuflw %2, %1, 0xE
- paddusw %1, %2
- pshuflw %2, %1, 0x1
- paddusw %1, %2
- movd %3, %1
-%elif cpuflag(mmxext)
- pshufw %2, %1, 0xE
- paddusw %1, %2
- pshufw %2, %1, 0x1
- paddusw %1, %2
- movd %3, %1
-%elif cpuflag(mmx)
- mova %2, %1
- psrlq %1, 32
- paddusw %1, %2
- mova %2, %1
- psrlq %1, 16
- paddusw %1, %2
- movd %3, %1
-%endif
-%endmacro
-
-%macro STORE4 5
- mova [%1+mmsize*0], %2
- mova [%1+mmsize*1], %3
- mova [%1+mmsize*2], %4
- mova [%1+mmsize*3], %5
-%endmacro
-
-%macro LOAD4 5
- mova %2, [%1+mmsize*0]
- mova %3, [%1+mmsize*1]
- mova %4, [%1+mmsize*2]
- mova %5, [%1+mmsize*3]
-%endmacro
-
-%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
- call hadamard8x8_diff %+ SUFFIX
-%ifndef m8
- ADD rsp, pad
-%endif
- RET
-
-cglobal hadamard8_diff16, 5, 6, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
-
- call hadamard8x8_diff %+ SUFFIX
- mov r5d, eax
-
- add r1, 8
- add r2, 8
- call hadamard8x8_diff %+ SUFFIX
- add r5d, eax
-
- cmp r4d, 16
- jne .done
-
- lea r1, [r1+r3*8-8]
- lea r2, [r2+r3*8-8]
- call hadamard8x8_diff %+ SUFFIX
- add r5d, eax
-
- add r1, 8
- add r2, 8
- call hadamard8x8_diff %+ SUFFIX
- add r5d, eax
-
-.done:
- mov eax, r5d
-%ifndef m8
- ADD rsp, pad
-%endif
- RET
-%endmacro
-
-%macro HADAMARD8_DIFF 0-1
-%if cpuflag(sse2)
-hadamard8x8_diff %+ SUFFIX:
- lea r0, [r3*3]
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
- HADAMARD8
-%if ARCH_X86_64
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
-%else
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
-%endif
- HADAMARD8
- ABS_SUM_8x8 rsp+gprsize
- HSUM m0, m1, eax
- and eax, 0xFFFF
- ret
-
-hadamard8_16_wrapper %1, 3
-%elif cpuflag(mmx)
-ALIGN 16
-; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
-; int stride, int h)
-; r0 = void *s = unused, int h = unused (always 8)
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
-; can simply call this 2x2x (and that's why we access rsp+gprsize
-; everywhere, which is rsp of calling func
-hadamard8x8_diff %+ SUFFIX:
- lea r0, [r3*3]
-
- ; first 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
- STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
-
- ; second 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
-
- LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize+0x60
- mova [rsp+gprsize+0x60], m0
-
- LOAD4 rsp+gprsize , m0, m1, m2, m3
- LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize
- paddusw m0, [rsp+gprsize+0x60]
-
- HSUM m0, m1, eax
- and rax, 0xFFFF
- ret
-
-hadamard8_16_wrapper 0, 14
-%endif
-%endmacro
-
-INIT_MMX mmx
-HADAMARD8_DIFF
-
-INIT_MMX mmxext
-HADAMARD8_DIFF
-
-INIT_XMM sse2
-%if ARCH_X86_64
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
-%else
-%define ABS_SUM_8x8 ABS_SUM_8x8_32
-%endif
-HADAMARD8_DIFF 10
-
-INIT_XMM ssse3
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
-HADAMARD8_DIFF 9
-
-INIT_XMM sse2
-; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
-cglobal sse16, 5, 5, 8
- shr r4d, 1
- pxor m0, m0 ; mm0 = 0
- pxor m7, m7 ; mm7 holds the sum
-
-.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
- movu m1, [r1 ] ; mm1 = pix1[0][0-15]
- movu m2, [r2 ] ; mm2 = pix2[0][0-15]
- movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
- movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
-
- ; todo: mm1-mm2, mm3-mm4
- ; algo: subtract mm1 from mm2 with saturation and vice versa
- ; OR the result to get the absolute difference
- mova m5, m1
- mova m6, m3
- psubusb m1, m2
- psubusb m3, m4
- psubusb m2, m5
- psubusb m4, m6
-
- por m2, m1
- por m4, m3
-
- ; now convert to 16-bit vectors so we can square them
- mova m1, m2
- mova m3, m4
-
- punpckhbw m2, m0
- punpckhbw m4, m0
- punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
- punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
-
- pmaddwd m2, m2
- pmaddwd m4, m4
- pmaddwd m1, m1
- pmaddwd m3, m3
-
- lea r1, [r1+r3*2] ; pix1 += 2*line_size
- lea r2, [r2+r3*2] ; pix2 += 2*line_size
-
- paddd m1, m2
- paddd m3, m4
- paddd m7, m1
- paddd m7, m3
-
- dec r4
- jnz .next2lines
-
- mova m1, m7
- psrldq m7, 8 ; shift hi qword to lo
- paddd m7, m1
- mova m1, m7
- psrldq m7, 4 ; shift hi dword to lo
- paddd m7, m1
- movd eax, m7 ; return value
- RET
-
-INIT_MMX mmx
-; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
-cglobal get_pixels, 3,4
- movsxdifnidn r2, r2d
- add r0, 128
- mov r3, -128
- pxor m7, m7
-.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- mova [r0+r3+ 0], m0
- mova [r0+r3+ 8], m1
- mova [r0+r3+16], m2
- mova [r0+r3+24], m3
- lea r1, [r1+r2*2]
- add r3, 32
- js .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal get_pixels, 3, 4
- movsxdifnidn r2, r2d
- lea r3, [r2*3]
- pxor m4, m4
- movh m0, [r1]
- movh m1, [r1+r2]
- movh m2, [r1+r2*2]
- movh m3, [r1+r3]
- lea r1, [r1+r2*4]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- mova [r0], m0
- mova [r0+0x10], m1
- mova [r0+0x20], m2
- mova [r0+0x30], m3
- movh m0, [r1]
- movh m1, [r1+r2*1]
- movh m2, [r1+r2*2]
- movh m3, [r1+r3]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- mova [r0+0x40], m0
- mova [r0+0x50], m1
- mova [r0+0x60], m2
- mova [r0+0x70], m3
- RET
-
-INIT_MMX mmx
-; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
-cglobal diff_pixels, 4,5
- movsxdifnidn r3, r3d
- pxor m7, m7
- add r0, 128
- mov r4, -128
-.loop:
- mova m0, [r1]
- mova m2, [r2]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- psubw m0, m2
- psubw m1, m3
- mova [r0+r4+0], m0
- mova [r0+r4+8], m1
- add r1, r3
- add r2, r3
- add r4, 16
- jne .loop
- REP_RET
-
-INIT_MMX mmx
-; pix_sum16_mmx(uint8_t * pix, int line_size)
-cglobal pix_sum16, 2, 3
- movsxdifnidn r1, r1d
- mov r2, r1
- neg r2
- shl r2, 4
- sub r0, r2
- pxor m7, m7
- pxor m6, m6
-.loop:
- mova m0, [r0+r2+0]
- mova m1, [r0+r2+0]
- mova m2, [r0+r2+8]
- mova m3, [r0+r2+8]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- paddw m1, m0
- paddw m3, m2
- paddw m3, m1
- paddw m6, m3
- add r2, r1
- js .loop
- mova m5, m6
- psrlq m6, 32
- paddw m6, m5
- mova m5, m6
- psrlq m6, 16
- paddw m6, m5
- movd eax, m6
- and eax, 0xffff
- RET
-
-INIT_MMX mmx
-; pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
- movsxdifnidn r1, r1d
- mov r2, 16
- pxor m0, m0
- pxor m7, m7
-.loop:
- mova m2, [r0+0]
- mova m3, [r0+8]
- mova m1, m2
- punpckhbw m1, m0
- punpcklbw m2, m0
- mova m4, m3
- punpckhbw m3, m0
- punpcklbw m4, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- pmaddwd m4, m4
- paddd m2, m1
- paddd m4, m3
- paddd m7, m2
- add r0, r1
- paddd m7, m4
- dec r2
- jne .loop
- mova m1, m7
- psrlq m7, 32
- paddd m1, m7
- movd eax, m1
- RET
-
diff --git a/ffmpeg/libavcodec/x86/dsputilenc_mmx.c b/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
deleted file mode 100644
index 5de8ade..0000000
--- a/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
+++ /dev/null
@@ -1,1061 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/dct.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-#include "libavcodec/mathops.h"
-#include "dsputil_x86.h"
-
-void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
-void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
-void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride);
-int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
-int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
-
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
- int tmp;
- __asm__ volatile (
- "movl %4,%%ecx\n"
- "shr $1,%%ecx\n"
- "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
- "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
- "1:\n"
- "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
- "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
- "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
- "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
-
- /* todo: mm1-mm2, mm3-mm4 */
- /* algo: subtract mm1 from mm2 with saturation and vice versa */
- /* OR the results to get absolute difference */
- "movq %%mm1,%%mm5\n"
- "movq %%mm3,%%mm6\n"
- "psubusb %%mm2,%%mm1\n"
- "psubusb %%mm4,%%mm3\n"
- "psubusb %%mm5,%%mm2\n"
- "psubusb %%mm6,%%mm4\n"
-
- "por %%mm1,%%mm2\n"
- "por %%mm3,%%mm4\n"
-
- /* now convert to 16-bit vectors so we can square them */
- "movq %%mm2,%%mm1\n"
- "movq %%mm4,%%mm3\n"
-
- "punpckhbw %%mm0,%%mm2\n"
- "punpckhbw %%mm0,%%mm4\n"
- "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
- "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
-
- "pmaddwd %%mm2,%%mm2\n"
- "pmaddwd %%mm4,%%mm4\n"
- "pmaddwd %%mm1,%%mm1\n"
- "pmaddwd %%mm3,%%mm3\n"
-
- "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
- "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
-
- "paddd %%mm2,%%mm1\n"
- "paddd %%mm4,%%mm3\n"
- "paddd %%mm1,%%mm7\n"
- "paddd %%mm3,%%mm7\n"
-
- "decl %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm7,%%mm1\n"
- "psrlq $32, %%mm7\n" /* shift hi dword to lo */
- "paddd %%mm7,%%mm1\n"
- "movd %%mm1,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp;
-}
-
-static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
- int tmp;
- __asm__ volatile (
- "movl %4,%%ecx\n"
- "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
- "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
- "1:\n"
- "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
- "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
- "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
- "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
-
- /* todo: mm1-mm2, mm3-mm4 */
- /* algo: subtract mm1 from mm2 with saturation and vice versa */
- /* OR the results to get absolute difference */
- "movq %%mm1,%%mm5\n"
- "movq %%mm3,%%mm6\n"
- "psubusb %%mm2,%%mm1\n"
- "psubusb %%mm4,%%mm3\n"
- "psubusb %%mm5,%%mm2\n"
- "psubusb %%mm6,%%mm4\n"
-
- "por %%mm1,%%mm2\n"
- "por %%mm3,%%mm4\n"
-
- /* now convert to 16-bit vectors so we can square them */
- "movq %%mm2,%%mm1\n"
- "movq %%mm4,%%mm3\n"
-
- "punpckhbw %%mm0,%%mm2\n"
- "punpckhbw %%mm0,%%mm4\n"
- "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
- "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
-
- "pmaddwd %%mm2,%%mm2\n"
- "pmaddwd %%mm4,%%mm4\n"
- "pmaddwd %%mm1,%%mm1\n"
- "pmaddwd %%mm3,%%mm3\n"
-
- "add %3,%0\n"
- "add %3,%1\n"
-
- "paddd %%mm2,%%mm1\n"
- "paddd %%mm4,%%mm3\n"
- "paddd %%mm1,%%mm7\n"
- "paddd %%mm3,%%mm7\n"
-
- "decl %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm7,%%mm1\n"
- "psrlq $32, %%mm7\n" /* shift hi dword to lo */
- "paddd %%mm7,%%mm1\n"
- "movd %%mm1,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
- int tmp;
- __asm__ volatile (
- "movl %3,%%ecx\n"
- "pxor %%mm7,%%mm7\n"
- "pxor %%mm6,%%mm6\n"
-
- "movq (%0),%%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm0\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm2\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2,%0\n"
-
- "movq (%0),%%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm4\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm5\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2,%0\n"
- "1:\n"
-
- "movq (%0),%%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm0\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm2\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2,%0\n"
-
- "movq (%0),%%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm4\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm5\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2,%0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7,%%mm0\n"
- "punpckhwd %%mm7,%%mm6\n"
- "paddd %%mm0, %%mm6\n"
-
- "movq %%mm6,%%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6,%%mm0\n"
- "movd %%mm0,%1\n"
- : "+r" (pix1), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "g" (h-2)
- : "%ecx");
- return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
- int tmp;
- uint8_t * pix= pix1;
- __asm__ volatile (
- "movl %3,%%ecx\n"
- "pxor %%mm7,%%mm7\n"
- "pxor %%mm6,%%mm6\n"
-
- "movq (%0),%%mm0\n"
- "movq 1(%0),%%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm0\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm2\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2,%0\n"
-
- "movq (%0),%%mm4\n"
- "movq 1(%0),%%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm4\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm5\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2,%0\n"
- "1:\n"
-
- "movq (%0),%%mm0\n"
- "movq 1(%0),%%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm0\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm2\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2,%0\n"
-
- "movq (%0),%%mm4\n"
- "movq 1(%0),%%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7,%%mm4\n"
- "punpcklbw %%mm7,%%mm1\n"
- "punpckhbw %%mm7,%%mm5\n"
- "punpckhbw %%mm7,%%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2,%0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7,%%mm0\n"
- "punpckhwd %%mm7,%%mm6\n"
- "paddd %%mm0, %%mm6\n"
-
- "movq %%mm6,%%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6,%%mm0\n"
- "movd %%mm0,%1\n"
- : "+r" (pix1), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "g" (h-2)
- : "%ecx");
- return tmp + hf_noise8_mmx(pix+8, line_size, h);
-}
-
-static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
- MpegEncContext *c = p;
- int score1, score2;
-
- if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
- else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
- score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
-
- if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
- else return score1 + FFABS(score2)*8;
-}
-
-static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
- MpegEncContext *c = p;
- int score1= sse8_mmx(c, pix1, pix2, line_size, h);
- int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
-
- if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
- else return score1 + FFABS(score2)*8;
-}
-
-static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
- int tmp;
-
- av_assert2( (((int)pix) & 7) == 0);
- av_assert2((line_size &7) ==0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0), %%mm2\n"\
- "movq 8(%0), %%mm3\n"\
- "add %2,%0\n"\
- "movq %%mm2, " #out0 "\n"\
- "movq %%mm3, " #out1 "\n"\
- "psubusb " #in0 ", %%mm2\n"\
- "psubusb " #in1 ", %%mm3\n"\
- "psubusb " #out0 ", " #in0 "\n"\
- "psubusb " #out1 ", " #in1 "\n"\
- "por %%mm2, " #in0 "\n"\
- "por %%mm3, " #in1 "\n"\
- "movq " #in0 ", %%mm2\n"\
- "movq " #in1 ", %%mm3\n"\
- "punpcklbw %%mm7, " #in0 "\n"\
- "punpcklbw %%mm7, " #in1 "\n"\
- "punpckhbw %%mm7, %%mm2\n"\
- "punpckhbw %%mm7, %%mm3\n"\
- "paddw " #in1 ", " #in0 "\n"\
- "paddw %%mm3, %%mm2\n"\
- "paddw %%mm2, " #in0 "\n"\
- "paddw " #in0 ", %%mm6\n"
-
-
- __asm__ volatile (
- "movl %3,%%ecx\n"
- "pxor %%mm6,%%mm6\n"
- "pxor %%mm7,%%mm7\n"
- "movq (%0),%%mm0\n"
- "movq 8(%0),%%mm1\n"
- "add %2,%0\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm6,%%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddw %%mm6,%%mm0\n"
- "movq %%mm0,%%mm6\n"
- "psrlq $16, %%mm0\n"
- "paddw %%mm6,%%mm0\n"
- "movd %%mm0,%1\n"
- : "+r" (pix), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp & 0xFFFF;
-}
-#undef SUM
-
-static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
- int line_size, int h)
-{
- int tmp;
-
- av_assert2( (((int)pix) & 7) == 0);
- av_assert2((line_size &7) ==0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0), " #out0 "\n"\
- "movq 8(%0), " #out1 "\n"\
- "add %2,%0\n"\
- "psadbw " #out0 ", " #in0 "\n"\
- "psadbw " #out1 ", " #in1 "\n"\
- "paddw " #in1 ", " #in0 "\n"\
- "paddw " #in0 ", %%mm6\n"
-
- __asm__ volatile (
- "movl %3,%%ecx\n"
- "pxor %%mm6,%%mm6\n"
- "pxor %%mm7,%%mm7\n"
- "movq (%0),%%mm0\n"
- "movq 8(%0),%%mm1\n"
- "add %2,%0\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movd %%mm6,%1\n"
- : "+r" (pix), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp;
-}
-#undef SUM
-
-static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
- int tmp;
-
- av_assert2( (((int)pix1) & 7) == 0);
- av_assert2( (((int)pix2) & 7) == 0);
- av_assert2((line_size &7) ==0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0),%%mm2\n"\
- "movq (%1)," #out0 "\n"\
- "movq 8(%0),%%mm3\n"\
- "movq 8(%1)," #out1 "\n"\
- "add %3,%0\n"\
- "add %3,%1\n"\
- "psubb " #out0 ", %%mm2\n"\
- "psubb " #out1 ", %%mm3\n"\
- "pxor %%mm7, %%mm2\n"\
- "pxor %%mm7, %%mm3\n"\
- "movq %%mm2, " #out0 "\n"\
- "movq %%mm3, " #out1 "\n"\
- "psubusb " #in0 ", %%mm2\n"\
- "psubusb " #in1 ", %%mm3\n"\
- "psubusb " #out0 ", " #in0 "\n"\
- "psubusb " #out1 ", " #in1 "\n"\
- "por %%mm2, " #in0 "\n"\
- "por %%mm3, " #in1 "\n"\
- "movq " #in0 ", %%mm2\n"\
- "movq " #in1 ", %%mm3\n"\
- "punpcklbw %%mm7, " #in0 "\n"\
- "punpcklbw %%mm7, " #in1 "\n"\
- "punpckhbw %%mm7, %%mm2\n"\
- "punpckhbw %%mm7, %%mm3\n"\
- "paddw " #in1 ", " #in0 "\n"\
- "paddw %%mm3, %%mm2\n"\
- "paddw %%mm2, " #in0 "\n"\
- "paddw " #in0 ", %%mm6\n"
-
-
- __asm__ volatile (
- "movl %4,%%ecx\n"
- "pxor %%mm6,%%mm6\n"
- "pcmpeqw %%mm7,%%mm7\n"
- "psllw $15, %%mm7\n"
- "packsswb %%mm7, %%mm7\n"
- "movq (%0),%%mm0\n"
- "movq (%1),%%mm2\n"
- "movq 8(%0),%%mm1\n"
- "movq 8(%1),%%mm3\n"
- "add %3,%0\n"
- "add %3,%1\n"
- "psubb %%mm2, %%mm0\n"
- "psubb %%mm3, %%mm1\n"
- "pxor %%mm7, %%mm0\n"
- "pxor %%mm7, %%mm1\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movq %%mm6,%%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddw %%mm6,%%mm0\n"
- "movq %%mm0,%%mm6\n"
- "psrlq $16, %%mm0\n"
- "paddw %%mm6,%%mm0\n"
- "movd %%mm0,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp & 0x7FFF;
-}
-#undef SUM
-
-static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
- int line_size, int h)
-{
- int tmp;
-
- av_assert2( (((int)pix1) & 7) == 0);
- av_assert2( (((int)pix2) & 7) == 0);
- av_assert2((line_size &7) ==0);
-
-#define SUM(in0, in1, out0, out1) \
- "movq (%0)," #out0 "\n"\
- "movq (%1),%%mm2\n"\
- "movq 8(%0)," #out1 "\n"\
- "movq 8(%1),%%mm3\n"\
- "add %3,%0\n"\
- "add %3,%1\n"\
- "psubb %%mm2, " #out0 "\n"\
- "psubb %%mm3, " #out1 "\n"\
- "pxor %%mm7, " #out0 "\n"\
- "pxor %%mm7, " #out1 "\n"\
- "psadbw " #out0 ", " #in0 "\n"\
- "psadbw " #out1 ", " #in1 "\n"\
- "paddw " #in1 ", " #in0 "\n"\
- "paddw " #in0 ", %%mm6\n"
-
- __asm__ volatile (
- "movl %4,%%ecx\n"
- "pxor %%mm6,%%mm6\n"
- "pcmpeqw %%mm7,%%mm7\n"
- "psllw $15, %%mm7\n"
- "packsswb %%mm7, %%mm7\n"
- "movq (%0),%%mm0\n"
- "movq (%1),%%mm2\n"
- "movq 8(%0),%%mm1\n"
- "movq 8(%1),%%mm3\n"
- "add %3,%0\n"
- "add %3,%1\n"
- "psubb %%mm2, %%mm0\n"
- "psubb %%mm3, %%mm1\n"
- "pxor %%mm7, %%mm0\n"
- "pxor %%mm7, %%mm1\n"
- "jmp 2f\n"
- "1:\n"
-
- SUM(%%mm4, %%mm5, %%mm0, %%mm1)
- "2:\n"
- SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
- "subl $2, %%ecx\n"
- "jnz 1b\n"
-
- "movd %%mm6,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp)
- : "r" ((x86_reg)line_size) , "m" (h)
- : "%ecx");
- return tmp;
-}
-#undef SUM
-
-static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
- x86_reg i=0;
- if(w>=16)
- __asm__ volatile(
- "1: \n\t"
- "movq (%2, %0), %%mm0 \n\t"
- "movq (%1, %0), %%mm1 \n\t"
- "psubb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%3, %0) \n\t"
- "movq 8(%2, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "psubb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%3, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp %4, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (i)
- : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
- );
- for(; i<w; i++)
- dst[i+0] = src1[i+0]-src2[i+0];
-}
-
-static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
- const uint8_t *src2, int w,
- int *left, int *left_top)
-{
- x86_reg i=0;
- uint8_t l, lt;
-
- __asm__ volatile(
- "movq (%1, %0), %%mm0 \n\t" // LT
- "psllq $8, %%mm0 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm1 \n\t" // T
- "movq -1(%2, %0), %%mm2 \n\t" // L
- "movq (%2, %0), %%mm3 \n\t" // X
- "movq %%mm2, %%mm4 \n\t" // L
- "psubb %%mm0, %%mm2 \n\t"
- "paddb %%mm1, %%mm2 \n\t" // L + T - LT
- "movq %%mm4, %%mm5 \n\t" // L
- "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
- "pminub %%mm5, %%mm1 \n\t" // min(T, L)
- "pminub %%mm2, %%mm4 \n\t"
- "pmaxub %%mm1, %%mm4 \n\t"
- "psubb %%mm4, %%mm3 \n\t" // dst - pred
- "movq %%mm3, (%3, %0) \n\t"
- "add $8, %0 \n\t"
- "movq -1(%1, %0), %%mm0 \n\t" // LT
- "cmp %4, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (i)
- : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
- );
-
- l= *left;
- lt= *left_top;
-
- dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
-
- *left_top= src1[w-1];
- *left = src2[w-1];
-}
-
-#define MMABS_MMX(a,z)\
- "pxor " #z ", " #z " \n\t"\
- "pcmpgtw " #a ", " #z " \n\t"\
- "pxor " #z ", " #a " \n\t"\
- "psubw " #z ", " #a " \n\t"
-
-#define MMABS_MMXEXT(a, z) \
- "pxor " #z ", " #z " \n\t"\
- "psubw " #a ", " #z " \n\t"\
- "pmaxsw " #z ", " #a " \n\t"
-
-#define MMABS_SSSE3(a,z)\
- "pabsw " #a ", " #a " \n\t"
-
-#define MMABS_SUM(a,z, sum)\
- MMABS(a,z)\
- "paddusw " #a ", " #sum " \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
- * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
- * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)\
- "movq "#a", "#t" \n\t"\
- "psrlq $32, "#a" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "movq "#a", "#t" \n\t"\
- "psrlq $16, "#a" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "movd "#a", "#dst" \n\t"\
-
-#define HSUM_MMXEXT(a, t, dst) \
- "pshufw $0x0E, "#a", "#t" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "pshufw $0x01, "#a", "#t" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "movd "#a", "#dst" \n\t"\
-
-#define HSUM_SSE2(a, t, dst)\
- "movhlps "#a", "#t" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "pshuflw $0x0E, "#a", "#t" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "pshuflw $0x01, "#a", "#t" \n\t"\
- "paddusw "#t", "#a" \n\t"\
- "movd "#a", "#dst" \n\t"\
-
-#define DCT_SAD4(m,mm,o)\
- "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
- "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
- "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
- "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
- MMABS_SUM(mm##2, mm##6, mm##0)\
- MMABS_SUM(mm##3, mm##7, mm##1)\
- MMABS_SUM(mm##4, mm##6, mm##0)\
- MMABS_SUM(mm##5, mm##7, mm##1)\
-
-#define DCT_SAD_MMX\
- "pxor %%mm0, %%mm0 \n\t"\
- "pxor %%mm1, %%mm1 \n\t"\
- DCT_SAD4(q, %%mm, 0)\
- DCT_SAD4(q, %%mm, 8)\
- DCT_SAD4(q, %%mm, 64)\
- DCT_SAD4(q, %%mm, 72)\
- "paddusw %%mm1, %%mm0 \n\t"\
- HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2\
- "pxor %%xmm0, %%xmm0 \n\t"\
- "pxor %%xmm1, %%xmm1 \n\t"\
- DCT_SAD4(dqa, %%xmm, 0)\
- DCT_SAD4(dqa, %%xmm, 64)\
- "paddusw %%xmm1, %%xmm0 \n\t"\
- HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu) \
-static int sum_abs_dctelem_##cpu(int16_t *block){\
- int sum;\
- __asm__ volatile(\
- DCT_SAD\
- :"=r"(sum)\
- :"r"(block)\
- );\
- return sum&0xFFFF;\
-}
-
-#define DCT_SAD DCT_SAD_MMX
-#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
-#define MMABS(a,z) MMABS_MMX(a,z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
-#define MMABS(a,z) MMABS_MMXEXT(a,z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD DCT_SAD_SSE2
-#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a,z) MMABS_SSSE3(a,z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
- int sum;
- x86_reg i=size;
- __asm__ volatile(
- "pxor %%mm4, %%mm4 \n"
- "1: \n"
- "sub $8, %0 \n"
- "movq (%2,%0), %%mm2 \n"
- "movq (%3,%0,2), %%mm0 \n"
- "movq 8(%3,%0,2), %%mm1 \n"
- "punpckhbw %%mm2, %%mm3 \n"
- "punpcklbw %%mm2, %%mm2 \n"
- "psraw $8, %%mm3 \n"
- "psraw $8, %%mm2 \n"
- "psubw %%mm3, %%mm1 \n"
- "psubw %%mm2, %%mm0 \n"
- "pmaddwd %%mm1, %%mm1 \n"
- "pmaddwd %%mm0, %%mm0 \n"
- "paddd %%mm1, %%mm4 \n"
- "paddd %%mm0, %%mm4 \n"
- "jg 1b \n"
- "movq %%mm4, %%mm3 \n"
- "psrlq $32, %%mm3 \n"
- "paddd %%mm3, %%mm4 \n"
- "movd %%mm4, %1 \n"
- :"+r"(i), "=r"(sum)
- :"r"(pix1), "r"(pix2)
- );
- return sum;
-}
-
-#define PHADDD(a, t)\
- "movq "#a", "#t" \n\t"\
- "psrlq $32, "#a" \n\t"\
- "paddd "#t", "#a" \n\t"
-/*
- pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
- pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
- pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
- */
-#define PMULHRW(x, y, s, o)\
- "pmulhw " #s ", "#x " \n\t"\
- "pmulhw " #s ", "#y " \n\t"\
- "paddw " #o ", "#x " \n\t"\
- "paddw " #o ", "#y " \n\t"\
- "psraw $1, "#x " \n\t"\
- "psraw $1, "#y " \n\t"
-#define DEF(x) x ## _mmx
-#define SET_RND MOVQ_WONE
-#define SCALE_OFFSET 1
-
-#include "dsputil_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#define DEF(x) x ## _3dnow
-#define SET_RND(x)
-#define SCALE_OFFSET 0
-#define PMULHRW(x, y, s, o)\
- "pmulhrw " #s ", "#x " \n\t"\
- "pmulhrw " #s ", "#y " \n\t"
-
-#include "dsputil_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#if HAVE_SSSE3_INLINE
-#undef PHADDD
-#define DEF(x) x ## _ssse3
-#define SET_RND(x)
-#define SCALE_OFFSET -1
-#define PHADDD(a, t)\
- "pshufw $0x0E, "#a", "#t" \n\t"\
- "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
-#define PMULHRW(x, y, s, o)\
- "pmulhrsw " #s ", "#x " \n\t"\
- "pmulhrsw " #s ", "#y " \n\t"
-
-#include "dsputil_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-#undef PHADDD
-#endif /* HAVE_SSSE3_INLINE */
-
-#endif /* HAVE_INLINE_ASM */
-
-int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
-
-#define hadamard_func(cpu) \
-int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
- int stride, int h); \
-int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
- int stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
-av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
-{
- int cpu_flags = av_get_cpu_flags();
- const int dct_algo = avctx->dct_algo;
-
-#if HAVE_YASM
- int bit_depth = avctx->bits_per_raw_sample;
-
- if (EXTERNAL_MMX(cpu_flags)) {
- if (bit_depth <= 8)
- c->get_pixels = ff_get_pixels_mmx;
- c->diff_pixels = ff_diff_pixels_mmx;
- c->pix_sum = ff_pix_sum16_mmx;
-
- c->pix_norm1 = ff_pix_norm1_mmx;
- }
- if (EXTERNAL_SSE2(cpu_flags))
- if (bit_depth <= 8)
- c->get_pixels = ff_get_pixels_sse2;
-#endif /* HAVE_YASM */
-
-#if HAVE_INLINE_ASM
- if (INLINE_MMX(cpu_flags)) {
- if (avctx->bits_per_raw_sample <= 8 &&
- (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
- c->fdct = ff_fdct_mmx;
-
- c->diff_bytes= diff_bytes_mmx;
- c->sum_abs_dctelem= sum_abs_dctelem_mmx;
-
- c->sse[0] = sse16_mmx;
- c->sse[1] = sse8_mmx;
- c->vsad[4]= vsad_intra16_mmx;
-
- c->nsse[0] = nsse16_mmx;
- c->nsse[1] = nsse8_mmx;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->vsad[0] = vsad16_mmx;
- }
-
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->try_8x8basis= try_8x8basis_mmx;
- }
- c->add_8x8basis= add_8x8basis_mmx;
-
- c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
- }
-
- if (INLINE_AMD3DNOW(cpu_flags)) {
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->try_8x8basis = try_8x8basis_3dnow;
- }
- c->add_8x8basis = add_8x8basis_3dnow;
- }
-
- if (INLINE_MMXEXT(cpu_flags)) {
- if (avctx->bits_per_raw_sample <= 8 &&
- (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
- c->fdct = ff_fdct_mmxext;
-
- c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
- c->vsad[4] = vsad_intra16_mmxext;
-
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->vsad[0] = vsad16_mmxext;
- }
-
- c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
- }
-
- if (INLINE_SSE2(cpu_flags)) {
- if (avctx->bits_per_raw_sample <= 8 &&
- (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
- c->fdct = ff_fdct_sse2;
-
- c->sum_abs_dctelem= sum_abs_dctelem_sse2;
- }
-
-#if HAVE_SSSE3_INLINE
- if (INLINE_SSSE3(cpu_flags)) {
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->try_8x8basis = try_8x8basis_ssse3;
- }
- c->add_8x8basis = add_8x8basis_ssse3;
- c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
- }
-#endif
-#endif /* HAVE_INLINE_ASM */
-
- if (EXTERNAL_MMX(cpu_flags)) {
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
- }
-
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->sse[0] = ff_sse16_sse2;
-
-#if HAVE_ALIGNED_STACK
- c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
- c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
-#endif
- }
-
- if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
- c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
- c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
- }
-
- ff_dsputil_init_pix_mmx(c, avctx);
-}
diff --git a/ffmpeg/libavcodec/x86/dwt_yasm.asm b/ffmpeg/libavcodec/x86/dwt_yasm.asm
deleted file mode 100644
index 5253abc..0000000
--- a/ffmpeg/libavcodec/x86/dwt_yasm.asm
+++ /dev/null
@@ -1,306 +0,0 @@
-;******************************************************************************
-;* MMX optimized discrete wavelet trasnform
-;* Copyright (c) 2010 David Conrad
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-pw_2: times 8 dw 2
-pw_8: times 8 dw 8
-pw_16: times 8 dw 16
-pw_1991: times 4 dw 9,-1
-
-section .text
-
-; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
-%macro COMPOSE_53iL0 4
- paddw %2, %3
- paddw %2, %4
- psraw %2, 2
- psubw %1, %2
-%endm
-
-; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
-; if %4 is supplied, %1 is loaded unaligned from there
-; m2: clobbered m3: pw_8 m4: pw_1991
-%macro COMPOSE_DD97iH0 3-4
- paddw m0, %3
- paddw m1, %2
- psubw m0, m3
- mova m2, m1
- punpcklwd m1, m0
- punpckhwd m2, m0
- pmaddwd m1, m4
- pmaddwd m2, m4
-%if %0 > 3
- movu %1, %4
-%endif
- psrad m1, 4
- psrad m2, 4
- packssdw m1, m2
- paddw m1, %1
-%endm
-
-%macro COMPOSE_VERTICAL 1
-; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
-; int width)
-cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
- mova m2, [pw_2]
-%if ARCH_X86_64
- mov widthd, widthd
-%endif
-.loop:
- sub widthq, mmsize/2
- mova m1, [b0q+2*widthq]
- mova m0, [b1q+2*widthq]
- COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
- mova [b1q+2*widthq], m0
- jg .loop
- REP_RET
-
-; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
-; int width)
-cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
- mova m1, [pw_1]
-%if ARCH_X86_64
- mov widthd, widthd
-%endif
-.loop:
- sub widthq, mmsize/2
- mova m0, [b0q+2*widthq]
- paddw m0, [b2q+2*widthq]
- paddw m0, m1
- psraw m0, 1
- paddw m0, [b1q+2*widthq]
- mova [b1q+2*widthq], m0
- jg .loop
- REP_RET
-
-; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
-; IDWTELEM *b3, IDWTELEM *b4, int width)
-cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
- mova m3, [pw_8]
- mova m4, [pw_1991]
-%if ARCH_X86_64
- mov widthd, widthd
-%endif
-.loop:
- sub widthq, mmsize/2
- mova m0, [b0q+2*widthq]
- mova m1, [b1q+2*widthq]
- COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
- mova [b2q+2*widthq], m1
- jg .loop
- REP_RET
-
-; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
-; IDWTELEM *b3, IDWTELEM *b4, int width)
-cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
- mova m3, [pw_16]
- mova m4, [pw_1991]
-%if ARCH_X86_64
- mov widthd, widthd
-%endif
-.loop:
- sub widthq, mmsize/2
- mova m0, [b0q+2*widthq]
- mova m1, [b1q+2*widthq]
- mova m5, [b2q+2*widthq]
- paddw m0, [b4q+2*widthq]
- paddw m1, [b3q+2*widthq]
- psubw m0, m3
- mova m2, m1
- punpcklwd m1, m0
- punpckhwd m2, m0
- pmaddwd m1, m4
- pmaddwd m2, m4
- psrad m1, 5
- psrad m2, 5
- packssdw m1, m2
- psubw m5, m1
- mova [b2q+2*widthq], m5
- jg .loop
- REP_RET
-
-; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
-cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
- mova m3, [pw_1]
-%if ARCH_X86_64
- mov widthd, widthd
-%endif
-.loop:
- sub widthq, mmsize/2
- mova m1, [b1q+2*widthq]
- mova m0, [b0q+2*widthq]
- mova m2, m1
- paddw m1, m3
- psraw m1, 1
- psubw m0, m1
- mova [b0q+2*widthq], m0
- paddw m2, m0
- mova [b1q+2*widthq], m2
- jg .loop
- REP_RET
-%endmacro
-
-; extend the left and right edges of the tmp array by %1 and %2 respectively
-%macro EDGE_EXTENSION 3
- mov %3, [tmpq]
-%assign %%i 1
-%rep %1
- mov [tmpq-2*%%i], %3
- %assign %%i %%i+1
-%endrep
- mov %3, [tmpq+2*w2q-2]
-%assign %%i 0
-%rep %2
- mov [tmpq+2*w2q+2*%%i], %3
- %assign %%i %%i+1
-%endrep
-%endmacro
-
-
-%macro HAAR_HORIZONTAL 2
-; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
-cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
- mov w2d, wd
- xor xq, xq
- shr w2d, 1
- lea b_w2q, [bq+wq]
- mova m3, [pw_1]
-.lowpass_loop:
- movu m1, [b_w2q + 2*xq]
- mova m0, [bq + 2*xq]
- paddw m1, m3
- psraw m1, 1
- psubw m0, m1
- mova [tmpq + 2*xq], m0
- add xq, mmsize/2
- cmp xq, w2q
- jl .lowpass_loop
-
- xor xq, xq
- and w2q, ~(mmsize/2 - 1)
- cmp w2q, mmsize/2
- jl .end
-
-.highpass_loop:
- movu m1, [b_w2q + 2*xq]
- mova m0, [tmpq + 2*xq]
- paddw m1, m0
-
- ; shift and interleave
-%if %2 == 1
- paddw m0, m3
- paddw m1, m3
- psraw m0, 1
- psraw m1, 1
-%endif
- mova m2, m0
- punpcklwd m0, m1
- punpckhwd m2, m1
- mova [bq+4*xq], m0
- mova [bq+4*xq+mmsize], m2
-
- add xq, mmsize/2
- cmp xq, w2q
- jl .highpass_loop
-.end:
- REP_RET
-%endmacro
-
-
-INIT_XMM
-; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
-cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
- mov w2d, wd
- xor xd, xd
- shr w2d, 1
- lea b_w2q, [bq+wq]
- movu m4, [bq+wq]
- mova m7, [pw_2]
- pslldq m4, 14
-.lowpass_loop:
- movu m1, [b_w2q + 2*xq]
- mova m0, [bq + 2*xq]
- mova m2, m1
- palignr m1, m4, 14
- mova m4, m2
- COMPOSE_53iL0 m0, m1, m2, m7
- mova [tmpq + 2*xq], m0
- add xd, mmsize/2
- cmp xd, w2d
- jl .lowpass_loop
-
- EDGE_EXTENSION 1, 2, xw
- ; leave the last up to 7 (sse) or 3 (mmx) values for C
- xor xd, xd
- and w2d, ~(mmsize/2 - 1)
- cmp w2d, mmsize/2
- jl .end
-
- mova m7, [tmpq-mmsize]
- mova m0, [tmpq]
- mova m5, [pw_1]
- mova m3, [pw_8]
- mova m4, [pw_1991]
-.highpass_loop:
- mova m6, m0
- palignr m0, m7, 14
- mova m7, [tmpq + 2*xq + 16]
- mova m1, m7
- mova m2, m7
- palignr m1, m6, 2
- palignr m2, m6, 4
- COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
- mova m0, m7
- mova m7, m6
-
- ; shift and interleave
- paddw m6, m5
- paddw m1, m5
- psraw m6, 1
- psraw m1, 1
- mova m2, m6
- punpcklwd m6, m1
- punpckhwd m2, m1
- mova [bq+4*xq], m6
- mova [bq+4*xq+mmsize], m2
-
- add xd, mmsize/2
- cmp xd, w2d
- jl .highpass_loop
-.end:
- REP_RET
-
-
-%if ARCH_X86_64 == 0
-INIT_MMX
-COMPOSE_VERTICAL mmx
-HAAR_HORIZONTAL mmx, 0
-HAAR_HORIZONTAL mmx, 1
-%endif
-
-;;INIT_XMM
-INIT_XMM
-COMPOSE_VERTICAL sse2
-HAAR_HORIZONTAL sse2, 0
-HAAR_HORIZONTAL sse2, 1
diff --git a/ffmpeg/libavcodec/x86/fdct.c b/ffmpeg/libavcodec/x86/fdct.c
deleted file mode 100644
index 11a13bb..0000000
--- a/ffmpeg/libavcodec/x86/fdct.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * MMX optimized forward DCT
- * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
- * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
- *
- * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
- *
- * Intel Application Note AP-922 - fast, precise implementation of DCT
- * http://developer.intel.com/vtune/cbts/appnotes.htm
- *
- * Also of inspiration:
- * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
- * Skal's fdct at http://skal.planet-d.net/coding/dct.html
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/common.h"
-#include "libavutil/x86/asm.h"
-#include "libavcodec/dct.h"
-
-#if HAVE_MMX_INLINE
-
-//////////////////////////////////////////////////////////////////////
-//
-// constants for the forward DCT
-// -----------------------------
-//
-// Be sure to check that your compiler is aligning all constants to QWORD
-// (8-byte) memory boundaries! Otherwise the unaligned memory access will
-// severely stall MMX execution.
-//
-//////////////////////////////////////////////////////////////////////
-
-#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
-#define SHIFT_FRW_COL BITS_FRW_ACC
-#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
-#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
-//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
-
-#define X8(x) x,x,x,x,x,x,x,x
-
-//concatenated table, for forward DCT transformation
-DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
- X8(13036), // tg * (2<<16) + 0.5
- X8(27146), // tg * (2<<16) + 0.5
- X8(-21746) // tg * (2<<16) + 0.5
-};
-
-DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
- X8(23170) //cos * (2<<15) + 0.5
-};
-
-DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
-
-DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
-
-static const struct
-{
- DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
-} fdct_r_row_sse2 =
-{{
- RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
-}};
-//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
-
-DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
- 16384, 16384, 22725, 19266,
- 16384, 16384, 12873, 4520,
- 21407, 8867, 19266, -4520,
- -8867, -21407, -22725, -12873,
- 16384, -16384, 12873, -22725,
- -16384, 16384, 4520, 19266,
- 8867, -21407, 4520, -12873,
- 21407, -8867, 19266, -22725,
-
- 22725, 22725, 31521, 26722,
- 22725, 22725, 17855, 6270,
- 29692, 12299, 26722, -6270,
- -12299, -29692, -31521, -17855,
- 22725, -22725, 17855, -31521,
- -22725, 22725, 6270, 26722,
- 12299, -29692, 6270, -17855,
- 29692, -12299, 26722, -31521,
-
- 21407, 21407, 29692, 25172,
- 21407, 21407, 16819, 5906,
- 27969, 11585, 25172, -5906,
- -11585, -27969, -29692, -16819,
- 21407, -21407, 16819, -29692,
- -21407, 21407, 5906, 25172,
- 11585, -27969, 5906, -16819,
- 27969, -11585, 25172, -29692,
-
- 19266, 19266, 26722, 22654,
- 19266, 19266, 15137, 5315,
- 25172, 10426, 22654, -5315,
- -10426, -25172, -26722, -15137,
- 19266, -19266, 15137, -26722,
- -19266, 19266, 5315, 22654,
- 10426, -25172, 5315, -15137,
- 25172, -10426, 22654, -26722,
-
- 16384, 16384, 22725, 19266,
- 16384, 16384, 12873, 4520,
- 21407, 8867, 19266, -4520,
- -8867, -21407, -22725, -12873,
- 16384, -16384, 12873, -22725,
- -16384, 16384, 4520, 19266,
- 8867, -21407, 4520, -12873,
- 21407, -8867, 19266, -22725,
-
- 19266, 19266, 26722, 22654,
- 19266, 19266, 15137, 5315,
- 25172, 10426, 22654, -5315,
- -10426, -25172, -26722, -15137,
- 19266, -19266, 15137, -26722,
- -19266, 19266, 5315, 22654,
- 10426, -25172, 5315, -15137,
- 25172, -10426, 22654, -26722,
-
- 21407, 21407, 29692, 25172,
- 21407, 21407, 16819, 5906,
- 27969, 11585, 25172, -5906,
- -11585, -27969, -29692, -16819,
- 21407, -21407, 16819, -29692,
- -21407, 21407, 5906, 25172,
- 11585, -27969, 5906, -16819,
- 27969, -11585, 25172, -29692,
-
- 22725, 22725, 31521, 26722,
- 22725, 22725, 17855, 6270,
- 29692, 12299, 26722, -6270,
- -12299, -29692, -31521, -17855,
- 22725, -22725, 17855, -31521,
- -22725, 22725, 6270, 26722,
- 12299, -29692, 6270, -17855,
- 29692, -12299, 26722, -31521,
-};
-
-static const struct
-{
- DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
-} tab_frw_01234567_sse2 =
-{{
-//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
-#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
- C4, C4, C5, C7, C2, C6, C3, -C7, \
- -C4, C4, C7, C3, C6, -C2, C7, -C5, \
- C4, -C4, C5, -C1, C2, -C6, C3, -C1,
-// c1..c7 * cos(pi/4) * 2^15
-#define C1 22725
-#define C2 21407
-#define C3 19266
-#define C4 16384
-#define C5 12873
-#define C6 8867
-#define C7 4520
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 31521
-#define C2 29692
-#define C3 26722
-#define C4 22725
-#define C5 17855
-#define C6 12299
-#define C7 6270
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 29692
-#define C2 27969
-#define C3 25172
-#define C4 21407
-#define C5 16819
-#define C6 11585
-#define C7 5906
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 26722
-#define C2 25172
-#define C3 22654
-#define C4 19266
-#define C5 15137
-#define C6 10426
-#define C7 5315
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 22725
-#define C2 21407
-#define C3 19266
-#define C4 16384
-#define C5 12873
-#define C6 8867
-#define C7 4520
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 26722
-#define C2 25172
-#define C3 22654
-#define C4 19266
-#define C5 15137
-#define C6 10426
-#define C7 5315
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 29692
-#define C2 27969
-#define C3 25172
-#define C4 21407
-#define C5 16819
-#define C6 11585
-#define C7 5906
-TABLE_SSE2
-
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-#define C1 31521
-#define C2 29692
-#define C3 26722
-#define C4 22725
-#define C5 17855
-#define C6 12299
-#define C7 6270
-TABLE_SSE2
-}};
-
-#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
-
-#define FDCT_COL(cpu, mm, mov)\
-static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
-{\
- __asm__ volatile (\
- #mov" 16(%0), %%"#mm"0 \n\t" \
- #mov" 96(%0), %%"#mm"1 \n\t" \
- #mov" %%"#mm"0, %%"#mm"2 \n\t" \
- #mov" 32(%0), %%"#mm"3 \n\t" \
- "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
- #mov" 80(%0), %%"#mm"4 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
- #mov" (%0), %%"#mm"5 \n\t" \
- "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
- "paddsw 112(%0), %%"#mm"5 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
- #mov" %%"#mm"0, %%"#mm"6 \n\t" \
- "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
- #mov" 16(%1), %%"#mm"1 \n\t" \
- "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
- #mov" 48(%0), %%"#mm"7 \n\t" \
- "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
- "paddsw 64(%0), %%"#mm"7 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
- "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
- #mov" %%"#mm"5, %%"#mm"4 \n\t" \
- "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
- "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
- "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
- "por (%2), %%"#mm"1 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
- "pmulhw 16(%1), %%"#mm"5 \n\t" \
- #mov" %%"#mm"4, %%"#mm"7 \n\t" \
- "psubsw 80(%0), %%"#mm"3 \n\t" \
- "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
- #mov" %%"#mm"1, 32(%3) \n\t" \
- "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
- #mov" 48(%0), %%"#mm"1 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
- "psubsw 64(%0), %%"#mm"1 \n\t" \
- #mov" %%"#mm"2, %%"#mm"6 \n\t" \
- #mov" %%"#mm"4, 64(%3) \n\t" \
- "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
- "pmulhw (%4), %%"#mm"2 \n\t" \
- "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
- "pmulhw (%4), %%"#mm"6 \n\t" \
- "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
- "por (%2), %%"#mm"5 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
- "por (%2), %%"#mm"2 \n\t" \
- #mov" %%"#mm"1, %%"#mm"4 \n\t" \
- #mov" (%0), %%"#mm"3 \n\t" \
- "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
- "psubsw 112(%0), %%"#mm"3 \n\t" \
- "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
- #mov" (%1), %%"#mm"0 \n\t" \
- "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
- #mov" 32(%1), %%"#mm"6 \n\t" \
- "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
- #mov" %%"#mm"7, (%3) \n\t" \
- "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
- #mov" %%"#mm"5, 96(%3) \n\t" \
- #mov" %%"#mm"3, %%"#mm"7 \n\t" \
- #mov" 32(%1), %%"#mm"5 \n\t" \
- "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
- "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
- "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
- "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
- "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
- "pmulhw (%1), %%"#mm"3 \n\t" \
- "por (%2), %%"#mm"0 \n\t" \
- "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
- "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
- #mov" %%"#mm"0, 16(%3) \n\t" \
- "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
- #mov" %%"#mm"7, 48(%3) \n\t" \
- "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
- #mov" %%"#mm"5, 80(%3) \n\t" \
- #mov" %%"#mm"3, 112(%3) \n\t" \
- : \
- : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
- "r" (out + offset), "r" (ocos_4_16)); \
-}
-
-FDCT_COL(mmx, mm, movq)
-FDCT_COL(sse2, xmm, movdqa)
-
-static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
-{
- __asm__ volatile(
-#define FDCT_ROW_SSE2_H1(i,t) \
- "movq " #i "(%0), %%xmm2 \n\t" \
- "movq " #i "+8(%0), %%xmm0 \n\t" \
- "movdqa " #t "+32(%1), %%xmm3 \n\t" \
- "movdqa " #t "+48(%1), %%xmm7 \n\t" \
- "movdqa " #t "(%1), %%xmm4 \n\t" \
- "movdqa " #t "+16(%1), %%xmm5 \n\t"
-
-#define FDCT_ROW_SSE2_H2(i,t) \
- "movq " #i "(%0), %%xmm2 \n\t" \
- "movq " #i "+8(%0), %%xmm0 \n\t" \
- "movdqa " #t "+32(%1), %%xmm3 \n\t" \
- "movdqa " #t "+48(%1), %%xmm7 \n\t"
-
-#define FDCT_ROW_SSE2(i) \
- "movq %%xmm2, %%xmm1 \n\t" \
- "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
- "paddsw %%xmm0, %%xmm1 \n\t" \
- "psubsw %%xmm0, %%xmm2 \n\t" \
- "punpckldq %%xmm2, %%xmm1 \n\t" \
- "pshufd $78, %%xmm1, %%xmm2 \n\t" \
- "pmaddwd %%xmm2, %%xmm3 \n\t" \
- "pmaddwd %%xmm1, %%xmm7 \n\t" \
- "pmaddwd %%xmm5, %%xmm2 \n\t" \
- "pmaddwd %%xmm4, %%xmm1 \n\t" \
- "paddd %%xmm7, %%xmm3 \n\t" \
- "paddd %%xmm2, %%xmm1 \n\t" \
- "paddd %%xmm6, %%xmm3 \n\t" \
- "paddd %%xmm6, %%xmm1 \n\t" \
- "psrad %3, %%xmm3 \n\t" \
- "psrad %3, %%xmm1 \n\t" \
- "packssdw %%xmm3, %%xmm1 \n\t" \
- "movdqa %%xmm1, " #i "(%4) \n\t"
-
- "movdqa (%2), %%xmm6 \n\t"
- FDCT_ROW_SSE2_H1(0,0)
- FDCT_ROW_SSE2(0)
- FDCT_ROW_SSE2_H2(64,0)
- FDCT_ROW_SSE2(64)
-
- FDCT_ROW_SSE2_H1(16,64)
- FDCT_ROW_SSE2(16)
- FDCT_ROW_SSE2_H2(112,64)
- FDCT_ROW_SSE2(112)
-
- FDCT_ROW_SSE2_H1(32,128)
- FDCT_ROW_SSE2(32)
- FDCT_ROW_SSE2_H2(96,128)
- FDCT_ROW_SSE2(96)
-
- FDCT_ROW_SSE2_H1(48,192)
- FDCT_ROW_SSE2(48)
- FDCT_ROW_SSE2_H2(80,192)
- FDCT_ROW_SSE2(80)
- :
- : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
- "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
- );
-}
-
-static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
- const int16_t *table)
-{
- __asm__ volatile (
- "pshufw $0x1B, 8(%0), %%mm5 \n\t"
- "movq (%0), %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "paddsw %%mm5, %%mm0 \n\t"
- "psubsw %%mm5, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "punpckldq %%mm1, %%mm0 \n\t"
- "punpckhdq %%mm1, %%mm2 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq 8(%1), %%mm3 \n\t"
- "movq 16(%1), %%mm4 \n\t"
- "movq 24(%1), %%mm5 \n\t"
- "movq 32(%1), %%mm6 \n\t"
- "movq 40(%1), %%mm7 \n\t"
- "pmaddwd %%mm0, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm0, %%mm4 \n\t"
- "pmaddwd %%mm2, %%mm5 \n\t"
- "pmaddwd %%mm0, %%mm6 \n\t"
- "pmaddwd %%mm2, %%mm7 \n\t"
- "pmaddwd 48(%1), %%mm0 \n\t"
- "pmaddwd 56(%1), %%mm2 \n\t"
- "paddd %%mm1, %%mm3 \n\t"
- "paddd %%mm4, %%mm5 \n\t"
- "paddd %%mm6, %%mm7 \n\t"
- "paddd %%mm0, %%mm2 \n\t"
- "movq (%2), %%mm0 \n\t"
- "paddd %%mm0, %%mm3 \n\t"
- "paddd %%mm0, %%mm5 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "paddd %%mm0, %%mm2 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
- "packssdw %%mm5, %%mm3 \n\t"
- "packssdw %%mm2, %%mm7 \n\t"
- "movq %%mm3, (%3) \n\t"
- "movq %%mm7, 8(%3) \n\t"
- :
- : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
-}
-
-static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
-{
- //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
- __asm__ volatile(
- "movd 12(%0), %%mm1 \n\t"
- "punpcklwd 8(%0), %%mm1 \n\t"
- "movq %%mm1, %%mm2 \n\t"
- "psrlq $0x20, %%mm1 \n\t"
- "movq 0(%0), %%mm0 \n\t"
- "punpcklwd %%mm2, %%mm1 \n\t"
- "movq %%mm0, %%mm5 \n\t"
- "paddsw %%mm1, %%mm0 \n\t"
- "psubsw %%mm1, %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "punpckldq %%mm5, %%mm0 \n\t"
- "punpckhdq %%mm5, %%mm2 \n\t"
- "movq 0(%1), %%mm1 \n\t"
- "movq 8(%1), %%mm3 \n\t"
- "movq 16(%1), %%mm4 \n\t"
- "movq 24(%1), %%mm5 \n\t"
- "movq 32(%1), %%mm6 \n\t"
- "movq 40(%1), %%mm7 \n\t"
- "pmaddwd %%mm0, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm0, %%mm4 \n\t"
- "pmaddwd %%mm2, %%mm5 \n\t"
- "pmaddwd %%mm0, %%mm6 \n\t"
- "pmaddwd %%mm2, %%mm7 \n\t"
- "pmaddwd 48(%1), %%mm0 \n\t"
- "pmaddwd 56(%1), %%mm2 \n\t"
- "paddd %%mm1, %%mm3 \n\t"
- "paddd %%mm4, %%mm5 \n\t"
- "paddd %%mm6, %%mm7 \n\t"
- "paddd %%mm0, %%mm2 \n\t"
- "movq (%2), %%mm0 \n\t"
- "paddd %%mm0, %%mm3 \n\t"
- "paddd %%mm0, %%mm5 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "paddd %%mm0, %%mm2 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
- "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
- "packssdw %%mm5, %%mm3 \n\t"
- "packssdw %%mm2, %%mm7 \n\t"
- "movq %%mm3, 0(%3) \n\t"
- "movq %%mm7, 8(%3) \n\t"
- :
- : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
-}
-
-void ff_fdct_mmx(int16_t *block)
-{
- DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
- int16_t * block1= (int16_t*)align_tmp;
- const int16_t *table= tab_frw_01234567;
- int i;
-
- fdct_col_mmx(block, block1, 0);
- fdct_col_mmx(block, block1, 4);
-
- for(i=8;i>0;i--) {
- fdct_row_mmx(block1, block, table);
- block1 += 8;
- table += 32;
- block += 8;
- }
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-void ff_fdct_mmxext(int16_t *block)
-{
- DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
- int16_t *block1= (int16_t*)align_tmp;
- const int16_t *table= tab_frw_01234567;
- int i;
-
- fdct_col_mmx(block, block1, 0);
- fdct_col_mmx(block, block1, 4);
-
- for(i=8;i>0;i--) {
- fdct_row_mmxext(block1, block, table);
- block1 += 8;
- table += 32;
- block += 8;
- }
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
-
-#if HAVE_SSE2_INLINE
-
-void ff_fdct_sse2(int16_t *block)
-{
- DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
- int16_t * const block1= (int16_t*)align_tmp;
-
- fdct_col_sse2(block, block1, 0);
- fdct_row_sse2(block1, block);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/ffmpeg/libavcodec/x86/fft.asm b/ffmpeg/libavcodec/x86/fft.asm
deleted file mode 100644
index cae404c..0000000
--- a/ffmpeg/libavcodec/x86/fft.asm
+++ /dev/null
@@ -1,1092 +0,0 @@
-;******************************************************************************
-;* FFT transform with SSE/3DNow optimizations
-;* Copyright (c) 2008 Loren Merritt
-;* Copyright (c) 2011 Vitor Sessak
-;*
-;* This algorithm (though not any of the implementation details) is
-;* based on libdjbfft by D. J. Bernstein.
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-; These functions are not individually interchangeable with the C versions.
-; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
-; in blocks as conventient to the vector size.
-; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
-
-%include "libavutil/x86/x86util.asm"
-
-%if ARCH_X86_64
-%define pointer resq
-%else
-%define pointer resd
-%endif
-
-SECTION_RODATA 32
-
-struc FFTContext
- .nbits: resd 1
- .reverse: resd 1
- .revtab: pointer 1
- .tmpbuf: pointer 1
- .mdctsize: resd 1
- .mdctbits: resd 1
- .tcos: pointer 1
- .tsin: pointer 1
- .fftperm: pointer 1
- .fftcalc: pointer 1
- .imdctcalc:pointer 1
- .imdcthalf:pointer 1
-endstruc
-
-%define M_SQRT1_2 0.70710678118654752440
-%define M_COS_PI_1_8 0.923879532511287
-%define M_COS_PI_3_8 0.38268343236509
-
-ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
-ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
-
-ps_root2: times 8 dd M_SQRT1_2
-ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
-ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
-
-perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
-perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
-ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
-ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
-ps_m1p1: dd 1<<31, 0
-
-%assign i 16
-%rep 13
-cextern cos_ %+ i
-%assign i i<<1
-%endrep
-
-%if ARCH_X86_64
- %define pointer dq
-%else
- %define pointer dd
-%endif
-
-%macro IF0 1+
-%endmacro
-%macro IF1 1+
- %1
-%endmacro
-
-SECTION_TEXT
-
-%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
- mova %1, %3
- mova %2, %1
- pfadd %1, %4
- pfsub %2, %4
-%endmacro
-
-%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
- mova %5, %3
- pfsub %3, %4
- pfadd %5, %4 ; {t6,t5}
- pxor %3, [ps_m1p1] ; {t8,t7}
- mova %6, %1
- movd [r0+12], %3
- punpckhdq %3, [r0+8]
- pfadd %1, %5 ; {r0,i0}
- pfsub %6, %5 ; {r2,i2}
- mova %4, %2
- pfadd %2, %3 ; {r1,i1}
- pfsub %4, %3 ; {r3,i3}
- SWAP %3, %6
-%endmacro
-
-; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
-; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
-; %3, %4, %5 tmp
-; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
-; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
-%macro T8_AVX 5
- vsubps %5, %1, %2 ; v = %1 - %2
- vaddps %3, %1, %2 ; w = %1 + %2
- vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
- vpermilps %2, %2, [perm1]
- vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
- vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
- vsubps %4, %5, %1 ; s = r - q
- vaddps %1, %5, %1 ; u = r + q
- vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
- vshufps %5, %4, %1, 0xbb
- vshufps %3, %4, %1, 0xee
- vperm2f128 %3, %3, %5, 0x13
- vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
- vshufps %2, %1, %4, 0xdd
- vshufps %1, %1, %4, 0x88
- vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
- vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
- vsubps %5, %1, %3
- vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
- vsubps %2, %4, %1 ; %2 = v - w
- vaddps %1, %4, %1 ; %1 = v + w
-%endmacro
-
-; In SSE mode do one fft4 transforms
-; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
-; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
-;
-; In AVX mode do two fft4 transforms
-; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
-; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
-%macro T4_SSE 3
- subps %3, %1, %2 ; {t3,t4,-t8,t7}
- addps %1, %1, %2 ; {t1,t2,t6,t5}
- xorps %3, %3, [ps_p1p1m1p1]
- shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
- shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
- subps %3, %1, %2 ; {r2,i2,r3,i3}
- addps %1, %1, %2 ; {r0,i0,r1,i1}
- shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
- shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
-%endmacro
-
-; In SSE mode do one FFT8
-; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
-; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
-;
-; In AVX mode do two FFT8
-; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
-; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
-; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
-; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
-%macro T8_SSE 6
- addps %6, %3, %4 ; {t1,t2,t3,t4}
- subps %3, %3, %4 ; {r5,i5,r7,i7}
- shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
- mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
- mulps %4, %4, [ps_root2]
- addps %3, %3, %4 ; {t8,t7,ta,t9}
- shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
- shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
- subps %3, %6, %4 ; {t6,t5,tc,tb}
- addps %6, %6, %4 ; {t1,t2,t9,ta}
- shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
- shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
- subps %3, %1, %6 ; {r4,r5,r6,r7}
- addps %1, %1, %6 ; {r0,r1,r2,r3}
- subps %4, %2, %5 ; {i4,i5,i6,i7}
- addps %2, %2, %5 ; {i0,i1,i2,i3}
-%endmacro
-
-; scheduled for cpu-bound sizes
-%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
-IF%1 mova m4, Z(4)
-IF%1 mova m5, Z(5)
- mova m0, %2 ; wre
- mova m1, %3 ; wim
- mulps m2, m4, m0 ; r2*wre
-IF%1 mova m6, Z2(6)
- mulps m3, m5, m1 ; i2*wim
-IF%1 mova m7, Z2(7)
- mulps m4, m4, m1 ; r2*wim
- mulps m5, m5, m0 ; i2*wre
- addps m2, m2, m3 ; r2*wre + i2*wim
- mulps m3, m1, m7 ; i3*wim
- subps m5, m5, m4 ; i2*wre - r2*wim
- mulps m1, m1, m6 ; r3*wim
- mulps m4, m0, m6 ; r3*wre
- mulps m0, m0, m7 ; i3*wre
- subps m4, m4, m3 ; r3*wre - i3*wim
- mova m3, Z(0)
- addps m0, m0, m1 ; i3*wre + r3*wim
- subps m1, m4, m2 ; t3
- addps m4, m4, m2 ; t5
- subps m3, m3, m4 ; r2
- addps m4, m4, Z(0) ; r0
- mova m6, Z(2)
- mova Z(4), m3
- mova Z(0), m4
- subps m3, m5, m0 ; t4
- subps m4, m6, m3 ; r3
- addps m3, m3, m6 ; r1
- mova Z2(6), m4
- mova Z(2), m3
- mova m2, Z(3)
- addps m3, m5, m0 ; t6
- subps m2, m2, m1 ; i3
- mova m7, Z(1)
- addps m1, m1, Z(3) ; i1
- mova Z2(7), m2
- mova Z(3), m1
- subps m4, m7, m3 ; i2
- addps m3, m3, m7 ; i0
- mova Z(5), m4
- mova Z(1), m3
-%endmacro
-
-; scheduled to avoid store->load aliasing
-%macro PASS_BIG 1 ; (!interleave)
- mova m4, Z(4) ; r2
- mova m5, Z(5) ; i2
- mova m0, [wq] ; wre
- mova m1, [wq+o1q] ; wim
- mulps m2, m4, m0 ; r2*wre
- mova m6, Z2(6) ; r3
- mulps m3, m5, m1 ; i2*wim
- mova m7, Z2(7) ; i3
- mulps m4, m4, m1 ; r2*wim
- mulps m5, m5, m0 ; i2*wre
- addps m2, m2, m3 ; r2*wre + i2*wim
- mulps m3, m1, m7 ; i3*wim
- mulps m1, m1, m6 ; r3*wim
- subps m5, m5, m4 ; i2*wre - r2*wim
- mulps m4, m0, m6 ; r3*wre
- mulps m0, m0, m7 ; i3*wre
- subps m4, m4, m3 ; r3*wre - i3*wim
- mova m3, Z(0)
- addps m0, m0, m1 ; i3*wre + r3*wim
- subps m1, m4, m2 ; t3
- addps m4, m4, m2 ; t5
- subps m3, m3, m4 ; r2
- addps m4, m4, Z(0) ; r0
- mova m6, Z(2)
- mova Z(4), m3
- mova Z(0), m4
- subps m3, m5, m0 ; t4
- subps m4, m6, m3 ; r3
- addps m3, m3, m6 ; r1
-IF%1 mova Z2(6), m4
-IF%1 mova Z(2), m3
- mova m2, Z(3)
- addps m5, m5, m0 ; t6
- subps m2, m2, m1 ; i3
- mova m7, Z(1)
- addps m1, m1, Z(3) ; i1
-IF%1 mova Z2(7), m2
-IF%1 mova Z(3), m1
- subps m6, m7, m5 ; i2
- addps m5, m5, m7 ; i0
-IF%1 mova Z(5), m6
-IF%1 mova Z(1), m5
-%if %1==0
- INTERL m1, m3, m7, Z, 2
- INTERL m2, m4, m0, Z2, 6
-
- mova m1, Z(0)
- mova m2, Z(4)
-
- INTERL m5, m1, m3, Z, 0
- INTERL m6, m2, m7, Z, 4
-%endif
-%endmacro
-
-%macro PUNPCK 3
- mova %3, %1
- punpckldq %1, %2
- punpckhdq %3, %2
-%endmacro
-
-%define Z(x) [r0+mmsize*x]
-%define Z2(x) [r0+mmsize*x]
-%define ZH(x) [r0+mmsize*x+mmsize/2]
-
-INIT_YMM avx
-
-%if HAVE_AVX_EXTERNAL
-align 16
-fft8_avx:
- mova m0, Z(0)
- mova m1, Z(1)
- T8_AVX m0, m1, m2, m3, m4
- mova Z(0), m0
- mova Z(1), m1
- ret
-
-
-align 16
-fft16_avx:
- mova m2, Z(2)
- mova m3, Z(3)
- T4_SSE m2, m3, m7
-
- mova m0, Z(0)
- mova m1, Z(1)
- T8_AVX m0, m1, m4, m5, m7
-
- mova m4, [ps_cos16_1]
- mova m5, [ps_cos16_2]
- vmulps m6, m2, m4
- vmulps m7, m3, m5
- vaddps m7, m7, m6
- vmulps m2, m2, m5
- vmulps m3, m3, m4
- vsubps m3, m3, m2
- vblendps m2, m7, m3, 0xf0
- vperm2f128 m3, m7, m3, 0x21
- vaddps m4, m2, m3
- vsubps m2, m3, m2
- vperm2f128 m2, m2, m2, 0x01
- vsubps m3, m1, m2
- vaddps m1, m1, m2
- vsubps m5, m0, m4
- vaddps m0, m0, m4
- vextractf128 Z(0), m0, 0
- vextractf128 ZH(0), m1, 0
- vextractf128 Z(1), m0, 1
- vextractf128 ZH(1), m1, 1
- vextractf128 Z(2), m5, 0
- vextractf128 ZH(2), m3, 0
- vextractf128 Z(3), m5, 1
- vextractf128 ZH(3), m3, 1
- ret
-
-align 16
-fft32_avx:
- call fft16_avx
-
- mova m0, Z(4)
- mova m1, Z(5)
-
- T4_SSE m0, m1, m4
-
- mova m2, Z(6)
- mova m3, Z(7)
-
- T8_SSE m0, m1, m2, m3, m4, m6
- ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
- ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
-
- vperm2f128 m4, m0, m2, 0x20
- vperm2f128 m5, m1, m3, 0x20
- vperm2f128 m6, m0, m2, 0x31
- vperm2f128 m7, m1, m3, 0x31
-
- PASS_SMALL 0, [cos_32], [cos_32+32]
-
- ret
-
-fft32_interleave_avx:
- call fft32_avx
- mov r2d, 32
-.deint_loop:
- mova m2, Z(0)
- mova m3, Z(1)
- vunpcklps m0, m2, m3
- vunpckhps m1, m2, m3
- vextractf128 Z(0), m0, 0
- vextractf128 ZH(0), m1, 0
- vextractf128 Z(1), m0, 1
- vextractf128 ZH(1), m1, 1
- add r0, mmsize*2
- sub r2d, mmsize/4
- jg .deint_loop
- ret
-
-%endif
-
-INIT_XMM sse
-
-align 16
-fft4_avx:
-fft4_sse:
- mova m0, Z(0)
- mova m1, Z(1)
- T4_SSE m0, m1, m2
- mova Z(0), m0
- mova Z(1), m1
- ret
-
-align 16
-fft8_sse:
- mova m0, Z(0)
- mova m1, Z(1)
- T4_SSE m0, m1, m2
- mova m2, Z(2)
- mova m3, Z(3)
- T8_SSE m0, m1, m2, m3, m4, m5
- mova Z(0), m0
- mova Z(1), m1
- mova Z(2), m2
- mova Z(3), m3
- ret
-
-align 16
-fft16_sse:
- mova m0, Z(0)
- mova m1, Z(1)
- T4_SSE m0, m1, m2
- mova m2, Z(2)
- mova m3, Z(3)
- T8_SSE m0, m1, m2, m3, m4, m5
- mova m4, Z(4)
- mova m5, Z(5)
- mova Z(0), m0
- mova Z(1), m1
- mova Z(2), m2
- mova Z(3), m3
- T4_SSE m4, m5, m6
- mova m6, Z2(6)
- mova m7, Z2(7)
- T4_SSE m6, m7, m0
- PASS_SMALL 0, [cos_16], [cos_16+16]
- ret
-
-
-%macro FFT48_3DNOW 0
-align 16
-fft4 %+ SUFFIX:
- T2_3DNOW m0, m1, Z(0), Z(1)
- mova m2, Z(2)
- mova m3, Z(3)
- T4_3DNOW m0, m1, m2, m3, m4, m5
- PUNPCK m0, m1, m4
- PUNPCK m2, m3, m5
- mova Z(0), m0
- mova Z(1), m4
- mova Z(2), m2
- mova Z(3), m5
- ret
-
-align 16
-fft8 %+ SUFFIX:
- T2_3DNOW m0, m1, Z(0), Z(1)
- mova m2, Z(2)
- mova m3, Z(3)
- T4_3DNOW m0, m1, m2, m3, m4, m5
- mova Z(0), m0
- mova Z(2), m2
- T2_3DNOW m4, m5, Z(4), Z(5)
- T2_3DNOW m6, m7, Z2(6), Z2(7)
- PSWAPD m0, m5
- PSWAPD m2, m7
- pxor m0, [ps_m1p1]
- pxor m2, [ps_m1p1]
- pfsub m5, m0
- pfadd m7, m2
- pfmul m5, [ps_root2]
- pfmul m7, [ps_root2]
- T4_3DNOW m1, m3, m5, m7, m0, m2
- mova Z(5), m5
- mova Z2(7), m7
- mova m0, Z(0)
- mova m2, Z(2)
- T4_3DNOW m0, m2, m4, m6, m5, m7
- PUNPCK m0, m1, m5
- PUNPCK m2, m3, m7
- mova Z(0), m0
- mova Z(1), m5
- mova Z(2), m2
- mova Z(3), m7
- PUNPCK m4, Z(5), m5
- PUNPCK m6, Z2(7), m7
- mova Z(4), m4
- mova Z(5), m5
- mova Z2(6), m6
- mova Z2(7), m7
- ret
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnowext
-FFT48_3DNOW
-
-INIT_MMX 3dnow
-FFT48_3DNOW
-%endif
-
-%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
-%define Z2(x) [zcq + o3q + mmsize*(x&1)]
-%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
-%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
-
-%macro DECL_PASS 2+ ; name, payload
-align 16
-%1:
-DEFINE_ARGS zc, w, n, o1, o3
- lea o3q, [nq*3]
- lea o1q, [nq*8]
- shl o3q, 4
-.loop:
- %2
- add zcq, mmsize*2
- add wq, mmsize
- sub nd, mmsize/8
- jg .loop
- rep ret
-%endmacro
-
-%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
- lea r2, [dispatch_tab%1]
- mov r2, [r2 + (%2q-2)*gprsize]
-%ifdef PIC
- lea r3, [$$]
- add r2, r3
-%endif
- call r2
-%endmacro ; FFT_DISPATCH
-
-INIT_YMM avx
-
-%if HAVE_AVX_EXTERNAL
-%macro INTERL_AVX 5
- vunpckhps %3, %2, %1
- vunpcklps %2, %2, %1
- vextractf128 %4(%5), %2, 0
- vextractf128 %4 %+ H(%5), %3, 0
- vextractf128 %4(%5 + 1), %2, 1
- vextractf128 %4 %+ H(%5 + 1), %3, 1
-%endmacro
-
-%define INTERL INTERL_AVX
-
-DECL_PASS pass_avx, PASS_BIG 1
-DECL_PASS pass_interleave_avx, PASS_BIG 0
-
-cglobal fft_calc, 2,5,8
- mov r3d, [r0 + FFTContext.nbits]
- mov r0, r1
- mov r1, r3
- FFT_DISPATCH _interleave %+ SUFFIX, r1
- REP_RET
-
-%endif
-
-INIT_XMM sse
-
-%macro INTERL_SSE 5
- mova %3, %2
- unpcklps %2, %1
- unpckhps %3, %1
- mova %4(%5), %2
- mova %4(%5+1), %3
-%endmacro
-
-%define INTERL INTERL_SSE
-
-DECL_PASS pass_sse, PASS_BIG 1
-DECL_PASS pass_interleave_sse, PASS_BIG 0
-
-%macro FFT_CALC_FUNC 0
-cglobal fft_calc, 2,5,8
- mov r3d, [r0 + FFTContext.nbits]
- PUSH r1
- PUSH r3
- mov r0, r1
- mov r1, r3
- FFT_DISPATCH _interleave %+ SUFFIX, r1
- POP rcx
- POP r4
- cmp rcx, 3+(mmsize/16)
- jg .end
- mov r2, -1
- add rcx, 3
- shl r2, cl
- sub r4, r2
-.loop:
-%if mmsize == 8
- PSWAPD m0, [r4 + r2 + 4]
- mova [r4 + r2 + 4], m0
-%else
- movaps xmm0, [r4 + r2]
- movaps xmm1, xmm0
- unpcklps xmm0, [r4 + r2 + 16]
- unpckhps xmm1, [r4 + r2 + 16]
- movaps [r4 + r2], xmm0
- movaps [r4 + r2 + 16], xmm1
-%endif
- add r2, mmsize*2
- jl .loop
-.end:
-%if cpuflag(3dnow)
- femms
- RET
-%else
- REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-FFT_CALC_FUNC
-INIT_MMX 3dnowext
-FFT_CALC_FUNC
-%endif
-INIT_XMM sse
-FFT_CALC_FUNC
-
-cglobal fft_permute, 2,7,1
- mov r4, [r0 + FFTContext.revtab]
- mov r5, [r0 + FFTContext.tmpbuf]
- mov ecx, [r0 + FFTContext.nbits]
- mov r2, 1
- shl r2, cl
- xor r0, r0
-%if ARCH_X86_32
- mov r1, r1m
-%endif
-.loop:
- movaps xmm0, [r1 + 8*r0]
- movzx r6, word [r4 + 2*r0]
- movzx r3, word [r4 + 2*r0 + 2]
- movlps [r5 + 8*r6], xmm0
- movhps [r5 + 8*r3], xmm0
- add r0, 2
- cmp r0, r2
- jl .loop
- shl r2, 3
- add r1, r2
- add r5, r2
- neg r2
-; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
-.loopcopy:
- movaps xmm0, [r5 + r2]
- movaps xmm1, [r5 + r2 + 16]
- movaps [r1 + r2], xmm0
- movaps [r1 + r2 + 16], xmm1
- add r2, 32
- jl .loopcopy
- REP_RET
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
- mov r3d, [r0 + FFTContext.mdctsize]
- mov r4, [r0 + FFTContext.imdcthalf]
- add r1, r3
- PUSH r3
- PUSH r1
-%if ARCH_X86_32
- push r2
- push r1
- push r0
-%else
- sub rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
- call r4
-%if ARCH_X86_32
- add esp, 12
-%else
- add rsp, 8+32*WIN64
-%endif
- POP r1
- POP r3
- lea r0, [r1 + 2*r3]
- mov r2, r3
- sub r3, mmsize
- neg r2
- mova m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
- PSWAPD m0, [r1 + r3]
- PSWAPD m1, [r0 + r2]
- pxor m0, m2
-%else
- mova m0, [r1 + r3]
- mova m1, [r0 + r2]
- shufps m0, m0, 0x1b
- shufps m1, m1, 0x1b
- xorps m0, m2
-%endif
- mova [r0 + r3], m1
- mova [r1 + r2], m0
- sub r3, mmsize
- add r2, mmsize
- jl .loop
-%if cpuflag(3dnow)
- femms
- RET
-%else
- REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-%define mulps pfmul
-%define addps pfadd
-%define subps pfsub
-%define unpcklps punpckldq
-%define unpckhps punpckhdq
-DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
-DECL_PASS pass_interleave_3dnow, PASS_BIG 0
-%define pass_3dnowext pass_3dnow
-%define pass_interleave_3dnowext pass_interleave_3dnow
-%endif
-
-%ifdef PIC
-%define SECTION_REL - $$
-%else
-%define SECTION_REL
-%endif
-
-%macro DECL_FFT 1-2 ; nbits, suffix
-%ifidn %0, 1
-%xdefine fullsuffix SUFFIX
-%else
-%xdefine fullsuffix %2 %+ SUFFIX
-%endif
-%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
-%if %1>=5
-%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
-%endif
-%if %1>=6
-%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
-%endif
-
-%assign n 1<<%1
-%rep 17-%1
-%assign n2 n/2
-%assign n4 n/4
-%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
-
-align 16
-fft %+ n %+ fullsuffix:
- call fft %+ n2 %+ SUFFIX
- add r0, n*4 - (n&(-2<<%1))
- call fft %+ n4 %+ SUFFIX
- add r0, n*2 - (n2&(-2<<%1))
- call fft %+ n4 %+ SUFFIX
- sub r0, n*6 + (n2&(-2<<%1))
- lea r1, [cos_ %+ n]
- mov r2d, n4/2
- jmp pass %+ fullsuffix
-
-%assign n n*2
-%endrep
-%undef n
-
-align 8
-dispatch_tab %+ fullsuffix: pointer list_of_fft
-%endmacro ; DECL_FFT
-
-%if HAVE_AVX_EXTERNAL
-INIT_YMM avx
-DECL_FFT 6
-DECL_FFT 6, _interleave
-%endif
-INIT_XMM sse
-DECL_FFT 5
-DECL_FFT 5, _interleave
-%if ARCH_X86_32
-INIT_MMX 3dnow
-DECL_FFT 4
-DECL_FFT 4, _interleave
-INIT_MMX 3dnowext
-DECL_FFT 4
-DECL_FFT 4, _interleave
-%endif
-
-INIT_XMM sse
-%undef mulps
-%undef addps
-%undef subps
-%undef unpcklps
-%undef unpckhps
-
-%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
-%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
- PSWAPD m0, [%3+%2*4]
- movq m2, [%3+%1*4-8]
- movq m3, m0
- punpckldq m0, m2
- punpckhdq m2, m3
- movd m1, [%4+%1*2-4] ; tcos[j]
- movd m3, [%4+%2*2] ; tcos[n4-j-1]
- punpckldq m1, [%5+%1*2-4] ; tsin[j]
- punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
-
- mova m4, m0
- PSWAPD m5, m1
- pfmul m0, m1
- pfmul m4, m5
- mova m6, m2
- PSWAPD m5, m3
- pfmul m2, m3
- pfmul m6, m5
-%if cpuflag(3dnowext)
- pfpnacc m0, m4
- pfpnacc m2, m6
-%else
- SBUTTERFLY dq, 0, 4, 1
- SBUTTERFLY dq, 2, 6, 3
- pxor m4, m7
- pxor m6, m7
- pfadd m0, m4
- pfadd m2, m6
-%endif
-%else
- movaps xmm0, [%3+%2*4]
- movaps xmm1, [%3+%1*4-0x10]
- movaps xmm2, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm1, xmm2, 0x77
- movlps xmm4, [%4+%2*2]
- movlps xmm5, [%5+%2*2+0x0]
- movhps xmm4, [%4+%1*2-0x8]
- movhps xmm5, [%5+%1*2-0x8]
- movaps xmm2, xmm0
- movaps xmm3, xmm1
- mulps xmm0, xmm5
- mulps xmm1, xmm4
- mulps xmm2, xmm4
- mulps xmm3, xmm5
- subps xmm1, xmm0
- addps xmm2, xmm3
- movaps xmm0, xmm1
- unpcklps xmm1, xmm2
- unpckhps xmm0, xmm2
-%endif
-%endmacro
-
-%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
- mulps m6, %3, [%5+%1]
- mulps m7, %2, [%5+%1]
- mulps %2, %2, [%6+%1]
- mulps %3, %3, [%6+%1]
- subps %2, %2, m6
- addps %3, %3, m7
-%endmacro
-
-%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
-.post:
- vmovaps ymm1, [%3+%1*2]
- vmovaps ymm0, [%3+%1*2+0x20]
- vmovaps ymm3, [%3+%2*2]
- vmovaps ymm2, [%3+%2*2+0x20]
-
- CMUL %1, ymm0, ymm1, %3, %4, %5
- CMUL %2, ymm2, ymm3, %3, %4, %5
- vshufps ymm1, ymm1, ymm1, 0x1b
- vshufps ymm3, ymm3, ymm3, 0x1b
- vperm2f128 ymm1, ymm1, ymm1, 0x01
- vperm2f128 ymm3, ymm3, ymm3, 0x01
- vunpcklps ymm6, ymm2, ymm1
- vunpckhps ymm4, ymm2, ymm1
- vunpcklps ymm7, ymm0, ymm3
- vunpckhps ymm5, ymm0, ymm3
-
- vextractf128 [%3+%1*2], ymm7, 0
- vextractf128 [%3+%1*2+0x10], ymm5, 0
- vextractf128 [%3+%1*2+0x20], ymm7, 1
- vextractf128 [%3+%1*2+0x30], ymm5, 1
-
- vextractf128 [%3+%2*2], ymm6, 0
- vextractf128 [%3+%2*2+0x10], ymm4, 0
- vextractf128 [%3+%2*2+0x20], ymm6, 1
- vextractf128 [%3+%2*2+0x30], ymm4, 1
- sub %2, 0x20
- add %1, 0x20
- jl .post
-%endmacro
-
-%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
-.post:
- movaps xmm1, [%3+%1*2]
- movaps xmm0, [%3+%1*2+0x10]
- CMUL %1, xmm0, xmm1, %3, %4, %5
- movaps xmm5, [%3+%2*2]
- movaps xmm4, [%3+%2*2+0x10]
- CMUL %2, xmm4, xmm5, %3, %4, %5
- shufps xmm1, xmm1, 0x1b
- shufps xmm5, xmm5, 0x1b
- movaps xmm6, xmm4
- unpckhps xmm4, xmm1
- unpcklps xmm6, xmm1
- movaps xmm2, xmm0
- unpcklps xmm0, xmm5
- unpckhps xmm2, xmm5
- movaps [%3+%2*2], xmm6
- movaps [%3+%2*2+0x10], xmm4
- movaps [%3+%1*2], xmm0
- movaps [%3+%1*2+0x10], xmm2
- sub %2, 0x10
- add %1, 0x10
- jl .post
-%endmacro
-
-%macro CMUL_3DNOW 6
- mova m6, [%1+%2*2]
- mova %3, [%1+%2*2+8]
- mova %4, m6
- mova m7, %3
- pfmul m6, [%5+%2]
- pfmul %3, [%6+%2]
- pfmul %4, [%6+%2]
- pfmul m7, [%5+%2]
- pfsub %3, m6
- pfadd %4, m7
-%endmacro
-
-%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
-.post:
- CMUL_3DNOW %3, %1, m0, m1, %4, %5
- CMUL_3DNOW %3, %2, m2, m3, %4, %5
- movd [%3+%1*2+ 0], m0
- movd [%3+%2*2+12], m1
- movd [%3+%2*2+ 0], m2
- movd [%3+%1*2+12], m3
- psrlq m0, 32
- psrlq m1, 32
- psrlq m2, 32
- psrlq m3, 32
- movd [%3+%1*2+ 8], m0
- movd [%3+%2*2+ 4], m1
- movd [%3+%2*2+ 8], m2
- movd [%3+%1*2+ 4], m3
- sub %2, 8
- add %1, 8
- jl .post
-%endmacro
-
-%macro DECL_IMDCT 1
-cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
-%if ARCH_X86_64
-%define rrevtab r7
-%define rtcos r8
-%define rtsin r9
-%else
-%define rrevtab r6
-%define rtsin r6
-%define rtcos r5
-%endif
- mov r3d, [r0+FFTContext.mdctsize]
- add r2, r3
- shr r3, 1
- mov rtcos, [r0+FFTContext.tcos]
- mov rtsin, [r0+FFTContext.tsin]
- add rtcos, r3
- add rtsin, r3
-%if ARCH_X86_64 == 0
- push rtcos
- push rtsin
-%endif
- shr r3, 1
- mov rrevtab, [r0+FFTContext.revtab]
- add rrevtab, r3
-%if ARCH_X86_64 == 0
- push rrevtab
-%endif
-
-%if mmsize == 8
- sub r3, 2
-%else
- sub r3, 4
-%endif
-%if ARCH_X86_64 || mmsize == 8
- xor r4, r4
- sub r4, r3
-%endif
-%if notcpuflag(3dnowext) && mmsize == 8
- movd m7, [ps_m1m1m1m1]
-%endif
-.pre:
-%if ARCH_X86_64 == 0
-;unspill
-%if mmsize != 8
- xor r4, r4
- sub r4, r3
-%endif
- mov rtcos, [esp+8]
- mov rtsin, [esp+4]
-%endif
-
- PREROTATER r4, r3, r2, rtcos, rtsin
-%if mmsize == 8
- mov r6, [esp] ; rrevtab = ptr+n8
- movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
- movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
- mova [r1+r5*8], m0
- mova [r1+r6*8], m2
- add r4, 2
- sub r3, 2
-%else
-%if ARCH_X86_64
- movzx r5, word [rrevtab+r4-4]
- movzx r6, word [rrevtab+r4-2]
- movzx r10, word [rrevtab+r3]
- movzx r11, word [rrevtab+r3+2]
- movlps [r1+r5 *8], xmm0
- movhps [r1+r6 *8], xmm0
- movlps [r1+r10*8], xmm1
- movhps [r1+r11*8], xmm1
- add r4, 4
-%else
- mov r6, [esp]
- movzx r5, word [r6+r4-4]
- movzx r4, word [r6+r4-2]
- movlps [r1+r5*8], xmm0
- movhps [r1+r4*8], xmm0
- movzx r5, word [r6+r3]
- movzx r4, word [r6+r3+2]
- movlps [r1+r5*8], xmm1
- movhps [r1+r4*8], xmm1
-%endif
- sub r3, 4
-%endif
- jns .pre
-
- mov r5, r0
- mov r6, r1
- mov r0, r1
- mov r1d, [r5+FFTContext.nbits]
-
- FFT_DISPATCH SUFFIX, r1
-
- mov r0d, [r5+FFTContext.mdctsize]
- add r6, r0
- shr r0, 1
-%if ARCH_X86_64 == 0
-%define rtcos r2
-%define rtsin r3
- mov rtcos, [esp+8]
- mov rtsin, [esp+4]
-%endif
- neg r0
- mov r1, -mmsize
- sub r1, r0
- %1 r0, r1, r6, rtcos, rtsin
-%if ARCH_X86_64 == 0
- add esp, 12
-%endif
-%if mmsize == 8
- femms
-%endif
- RET
-%endmacro
-
-DECL_IMDCT POSROTATESHUF
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-DECL_IMDCT POSROTATESHUF_3DNOW
-
-INIT_MMX 3dnowext
-DECL_IMDCT POSROTATESHUF_3DNOW
-%endif
-
-INIT_YMM avx
-
-%if HAVE_AVX_EXTERNAL
-DECL_IMDCT POSROTATESHUF_AVX
-%endif
diff --git a/ffmpeg/libavcodec/x86/fft.h b/ffmpeg/libavcodec/x86/fft.h
deleted file mode 100644
index 398091e..0000000
--- a/ffmpeg/libavcodec/x86/fft.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_FFT_H
-#define AVCODEC_X86_FFT_H
-
-#include "libavcodec/fft.h"
-
-void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
-
-void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-#endif /* AVCODEC_X86_FFT_H */
diff --git a/ffmpeg/libavcodec/x86/fft_init.c b/ffmpeg/libavcodec/x86/fft_init.c
deleted file mode 100644
index 5682230..0000000
--- a/ffmpeg/libavcodec/x86/fft_init.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "fft.h"
-
-av_cold void ff_fft_init_x86(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- /* 3DNow! for K6-2/3 */
- s->imdct_calc = ff_imdct_calc_3dnow;
- s->imdct_half = ff_imdct_half_3dnow;
- s->fft_calc = ff_fft_calc_3dnow;
- }
- if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
- /* 3DNowEx for K7 */
- s->imdct_calc = ff_imdct_calc_3dnowext;
- s->imdct_half = ff_imdct_half_3dnowext;
- s->fft_calc = ff_fft_calc_3dnowext;
- }
-#endif
- if (EXTERNAL_SSE(cpu_flags)) {
- /* SSE for P3/P4/K8 */
- s->imdct_calc = ff_imdct_calc_sse;
- s->imdct_half = ff_imdct_half_sse;
- s->fft_permute = ff_fft_permute_sse;
- s->fft_calc = ff_fft_calc_sse;
- s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
- }
- if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) {
- /* AVX for SB */
- s->imdct_half = ff_imdct_half_avx;
- s->fft_calc = ff_fft_calc_avx;
- s->fft_permutation = FF_FFT_PERM_AVX;
- }
-}
diff --git a/ffmpeg/libavcodec/x86/fmtconvert.asm b/ffmpeg/libavcodec/x86/fmtconvert.asm
deleted file mode 100644
index 60078e2..0000000
--- a/ffmpeg/libavcodec/x86/fmtconvert.asm
+++ /dev/null
@@ -1,429 +0,0 @@
-;******************************************************************************
-;* x86 optimized Format Conversion Utils
-;* Copyright (c) 2008 Loren Merritt
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_TEXT
-
-%macro CVTPS2PI 2
-%if cpuflag(sse)
- cvtps2pi %1, %2
-%elif cpuflag(3dnow)
- pf2id %1, %2
-%endif
-%endmacro
-
-;---------------------------------------------------------------------------------
-; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len);
-;---------------------------------------------------------------------------------
-%macro INT32_TO_FLOAT_FMUL_SCALAR 1
-%if UNIX64
-cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
-%else
-cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
-%endif
-%if WIN64
- SWAP 0, 2
-%elif ARCH_X86_32
- movss m0, mulm
-%endif
- SPLATD m0
- shl lenq, 2
- add srcq, lenq
- add dstq, lenq
- neg lenq
-.loop:
-%if cpuflag(sse2)
- cvtdq2ps m1, [srcq+lenq ]
- cvtdq2ps m2, [srcq+lenq+16]
-%else
- cvtpi2ps m1, [srcq+lenq ]
- cvtpi2ps m3, [srcq+lenq+ 8]
- cvtpi2ps m2, [srcq+lenq+16]
- cvtpi2ps m4, [srcq+lenq+24]
- movlhps m1, m3
- movlhps m2, m4
-%endif
- mulps m1, m0
- mulps m2, m0
- mova [dstq+lenq ], m1
- mova [dstq+lenq+16], m2
- add lenq, 32
- jl .loop
- REP_RET
-%endmacro
-
-INIT_XMM sse
-INT32_TO_FLOAT_FMUL_SCALAR 5
-INIT_XMM sse2
-INT32_TO_FLOAT_FMUL_SCALAR 3
-
-
-;------------------------------------------------------------------------------
-; void ff_float_to_int16(int16_t *dst, const float *src, long len);
-;------------------------------------------------------------------------------
-%macro FLOAT_TO_INT16 1
-cglobal float_to_int16, 3, 3, %1, dst, src, len
- add lenq, lenq
- lea srcq, [srcq+2*lenq]
- add dstq, lenq
- neg lenq
-.loop:
-%if cpuflag(sse2)
- cvtps2dq m0, [srcq+2*lenq ]
- cvtps2dq m1, [srcq+2*lenq+16]
- packssdw m0, m1
- mova [dstq+lenq], m0
-%else
- CVTPS2PI m0, [srcq+2*lenq ]
- CVTPS2PI m1, [srcq+2*lenq+ 8]
- CVTPS2PI m2, [srcq+2*lenq+16]
- CVTPS2PI m3, [srcq+2*lenq+24]
- packssdw m0, m1
- packssdw m2, m3
- mova [dstq+lenq ], m0
- mova [dstq+lenq+8], m2
-%endif
- add lenq, 16
- js .loop
-%if mmsize == 8
- emms
-%endif
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-FLOAT_TO_INT16 2
-INIT_MMX sse
-FLOAT_TO_INT16 0
-INIT_MMX 3dnow
-FLOAT_TO_INT16 0
-
-;------------------------------------------------------------------------------
-; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
-;------------------------------------------------------------------------------
-%macro FLOAT_TO_INT16_STEP 1
-cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
- add lenq, lenq
- lea srcq, [srcq+2*lenq]
- lea step3q, [stepq*3]
- neg lenq
-.loop:
-%if cpuflag(sse2)
- cvtps2dq m0, [srcq+2*lenq ]
- cvtps2dq m1, [srcq+2*lenq+16]
- packssdw m0, m1
- movd v1d, m0
- psrldq m0, 4
- movd v2d, m0
- psrldq m0, 4
- mov [dstq], v1w
- mov [dstq+stepq*4], v2w
- shr v1d, 16
- shr v2d, 16
- mov [dstq+stepq*2], v1w
- mov [dstq+step3q*2], v2w
- lea dstq, [dstq+stepq*8]
- movd v1d, m0
- psrldq m0, 4
- movd v2d, m0
- mov [dstq], v1w
- mov [dstq+stepq*4], v2w
- shr v1d, 16
- shr v2d, 16
- mov [dstq+stepq*2], v1w
- mov [dstq+step3q*2], v2w
- lea dstq, [dstq+stepq*8]
-%else
- CVTPS2PI m0, [srcq+2*lenq ]
- CVTPS2PI m1, [srcq+2*lenq+ 8]
- CVTPS2PI m2, [srcq+2*lenq+16]
- CVTPS2PI m3, [srcq+2*lenq+24]
- packssdw m0, m1
- packssdw m2, m3
- movd v1d, m0
- psrlq m0, 32
- movd v2d, m0
- mov [dstq], v1w
- mov [dstq+stepq*4], v2w
- shr v1d, 16
- shr v2d, 16
- mov [dstq+stepq*2], v1w
- mov [dstq+step3q*2], v2w
- lea dstq, [dstq+stepq*8]
- movd v1d, m2
- psrlq m2, 32
- movd v2d, m2
- mov [dstq], v1w
- mov [dstq+stepq*4], v2w
- shr v1d, 16
- shr v2d, 16
- mov [dstq+stepq*2], v1w
- mov [dstq+step3q*2], v2w
- lea dstq, [dstq+stepq*8]
-%endif
- add lenq, 16
- js .loop
-%if mmsize == 8
- emms
-%endif
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-FLOAT_TO_INT16_STEP 2
-INIT_MMX sse
-FLOAT_TO_INT16_STEP 0
-INIT_MMX 3dnow
-FLOAT_TO_INT16_STEP 0
-
-;-------------------------------------------------------------------------------
-; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
-;-------------------------------------------------------------------------------
-%macro FLOAT_TO_INT16_INTERLEAVE2 0
-cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
- lea lenq, [4*r2q]
- mov src1q, [src0q+gprsize]
- mov src0q, [src0q]
- add dstq, lenq
- add src0q, lenq
- add src1q, lenq
- neg lenq
-.loop:
-%if cpuflag(sse2)
- cvtps2dq m0, [src0q+lenq]
- cvtps2dq m1, [src1q+lenq]
- packssdw m0, m1
- movhlps m1, m0
- punpcklwd m0, m1
- mova [dstq+lenq], m0
-%else
- CVTPS2PI m0, [src0q+lenq ]
- CVTPS2PI m1, [src0q+lenq+8]
- CVTPS2PI m2, [src1q+lenq ]
- CVTPS2PI m3, [src1q+lenq+8]
- packssdw m0, m1
- packssdw m2, m3
- mova m1, m0
- punpcklwd m0, m2
- punpckhwd m1, m2
- mova [dstq+lenq ], m0
- mova [dstq+lenq+8], m1
-%endif
- add lenq, 16
- js .loop
-%if mmsize == 8
- emms
-%endif
- REP_RET
-%endmacro
-
-INIT_MMX 3dnow
-FLOAT_TO_INT16_INTERLEAVE2
-INIT_MMX sse
-FLOAT_TO_INT16_INTERLEAVE2
-INIT_XMM sse2
-FLOAT_TO_INT16_INTERLEAVE2
-
-%macro FLOAT_TO_INT16_INTERLEAVE6 0
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
-%if ARCH_X86_64
- mov lend, r2d
-%else
- %define lend dword r2m
-%endif
- mov src1q, [srcq+1*gprsize]
- mov src2q, [srcq+2*gprsize]
- mov src3q, [srcq+3*gprsize]
- mov src4q, [srcq+4*gprsize]
- mov src5q, [srcq+5*gprsize]
- mov srcq, [srcq]
- sub src1q, srcq
- sub src2q, srcq
- sub src3q, srcq
- sub src4q, srcq
- sub src5q, srcq
-.loop:
- CVTPS2PI mm0, [srcq]
- CVTPS2PI mm1, [srcq+src1q]
- CVTPS2PI mm2, [srcq+src2q]
- CVTPS2PI mm3, [srcq+src3q]
- CVTPS2PI mm4, [srcq+src4q]
- CVTPS2PI mm5, [srcq+src5q]
- packssdw mm0, mm3
- packssdw mm1, mm4
- packssdw mm2, mm5
- PSWAPD mm3, mm0
- punpcklwd mm0, mm1
- punpckhwd mm1, mm2
- punpcklwd mm2, mm3
- PSWAPD mm3, mm0
- punpckldq mm0, mm2
- punpckhdq mm2, mm1
- punpckldq mm1, mm3
- movq [dstq ], mm0
- movq [dstq+16], mm2
- movq [dstq+ 8], mm1
- add srcq, 8
- add dstq, 24
- sub lend, 2
- jg .loop
- emms
- RET
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
-
-INIT_MMX sse
-FLOAT_TO_INT16_INTERLEAVE6
-INIT_MMX 3dnow
-FLOAT_TO_INT16_INTERLEAVE6
-INIT_MMX 3dnowext
-FLOAT_TO_INT16_INTERLEAVE6
-
-;-----------------------------------------------------------------------------
-; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
-;-----------------------------------------------------------------------------
-
-%macro FLOAT_INTERLEAVE6 1
-cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
-%if ARCH_X86_64
- mov lend, r2d
-%else
- %define lend dword r2m
-%endif
- mov src1q, [srcq+1*gprsize]
- mov src2q, [srcq+2*gprsize]
- mov src3q, [srcq+3*gprsize]
- mov src4q, [srcq+4*gprsize]
- mov src5q, [srcq+5*gprsize]
- mov srcq, [srcq]
- sub src1q, srcq
- sub src2q, srcq
- sub src3q, srcq
- sub src4q, srcq
- sub src5q, srcq
-.loop:
-%if cpuflag(sse)
- movaps m0, [srcq]
- movaps m1, [srcq+src1q]
- movaps m2, [srcq+src2q]
- movaps m3, [srcq+src3q]
- movaps m4, [srcq+src4q]
- movaps m5, [srcq+src5q]
-
- SBUTTERFLYPS 0, 1, 6
- SBUTTERFLYPS 2, 3, 6
- SBUTTERFLYPS 4, 5, 6
-
- movaps m6, m4
- shufps m4, m0, 0xe4
- movlhps m0, m2
- movhlps m6, m2
- movaps [dstq ], m0
- movaps [dstq+16], m4
- movaps [dstq+32], m6
-
- movaps m6, m5
- shufps m5, m1, 0xe4
- movlhps m1, m3
- movhlps m6, m3
- movaps [dstq+48], m1
- movaps [dstq+64], m5
- movaps [dstq+80], m6
-%else ; mmx
- movq m0, [srcq]
- movq m1, [srcq+src1q]
- movq m2, [srcq+src2q]
- movq m3, [srcq+src3q]
- movq m4, [srcq+src4q]
- movq m5, [srcq+src5q]
-
- SBUTTERFLY dq, 0, 1, 6
- SBUTTERFLY dq, 2, 3, 6
- SBUTTERFLY dq, 4, 5, 6
- movq [dstq ], m0
- movq [dstq+ 8], m2
- movq [dstq+16], m4
- movq [dstq+24], m1
- movq [dstq+32], m3
- movq [dstq+40], m5
-%endif
- add srcq, mmsize
- add dstq, mmsize*6
- sub lend, mmsize/4
- jg .loop
-%if mmsize == 8
- emms
-%endif
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-FLOAT_INTERLEAVE6 0
-INIT_XMM sse
-FLOAT_INTERLEAVE6 7
-
-;-----------------------------------------------------------------------------
-; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
-;-----------------------------------------------------------------------------
-
-%macro FLOAT_INTERLEAVE2 1
-cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
- mov src1q, [srcq+gprsize]
- mov srcq, [srcq ]
- sub src1q, srcq
-.loop:
- mova m0, [srcq ]
- mova m1, [srcq+src1q ]
- mova m3, [srcq +mmsize]
- mova m4, [srcq+src1q+mmsize]
-
- mova m2, m0
- PUNPCKLDQ m0, m1
- PUNPCKHDQ m2, m1
-
- mova m1, m3
- PUNPCKLDQ m3, m4
- PUNPCKHDQ m1, m4
-
- mova [dstq ], m0
- mova [dstq+1*mmsize], m2
- mova [dstq+2*mmsize], m3
- mova [dstq+3*mmsize], m1
-
- add srcq, mmsize*2
- add dstq, mmsize*4
- sub lend, mmsize/2
- jg .loop
-%if mmsize == 8
- emms
-%endif
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-%define PUNPCKLDQ punpckldq
-%define PUNPCKHDQ punpckhdq
-FLOAT_INTERLEAVE2 0
-INIT_XMM sse
-%define PUNPCKLDQ unpcklps
-%define PUNPCKHDQ unpckhps
-FLOAT_INTERLEAVE2 5
diff --git a/ffmpeg/libavcodec/x86/fmtconvert_init.c b/ffmpeg/libavcodec/x86/fmtconvert_init.c
deleted file mode 100644
index d300dfd..0000000
--- a/ffmpeg/libavcodec/x86/fmtconvert_init.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Format Conversion Utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/fmtconvert.h"
-
-#if HAVE_YASM
-
-void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
-void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
-
-void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
-void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
-void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
-
-void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
-void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
-void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
-
-void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
-void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
-void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
-
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
-
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
- int c;\
- for(c=0; c<channels; c++){\
- ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
- }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
- if(channels==1)\
- ff_float_to_int16_##cpu(dst, src[0], len);\
- else if(channels==2){\
- ff_float_to_int16_interleave2_##cpu(dst, src, len);\
- }else if(channels==6){\
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\
- }else\
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow)
-FLOAT_TO_INT16_INTERLEAVE(sse)
-FLOAT_TO_INT16_INTERLEAVE(sse2)
-
-static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
- long len, int channels)
-{
- if(channels==6)
- ff_float_to_int16_interleave6_3dnowext(dst, src, len);
- else
- float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
-void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
-void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
-
-void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
-void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
-
-static void float_interleave_mmx(float *dst, const float **src,
- unsigned int len, int channels)
-{
- if (channels == 2) {
- ff_float_interleave2_mmx(dst, src, len);
- } else if (channels == 6)
- ff_float_interleave6_mmx(dst, src, len);
- else
- ff_float_interleave_c(dst, src, len, channels);
-}
-
-static void float_interleave_sse(float *dst, const float **src,
- unsigned int len, int channels)
-{
- if (channels == 2) {
- ff_float_interleave2_sse(dst, src, len);
- } else if (channels == 6)
- ff_float_interleave6_sse(dst, src, len);
- else
- ff_float_interleave_c(dst, src, len, channels);
-}
-#endif /* HAVE_YASM */
-
-av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMX(cpu_flags)) {
- c->float_interleave = float_interleave_mmx;
- }
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16 = ff_float_to_int16_3dnow;
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
- }
- }
- if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
- }
- }
- if (EXTERNAL_SSE(cpu_flags)) {
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
- c->float_to_int16 = ff_float_to_int16_sse;
- c->float_to_int16_interleave = float_to_int16_interleave_sse;
- c->float_interleave = float_interleave_sse;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
- c->float_to_int16 = ff_float_to_int16_sse2;
- c->float_to_int16_interleave = float_to_int16_interleave_sse2;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/h263_loopfilter.asm b/ffmpeg/libavcodec/x86/h263_loopfilter.asm
deleted file mode 100644
index a21baf1..0000000
--- a/ffmpeg/libavcodec/x86/h263_loopfilter.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-;******************************************************************************
-;* MMX-optimized H.263 loop filter
-;* Copyright (c) 2003-2013 Michael Niedermayer
-;* Copyright (c) 2013 Daniel Kang
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-cextern pb_FC
-cextern h263_loop_filter_strength
-
-SECTION_TEXT
-
-%macro H263_LOOP_FILTER 5
- pxor m7, m7
- mova m0, [%1]
- mova m1, [%1]
- mova m2, [%4]
- mova m3, [%4]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- psubw m0, m2
- psubw m1, m3
- mova m2, [%2]
- mova m3, [%2]
- mova m4, [%3]
- mova m5, [%3]
- punpcklbw m2, m7
- punpckhbw m3, m7
- punpcklbw m4, m7
- punpckhbw m5, m7
- psubw m4, m2
- psubw m5, m3
- psllw m4, 2
- psllw m5, 2
- paddw m4, m0
- paddw m5, m1
- pxor m6, m6
- pcmpgtw m6, m4
- pcmpgtw m7, m5
- pxor m4, m6
- pxor m5, m7
- psubw m4, m6
- psubw m5, m7
- psrlw m4, 3
- psrlw m5, 3
- packuswb m4, m5
- packsswb m6, m7
- pxor m7, m7
- movd m2, %5
- punpcklbw m2, m2
- punpcklbw m2, m2
- punpcklbw m2, m2
- psubusb m2, m4
- mova m3, m2
- psubusb m3, m4
- psubb m2, m3
- mova m3, [%2]
- mova m4, [%3]
- pxor m3, m6
- pxor m4, m6
- paddusb m3, m2
- psubusb m4, m2
- pxor m3, m6
- pxor m4, m6
- paddusb m2, m2
- packsswb m0, m1
- pcmpgtb m7, m0
- pxor m0, m7
- psubb m0, m7
- mova m1, m0
- psubusb m0, m2
- psubb m1, m0
- pand m1, [pb_FC]
- psrlw m1, 2
- pxor m1, m7
- psubb m1, m7
- mova m5, [%1]
- mova m6, [%4]
- psubb m5, m1
- paddb m6, m1
-%endmacro
-
-INIT_MMX mmx
-; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
-cglobal h263_v_loop_filter, 3,5
- movsxdifnidn r1, r1d
- movsxdifnidn r2, r2d
-
- lea r4, [h263_loop_filter_strength]
- movzx r3d, BYTE [r4+r2]
- movsx r2, r3b
- shl r2, 1
-
- mov r3, r0
- sub r3, r1
- mov r4, r3
- sub r4, r1
- H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
-
- mova [r3], m3
- mova [r0], m4
- mova [r4], m5
- mova [r0+r1], m6
- RET
-
-%macro TRANSPOSE4X4 2
- movd m0, [%1]
- movd m1, [%1+r1]
- movd m2, [%1+r1*2]
- movd m3, [%1+r3]
- punpcklbw m0, m1
- punpcklbw m2, m3
- mova m1, m0
- punpcklwd m0, m2
- punpckhwd m1, m2
- movd [%2+ 0], m0
- punpckhdq m0, m0
- movd [%2+ 8], m0
- movd [%2+16], m1
- punpckhdq m1, m1
- movd [%2+24], m1
-%endmacro
-
-
-; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
-INIT_MMX mmx
-cglobal h263_h_loop_filter, 3,5,0,32
- movsxdifnidn r1, r1d
- movsxdifnidn r2, r2d
-
- lea r4, [h263_loop_filter_strength]
- movzx r3d, BYTE [r4+r2]
- movsx r2, r3b
- shl r2, 1
-
- sub r0, 2
- lea r3, [r1*3]
-
- TRANSPOSE4X4 r0, rsp
- lea r4, [r0+r1*4]
- TRANSPOSE4X4 r4, rsp+4
-
- H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
-
- mova m1, m5
- mova m0, m4
- punpcklbw m5, m3
- punpcklbw m4, m6
- punpckhbw m1, m3
- punpckhbw m0, m6
- mova m3, m5
- mova m6, m1
- punpcklwd m5, m4
- punpcklwd m1, m0
- punpckhwd m3, m4
- punpckhwd m6, m0
- movd [r0], m5
- punpckhdq m5, m5
- movd [r0+r1*1], m5
- movd [r0+r1*2], m3
- punpckhdq m3, m3
- movd [r0+r3], m3
- movd [r4], m1
- punpckhdq m1, m1
- movd [r4+r1*1], m1
- movd [r4+r1*2], m6
- punpckhdq m6, m6
- movd [r4+r3], m6
- RET
diff --git a/ffmpeg/libavcodec/x86/h264_chromamc.asm b/ffmpeg/libavcodec/x86/h264_chromamc.asm
deleted file mode 100644
index 32681aa..0000000
--- a/ffmpeg/libavcodec/x86/h264_chromamc.asm
+++ /dev/null
@@ -1,678 +0,0 @@
-;******************************************************************************
-;* MMX/SSSE3-optimized functions for H264 chroma MC
-;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
-;* 2005-2008 Loren Merritt
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-rnd_rv40_2d_tbl: times 4 dw 0
- times 4 dw 16
- times 4 dw 32
- times 4 dw 16
- times 4 dw 32
- times 4 dw 28
- times 4 dw 32
- times 4 dw 28
- times 4 dw 0
- times 4 dw 32
- times 4 dw 16
- times 4 dw 32
- times 4 dw 32
- times 4 dw 28
- times 4 dw 32
- times 4 dw 28
-rnd_rv40_1d_tbl: times 4 dw 0
- times 4 dw 2
- times 4 dw 4
- times 4 dw 2
- times 4 dw 4
- times 4 dw 3
- times 4 dw 4
- times 4 dw 3
- times 4 dw 0
- times 4 dw 4
- times 4 dw 2
- times 4 dw 4
- times 4 dw 4
- times 4 dw 3
- times 4 dw 4
- times 4 dw 3
-
-cextern pw_3
-cextern pw_4
-cextern pw_8
-pw_28: times 8 dw 28
-cextern pw_32
-cextern pw_64
-
-SECTION .text
-
-%macro mv0_pixels_mc8 0
- lea r4, [r2*2 ]
-.next4rows:
- movq mm0, [r1 ]
- movq mm1, [r1+r2]
- add r1, r4
- CHROMAMC_AVG mm0, [r0 ]
- CHROMAMC_AVG mm1, [r0+r2]
- movq [r0 ], mm0
- movq [r0+r2], mm1
- add r0, r4
- movq mm0, [r1 ]
- movq mm1, [r1+r2]
- add r1, r4
- CHROMAMC_AVG mm0, [r0 ]
- CHROMAMC_AVG mm1, [r0+r2]
- movq [r0 ], mm0
- movq [r0+r2], mm1
- add r0, r4
- sub r3d, 4
- jne .next4rows
-%endmacro
-
-%macro chroma_mc8_mmx_func 2-3
-%ifidn %2, rv40
-%ifdef PIC
-%define rnd_1d_rv40 r8
-%define rnd_2d_rv40 r8
-%define extra_regs 2
-%else ; no-PIC
-%define rnd_1d_rv40 rnd_rv40_1d_tbl
-%define rnd_2d_rv40 rnd_rv40_2d_tbl
-%define extra_regs 1
-%endif ; PIC
-%else
-%define extra_regs 0
-%endif ; rv40
-; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
-; int stride, int h, int mx, int my)
-cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
- mov r6d, r5d
- or r6d, r4d
- jne .at_least_one_non_zero
- ; mx == 0 AND my == 0 - no filter needed
- mv0_pixels_mc8
- REP_RET
-
-.at_least_one_non_zero:
-%ifidn %2, rv40
-%if ARCH_X86_64
- mov r7, r5
- and r7, 6 ; &~1 for mx/my=[0,7]
- lea r7, [r7*4+r4]
- sar r7d, 1
-%define rnd_bias r7
-%define dest_reg r0
-%else ; x86-32
- mov r0, r5
- and r0, 6 ; &~1 for mx/my=[0,7]
- lea r0, [r0*4+r4]
- sar r0d, 1
-%define rnd_bias r0
-%define dest_reg r5
-%endif
-%else ; vc1, h264
-%define rnd_bias 0
-%define dest_reg r0
-%endif
-
- test r5d, r5d
- mov r6, 1
- je .my_is_zero
- test r4d, r4d
- mov r6, r2 ; dxy = x ? 1 : stride
- jne .both_non_zero
-.my_is_zero:
- ; mx == 0 XOR my == 0 - 1 dimensional filter only
- or r4d, r5d ; x + y
-
-%ifidn %2, rv40
-%ifdef PIC
- lea r8, [rnd_rv40_1d_tbl]
-%endif
-%if ARCH_X86_64 == 0
- mov r5, r0m
-%endif
-%endif
-
- movd m5, r4d
- movq m4, [pw_8]
- movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
- punpcklwd m5, m5
- punpckldq m5, m5 ; mm5 = B = x
- pxor m7, m7
- psubw m4, m5 ; mm4 = A = 8-x
-
-.next1drow:
- movq m0, [r1 ] ; mm0 = src[0..7]
- movq m2, [r1+r6] ; mm1 = src[1..8]
-
- movq m1, m0
- movq m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
- pmullw m1, m4
- pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
- pmullw m3, m5
-
- paddw m0, m6
- paddw m1, m6
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 3
- psrlw m1, 3
- packuswb m0, m1
- CHROMAMC_AVG m0, [dest_reg]
- movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
-
- add dest_reg, r2
- add r1, r2
- dec r3d
- jne .next1drow
- REP_RET
-
-.both_non_zero: ; general case, bilinear
- movd m4, r4d ; x
- movd m6, r5d ; y
-%ifidn %2, rv40
-%ifdef PIC
- lea r8, [rnd_rv40_2d_tbl]
-%endif
-%if ARCH_X86_64 == 0
- mov r5, r0m
-%endif
-%endif
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, 16 ; AA and DD
-
- punpcklwd m4, m4
- punpcklwd m6, m6
- punpckldq m4, m4 ; mm4 = x words
- punpckldq m6, m6 ; mm6 = y words
- movq m5, m4
- pmullw m4, m6 ; mm4 = x * y
- psllw m5, 3
- psllw m6, 3
- movq m7, m5
- paddw m7, m6
- movq [rsp+8], m4 ; DD = x * y
- psubw m5, m4 ; mm5 = B = 8x - xy
- psubw m6, m4 ; mm6 = C = 8y - xy
- paddw m4, [pw_64]
- psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
- pxor m7, m7
- movq [rsp ], m4
-
- movq m0, [r1 ] ; mm0 = src[0..7]
- movq m1, [r1+1] ; mm1 = src[1..8]
-.next2drow:
- add r1, r2
-
- movq m2, m0
- movq m3, m1
- punpckhbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- pmullw m0, [rsp]
- pmullw m2, [rsp]
- pmullw m1, m5
- pmullw m3, m5
- paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
- paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
-
- movq m0, [r1]
- movq m1, m0
- punpcklbw m0, m7
- punpckhbw m1, m7
- pmullw m0, m6
- pmullw m1, m6
- paddw m2, m0
- paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
-
- movq m1, [r1+1]
- movq m0, m1
- movq m4, m1
- punpcklbw m0, m7
- punpckhbw m4, m7
- pmullw m0, [rsp+8]
- pmullw m4, [rsp+8]
- paddw m2, m0
- paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
- movq m0, [r1]
-
- paddw m2, [rnd_2d_%2+rnd_bias*8]
- paddw m3, [rnd_2d_%2+rnd_bias*8]
- psrlw m2, 6
- psrlw m3, 6
- packuswb m2, m3
- CHROMAMC_AVG m2, [dest_reg]
- movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
-
- add dest_reg, r2
- dec r3d
- jne .next2drow
- mov rsp, r6 ; restore stack pointer
- RET
-%endmacro
-
-%macro chroma_mc4_mmx_func 2
-%define extra_regs 0
-%ifidn %2, rv40
-%ifdef PIC
-%define extra_regs 1
-%endif ; PIC
-%endif ; rv40
-cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
- pxor m7, m7
- movd m2, r4d ; x
- movd m3, r5d ; y
- movq m4, [pw_8]
- movq m5, [pw_8]
- punpcklwd m2, m2
- punpcklwd m3, m3
- punpcklwd m2, m2
- punpcklwd m3, m3
- psubw m4, m2
- psubw m5, m3
-
-%ifidn %2, rv40
-%ifdef PIC
- lea r6, [rnd_rv40_2d_tbl]
-%define rnd_2d_rv40 r6
-%else
-%define rnd_2d_rv40 rnd_rv40_2d_tbl
-%endif
- and r5, 6 ; &~1 for mx/my=[0,7]
- lea r5, [r5*4+r4]
- sar r5d, 1
-%define rnd_bias r5
-%else ; vc1, h264
-%define rnd_bias 0
-%endif
-
- movd m0, [r1 ]
- movd m6, [r1+1]
- add r1, r2
- punpcklbw m0, m7
- punpcklbw m6, m7
- pmullw m0, m4
- pmullw m6, m2
- paddw m6, m0
-
-.next2rows:
- movd m0, [r1 ]
- movd m1, [r1+1]
- add r1, r2
- punpcklbw m0, m7
- punpcklbw m1, m7
- pmullw m0, m4
- pmullw m1, m2
- paddw m1, m0
- movq m0, m1
-
- pmullw m6, m5
- pmullw m1, m3
- paddw m6, [rnd_2d_%2+rnd_bias*8]
- paddw m1, m6
- psrlw m1, 6
- packuswb m1, m1
- CHROMAMC_AVG4 m1, m6, [r0]
- movd [r0], m1
- add r0, r2
-
- movd m6, [r1 ]
- movd m1, [r1+1]
- add r1, r2
- punpcklbw m6, m7
- punpcklbw m1, m7
- pmullw m6, m4
- pmullw m1, m2
- paddw m1, m6
- movq m6, m1
- pmullw m0, m5
- pmullw m1, m3
- paddw m0, [rnd_2d_%2+rnd_bias*8]
- paddw m1, m0
- psrlw m1, 6
- packuswb m1, m1
- CHROMAMC_AVG4 m1, m0, [r0]
- movd [r0], m1
- add r0, r2
- sub r3d, 2
- jnz .next2rows
- REP_RET
-%endmacro
-
-%macro chroma_mc2_mmx_func 2
-cglobal %1_%2_chroma_mc2, 6, 7, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
-
- mov r6d, r4d
- shl r4d, 16
- sub r4d, r6d
- add r4d, 8
- imul r5d, r4d ; x*y<<16 | y*(8-x)
- shl r4d, 3
- sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
-
- movd m5, r4d
- movd m6, r5d
- punpckldq m5, m5 ; mm5 = {A,B,A,B}
- punpckldq m6, m6 ; mm6 = {C,D,C,D}
- pxor m7, m7
- movd m2, [r1]
- punpcklbw m2, m7
- pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
-
-.nextrow:
- add r1, r2
- movq m1, m2
- pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
- movd m0, [r1]
- punpcklbw m0, m7
- pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
- movq m2, m0
- pmaddwd m0, m6
- paddw m1, [rnd_2d_%2]
- paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
- psrlw m1, 6
- packssdw m1, m7
- packuswb m1, m7
- CHROMAMC_AVG4 m1, m3, [r0]
- movd r5d, m1
- mov [r0], r5w
- add r0, r2
- sub r3d, 1
- jnz .nextrow
- REP_RET
-%endmacro
-
-%define rnd_1d_h264 pw_4
-%define rnd_2d_h264 pw_32
-%define rnd_1d_vc1 pw_3
-%define rnd_2d_vc1 pw_28
-
-%macro NOTHING 2-3
-%endmacro
-%macro DIRECT_AVG 2
- PAVGB %1, %2
-%endmacro
-%macro COPY_AVG 3
- movd %2, %3
- PAVGB %1, %2
-%endmacro
-
-INIT_MMX mmx
-%define CHROMAMC_AVG NOTHING
-%define CHROMAMC_AVG4 NOTHING
-chroma_mc8_mmx_func put, h264, _rnd
-chroma_mc8_mmx_func put, vc1, _nornd
-chroma_mc8_mmx_func put, rv40
-chroma_mc4_mmx_func put, h264
-chroma_mc4_mmx_func put, rv40
-
-INIT_MMX mmxext
-chroma_mc2_mmx_func put, h264
-
-%define CHROMAMC_AVG DIRECT_AVG
-%define CHROMAMC_AVG4 COPY_AVG
-chroma_mc8_mmx_func avg, h264, _rnd
-chroma_mc8_mmx_func avg, vc1, _nornd
-chroma_mc8_mmx_func avg, rv40
-chroma_mc4_mmx_func avg, h264
-chroma_mc4_mmx_func avg, rv40
-chroma_mc2_mmx_func avg, h264
-
-INIT_MMX 3dnow
-chroma_mc8_mmx_func avg, h264, _rnd
-chroma_mc8_mmx_func avg, vc1, _nornd
-chroma_mc8_mmx_func avg, rv40
-chroma_mc4_mmx_func avg, h264
-chroma_mc4_mmx_func avg, rv40
-
-%macro chroma_mc8_ssse3_func 2-3
-cglobal %1_%2_chroma_mc8%3, 6, 7, 8
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
- mov r6d, r5d
- or r6d, r4d
- jne .at_least_one_non_zero
- ; mx == 0 AND my == 0 - no filter needed
- mv0_pixels_mc8
- REP_RET
-
-.at_least_one_non_zero:
- test r5d, r5d
- je .my_is_zero
- test r4d, r4d
- je .mx_is_zero
-
- ; general case, bilinear
- mov r6d, r4d
- shl r4d, 8
- sub r4, r6
- mov r6, 8
- add r4, 8 ; x*288+8 = x<<8 | (8-x)
- sub r6d, r5d
- imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
- imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
-
- movd m7, r6d
- movd m6, r4d
- movdqa m5, [rnd_2d_%2]
- movq m0, [r1 ]
- movq m1, [r1+1]
- pshuflw m7, m7, 0
- pshuflw m6, m6, 0
- punpcklbw m0, m1
- movlhps m7, m7
- movlhps m6, m6
-
-.next2rows:
- movq m1, [r1+r2*1 ]
- movq m2, [r1+r2*1+1]
- movq m3, [r1+r2*2 ]
- movq m4, [r1+r2*2+1]
- lea r1, [r1+r2*2]
- punpcklbw m1, m2
- movdqa m2, m1
- punpcklbw m3, m4
- movdqa m4, m3
- pmaddubsw m0, m7
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
- paddw m0, m5
- paddw m2, m5
- paddw m1, m0
- paddw m3, m2
- psrlw m1, 6
- movdqa m0, m4
- psrlw m3, 6
-%ifidn %1, avg
- movq m2, [r0 ]
- movhps m2, [r0+r2]
-%endif
- packuswb m1, m3
- CHROMAMC_AVG m1, m2
- movq [r0 ], m1
- movhps [r0+r2], m1
- sub r3d, 2
- lea r0, [r0+r2*2]
- jg .next2rows
- REP_RET
-
-.my_is_zero:
- mov r5d, r4d
- shl r4d, 8
- add r4, 8
- sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
- movd m7, r4d
- movdqa m6, [rnd_1d_%2]
- pshuflw m7, m7, 0
- movlhps m7, m7
-
-.next2xrows:
- movq m0, [r1 ]
- movq m1, [r1 +1]
- movq m2, [r1+r2 ]
- movq m3, [r1+r2+1]
- punpcklbw m0, m1
- punpcklbw m2, m3
- pmaddubsw m0, m7
- pmaddubsw m2, m7
-%ifidn %1, avg
- movq m4, [r0 ]
- movhps m4, [r0+r2]
-%endif
- paddw m0, m6
- paddw m2, m6
- psrlw m0, 3
- psrlw m2, 3
- packuswb m0, m2
- CHROMAMC_AVG m0, m4
- movq [r0 ], m0
- movhps [r0+r2], m0
- sub r3d, 2
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- jg .next2xrows
- REP_RET
-
-.mx_is_zero:
- mov r4d, r5d
- shl r5d, 8
- add r5, 8
- sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
- movd m7, r5d
- movdqa m6, [rnd_1d_%2]
- pshuflw m7, m7, 0
- movlhps m7, m7
-
-.next2yrows:
- movq m0, [r1 ]
- movq m1, [r1+r2 ]
- movdqa m2, m1
- movq m3, [r1+r2*2]
- lea r1, [r1+r2*2]
- punpcklbw m0, m1
- punpcklbw m2, m3
- pmaddubsw m0, m7
- pmaddubsw m2, m7
-%ifidn %1, avg
- movq m4, [r0 ]
- movhps m4, [r0+r2]
-%endif
- paddw m0, m6
- paddw m2, m6
- psrlw m0, 3
- psrlw m2, 3
- packuswb m0, m2
- CHROMAMC_AVG m0, m4
- movq [r0 ], m0
- movhps [r0+r2], m0
- sub r3d, 2
- lea r0, [r0+r2*2]
- jg .next2yrows
- REP_RET
-%endmacro
-
-%macro chroma_mc4_ssse3_func 2
-cglobal %1_%2_chroma_mc4, 6, 7, 0
-%if ARCH_X86_64
- movsxd r2, r2d
-%endif
- mov r6, r4
- shl r4d, 8
- sub r4d, r6d
- mov r6, 8
- add r4d, 8 ; x*288+8
- sub r6d, r5d
- imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
- imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
-
- movd m7, r6d
- movd m6, r4d
- movq m5, [pw_32]
- movd m0, [r1 ]
- pshufw m7, m7, 0
- punpcklbw m0, [r1+1]
- pshufw m6, m6, 0
-
-.next2rows:
- movd m1, [r1+r2*1 ]
- movd m3, [r1+r2*2 ]
- punpcklbw m1, [r1+r2*1+1]
- punpcklbw m3, [r1+r2*2+1]
- lea r1, [r1+r2*2]
- movq m2, m1
- movq m4, m3
- pmaddubsw m0, m7
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
- paddw m0, m5
- paddw m2, m5
- paddw m1, m0
- paddw m3, m2
- psrlw m1, 6
- movq m0, m4
- psrlw m3, 6
- packuswb m1, m1
- packuswb m3, m3
- CHROMAMC_AVG m1, [r0 ]
- CHROMAMC_AVG m3, [r0+r2]
- movd [r0 ], m1
- movd [r0+r2], m3
- sub r3d, 2
- lea r0, [r0+r2*2]
- jg .next2rows
- REP_RET
-%endmacro
-
-%define CHROMAMC_AVG NOTHING
-INIT_XMM ssse3
-chroma_mc8_ssse3_func put, h264, _rnd
-chroma_mc8_ssse3_func put, vc1, _nornd
-INIT_MMX ssse3
-chroma_mc4_ssse3_func put, h264
-
-%define CHROMAMC_AVG DIRECT_AVG
-INIT_XMM ssse3
-chroma_mc8_ssse3_func avg, h264, _rnd
-chroma_mc8_ssse3_func avg, vc1, _nornd
-INIT_MMX ssse3
-chroma_mc4_ssse3_func avg, h264
diff --git a/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm b/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
deleted file mode 100644
index beb7c0f..0000000
--- a/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
+++ /dev/null
@@ -1,271 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-cextern pw_4
-cextern pw_8
-cextern pw_32
-cextern pw_64
-
-SECTION .text
-
-
-%macro MV0_PIXELS_MC8 0
- lea r4, [r2*3 ]
- lea r5, [r2*4 ]
-.next4rows:
- movu m0, [r1 ]
- movu m1, [r1+r2 ]
- CHROMAMC_AVG m0, [r0 ]
- CHROMAMC_AVG m1, [r0+r2 ]
- mova [r0 ], m0
- mova [r0+r2 ], m1
- movu m0, [r1+r2*2]
- movu m1, [r1+r4 ]
- CHROMAMC_AVG m0, [r0+r2*2]
- CHROMAMC_AVG m1, [r0+r4 ]
- mova [r0+r2*2], m0
- mova [r0+r4 ], m1
- add r1, r5
- add r0, r5
- sub r3d, 4
- jne .next4rows
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
-;-----------------------------------------------------------------------------
-%macro CHROMA_MC8 1
-; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
-; int stride, int h, int mx, int my)
-cglobal %1_h264_chroma_mc8_10, 6,7,8
- movsxdifnidn r2, r2d
- mov r6d, r5d
- or r6d, r4d
- jne .at_least_one_non_zero
- ; mx == 0 AND my == 0 - no filter needed
- MV0_PIXELS_MC8
- REP_RET
-
-.at_least_one_non_zero:
- mov r6d, 2
- test r5d, r5d
- je .x_interpolation
- mov r6, r2 ; dxy = x ? 1 : stride
- test r4d, r4d
- jne .xy_interpolation
-.x_interpolation:
- ; mx == 0 XOR my == 0 - 1 dimensional filter only
- or r4d, r5d ; x + y
- movd m5, r4d
- mova m4, [pw_8]
- mova m6, [pw_4] ; mm6 = rnd >> 3
- SPLATW m5, m5 ; mm5 = B = x
- psubw m4, m5 ; mm4 = A = 8-x
-
-.next1drow:
- movu m0, [r1 ] ; mm0 = src[0..7]
- movu m2, [r1+r6] ; mm2 = src[1..8]
-
- pmullw m0, m4 ; mm0 = A * src[0..7]
- pmullw m2, m5 ; mm2 = B * src[1..8]
-
- paddw m0, m6
- paddw m0, m2
- psrlw m0, 3
- CHROMAMC_AVG m0, [r0]
- mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
-
- add r0, r2
- add r1, r2
- dec r3d
- jne .next1drow
- REP_RET
-
-.xy_interpolation: ; general case, bilinear
- movd m4, r4m ; x
- movd m6, r5m ; y
-
- SPLATW m4, m4 ; mm4 = x words
- SPLATW m6, m6 ; mm6 = y words
- psllw m5, m4, 3 ; mm5 = 8x
- pmullw m4, m6 ; mm4 = x * y
- psllw m6, 3 ; mm6 = 8y
- paddw m1, m5, m6 ; mm7 = 8x+8y
- mova m7, m4 ; DD = x * y
- psubw m5, m4 ; mm5 = B = 8x - xy
- psubw m6, m4 ; mm6 = C = 8y - xy
- paddw m4, [pw_64]
- psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
-
- movu m0, [r1 ] ; mm0 = src[0..7]
- movu m1, [r1+2] ; mm1 = src[1..8]
-.next2drow:
- add r1, r2
-
- pmullw m2, m0, m4
- pmullw m1, m5
- paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
-
- movu m0, [r1]
- movu m1, [r1+2]
- pmullw m3, m0, m6
- paddw m2, m3 ; mm2 += C * src[0..7+strde]
- pmullw m3, m1, m7
- paddw m2, m3 ; mm2 += D * src[1..8+strde]
-
- paddw m2, [pw_32]
- psrlw m2, 6
- CHROMAMC_AVG m2, [r0]
- mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
-
- add r0, r2
- dec r3d
- jne .next2drow
- REP_RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
-;-----------------------------------------------------------------------------
-;TODO: xmm mc4
-%macro MC4_OP 2
- movq %1, [r1 ]
- movq m1, [r1+2]
- add r1, r2
- pmullw %1, m4
- pmullw m1, m2
- paddw m1, %1
- mova %1, m1
-
- pmullw %2, m5
- pmullw m1, m3
- paddw %2, [pw_32]
- paddw m1, %2
- psrlw m1, 6
- CHROMAMC_AVG m1, %2, [r0]
- movq [r0], m1
- add r0, r2
-%endmacro
-
-%macro CHROMA_MC4 1
-cglobal %1_h264_chroma_mc4_10, 6,6,7
- movsxdifnidn r2, r2d
- movd m2, r4m ; x
- movd m3, r5m ; y
- mova m4, [pw_8]
- mova m5, m4
- SPLATW m2, m2
- SPLATW m3, m3
- psubw m4, m2
- psubw m5, m3
-
- movq m0, [r1 ]
- movq m6, [r1+2]
- add r1, r2
- pmullw m0, m4
- pmullw m6, m2
- paddw m6, m0
-
-.next2rows:
- MC4_OP m0, m6
- MC4_OP m6, m0
- sub r3d, 2
- jnz .next2rows
- REP_RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
-;-----------------------------------------------------------------------------
-%macro CHROMA_MC2 1
-cglobal %1_h264_chroma_mc2_10, 6,7
- movsxdifnidn r2, r2d
- mov r6d, r4d
- shl r4d, 16
- sub r4d, r6d
- add r4d, 8
- imul r5d, r4d ; x*y<<16 | y*(8-x)
- shl r4d, 3
- sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
-
- movd m5, r4d
- movd m6, r5d
- punpckldq m5, m5 ; mm5 = {A,B,A,B}
- punpckldq m6, m6 ; mm6 = {C,D,C,D}
- pxor m7, m7
- pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
-
-.nextrow:
- add r1, r2
- movq m1, m2
- pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
- pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
- movq m2, m0
- pmaddwd m0, m6
- paddw m1, [pw_32]
- paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
- psrlw m1, 6
- packssdw m1, m7
- CHROMAMC_AVG m1, m3, [r0]
- movd [r0], m1
- add r0, r2
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-%macro NOTHING 2-3
-%endmacro
-%macro AVG 2-3
-%if %0==3
- movq %2, %3
-%endif
- pavgw %1, %2
-%endmacro
-
-%define CHROMAMC_AVG NOTHING
-INIT_XMM sse2
-CHROMA_MC8 put
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-CHROMA_MC8 put
-%endif
-INIT_MMX mmxext
-CHROMA_MC4 put
-CHROMA_MC2 put
-
-%define CHROMAMC_AVG AVG
-INIT_XMM sse2
-CHROMA_MC8 avg
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-CHROMA_MC8 avg
-%endif
-INIT_MMX mmxext
-CHROMA_MC4 avg
-CHROMA_MC2 avg
diff --git a/ffmpeg/libavcodec/x86/h264_deblock.asm b/ffmpeg/libavcodec/x86/h264_deblock.asm
deleted file mode 100644
index 1317783..0000000
--- a/ffmpeg/libavcodec/x86/h264_deblock.asm
+++ /dev/null
@@ -1,1078 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized H.264 deblocking code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;* Jason Garrett-Glaser <darkshikari@gmail.com>
-;* Oskar Arvidsson <oskar@irock.se>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-pb_A1: times 16 db 0xA1
-pb_3_1: times 4 db 3, 1
-
-SECTION .text
-
-cextern pb_0
-cextern pb_1
-cextern pb_3
-
-; expands to [base],...,[base+7*stride]
-%define PASS8ROWS(base, base3, stride, stride3) \
- [base], [base+stride], [base+stride*2], [base3], \
- [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-
-%define PASS8ROWS(base, base3, stride, stride3, offset) \
- PASS8ROWS(base+offset, base3+offset, stride, stride3)
-
-; in: 8 rows of 4 bytes in %4..%11
-; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 11
- movh m0, %4
- movh m2, %5
- movh m1, %6
- movh m3, %7
- punpckl%1 m0, m2
- punpckl%1 m1, m3
- mova m2, m0
- punpckl%2 m0, m1
- punpckh%2 m2, m1
-
- movh m4, %8
- movh m6, %9
- movh m5, %10
- movh m7, %11
- punpckl%1 m4, m6
- punpckl%1 m5, m7
- mova m6, m4
- punpckl%2 m4, m5
- punpckh%2 m6, m5
-
- punpckh%3 m1, m0, m4
- punpckh%3 m3, m2, m6
- punpckl%3 m0, m4
- punpckl%3 m2, m6
-%endmacro
-
-; in: 4 rows of 8 bytes in m0..m3
-; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4B_STORE 8
- punpckhdq m4, m0, m0
- punpckhdq m5, m1, m1
- punpckhdq m6, m2, m2
-
- punpcklbw m0, m1
- punpcklbw m2, m3
- punpcklwd m1, m0, m2
- punpckhwd m0, m2
- movh %1, m1
- punpckhdq m1, m1
- movh %2, m1
- movh %3, m0
- punpckhdq m0, m0
- movh %4, m0
-
- punpckhdq m3, m3
- punpcklbw m4, m5
- punpcklbw m6, m3
- punpcklwd m5, m4, m6
- punpckhwd m4, m6
- movh %5, m5
- punpckhdq m5, m5
- movh %6, m5
- movh %7, m4
- punpckhdq m4, m4
- movh %8, m4
-%endmacro
-
-%macro TRANSPOSE4x8B_LOAD 8
- TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
-%endmacro
-
-%macro SBUTTERFLY3 4
- punpckh%1 %4, %2, %3
- punpckl%1 %2, %3
-%endmacro
-
-; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
-; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
-%macro TRANSPOSE6x8_MEM 9
- RESET_MM_PERMUTATION
- movq m0, %1
- movq m1, %2
- movq m2, %3
- movq m3, %4
- movq m4, %5
- movq m5, %6
- movq m6, %7
- SBUTTERFLY bw, 0, 1, 7
- SBUTTERFLY bw, 2, 3, 7
- SBUTTERFLY bw, 4, 5, 7
- movq [%9+0x10], m3
- SBUTTERFLY3 bw, m6, %8, m7
- SBUTTERFLY wd, 0, 2, 3
- SBUTTERFLY wd, 4, 6, 3
- punpckhdq m0, m4
- movq [%9+0x00], m0
- SBUTTERFLY3 wd, m1, [%9+0x10], m3
- SBUTTERFLY wd, 5, 7, 0
- SBUTTERFLY dq, 1, 5, 0
- SBUTTERFLY dq, 2, 6, 0
- punpckldq m3, m7
- movq [%9+0x10], m2
- movq [%9+0x20], m6
- movq [%9+0x30], m1
- movq [%9+0x40], m5
- movq [%9+0x50], m3
- RESET_MM_PERMUTATION
-%endmacro
-
-; in: 8 rows of 8 in %1..%8
-; out: 8 rows of 8 in %9..%16
-%macro TRANSPOSE8x8_MEM 16
- RESET_MM_PERMUTATION
- movq m0, %1
- movq m1, %2
- movq m2, %3
- movq m3, %4
- movq m4, %5
- movq m5, %6
- movq m6, %7
- SBUTTERFLY bw, 0, 1, 7
- SBUTTERFLY bw, 2, 3, 7
- SBUTTERFLY bw, 4, 5, 7
- SBUTTERFLY3 bw, m6, %8, m7
- movq %9, m5
- SBUTTERFLY wd, 0, 2, 5
- SBUTTERFLY wd, 4, 6, 5
- SBUTTERFLY wd, 1, 3, 5
- movq %11, m6
- movq m6, %9
- SBUTTERFLY wd, 6, 7, 5
- SBUTTERFLY dq, 0, 4, 5
- SBUTTERFLY dq, 1, 6, 5
- movq %9, m0
- movq %10, m4
- movq %13, m1
- movq %14, m6
- SBUTTERFLY3 dq, m2, %11, m0
- SBUTTERFLY dq, 3, 7, 4
- movq %11, m2
- movq %12, m0
- movq %15, m3
- movq %16, m7
- RESET_MM_PERMUTATION
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT 5
-%if avx_enabled == 0
- mova %5, %2
- mova %4, %1
- psubusb %5, %1
- psubusb %4, %2
-%else
- psubusb %5, %2, %1
- psubusb %4, %1, %2
-%endif
- por %4, %5
- psubusb %4, %3
-%endmacro
-
-; out: %4 = |%1-%2|>%3
-; clobbers: %5
-%macro DIFF_GT2 5
-%if ARCH_X86_64
- psubusb %5, %2, %1
- psubusb %4, %1, %2
-%else
- mova %5, %2
- mova %4, %1
- psubusb %5, %1
- psubusb %4, %2
-%endif
- psubusb %5, %3
- psubusb %4, %3
- pcmpeqb %4, %5
-%endmacro
-
-; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
-; out: m5=beta-1, m7=mask, %3=alpha-1
-; clobbers: m4,m6
-%macro LOAD_MASK 2-3
- movd m4, %1
- movd m5, %2
- SPLATW m4, m4
- SPLATW m5, m5
- packuswb m4, m4 ; 16x alpha-1
- packuswb m5, m5 ; 16x beta-1
-%if %0>2
- mova %3, m4
-%endif
- DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
- DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
- por m7, m4
- DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
- por m7, m4
- pxor m6, m6
- pcmpeqb m7, m6
-%endmacro
-
-; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
-; out: m1=p0' m2=q0'
-; clobbers: m0,3-6
-%macro DEBLOCK_P0_Q0 0
- pcmpeqb m4, m4
- pxor m5, m1, m2 ; p0^q0
- pxor m3, m4
- pand m5, [pb_1] ; (p0^q0)&1
- pavgb m3, m0 ; (p1 - q1 + 256)>>1
- pxor m4, m1
- pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
- pavgb m4, m2 ; (q0 - p0 + 256)>>1
- pavgb m3, m5
- mova m6, [pb_A1]
- paddusb m3, m4 ; d+128+33
- psubusb m6, m3
- psubusb m3, [pb_A1]
- pminub m6, m7
- pminub m3, m7
- psubusb m1, m6
- psubusb m2, m3
- paddusb m1, m3
- paddusb m2, m6
-%endmacro
-
-; in: m1=p0 m2=q0
-; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
-; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-; clobbers: q2, tmp, tc0
-%macro LUMA_Q1 6
- pavgb %6, m1, m2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
- pxor %6, %3
- pand %6, [pb_1] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
- psubusb %6, %1, %5
- paddusb %5, %1
- pmaxub %2, %6
- pminub %2, %5
- mova %4, %2
-%endmacro
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA 0
-cglobal deblock_v_luma_8, 5,5,10
- movd m8, [r4] ; tc0
- lea r4, [r1*3]
- dec r2d ; alpha-1
- neg r4
- dec r3d ; beta-1
- add r4, r0 ; pix-3*stride
-
- mova m0, [r4+r1] ; p1
- mova m1, [r4+2*r1] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r1] ; q1
- LOAD_MASK r2d, r3d
-
- punpcklbw m8, m8
- punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
- pcmpeqb m9, m9
- pcmpeqb m9, m8
- pandn m9, m7
- pand m8, m9
-
- movdqa m3, [r4] ; p2
- DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
- pand m6, m9
- psubb m7, m8, m6
- pand m6, m8
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
-
- movdqa m4, [r0+2*r1] ; q2
- DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
- pand m6, m9
- pand m8, m6
- psubb m7, m6
- mova m3, [r0+r1]
- LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
-
- DEBLOCK_P0_Q0
- mova [r4+2*r1], m1
- mova [r0], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-INIT_MMX cpuname
-cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
- movsxd r7, r1d
- lea r8, [r7+r7*2]
- lea r6, [r0-4]
- lea r5, [r0-4+r8]
-%if WIN64
- %define pix_tmp rsp+0x30 ; shadow space + r4
-%else
- %define pix_tmp rsp
-%endif
-
- ; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
- lea r6, [r6+r7*8]
- lea r5, [r5+r7*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
-
- ; vertical filter
- ; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
- lea r0, [pix_tmp+0x30]
- mov r1d, 0x10
-%if WIN64
- mov [rsp+0x20], r4
-%endif
- call deblock_v_luma_8
-
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- add r6, 2
- add r5, 2
- movq m0, [pix_tmp+0x18]
- movq m1, [pix_tmp+0x28]
- movq m2, [pix_tmp+0x38]
- movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-
- shl r7, 3
- sub r6, r7
- sub r5, r7
- shr r7, 3
- movq m0, [pix_tmp+0x10]
- movq m1, [pix_tmp+0x20]
- movq m2, [pix_tmp+0x30]
- movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-
- RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA
-%endif
-
-%else
-
-%macro DEBLOCK_LUMA 2
-;-----------------------------------------------------------------------------
-; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_8, 5,5,8,2*%2
- lea r4, [r1*3]
- dec r2 ; alpha-1
- neg r4
- dec r3 ; beta-1
- add r4, r0 ; pix-3*stride
-
- mova m0, [r4+r1] ; p1
- mova m1, [r4+2*r1] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r1] ; q1
- LOAD_MASK r2, r3
-
- mov r3, r4mp
- pcmpeqb m3, m3
- movd m4, [r3] ; tc0
- punpcklbw m4, m4
- punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
- mova [esp+%2], m4 ; tc
- pcmpgtb m4, m3
- mova m3, [r4] ; p2
- pand m4, m7
- mova [esp], m4 ; mask
-
- DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
- pand m6, m4
- pand m4, [esp+%2] ; tc
- psubb m7, m4, m6
- pand m6, m4
- LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
-
- mova m4, [r0+2*r1] ; q2
- DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
- pand m6, [esp] ; mask
- mova m5, [esp+%2] ; tc
- psubb m7, m6
- pand m5, m6
- mova m3, [r0+r1]
- LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
-
- DEBLOCK_P0_Q0
- mova [r4+2*r1], m1
- mova [r0], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-INIT_MMX cpuname
-cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
- mov r0, r0mp
- mov r3, r1m
- lea r4, [r3*3]
- sub r0, 4
- lea r1, [r0+r4]
-%define pix_tmp esp+12*HAVE_ALIGNED_STACK
-
- ; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
- lea r0, [r0+r3*8]
- lea r1, [r1+r3*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
-
- ; vertical filter
- lea r0, [pix_tmp+0x30]
- PUSH dword r4m
- PUSH dword r3m
- PUSH dword r2m
- PUSH dword 16
- PUSH dword r0
- call deblock_%1_luma_8
-%ifidn %1, v8
- add dword [esp ], 8 ; pix_tmp+0x38
- add dword [esp+16], 2 ; tc0+2
- call deblock_%1_luma_8
-%endif
- ADD esp, 20
-
- ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- mov r0, r0mp
- sub r0, 2
-
- movq m0, [pix_tmp+0x10]
- movq m1, [pix_tmp+0x20]
- lea r1, [r0+r4]
- movq m2, [pix_tmp+0x30]
- movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
-
- lea r0, [r0+r3*8]
- lea r1, [r1+r3*8]
- movq m0, [pix_tmp+0x18]
- movq m1, [pix_tmp+0x28]
- movq m2, [pix_tmp+0x38]
- movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
-
- RET
-%endmacro ; DEBLOCK_LUMA
-
-INIT_MMX mmxext
-DEBLOCK_LUMA v8, 8
-INIT_XMM sse2
-DEBLOCK_LUMA v, 16
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA v, 16
-%endif
-
-%endif ; ARCH
-
-
-
-%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
-%if ARCH_X86_64
- pavgb t0, p2, p1
- pavgb t1, p0, q0
-%else
- mova t0, p2
- mova t1, p0
- pavgb t0, p1
- pavgb t1, q0
-%endif
- pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
- mova t5, t1
-%if ARCH_X86_64
- paddb t2, p2, p1
- paddb t3, p0, q0
-%else
- mova t2, p2
- mova t3, p0
- paddb t2, p1
- paddb t3, q0
-%endif
- paddb t2, t3
- mova t3, t2
- mova t4, t2
- psrlw t2, 1
- pavgb t2, mpb_0
- pxor t2, t0
- pand t2, mpb_1
- psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
-
-%if ARCH_X86_64
- pavgb t1, p2, q1
- psubb t2, p2, q1
-%else
- mova t1, p2
- mova t2, p2
- pavgb t1, q1
- psubb t2, q1
-%endif
- paddb t3, t3
- psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
- pand t2, mpb_1
- psubb t1, t2
- pavgb t1, p1
- pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
- psrlw t3, 2
- pavgb t3, mpb_0
- pxor t3, t1
- pand t3, mpb_1
- psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
-
- pxor t3, p0, q1
- pavgb t2, p0, q1
- pand t3, mpb_1
- psubb t2, t3
- pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
-
- pxor t1, t2
- pxor t2, p0
- pand t1, mask1p
- pand t2, mask0
- pxor t1, t2
- pxor t1, p0
- mova %1, t1 ; store p0
-
- mova t1, %4 ; p3
- paddb t2, t1, p2
- pavgb t1, p2
- pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
- paddb t2, t2
- paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
- psrlw t2, 2
- pavgb t2, mpb_0
- pxor t2, t1
- pand t2, mpb_1
- psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
-
- pxor t0, p1
- pxor t1, p2
- pand t0, mask1p
- pand t1, mask1p
- pxor t0, p1
- pxor t1, p2
- mova %2, t0 ; store p1
- mova %3, t1 ; store p2
-%endmacro
-
-%macro LUMA_INTRA_SWAP_PQ 0
- %define q1 m0
- %define q0 m1
- %define p0 m2
- %define p1 m3
- %define p2 q2
- %define mask1p mask1q
-%endmacro
-
-%macro DEBLOCK_LUMA_INTRA 1
- %define p1 m0
- %define p0 m1
- %define q0 m2
- %define q1 m3
- %define t0 m4
- %define t1 m5
- %define t2 m6
- %define t3 m7
-%if ARCH_X86_64
- %define p2 m8
- %define q2 m9
- %define t4 m10
- %define t5 m11
- %define mask0 m12
- %define mask1p m13
-%if WIN64
- %define mask1q [rsp]
-%else
- %define mask1q [rsp-24]
-%endif
- %define mpb_0 m14
- %define mpb_1 m15
-%else
- %define spill(x) [esp+16*x]
- %define p2 [r4+r1]
- %define q2 [r0+2*r1]
- %define t4 spill(0)
- %define t5 spill(1)
- %define mask0 spill(2)
- %define mask1p spill(3)
- %define mask1q spill(4)
- %define mpb_0 [pb_0]
- %define mpb_1 [pb_1]
-%endif
-
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-%if WIN64
-cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
-%else
-cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
-%endif
- lea r4, [r1*4]
- lea r5, [r1*3] ; 3*stride
- dec r2d ; alpha-1
- jl .end
- neg r4
- dec r3d ; beta-1
- jl .end
- add r4, r0 ; pix-4*stride
- mova p1, [r4+2*r1]
- mova p0, [r4+r5]
- mova q0, [r0]
- mova q1, [r0+r1]
-%if ARCH_X86_64
- pxor mpb_0, mpb_0
- mova mpb_1, [pb_1]
- LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
- SWAP 7, 12 ; m12=mask0
- pavgb t5, mpb_0
- pavgb t5, mpb_1 ; alpha/4+1
- movdqa p2, [r4+r1]
- movdqa q2, [r0+2*r1]
- DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
- DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
- DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
- pand t0, mask0
- pand t4, t0
- pand t2, t0
- mova mask1q, t4
- mova mask1p, t2
-%else
- LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
- mova m4, t5
- mova mask0, m7
- pavgb m4, [pb_0]
- pavgb m4, [pb_1] ; alpha/4+1
- DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
- pand m6, mask0
- DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
- pand m4, m6
- mova mask1p, m4
- DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
- pand m4, m6
- mova mask1q, m4
-%endif
- LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
- LUMA_INTRA_SWAP_PQ
- LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
-.end:
- RET
-
-INIT_MMX cpuname
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_8, 4,9,0,0x80
- movsxd r7, r1d
- lea r8, [r7*3]
- lea r6, [r0-4]
- lea r5, [r0-4+r8]
-%if WIN64
- %define pix_tmp rsp+0x20 ; shadow space
-%else
- %define pix_tmp rsp
-%endif
-
- ; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r7*8]
- lea r5, [r5+r7*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
- lea r0, [pix_tmp+0x40]
- mov r1, 0x10
- call deblock_v_luma_intra_8
-
- ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r5, [r6+r8]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
- shl r7, 3
- sub r6, r7
- sub r5, r7
- shr r7, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
- RET
-%else
-cglobal deblock_h_luma_intra_8, 2,4,8,0x80
- lea r3, [r1*3]
- sub r0, 4
- lea r2, [r0+r3]
- %define pix_tmp rsp
-
- ; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r0, [r0+r1*8]
- lea r2, [r2+r1*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
- lea r0, [pix_tmp+0x40]
- PUSH dword r3m
- PUSH dword r2m
- PUSH dword 16
- PUSH r0
- call deblock_%1_luma_intra_8
-%ifidn %1, v8
- add dword [rsp], 8 ; pix_tmp+8
- call deblock_%1_luma_intra_8
-%endif
- ADD esp, 16
-
- mov r1, r1m
- mov r0, r0mp
- lea r3, [r1*3]
- sub r0, 4
- lea r2, [r0+r3]
- ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
- lea r0, [r0+r1*8]
- lea r2, [r2+r1*8]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
- RET
-%endif ; ARCH_X86_64
-%endmacro ; DEBLOCK_LUMA_INTRA
-
-INIT_XMM sse2
-DEBLOCK_LUMA_INTRA v
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA_INTRA v
-%endif
-%if ARCH_X86_64 == 0
-INIT_MMX mmxext
-DEBLOCK_LUMA_INTRA v8
-%endif
-
-INIT_MMX mmxext
-
-%macro CHROMA_V_START 0
- dec r2d ; alpha-1
- dec r3d ; beta-1
- mov t5, r0
- sub t5, r1
- sub t5, r1
-%endmacro
-
-%macro CHROMA_H_START 0
- dec r2d
- dec r3d
- sub r0, 2
- lea t6, [r1*3]
- mov t5, r0
- add r0, t6
-%endmacro
-
-%define t5 r5
-%define t6 r6
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_8, 5,6
- CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call ff_chroma_inter_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_8, 5,7
-%if UNIX64
- %define buf0 [rsp-24]
- %define buf1 [rsp-16]
-%elif WIN64
- sub rsp, 16
- %define buf0 [rsp]
- %define buf1 [rsp+8]
-%else
- %define buf0 r0m
- %define buf1 r2m
-%endif
- CHROMA_H_START
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- movq buf0, m0
- movq buf1, m3
- LOAD_MASK r2d, r3d
- movd m6, [r4] ; tc0
- punpcklbw m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- movq m0, buf0
- movq m3, buf1
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-%if WIN64
- add rsp, 16
-%endif
- RET
-
-ALIGN 16
-ff_chroma_inter_body_mmxext:
- LOAD_MASK r2d, r3d
- movd m6, [r4] ; tc0
- punpcklbw m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- ret
-
-
-
-; in: %1=p0 %2=p1 %3=q1
-; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
-%macro CHROMA_INTRA_P0 3
- movq m4, %1
- pxor m4, %3
- pand m4, [pb_1] ; m4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, m4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
-%endmacro
-
-%define t5 r4
-%define t6 r5
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_8, 4,5
- CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call ff_chroma_intra_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_8, 4,6
- CHROMA_H_START
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- call ff_chroma_intra_body_mmxext
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
- RET
-
-ALIGN 16
-ff_chroma_intra_body_mmxext:
- LOAD_MASK r2d, r3d
- movq m5, m1
- movq m6, m2
- CHROMA_INTRA_P0 m1, m0, m3
- CHROMA_INTRA_P0 m2, m3, m0
- psubb m1, m5
- psubb m2, m6
- pand m1, m7
- pand m2, m7
- paddb m1, m5
- paddb m2, m6
- ret
-
-;-----------------------------------------------------------------------------
-; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
-; int8_t ref[2][40], int16_t mv[2][40][2],
-; int bidir, int edges, int step,
-; int mask_mv0, int mask_mv1, int field);
-;
-; bidir is 0 or 1
-; edges is 1 or 4
-; step is 1 or 2
-; mask_mv0 is 0 or 3
-; mask_mv1 is 0 or 1
-; field is 0 or 1
-;-----------------------------------------------------------------------------
-%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
- ; dir, d_idx, mask_dir, bidir
-%define edgesd %1
-%define stepd %2
-%define mask_mvd %3
-%define dir %4
-%define d_idx %5
-%define mask_dir %6
-%define bidir %7
- xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
-%%.b_idx_loop:
-%if mask_dir == 0
- pxor m0, m0
-%endif
- test b_idxd, dword mask_mvd
- jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
-%if bidir == 1
- movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
- punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
- pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
- pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
- pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
- psubb m0, m2 ; { ref0[b] != ref0[bn],
- ; ref0[b] != ref1[bn] }
- psubb m1, m3 ; { ref1[b] != ref1[bn],
- ; ref1[b] != ref0[bn] }
-
- por m0, m1
- mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
- mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
- mova m3, m1
- mova m4, m2
- psubw m1, [mvq+b_idxq*4+12*4]
- psubw m2, [mvq+b_idxq*4+12*4+mmsize]
- psubw m3, [mvq+b_idxq*4+52*4]
- psubw m4, [mvq+b_idxq*4+52*4+mmsize]
- packsswb m1, m2
- packsswb m3, m4
- paddb m1, m6
- paddb m3, m6
- psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
- psubusb m3, m5
- packsswb m1, m3
-
- por m0, m1
- mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
- mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
- mova m3, m1
- mova m4, m2
- psubw m1, [mvq+b_idxq*4+12*4]
- psubw m2, [mvq+b_idxq*4+12*4+mmsize]
- psubw m3, [mvq+b_idxq*4+52*4]
- psubw m4, [mvq+b_idxq*4+52*4+mmsize]
- packsswb m1, m2
- packsswb m3, m4
- paddb m1, m6
- paddb m3, m6
- psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
- psubusb m3, m5
- packsswb m1, m3
-
- pshufw m1, m1, 0x4E
- por m0, m1
- pshufw m1, m0, 0x4E
- pminub m0, m1
-%else ; bidir == 0
- movd m0, [refq+b_idxq+12]
- psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
-
- mova m1, [mvq+b_idxq*4+12*4]
- mova m2, [mvq+b_idxq*4+12*4+mmsize]
- psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
- psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
- packsswb m1, m2
- paddb m1, m6
- psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
- packsswb m1, m1
- por m0, m1
-%endif ; bidir == 1/0
-
-%%.skip_loop_iter:
- movd m1, [nnzq+b_idxq+12]
- por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
-
- pminub m1, m7
- pminub m0, m7
- psllw m1, 1
- pxor m2, m2
- pmaxub m1, m0
- punpcklbw m1, m2
- movq [bsq+b_idxq+32*dir], m1
-
- add b_idxd, dword stepd
- cmp b_idxd, dword edgesd
- jl %%.b_idx_loop
-%endmacro
-
-INIT_MMX mmxext
-cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
- step, mask_mv0, mask_mv1, field
-%define b_idxq bidirq
-%define b_idxd bidird
- cmp dword fieldm, 0
- mova m7, [pb_1]
- mova m5, [pb_3]
- je .nofield
- mova m5, [pb_3_1]
-.nofield:
- mova m6, m5
- paddb m5, m5
-
- shl dword stepd, 3
- shl dword edgesd, 3
-%if ARCH_X86_32
-%define mask_mv0d mask_mv0m
-%define mask_mv1d mask_mv1m
-%endif
- shl dword mask_mv1d, 3
- shl dword mask_mv0d, 3
-
- cmp dword bidird, 0
- jne .bidir
- loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
- loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
-
- mova m0, [bsq+mmsize*0]
- mova m1, [bsq+mmsize*1]
- mova m2, [bsq+mmsize*2]
- mova m3, [bsq+mmsize*3]
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- mova [bsq+mmsize*0], m0
- mova [bsq+mmsize*1], m1
- mova [bsq+mmsize*2], m2
- mova [bsq+mmsize*3], m3
- RET
-
-.bidir:
- loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
- loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
-
- mova m0, [bsq+mmsize*0]
- mova m1, [bsq+mmsize*1]
- mova m2, [bsq+mmsize*2]
- mova m3, [bsq+mmsize*3]
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- mova [bsq+mmsize*0], m0
- mova [bsq+mmsize*1], m1
- mova [bsq+mmsize*2], m2
- mova [bsq+mmsize*3], m3
- RET
diff --git a/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm b/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
deleted file mode 100644
index fdaf510..0000000
--- a/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
+++ /dev/null
@@ -1,923 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Oskar Arvidsson <oskar@irock.se>
-;* Loren Merritt <lorenm@u.washington.edu>
-;* Jason Garrett-Glaser <darkshikari@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
-SECTION .text
-
-cextern pw_2
-cextern pw_3
-cextern pw_4
-
-; out: %4 = |%1-%2|-%3
-; clobbers: %5
-%macro ABS_SUB 5
- psubusw %5, %2, %1
- psubusw %4, %1, %2
- por %4, %5
- psubw %4, %3
-%endmacro
-
-; out: %4 = |%1-%2|<%3
-%macro DIFF_LT 5
- psubusw %4, %2, %1
- psubusw %5, %1, %2
- por %5, %4 ; |%1-%2|
- pxor %4, %4
- psubw %5, %3 ; |%1-%2|-%3
- pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
-%endmacro
-
-%macro LOAD_AB 4
- movd %1, %3
- movd %2, %4
- SPLATW %1, %1
- SPLATW %2, %2
-%endmacro
-
-; in: %2=tc reg
-; out: %1=splatted tc
-%macro LOAD_TC 2
- movd %1, [%2]
- punpcklbw %1, %1
-%if mmsize == 8
- pshufw %1, %1, 0
-%else
- pshuflw %1, %1, 01010000b
- pshufd %1, %1, 01010000b
-%endif
- psraw %1, 6
-%endmacro
-
-; in: %1=p1, %2=p0, %3=q0, %4=q1
-; %5=alpha, %6=beta, %7-%9=tmp
-; out: %7=mask
-%macro LOAD_MASK 9
- ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
- ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
- pand %8, %9
- ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
- pxor %7, %7
- pand %8, %9
- pcmpgtw %7, %8
-%endmacro
-
-; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
-; out: %1=p0', m2=q0'
-%macro DEBLOCK_P0_Q0 7
- psubw %3, %4
- pxor %7, %7
- paddw %3, [pw_4]
- psubw %7, %5
- psubw %6, %2, %1
- psllw %6, 2
- paddw %3, %6
- psraw %3, 3
- mova %6, [pw_pixel_max]
- CLIPW %3, %7, %5
- pxor %7, %7
- paddw %1, %3
- psubw %2, %3
- CLIPW %1, %7, %6
- CLIPW %2, %7, %6
-%endmacro
-
-; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
-%macro LUMA_Q1 6
- pavgw %6, %3, %4 ; (p0+q0+1)>>1
- paddw %1, %6
- pxor %6, %6
- psraw %1, 1
- psubw %6, %5
- psubw %1, %2
- CLIPW %1, %6, %5
- paddw %1, %2
-%endmacro
-
-%macro LUMA_DEBLOCK_ONE 3
- DIFF_LT m5, %1, bm, m4, m6
- pxor m6, m6
- mova %3, m4
- pcmpgtw m6, tcm
- pand m4, tcm
- pandn m6, m7
- pand m4, m6
- LUMA_Q1 m5, %2, m1, m2, m4, m6
-%endmacro
-
-%macro LUMA_H_STORE 2
-%if mmsize == 8
- movq [r0-4], m0
- movq [r0+r1-4], m1
- movq [r0+r1*2-4], m2
- movq [r0+%2-4], m3
-%else
- movq [r0-4], m0
- movhps [r0+r1-4], m0
- movq [r0+r1*2-4], m1
- movhps [%1-4], m1
- movq [%1+r1-4], m2
- movhps [%1+r1*2-4], m2
- movq [%1+%2-4], m3
- movhps [%1+r1*4-4], m3
-%endif
-%endmacro
-
-%macro DEBLOCK_LUMA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
- %assign pad 5*mmsize+12-(stack_offset&15)
- %define tcm [rsp]
- %define ms1 [rsp+mmsize]
- %define ms2 [rsp+mmsize*2]
- %define am [rsp+mmsize*3]
- %define bm [rsp+mmsize*4]
- SUB rsp, pad
- shl r2d, 2
- shl r3d, 2
- LOAD_AB m4, m5, r2d, r3d
- mov r3, 32/mmsize
- mov r2, r0
- sub r0, r1
- mova am, m4
- sub r0, r1
- mova bm, m5
- sub r0, r1
-.loop:
- mova m0, [r0+r1]
- mova m1, [r0+r1*2]
- mova m2, [r2]
- mova m3, [r2+r1]
-
- LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
- LOAD_TC m6, r4
- mova tcm, m6
-
- mova m5, [r0]
- LUMA_DEBLOCK_ONE m1, m0, ms1
- mova [r0+r1], m5
-
- mova m5, [r2+r1*2]
- LUMA_DEBLOCK_ONE m2, m3, ms2
- mova [r2+r1], m5
-
- pxor m5, m5
- mova m6, tcm
- pcmpgtw m5, tcm
- psubw m6, ms1
- pandn m5, m7
- psubw m6, ms2
- pand m5, m6
- DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
- mova [r0+r1*2], m1
- mova [r2], m2
-
- add r0, mmsize
- add r2, mmsize
- add r4, mmsize/8
- dec r3
- jg .loop
- ADD rsp, pad
- RET
-
-cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
- %assign pad 7*mmsize+12-(stack_offset&15)
- %define tcm [rsp]
- %define ms1 [rsp+mmsize]
- %define ms2 [rsp+mmsize*2]
- %define p1m [rsp+mmsize*3]
- %define p2m [rsp+mmsize*4]
- %define am [rsp+mmsize*5]
- %define bm [rsp+mmsize*6]
- SUB rsp, pad
- shl r2d, 2
- shl r3d, 2
- LOAD_AB m4, m5, r2d, r3d
- mov r3, r1
- mova am, m4
- add r3, r1
- mov r5, 32/mmsize
- mova bm, m5
- add r3, r1
-%if mmsize == 16
- mov r2, r0
- add r2, r3
-%endif
-.loop:
-%if mmsize == 8
- movq m2, [r0-8] ; y q2 q1 q0
- movq m7, [r0+0]
- movq m5, [r0+r1-8]
- movq m3, [r0+r1+0]
- movq m0, [r0+r1*2-8]
- movq m6, [r0+r1*2+0]
- movq m1, [r0+r3-8]
- TRANSPOSE4x4W 2, 5, 0, 1, 4
- SWAP 2, 7
- movq m7, [r0+r3]
- TRANSPOSE4x4W 2, 3, 6, 7, 4
-%else
- movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
- movu m0, [r0+r1-8]
- movu m2, [r0+r1*2-8]
- movu m3, [r2-8]
- TRANSPOSE4x4W 5, 0, 2, 3, 6
- mova tcm, m3
-
- movu m4, [r2+r1-8]
- movu m1, [r2+r1*2-8]
- movu m3, [r2+r3-8]
- movu m7, [r2+r1*4-8]
- TRANSPOSE4x4W 4, 1, 3, 7, 6
-
- mova m6, tcm
- punpcklqdq m6, m7
- punpckhqdq m5, m4
- SBUTTERFLY qdq, 0, 1, 7
- SBUTTERFLY qdq, 2, 3, 7
-%endif
-
- mova p2m, m6
- LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
- LOAD_TC m6, r4
- mova tcm, m6
-
- LUMA_DEBLOCK_ONE m1, m0, ms1
- mova p1m, m5
-
- mova m5, p2m
- LUMA_DEBLOCK_ONE m2, m3, ms2
- mova p2m, m5
-
- pxor m5, m5
- mova m6, tcm
- pcmpgtw m5, tcm
- psubw m6, ms1
- pandn m5, m7
- psubw m6, ms2
- pand m5, m6
- DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
- mova m0, p1m
- mova m3, p2m
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- LUMA_H_STORE r2, r3
-
- add r4, mmsize/8
- lea r0, [r0+r1*(mmsize/2)]
- lea r2, [r2+r1*(mmsize/2)]
- dec r5
- jg .loop
- ADD rsp, pad
- RET
-%endmacro
-
-%if ARCH_X86_64
-; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
-; m12=alpha, m13=beta
-; out: m0=p1', m3=q1', m1=p0', m2=q0'
-; clobbers: m4, m5, m6, m7, m10, m11, m14
-%macro DEBLOCK_LUMA_INTER_SSE2 0
- LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
- LOAD_TC m6, r4
- DIFF_LT m8, m1, m13, m10, m4
- DIFF_LT m9, m2, m13, m11, m4
- pand m6, m7
-
- mova m14, m6
- pxor m4, m4
- pcmpgtw m6, m4
- pand m6, m14
-
- mova m5, m10
- pand m5, m6
- LUMA_Q1 m8, m0, m1, m2, m5, m4
-
- mova m5, m11
- pand m5, m6
- LUMA_Q1 m9, m3, m1, m2, m5, m4
-
- pxor m4, m4
- psubw m6, m10
- pcmpgtw m4, m14
- pandn m4, m7
- psubw m6, m11
- pand m4, m6
- DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
-
- SWAP 0, 8
- SWAP 3, 9
-%endmacro
-
-%macro DEBLOCK_LUMA_64 0
-cglobal deblock_v_luma_10, 5,5,15
- %define p2 m8
- %define p1 m0
- %define p0 m1
- %define q0 m2
- %define q1 m3
- %define q2 m9
- %define mask0 m7
- %define mask1 m10
- %define mask2 m11
- shl r2d, 2
- shl r3d, 2
- LOAD_AB m12, m13, r2d, r3d
- mov r2, r0
- sub r0, r1
- sub r0, r1
- sub r0, r1
- mov r3, 2
-.loop:
- mova p2, [r0]
- mova p1, [r0+r1]
- mova p0, [r0+r1*2]
- mova q0, [r2]
- mova q1, [r2+r1]
- mova q2, [r2+r1*2]
- DEBLOCK_LUMA_INTER_SSE2
- mova [r0+r1], p1
- mova [r0+r1*2], p0
- mova [r2], q0
- mova [r2+r1], q1
- add r0, mmsize
- add r2, mmsize
- add r4, 2
- dec r3
- jg .loop
- REP_RET
-
-cglobal deblock_h_luma_10, 5,7,15
- shl r2d, 2
- shl r3d, 2
- LOAD_AB m12, m13, r2d, r3d
- mov r2, r1
- add r2, r1
- add r2, r1
- mov r5, r0
- add r5, r2
- mov r6, 2
-.loop:
- movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
- movu m0, [r0+r1-8]
- movu m2, [r0+r1*2-8]
- movu m9, [r5-8]
- movu m5, [r5+r1-8]
- movu m1, [r5+r1*2-8]
- movu m3, [r5+r2-8]
- movu m7, [r5+r1*4-8]
-
- TRANSPOSE4x4W 8, 0, 2, 9, 10
- TRANSPOSE4x4W 5, 1, 3, 7, 10
-
- punpckhqdq m8, m5
- SBUTTERFLY qdq, 0, 1, 10
- SBUTTERFLY qdq, 2, 3, 10
- punpcklqdq m9, m7
-
- DEBLOCK_LUMA_INTER_SSE2
-
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- LUMA_H_STORE r5, r2
- add r4, 2
- lea r0, [r0+r1*8]
- lea r5, [r5+r1*8]
- dec r6
- jg .loop
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA_64
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA_64
-%endif
-%endif
-
-%macro SWAPMOVA 2
-%ifid %1
- SWAP %1, %2
-%else
- mova %1, %2
-%endif
-%endmacro
-
-; in: t0-t2: tmp registers
-; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
-; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
-%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
-%if ARCH_X86_64
- paddw t0, %3, %2
- mova t2, %4
- paddw t2, %3
-%else
- mova t0, %3
- mova t2, %4
- paddw t0, %2
- paddw t2, %3
-%endif
- paddw t0, %1
- paddw t2, t2
- paddw t0, %5
- paddw t2, %9
- paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
- paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
-
- psrlw t2, 3
- psrlw t1, t0, 2
- psubw t2, %3
- psubw t1, %2
- pand t2, %8
- pand t1, %8
- paddw t2, %3
- paddw t1, %2
- SWAPMOVA %11, t1
-
- psubw t1, t0, %3
- paddw t0, t0
- psubw t1, %5
- psubw t0, %3
- paddw t1, %6
- paddw t1, %2
- paddw t0, %6
- psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
- psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
-
- pxor t0, t1
- pxor t1, %1
- pand t0, %8
- pand t1, %7
- pxor t0, t1
- pxor t0, %1
- SWAPMOVA %10, t0
- SWAPMOVA %12, t2
-%endmacro
-
-%macro LUMA_INTRA_INIT 1
- %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
- %define t0 m4
- %define t1 m5
- %define t2 m6
- %define t3 m7
- %assign i 4
-%rep %1
- CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
- %assign i i+1
-%endrep
- SUB rsp, pad
-%endmacro
-
-; in: %1-%3=tmp, %4=p2, %5=q2
-%macro LUMA_INTRA_INTER 5
- LOAD_AB t0, t1, r2d, r3d
- mova %1, t0
- LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
-%if ARCH_X86_64
- mova %2, t0 ; mask0
- psrlw t3, %1, 2
-%else
- mova t3, %1
- mova %2, t0 ; mask0
- psrlw t3, 2
-%endif
- paddw t3, [pw_2] ; alpha/4+2
- DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
- pand t2, %2
- mova t3, %5 ; q2
- mova %1, t2 ; mask1
- DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
- pand t2, %1
- mova t3, %4 ; p2
- mova %3, t2 ; mask1q
- DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
- pand t2, %1
- mova %1, t2 ; mask1p
-%endmacro
-
-%macro LUMA_H_INTRA_LOAD 0
-%if mmsize == 8
- movu t0, [r0-8]
- movu t1, [r0+r1-8]
- movu m0, [r0+r1*2-8]
- movu m1, [r0+r4-8]
- TRANSPOSE4x4W 4, 5, 0, 1, 2
- mova t4, t0 ; p3
- mova t5, t1 ; p2
-
- movu m2, [r0]
- movu m3, [r0+r1]
- movu t0, [r0+r1*2]
- movu t1, [r0+r4]
- TRANSPOSE4x4W 2, 3, 4, 5, 6
- mova t6, t0 ; q2
- mova t7, t1 ; q3
-%else
- movu t0, [r0-8]
- movu t1, [r0+r1-8]
- movu m0, [r0+r1*2-8]
- movu m1, [r0+r5-8]
- movu m2, [r4-8]
- movu m3, [r4+r1-8]
- movu t2, [r4+r1*2-8]
- movu t3, [r4+r5-8]
- TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
- mova t4, t0 ; p3
- mova t5, t1 ; p2
- mova t6, t2 ; q2
- mova t7, t3 ; q3
-%endif
-%endmacro
-
-; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
-%macro LUMA_H_INTRA_STORE 9
-%if mmsize == 8
- TRANSPOSE4x4W %1, %2, %3, %4, %9
- movq [r0-8], m%1
- movq [r0+r1-8], m%2
- movq [r0+r1*2-8], m%3
- movq [r0+r4-8], m%4
- movq m%1, %8
- TRANSPOSE4x4W %5, %6, %7, %1, %9
- movq [r0], m%5
- movq [r0+r1], m%6
- movq [r0+r1*2], m%7
- movq [r0+r4], m%1
-%else
- TRANSPOSE2x4x4W %1, %2, %3, %4, %9
- movq [r0-8], m%1
- movq [r0+r1-8], m%2
- movq [r0+r1*2-8], m%3
- movq [r0+r5-8], m%4
- movhps [r4-8], m%1
- movhps [r4+r1-8], m%2
- movhps [r4+r1*2-8], m%3
- movhps [r4+r5-8], m%4
-%ifnum %8
- SWAP %1, %8
-%else
- mova m%1, %8
-%endif
- TRANSPOSE2x4x4W %5, %6, %7, %1, %9
- movq [r0], m%5
- movq [r0+r1], m%6
- movq [r0+r1*2], m%7
- movq [r0+r5], m%1
- movhps [r4], m%5
- movhps [r4+r1], m%6
- movhps [r4+r1*2], m%7
- movhps [r4+r5], m%1
-%endif
-%endmacro
-
-%if ARCH_X86_64
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-%macro DEBLOCK_LUMA_INTRA_64 0
-cglobal deblock_v_luma_intra_10, 4,7,16
- %define t0 m1
- %define t1 m2
- %define t2 m4
- %define p2 m8
- %define p1 m9
- %define p0 m10
- %define q0 m11
- %define q1 m12
- %define q2 m13
- %define aa m5
- %define bb m14
- lea r4, [r1*4]
- lea r5, [r1*3] ; 3*stride
- neg r4
- add r4, r0 ; pix-4*stride
- mov r6, 2
- mova m0, [pw_2]
- shl r2d, 2
- shl r3d, 2
- LOAD_AB aa, bb, r2d, r3d
-.loop:
- mova p2, [r4+r1]
- mova p1, [r4+2*r1]
- mova p0, [r4+r5]
- mova q0, [r0]
- mova q1, [r0+r1]
- mova q2, [r0+2*r1]
-
- LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
- mova t2, aa
- psrlw t2, 2
- paddw t2, m0 ; alpha/4+2
- DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
- DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
- DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
- pand m6, m3
- pand m7, m6
- pand m6, t1
- LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
- LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
- add r0, mmsize
- add r4, mmsize
- dec r6
- jg .loop
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10, 4,7,16
- %define t0 m15
- %define t1 m14
- %define t2 m2
- %define q3 m5
- %define q2 m8
- %define q1 m9
- %define q0 m10
- %define p0 m11
- %define p1 m12
- %define p2 m13
- %define p3 m4
- %define spill [rsp]
- %assign pad 24-(stack_offset&15)
- SUB rsp, pad
- lea r4, [r1*4]
- lea r5, [r1*3] ; 3*stride
- add r4, r0 ; pix+4*stride
- mov r6, 2
- mova m0, [pw_2]
- shl r2d, 2
- shl r3d, 2
-.loop:
- movu q3, [r0-8]
- movu q2, [r0+r1-8]
- movu q1, [r0+r1*2-8]
- movu q0, [r0+r5-8]
- movu p0, [r4-8]
- movu p1, [r4+r1-8]
- movu p2, [r4+r1*2-8]
- movu p3, [r4+r5-8]
- TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
-
- LOAD_AB m1, m2, r2d, r3d
- LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
- psrlw m1, 2
- paddw m1, m0 ; alpha/4+2
- DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
- DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
- DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
- pand m6, m3
- pand m7, m6
- pand m6, t1
-
- mova spill, q3
- LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
- LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
- mova m7, spill
-
- LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
-
- lea r0, [r0+r1*8]
- lea r4, [r4+r1*8]
- dec r6
- jg .loop
- ADD rsp, pad
- RET
-%endmacro
-
-INIT_XMM sse2
-DEBLOCK_LUMA_INTRA_64
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA_INTRA_64
-%endif
-
-%endif
-
-%macro DEBLOCK_LUMA_INTRA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
- LUMA_INTRA_INIT 3
- lea r4, [r1*4]
- lea r5, [r1*3]
- neg r4
- add r4, r0
- mov r6, 32/mmsize
- shl r2d, 2
- shl r3d, 2
-.loop:
- mova m0, [r4+r1*2] ; p1
- mova m1, [r4+r5] ; p0
- mova m2, [r0] ; q0
- mova m3, [r0+r1] ; q1
- LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
- LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
- mova t3, [r0+r1*2] ; q2
- LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
- add r0, mmsize
- add r4, mmsize
- dec r6
- jg .loop
- ADD rsp, pad
- RET
-
-;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
- LUMA_INTRA_INIT 8
-%if mmsize == 8
- lea r4, [r1*3]
- mov r5, 32/mmsize
-%else
- lea r4, [r1*4]
- lea r5, [r1*3] ; 3*stride
- add r4, r0 ; pix+4*stride
- mov r6, 32/mmsize
-%endif
- shl r2d, 2
- shl r3d, 2
-.loop:
- LUMA_H_INTRA_LOAD
- LUMA_INTRA_INTER t8, t9, t10, t5, t6
-
- LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
- mova t3, t6 ; q2
- LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
-
- mova m2, t4
- mova m0, t11
- mova m1, t5
- mova m3, t8
- mova m6, t6
-
- LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
-
- lea r0, [r0+r1*(mmsize/2)]
-%if mmsize == 8
- dec r5
-%else
- lea r4, [r4+r1*(mmsize/2)]
- dec r6
-%endif
- jg .loop
- ADD rsp, pad
- RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmxext
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-INIT_XMM sse2
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_LUMA
-DEBLOCK_LUMA_INTRA
-%endif
-%endif
-
-; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
-; out: %1=p0', %2=q0'
-%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
- mova %6, [pw_2]
- paddw %6, %3
- paddw %6, %4
- paddw %7, %6, %2
- paddw %6, %1
- paddw %6, %3
- paddw %7, %4
- psraw %6, 2
- psraw %7, 2
- psubw %6, %1
- psubw %7, %2
- pand %6, %5
- pand %7, %5
- paddw %1, %6
- paddw %2, %7
-%endmacro
-
-%macro CHROMA_V_LOAD 1
- mova m0, [r0] ; p1
- mova m1, [r0+r1] ; p0
- mova m2, [%1] ; q0
- mova m3, [%1+r1] ; q1
-%endmacro
-
-%macro CHROMA_V_STORE 0
- mova [r0+1*r1], m1
- mova [r0+2*r1], m2
-%endmacro
-
-%macro CHROMA_V_LOAD_TC 2
- movd %1, [%2]
- punpcklbw %1, %1
- punpcklwd %1, %1
- psraw %1, 6
-%endmacro
-
-%macro DEBLOCK_CHROMA 0
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
- mov r5, r0
- sub r0, r1
- sub r0, r1
- shl r2d, 2
- shl r3d, 2
-%if mmsize < 16
- mov r6, 16/mmsize
-.loop:
-%endif
- CHROMA_V_LOAD r5
- LOAD_AB m4, m5, r2d, r3d
- LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
- pxor m4, m4
- CHROMA_V_LOAD_TC m6, r4
- psubw m6, [pw_3]
- pmaxsw m6, m4
- pand m7, m6
- DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
- CHROMA_V_STORE
-%if mmsize < 16
- add r0, mmsize
- add r5, mmsize
- add r4, mmsize/4
- dec r6
- jg .loop
- REP_RET
-%else
- RET
-%endif
-
-;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
- mov r4, r0
- sub r0, r1
- sub r0, r1
- shl r2d, 2
- shl r3d, 2
-%if mmsize < 16
- mov r5, 16/mmsize
-.loop:
-%endif
- CHROMA_V_LOAD r4
- LOAD_AB m4, m5, r2d, r3d
- LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
- CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
- CHROMA_V_STORE
-%if mmsize < 16
- add r0, mmsize
- add r4, mmsize
- dec r5
- jg .loop
- REP_RET
-%else
- RET
-%endif
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmxext
-DEBLOCK_CHROMA
-%endif
-INIT_XMM sse2
-DEBLOCK_CHROMA
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEBLOCK_CHROMA
-%endif
diff --git a/ffmpeg/libavcodec/x86/h264_i386.h b/ffmpeg/libavcodec/x86/h264_i386.h
deleted file mode 100644
index 0dc0a7c..0000000
--- a/ffmpeg/libavcodec/x86/h264_i386.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * H.264 / AVC / MPEG4 part10 codec.
- * non-MMX i386-specific optimizations for H.264
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#ifndef AVCODEC_X86_H264_I386_H
-#define AVCODEC_X86_H264_I386_H
-
-#include <stddef.h>
-
-#include "libavcodec/cabac.h"
-#include "cabac.h"
-
-#if HAVE_INLINE_ASM
-
-//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
-//as that would make optimization work hard)
-#if HAVE_7REGS
-#define decode_significance decode_significance_x86
-static int decode_significance_x86(CABACContext *c, int max_coeff,
- uint8_t *significant_coeff_ctx_base,
- int *index, x86_reg last_off){
- void *end= significant_coeff_ctx_base + max_coeff - 1;
- int minusstart= -(intptr_t)significant_coeff_ctx_base;
- int minusindex= 4-(intptr_t)index;
- int bit;
- x86_reg coeff_count;
-
-#ifdef BROKEN_RELOCATIONS
- void *tables;
-
- __asm__ volatile(
- "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
- : "=&r"(tables)
- );
-#endif
-
- __asm__ volatile(
- "3: \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c11(%6)", "%c12(%6)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%13")
-
- "test $1, %4 \n\t"
- " jz 4f \n\t"
- "add %10, %1 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c11(%6)", "%c12(%6)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%13")
-
- "sub %10, %1 \n\t"
- "mov %2, %0 \n\t"
- "movl %7, %%ecx \n\t"
- "add %1, %%"REG_c" \n\t"
- "movl %%ecx, (%0) \n\t"
-
- "test $1, %4 \n\t"
- " jnz 5f \n\t"
-
- "add"OPSIZE" $4, %2 \n\t"
-
- "4: \n\t"
- "add $1, %1 \n\t"
- "cmp %8, %1 \n\t"
- " jb 3b \n\t"
- "mov %2, %0 \n\t"
- "movl %7, %%ecx \n\t"
- "add %1, %%"REG_c" \n\t"
- "movl %%ecx, (%0) \n\t"
- "5: \n\t"
- "add %9, %k0 \n\t"
- "shr $2, %k0 \n\t"
- : "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
- "+&r"(c->low), "=&r"(bit), "+&r"(c->range)
- : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end))
- TABLES_ARG
- : "%"REG_c, "memory"
- );
- return coeff_count;
-}
-
-#define decode_significance_8x8 decode_significance_8x8_x86
-static int decode_significance_8x8_x86(CABACContext *c,
- uint8_t *significant_coeff_ctx_base,
- int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
- int minusindex= 4-(intptr_t)index;
- int bit;
- x86_reg coeff_count;
- x86_reg last=0;
- x86_reg state;
-
-#ifdef BROKEN_RELOCATIONS
- void *tables;
-
- __asm__ volatile(
- "lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
- : "=&r"(tables)
- );
-#endif
-
- __asm__ volatile(
- "mov %1, %6 \n\t"
- "3: \n\t"
-
- "mov %10, %0 \n\t"
- "movzbl (%0, %6), %k6 \n\t"
- "add %9, %6 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c12(%7)", "%c13(%7)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%15")
-
- "mov %1, %k6 \n\t"
- "test $1, %4 \n\t"
- " jz 4f \n\t"
-
-#ifdef BROKEN_RELOCATIONS
- "movzbl %c14(%15, %q6), %k6\n\t"
-#else
- "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
-#endif
- "add %11, %6 \n\t"
-
- BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
- "%5", "%q5", "%k0", "%b0",
- "%c12(%7)", "%c13(%7)",
- AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
- AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
- AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
- "%15")
-
- "mov %2, %0 \n\t"
- "mov %1, %k6 \n\t"
- "movl %k6, (%0) \n\t"
-
- "test $1, %4 \n\t"
- " jnz 5f \n\t"
-
- "add"OPSIZE" $4, %2 \n\t"
-
- "4: \n\t"
- "addl $1, %k6 \n\t"
- "mov %k6, %1 \n\t"
- "cmpl $63, %k6 \n\t"
- " jb 3b \n\t"
- "mov %2, %0 \n\t"
- "movl %k6, (%0) \n\t"
- "5: \n\t"
- "addl %8, %k0 \n\t"
- "shr $2, %k0 \n\t"
- : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
- "=&r"(bit), "+&r"(c->range), "=&r"(state)
- : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
- "m"(sig_off), "m"(last_coeff_ctx_base),
- "i"(offsetof(CABACContext, bytestream)),
- "i"(offsetof(CABACContext, bytestream_end)),
- "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
- : "%"REG_c, "memory"
- );
- return coeff_count;
-}
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
-
-#endif /* HAVE_INLINE_ASM */
-#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/ffmpeg/libavcodec/x86/h264_idct.asm b/ffmpeg/libavcodec/x86/h264_idct.asm
deleted file mode 100644
index 9af98a9..0000000
--- a/ffmpeg/libavcodec/x86/h264_idct.asm
+++ /dev/null
@@ -1,1082 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2-optimized H.264 iDCT
-;*****************************************************************************
-;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
-;* Copyright (C) 2003-2008 x264 project
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;* Loren Merritt <lorenm@u.washington.edu>
-;* Holger Lubitz <hal@duncan.ol.sub.de>
-;* Min Chen <chenm001.163.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;*****************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
- db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
- db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
- db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
- db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
- db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
- db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
- db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
- db 4+11*8, 5+11*8, 4+12*8, 5+12*8
- db 6+11*8, 7+11*8, 6+12*8, 7+12*8
- db 4+13*8, 5+13*8, 4+14*8, 5+14*8
- db 6+13*8, 7+13*8, 6+14*8, 7+14*8
-%ifdef PIC
-%define npicregs 1
-%define scan8 picregq
-%else
-%define npicregs 0
-%define scan8 scan8_mem
-%endif
-
-cextern pw_32
-cextern pw_1
-
-SECTION .text
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT4_ADD 3
- ; Load dct coeffs
- movq m0, [%2]
- movq m1, [%2+8]
- movq m2, [%2+16]
- movq m3, [%2+24]
-
- IDCT4_1D w, 0, 1, 2, 3, 4, 5
- mova m6, [pw_32]
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- paddw m0, m6
- IDCT4_1D w, 0, 1, 2, 3, 4, 5
- pxor m7, m7
- movq [%2+ 0], m7
- movq [%2+ 8], m7
- movq [%2+16], m7
- movq [%2+24], m7
-
- STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
-%endmacro
-
-INIT_MMX mmx
-; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct_add_8, 3, 3, 0
- IDCT4_ADD r0, r1, r2
- RET
-
-%macro IDCT8_1D 2
- mova m0, m1
- psraw m1, 1
- mova m4, m5
- psraw m4, 1
- paddw m4, m5
- paddw m1, m0
- paddw m4, m7
- paddw m1, m5
- psubw m4, m0
- paddw m1, m3
-
- psubw m0, m3
- psubw m5, m3
- psraw m3, 1
- paddw m0, m7
- psubw m5, m7
- psraw m7, 1
- psubw m0, m3
- psubw m5, m7
-
- mova m7, m1
- psraw m1, 2
- mova m3, m4
- psraw m3, 2
- paddw m3, m0
- psraw m0, 2
- paddw m1, m5
- psraw m5, 2
- psubw m0, m4
- psubw m7, m5
-
- mova m5, m6
- psraw m6, 1
- mova m4, m2
- psraw m4, 1
- paddw m6, m2
- psubw m4, m5
-
- mova m2, %1
- mova m5, %2
- SUMSUB_BA w, 5, 2
- SUMSUB_BA w, 6, 5
- SUMSUB_BA w, 4, 2
- SUMSUB_BA w, 7, 6
- SUMSUB_BA w, 0, 4
- SUMSUB_BA w, 3, 2
- SUMSUB_BA w, 1, 5
- SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
-%endmacro
-
-%macro IDCT8_1D_FULL 1
- mova m7, [%1+112]
- mova m6, [%1+ 96]
- mova m5, [%1+ 80]
- mova m3, [%1+ 48]
- mova m2, [%1+ 32]
- mova m1, [%1+ 16]
- IDCT8_1D [%1], [%1+ 64]
-%endmacro
-
-; %1=int16_t *block, %2=int16_t *dstblock
-%macro IDCT8_ADD_MMX_START 2
- IDCT8_1D_FULL %1
- mova [%1], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- mova m7, [%1]
- mova [%2 ], m0
- mova [%2+16], m1
- mova [%2+32], m2
- mova [%2+48], m3
- TRANSPOSE4x4W 4, 5, 6, 7, 3
- mova [%2+ 8], m4
- mova [%2+24], m5
- mova [%2+40], m6
- mova [%2+56], m7
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_MMX_END 3-4
- IDCT8_1D_FULL %2
- mova [%2 ], m5
- mova [%2+16], m6
- mova [%2+32], m7
-
- pxor m7, m7
-%if %0 == 4
- movq [%4+ 0], m7
- movq [%4+ 8], m7
- movq [%4+ 16], m7
- movq [%4+ 24], m7
- movq [%4+ 32], m7
- movq [%4+ 40], m7
- movq [%4+ 48], m7
- movq [%4+ 56], m7
- movq [%4+ 64], m7
- movq [%4+ 72], m7
- movq [%4+ 80], m7
- movq [%4+ 88], m7
- movq [%4+ 96], m7
- movq [%4+104], m7
- movq [%4+112], m7
- movq [%4+120], m7
-%endif
- STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
- mova m0, [%2 ]
- mova m1, [%2+16]
- mova m2, [%2+32]
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
-%endmacro
-
-INIT_MMX mmx
-; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_add_8, 3, 4, 0
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- add word [r1], 32
- IDCT8_ADD_MMX_START r1 , rsp
- IDCT8_ADD_MMX_START r1+8, rsp+64
- lea r3, [r0+4]
- IDCT8_ADD_MMX_END r0 , rsp, r2, r1
- IDCT8_ADD_MMX_END r3 , rsp+8, r2
-
- ADD rsp, pad
- RET
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_SSE 4
- IDCT8_1D_FULL %2
-%if ARCH_X86_64
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
-%else
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
-%endif
- paddw m0, [pw_32]
-
-%if ARCH_X86_64 == 0
- mova [%2 ], m0
- mova [%2+16], m4
- IDCT8_1D [%2], [%2+ 16]
- mova [%2 ], m6
- mova [%2+16], m7
-%else
- SWAP 0, 8
- SWAP 4, 9
- IDCT8_1D m8, m9
- SWAP 6, 8
- SWAP 7, 9
-%endif
-
- pxor m7, m7
- lea %4, [%3*3]
- STORE_DIFF m0, m6, m7, [%1 ]
- STORE_DIFF m1, m6, m7, [%1+%3 ]
- STORE_DIFF m2, m6, m7, [%1+%3*2]
- STORE_DIFF m3, m6, m7, [%1+%4 ]
-%if ARCH_X86_64 == 0
- mova m0, [%2 ]
- mova m1, [%2+16]
-%else
- SWAP 0, 8
- SWAP 1, 9
-%endif
- mova [%2+ 0], m7
- mova [%2+ 16], m7
- mova [%2+ 32], m7
- mova [%2+ 48], m7
- mova [%2+ 64], m7
- mova [%2+ 80], m7
- mova [%2+ 96], m7
- mova [%2+112], m7
- lea %1, [%1+%3*4]
- STORE_DIFF m4, m6, m7, [%1 ]
- STORE_DIFF m5, m6, m7, [%1+%3 ]
- STORE_DIFF m0, m6, m7, [%1+%3*2]
- STORE_DIFF m1, m6, m7, [%1+%4 ]
-%endmacro
-
-INIT_XMM sse2
-; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_add_8, 3, 4, 10
- IDCT8_ADD_SSE r0, r1, r2, r3
- RET
-
-%macro DC_ADD_MMXEXT_INIT 2
- add %1, 32
- sar %1, 6
- movd m0, %1d
- lea %1, [%2*3]
- pshufw m0, m0, 0
- pxor m1, m1
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
-%endmacro
-
-%macro DC_ADD_MMXEXT_OP 4
- %1 m2, [%2 ]
- %1 m3, [%2+%3 ]
- %1 m4, [%2+%3*2]
- %1 m5, [%2+%4 ]
- paddusb m2, m0
- paddusb m3, m0
- paddusb m4, m0
- paddusb m5, m0
- psubusb m2, m1
- psubusb m3, m1
- psubusb m4, m1
- psubusb m5, m1
- %1 [%2 ], m2
- %1 [%2+%3 ], m3
- %1 [%2+%3*2], m4
- %1 [%2+%4 ], m5
-%endmacro
-
-INIT_MMX mmxext
-; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
-%if ARCH_X86_64
-cglobal h264_idct_dc_add_8, 3, 4, 0
- movsx r3, word [r1]
- mov dword [r1], 0
- DC_ADD_MMXEXT_INIT r3, r2
- DC_ADD_MMXEXT_OP movh, r0, r2, r3
- RET
-
-; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_dc_add_8, 3, 4, 0
- movsx r3, word [r1]
- mov dword [r1], 0
- DC_ADD_MMXEXT_INIT r3, r2
- DC_ADD_MMXEXT_OP mova, r0, r2, r3
- lea r0, [r0+r2*4]
- DC_ADD_MMXEXT_OP mova, r0, r2, r3
- RET
-%else
-; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct_dc_add_8, 2, 3, 0
- movsx r2, word [r1]
- mov dword [r1], 0
- mov r1, r2m
- DC_ADD_MMXEXT_INIT r2, r1
- DC_ADD_MMXEXT_OP movh, r0, r1, r2
- RET
-
-; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_dc_add_8, 2, 3, 0
- movsx r2, word [r1]
- mov dword [r1], 0
- mov r1, r2m
- DC_ADD_MMXEXT_INIT r2, r1
- DC_ADD_MMXEXT_OP mova, r0, r1, r2
- lea r0, [r0+r1*4]
- DC_ADD_MMXEXT_OP mova, r0, r1, r2
- RET
-%endif
-
-INIT_MMX mmx
-; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6]
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- add r6, r0
- add word [r2], 32
- IDCT8_ADD_MMX_START r2 , rsp
- IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3, r2
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6+4]
- IDCT8_ADD_MMX_END r6 , rsp+8, r3
-.skipblock:
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
- ADD rsp, pad
- RET
-
-INIT_MMX mmxext
-; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- cmp r6, 1
- jnz .no_dc
- movsx r6, word [r2]
- test r6, r6
- jz .no_dc
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-.no_dc:
- mov r6d, dword [r1+r5*4]
- add r6, r0
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-INIT_MMX mmx
-; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- or r6w, word [r2]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- add r6, r0
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-INIT_MMX mmxext
-; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .try_dc
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6]
- IDCT4_ADD r6, r2, r3
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-.try_dc:
- movsx r6, word [r2]
- test r6, r6
- jz .skipblock
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- add dst2q, r0
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- cmp r6, 1
- jnz .no_dc
- movsx r6, word [r2]
- test r6, r6
- jz .no_dc
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
- lea dst2q, [dst2q+r3*4]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
-
- ADD rsp, pad
- RET
-.no_dc:
- mov r6d, dword [r1+r5*4]
- add r6, r0
- add word [r2], 32
- IDCT8_ADD_MMX_START r2 , rsp
- IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3, r2
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6+4]
- IDCT8_ADD_MMX_END r6 , rsp+8, r3
-.skipblock:
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
-
- ADD rsp, pad
- RET
-
-INIT_XMM sse2
-; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- cmp r6, 1
- jnz .no_dc
- movsx r6, word [r2]
- test r6, r6
- jz .no_dc
-INIT_MMX cpuname
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- add dst2q, r0
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
- lea dst2q, [dst2q+r3*4]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
- REP_RET
-.no_dc:
-INIT_XMM cpuname
- mov dst2d, dword [r1+r5*4]
- add dst2q, r0
- IDCT8_ADD_SSE dst2q, r2, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
-.skipblock:
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-INIT_MMX mmx
-h264_idct_add8_mmx_plane:
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- or r6w, word [r2]
- test r6, r6
- jz .skipblock
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- IDCT4_ADD r0, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-
-; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
-; int16_t *block, int stride, const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- mov r5, 16
- add r2, 512
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
- call h264_idct_add8_mmx_plane
- mov r5, 32
- add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
- add r0mp, gprsize
-%endif
- call h264_idct_add8_mmx_plane
- RET
-
-h264_idct_add8_mmxext_plane:
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .try_dc
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- IDCT4_ADD r0, r2, r3
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-.try_dc:
- movsx r6, word [r2]
- test r6, r6
- jz .skipblock
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- DC_ADD_MMXEXT_OP movh, r0, r3, r6
-.skipblock:
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-
-INIT_MMX mmxext
-; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- mov r5, 16
- add r2, 512
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
- call h264_idct_add8_mmxext_plane
- mov r5, 32
- add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
- add r0mp, gprsize
-%endif
- call h264_idct_add8_mmxext_plane
- RET
-
-; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
-h264_idct_dc_add8_mmxext:
- movd m0, [r2 ] ; 0 0 X D
- mov word [r2+ 0], 0
- punpcklwd m0, [r2+32] ; x X d D
- mov word [r2+32], 0
- paddsw m0, [pw_32]
- psraw m0, 6
- punpcklwd m0, m0 ; d d D D
- pxor m1, m1 ; 0 0 0 0
- psubw m1, m0 ; -d-d-D-D
- packuswb m0, m1 ; -d-d-D-D d d D D
- pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
- punpcklwd m0, m0 ; d d d d D D D D
- lea r6, [r3*3]
- DC_ADD_MMXEXT_OP movq, r0, r3, r6
- ret
-
-ALIGN 16
-INIT_XMM sse2
-; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
-h264_add8x4_idct_sse2:
- movq m0, [r2+ 0]
- movq m1, [r2+ 8]
- movq m2, [r2+16]
- movq m3, [r2+24]
- movhps m0, [r2+32]
- movhps m1, [r2+40]
- movhps m2, [r2+48]
- movhps m3, [r2+56]
- IDCT4_1D w,0,1,2,3,4,5
- TRANSPOSE2x4x4W 0,1,2,3,4
- paddw m0, [pw_32]
- IDCT4_1D w,0,1,2,3,4,5
- pxor m7, m7
- mova [r2+ 0], m7
- mova [r2+16], m7
- mova [r2+32], m7
- mova [r2+48], m7
- STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
- lea r0, [r0+r3*2]
- STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
- ret
-
-%macro add16_sse2_cycle 2
- movzx r0, word [r4+%2]
- test r0, r0
- jz .cycle%1end
- mov r0d, dword [r1+%1*8]
-%if ARCH_X86_64
- add r0, r5
-%else
- add r0, r0m
-%endif
- call h264_add8x4_idct_sse2
-.cycle%1end:
-%if %1 < 7
- add r2, 64
-%endif
-%endmacro
-
-; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
-%if ARCH_X86_64
- mov r5, r0
-%endif
- ; unrolling of the loop leads to an average performance gain of
- ; 20-25%
- add16_sse2_cycle 0, 0xc
- add16_sse2_cycle 1, 0x14
- add16_sse2_cycle 2, 0xe
- add16_sse2_cycle 3, 0x16
- add16_sse2_cycle 4, 0x1c
- add16_sse2_cycle 5, 0x24
- add16_sse2_cycle 6, 0x1e
- add16_sse2_cycle 7, 0x26
- RET
-
-%macro add16intra_sse2_cycle 2
- movzx r0, word [r4+%2]
- test r0, r0
- jz .try%1dc
- mov r0d, dword [r1+%1*8]
-%if ARCH_X86_64
- add r0, r7
-%else
- add r0, r0m
-%endif
- call h264_add8x4_idct_sse2
- jmp .cycle%1end
-.try%1dc:
- movsx r0, word [r2 ]
- or r0w, word [r2+32]
- jz .cycle%1end
- mov r0d, dword [r1+%1*8]
-%if ARCH_X86_64
- add r0, r7
-%else
- add r0, r0m
-%endif
- call h264_idct_dc_add8_mmxext
-.cycle%1end:
-%if %1 < 7
- add r2, 64
-%endif
-%endmacro
-
-; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
-%if ARCH_X86_64
- mov r7, r0
-%endif
- add16intra_sse2_cycle 0, 0xc
- add16intra_sse2_cycle 1, 0x14
- add16intra_sse2_cycle 2, 0xe
- add16intra_sse2_cycle 3, 0x16
- add16intra_sse2_cycle 4, 0x1c
- add16intra_sse2_cycle 5, 0x24
- add16intra_sse2_cycle 6, 0x1e
- add16intra_sse2_cycle 7, 0x26
- RET
-
-%macro add8_sse2_cycle 2
- movzx r0, word [r4+%2]
- test r0, r0
- jz .try%1dc
-%if ARCH_X86_64
- mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r7]
-%else
- mov r0, r0m
- mov r0, [r0]
- add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
-%endif
- call h264_add8x4_idct_sse2
- jmp .cycle%1end
-.try%1dc:
- movsx r0, word [r2 ]
- or r0w, word [r2+32]
- jz .cycle%1end
-%if ARCH_X86_64
- mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r7]
-%else
- mov r0, r0m
- mov r0, [r0]
- add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
-%endif
- call h264_idct_dc_add8_mmxext
-.cycle%1end:
-%if %1 == 1
- add r2, 384+64
-%elif %1 < 3
- add r2, 64
-%endif
-%endmacro
-
-; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
- add r2, 512
-%if ARCH_X86_64
- mov r7, r0
-%endif
- add8_sse2_cycle 0, 0x34
- add8_sse2_cycle 1, 0x3c
-%if ARCH_X86_64
- add r7, gprsize
-%else
- add r0mp, gprsize
-%endif
- add8_sse2_cycle 2, 0x5c
- add8_sse2_cycle 3, 0x64
- RET
-
-;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
-
-%macro WALSH4_1D 5
- SUMSUB_BADC w, %4, %3, %2, %1, %5
- SUMSUB_BADC w, %4, %2, %3, %1, %5
- SWAP %1, %4, %3
-%endmacro
-
-%macro DEQUANT_MMX 3
- mova m7, [pw_1]
- mova m4, %1
- punpcklwd %1, m7
- punpckhwd m4, m7
- mova m5, %2
- punpcklwd %2, m7
- punpckhwd m5, m7
- movd m7, t3d
- punpckldq m7, m7
- pmaddwd %1, m7
- pmaddwd %2, m7
- pmaddwd m4, m7
- pmaddwd m5, m7
- psrad %1, %3
- psrad %2, %3
- psrad m4, %3
- psrad m5, %3
- packssdw %1, m4
- packssdw %2, m5
-%endmacro
-
-%macro STORE_WORDS 5-9
-%if cpuflag(sse)
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
- psrldq %1, 4
- mov [t2+%2*32], t0w
- mov [t2+%4*32], t1w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%3*32], t0w
- mov [t2+%5*32], t1w
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
- mov [t2+%6*32], t0w
- mov [t2+%8*32], t1w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%7*32], t0w
- mov [t2+%9*32], t1w
-%else
- movd t0d, %1
- psrlq %1, 32
- movd t1d, %1
- mov [t2+%2*32], t0w
- mov [t2+%4*32], t1w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%3*32], t0w
- mov [t2+%5*32], t1w
-%endif
-%endmacro
-
-%macro DEQUANT_STORE 1
-%if cpuflag(sse2)
- movd xmm4, t3d
- movq xmm5, [pw_1]
- pshufd xmm4, xmm4, 0
- movq2dq xmm0, m0
- movq2dq xmm1, m1
- movq2dq xmm2, m2
- movq2dq xmm3, m3
- punpcklwd xmm0, xmm5
- punpcklwd xmm1, xmm5
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm5
- pmaddwd xmm0, xmm4
- pmaddwd xmm1, xmm4
- pmaddwd xmm2, xmm4
- pmaddwd xmm3, xmm4
- psrad xmm0, %1
- psrad xmm1, %1
- psrad xmm2, %1
- psrad xmm3, %1
- packssdw xmm0, xmm1
- packssdw xmm2, xmm3
- STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
- STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
-%else
- DEQUANT_MMX m0, m1, %1
- STORE_WORDS m0, 0, 1, 4, 5
- STORE_WORDS m1, 2, 3, 6, 7
-
- DEQUANT_MMX m2, m3, %1
- STORE_WORDS m2, 8, 9, 12, 13
- STORE_WORDS m3, 10, 11, 14, 15
-%endif
-%endmacro
-
-%macro IDCT_DC_DEQUANT 1
-cglobal h264_luma_dc_dequant_idct, 3, 4, %1
- ; manually spill XMM registers for Win64 because
- ; the code here is initialized with INIT_MMX
- WIN64_SPILL_XMM %1
- movq m3, [r1+24]
- movq m2, [r1+16]
- movq m1, [r1+ 8]
- movq m0, [r1+ 0]
- WALSH4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- WALSH4_1D 0,1,2,3,4
-
-; shift, tmp, output, qmul
-%if WIN64
- DECLARE_REG_TMP 0,3,1,2
- ; we can't avoid this, because r0 is the shift register (ecx) on win64
- xchg r0, t2
-%elif ARCH_X86_64
- DECLARE_REG_TMP 3,1,0,2
-%else
- DECLARE_REG_TMP 1,3,0,2
-%endif
-
- cmp t3d, 32767
- jg .big_qmul
- add t3d, 128 << 16
- DEQUANT_STORE 8
- RET
-.big_qmul:
- bsr t0d, t3d
- add t3d, 128 << 16
- mov t1d, 7
- cmp t0d, t1d
- cmovg t0d, t1d
- inc t1d
- shr t3d, t0b
- sub t1d, t0d
-%if cpuflag(sse2)
- movd xmm6, t1d
- DEQUANT_STORE xmm6
-%else
- movd m6, t1d
- DEQUANT_STORE m6
-%endif
- RET
-%endmacro
-
-INIT_MMX mmx
-IDCT_DC_DEQUANT 0
-INIT_MMX sse2
-IDCT_DC_DEQUANT 7
diff --git a/ffmpeg/libavcodec/x86/h264_idct_10bit.asm b/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
deleted file mode 100644
index df21288..0000000
--- a/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
+++ /dev/null
@@ -1,589 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32: times 4 dd 32
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
-;-----------------------------------------------------------------------------
-%macro STORE_DIFFx2 6
- psrad %1, 6
- psrad %2, 6
- packssdw %1, %2
- movq %3, [%5]
- movhps %3, [%5+%6]
- paddsw %1, %3
- CLIPW %1, %4, [pw_pixel_max]
- movq [%5], %1
- movhps [%5+%6], %1
-%endmacro
-
-%macro STORE_DIFF16 5
- psrad %1, 6
- psrad %2, 6
- packssdw %1, %2
- paddsw %1, [%5]
- CLIPW %1, %3, %4
- mova [%5], %1
-%endmacro
-
-;dst, in, stride
-%macro IDCT4_ADD_10 3
- mova m0, [%2+ 0]
- mova m1, [%2+16]
- mova m2, [%2+32]
- mova m3, [%2+48]
- IDCT4_1D d,0,1,2,3,4,5
- TRANSPOSE4x4D 0,1,2,3,4
- paddd m0, [pd_32]
- IDCT4_1D d,0,1,2,3,4,5
- pxor m5, m5
- mova [%2+ 0], m5
- mova [%2+16], m5
- mova [%2+32], m5
- mova [%2+48], m5
- STORE_DIFFx2 m0, m1, m4, m5, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m2, m3, m4, m5, %1, %3
-%endmacro
-
-%macro IDCT_ADD_10 0
-cglobal h264_idct_add_10, 3,3
- IDCT4_ADD_10 r0, r1, r2
- RET
-%endmacro
-
-INIT_XMM sse2
-IDCT_ADD_10
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT_ADD_10
-%endif
-
-;-----------------------------------------------------------------------------
-; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
-;-----------------------------------------------------------------------------
-;;;;;;; NO FATE SAMPLES TRIGGER THIS
-%macro ADD4x4IDCT 0
-add4x4_idct %+ SUFFIX:
- add r5, r0
- mova m0, [r2+ 0]
- mova m1, [r2+16]
- mova m2, [r2+32]
- mova m3, [r2+48]
- IDCT4_1D d,0,1,2,3,4,5
- TRANSPOSE4x4D 0,1,2,3,4
- paddd m0, [pd_32]
- IDCT4_1D d,0,1,2,3,4,5
- pxor m5, m5
- mova [r2+ 0], m5
- mova [r2+16], m5
- mova [r2+32], m5
- mova [r2+48], m5
- STORE_DIFFx2 m0, m1, m4, m5, r5, r3
- lea r5, [r5+r3*2]
- STORE_DIFFx2 m2, m3, m4, m5, r5, r3
- ret
-%endmacro
-
-INIT_XMM sse2
-ALIGN 16
-ADD4x4IDCT
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-ALIGN 16
-ADD4x4IDCT
-%endif
-
-%macro ADD16_OP 2
- cmp byte [r4+%2], 0
- jz .skipblock%1
- mov r5d, [r1+%1*4]
- call add4x4_idct %+ SUFFIX
-.skipblock%1:
-%if %1<15
- add r2, 64
-%endif
-%endmacro
-
-%macro IDCT_ADD16_10 0
-cglobal h264_idct_add16_10, 5,6
- ADD16_OP 0, 4+1*8
- ADD16_OP 1, 5+1*8
- ADD16_OP 2, 4+2*8
- ADD16_OP 3, 5+2*8
- ADD16_OP 4, 6+1*8
- ADD16_OP 5, 7+1*8
- ADD16_OP 6, 6+2*8
- ADD16_OP 7, 7+2*8
- ADD16_OP 8, 4+3*8
- ADD16_OP 9, 5+3*8
- ADD16_OP 10, 4+4*8
- ADD16_OP 11, 5+4*8
- ADD16_OP 12, 6+3*8
- ADD16_OP 13, 7+3*8
- ADD16_OP 14, 6+4*8
- ADD16_OP 15, 7+4*8
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-IDCT_ADD16_10
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT_ADD16_10
-%endif
-
-;-----------------------------------------------------------------------------
-; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
-;-----------------------------------------------------------------------------
-%macro IDCT_DC_ADD_OP_10 3
- pxor m5, m5
-%if avx_enabled
- paddw m1, m0, [%1+0 ]
- paddw m2, m0, [%1+%2 ]
- paddw m3, m0, [%1+%2*2]
- paddw m4, m0, [%1+%3 ]
-%else
- mova m1, [%1+0 ]
- mova m2, [%1+%2 ]
- mova m3, [%1+%2*2]
- mova m4, [%1+%3 ]
- paddw m1, m0
- paddw m2, m0
- paddw m3, m0
- paddw m4, m0
-%endif
- CLIPW m1, m5, m6
- CLIPW m2, m5, m6
- CLIPW m3, m5, m6
- CLIPW m4, m5, m6
- mova [%1+0 ], m1
- mova [%1+%2 ], m2
- mova [%1+%2*2], m3
- mova [%1+%3 ], m4
-%endmacro
-
-INIT_MMX mmxext
-cglobal h264_idct_dc_add_10,3,3
- movd m0, [r1]
- mov dword [r1], 0
- paddd m0, [pd_32]
- psrad m0, 6
- lea r1, [r2*3]
- pshufw m0, m0, 0
- mova m6, [pw_pixel_max]
- IDCT_DC_ADD_OP_10 r0, r2, r1
- RET
-
-;-----------------------------------------------------------------------------
-; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
-;-----------------------------------------------------------------------------
-%macro IDCT8_DC_ADD 0
-cglobal h264_idct8_dc_add_10,3,4,7
- movd m0, [r1]
- mov dword[r1], 0
- paddd m0, [pd_32]
- psrad m0, 6
- lea r1, [r2*3]
- SPLATW m0, m0, 0
- mova m6, [pw_pixel_max]
- IDCT_DC_ADD_OP_10 r0, r2, r1
- lea r0, [r0+r2*4]
- IDCT_DC_ADD_OP_10 r0, r2, r1
- RET
-%endmacro
-
-INIT_XMM sse2
-IDCT8_DC_ADD
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT8_DC_ADD
-%endif
-
-;-----------------------------------------------------------------------------
-; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
-;-----------------------------------------------------------------------------
-%macro AC 1
-.ac%1:
- mov r5d, [r1+(%1+0)*4]
- call add4x4_idct %+ SUFFIX
- mov r5d, [r1+(%1+1)*4]
- add r2, 64
- call add4x4_idct %+ SUFFIX
- add r2, 64
- jmp .skipadd%1
-%endmacro
-
-%assign last_block 16
-%macro ADD16_OP_INTRA 2
- cmp word [r4+%2], 0
- jnz .ac%1
- mov r5d, [r2+ 0]
- or r5d, [r2+64]
- jz .skipblock%1
- mov r5d, [r1+(%1+0)*4]
- call idct_dc_add %+ SUFFIX
-.skipblock%1:
-%if %1<last_block-2
- add r2, 128
-%endif
-.skipadd%1:
-%endmacro
-
-%macro IDCT_ADD16INTRA_10 0
-idct_dc_add %+ SUFFIX:
- add r5, r0
- movq m0, [r2+ 0]
- movhps m0, [r2+64]
- mov dword [r2+ 0], 0
- mov dword [r2+64], 0
- paddd m0, [pd_32]
- psrad m0, 6
- pshufhw m0, m0, 0
- pshuflw m0, m0, 0
- lea r6, [r3*3]
- mova m6, [pw_pixel_max]
- IDCT_DC_ADD_OP_10 r5, r3, r6
- ret
-
-cglobal h264_idct_add16intra_10,5,7,8
- ADD16_OP_INTRA 0, 4+1*8
- ADD16_OP_INTRA 2, 4+2*8
- ADD16_OP_INTRA 4, 6+1*8
- ADD16_OP_INTRA 6, 6+2*8
- ADD16_OP_INTRA 8, 4+3*8
- ADD16_OP_INTRA 10, 4+4*8
- ADD16_OP_INTRA 12, 6+3*8
- ADD16_OP_INTRA 14, 6+4*8
- REP_RET
- AC 8
- AC 10
- AC 12
- AC 14
- AC 0
- AC 2
- AC 4
- AC 6
-%endmacro
-
-INIT_XMM sse2
-IDCT_ADD16INTRA_10
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT_ADD16INTRA_10
-%endif
-
-%assign last_block 36
-;-----------------------------------------------------------------------------
-; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
-;-----------------------------------------------------------------------------
-%macro IDCT_ADD8 0
-cglobal h264_idct_add8_10,5,8,7
-%if ARCH_X86_64
- mov r7, r0
-%endif
- add r2, 1024
- mov r0, [r0]
- ADD16_OP_INTRA 16, 4+ 6*8
- ADD16_OP_INTRA 18, 4+ 7*8
- add r2, 1024-128*2
-%if ARCH_X86_64
- mov r0, [r7+gprsize]
-%else
- mov r0, r0m
- mov r0, [r0+gprsize]
-%endif
- ADD16_OP_INTRA 32, 4+11*8
- ADD16_OP_INTRA 34, 4+12*8
- REP_RET
- AC 16
- AC 18
- AC 32
- AC 34
-
-%endmacro ; IDCT_ADD8
-
-INIT_XMM sse2
-IDCT_ADD8
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT_ADD8
-%endif
-
-;-----------------------------------------------------------------------------
-; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
-;-----------------------------------------------------------------------------
-%macro IDCT8_1D 2
- SWAP 0, 1
- psrad m4, m5, 1
- psrad m1, m0, 1
- paddd m4, m5
- paddd m1, m0
- paddd m4, m7
- paddd m1, m5
- psubd m4, m0
- paddd m1, m3
-
- psubd m0, m3
- psubd m5, m3
- paddd m0, m7
- psubd m5, m7
- psrad m3, 1
- psrad m7, 1
- psubd m0, m3
- psubd m5, m7
-
- SWAP 1, 7
- psrad m1, m7, 2
- psrad m3, m4, 2
- paddd m3, m0
- psrad m0, 2
- paddd m1, m5
- psrad m5, 2
- psubd m0, m4
- psubd m7, m5
-
- SWAP 5, 6
- psrad m4, m2, 1
- psrad m6, m5, 1
- psubd m4, m5
- paddd m6, m2
-
- mova m2, %1
- mova m5, %2
- SUMSUB_BA d, 5, 2
- SUMSUB_BA d, 6, 5
- SUMSUB_BA d, 4, 2
- SUMSUB_BA d, 7, 6
- SUMSUB_BA d, 0, 4
- SUMSUB_BA d, 3, 2
- SUMSUB_BA d, 1, 5
- SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
-%endmacro
-
-%macro IDCT8_1D_FULL 1
- mova m7, [%1+112*2]
- mova m6, [%1+ 96*2]
- mova m5, [%1+ 80*2]
- mova m3, [%1+ 48*2]
- mova m2, [%1+ 32*2]
- mova m1, [%1+ 16*2]
- IDCT8_1D [%1], [%1+ 64*2]
-%endmacro
-
-; %1=int16_t *block, %2=int16_t *dstblock
-%macro IDCT8_ADD_SSE_START 2
- IDCT8_1D_FULL %1
-%if ARCH_X86_64
- TRANSPOSE4x4D 0,1,2,3,8
- mova [%2 ], m0
- TRANSPOSE4x4D 4,5,6,7,8
- mova [%2+8*2], m4
-%else
- mova [%1], m7
- TRANSPOSE4x4D 0,1,2,3,7
- mova m7, [%1]
- mova [%2 ], m0
- mova [%2+16*2], m1
- mova [%2+32*2], m2
- mova [%2+48*2], m3
- TRANSPOSE4x4D 4,5,6,7,3
- mova [%2+ 8*2], m4
- mova [%2+24*2], m5
- mova [%2+40*2], m6
- mova [%2+56*2], m7
-%endif
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_SSE_END 3
- IDCT8_1D_FULL %2
- mova [%2 ], m6
- mova [%2+16*2], m7
-
- pxor m7, m7
- STORE_DIFFx2 m0, m1, m6, m7, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m2, m3, m6, m7, %1, %3
- mova m0, [%2 ]
- mova m1, [%2+16*2]
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m4, m5, m6, m7, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m0, m1, m6, m7, %1, %3
-%endmacro
-
-%macro IDCT8_ADD 0
-cglobal h264_idct8_add_10, 3,4,16
-%if UNIX64 == 0
- %assign pad 16-gprsize-(stack_offset&15)
- sub rsp, pad
- call h264_idct8_add1_10 %+ SUFFIX
- add rsp, pad
- RET
-%endif
-
-ALIGN 16
-; TODO: does not need to use stack
-h264_idct8_add1_10 %+ SUFFIX:
-%assign pad 256+16-gprsize
- sub rsp, pad
- add dword [r1], 32
-
-%if ARCH_X86_64
- IDCT8_ADD_SSE_START r1, rsp
- SWAP 1, 9
- SWAP 2, 10
- SWAP 3, 11
- SWAP 5, 13
- SWAP 6, 14
- SWAP 7, 15
- IDCT8_ADD_SSE_START r1+16, rsp+128
- PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
- IDCT8_1D [rsp], [rsp+128]
- SWAP 0, 8
- SWAP 1, 9
- SWAP 2, 10
- SWAP 3, 11
- SWAP 4, 12
- SWAP 5, 13
- SWAP 6, 14
- SWAP 7, 15
- IDCT8_1D [rsp+16], [rsp+144]
- psrad m8, 6
- psrad m0, 6
- packssdw m8, m0
- paddsw m8, [r0]
- pxor m0, m0
- mova [r1+ 0], m0
- mova [r1+ 16], m0
- mova [r1+ 32], m0
- mova [r1+ 48], m0
- mova [r1+ 64], m0
- mova [r1+ 80], m0
- mova [r1+ 96], m0
- mova [r1+112], m0
- mova [r1+128], m0
- mova [r1+144], m0
- mova [r1+160], m0
- mova [r1+176], m0
- mova [r1+192], m0
- mova [r1+208], m0
- mova [r1+224], m0
- mova [r1+240], m0
- CLIPW m8, m0, [pw_pixel_max]
- mova [r0], m8
- mova m8, [pw_pixel_max]
- STORE_DIFF16 m9, m1, m0, m8, r0+r2
- lea r0, [r0+r2*2]
- STORE_DIFF16 m10, m2, m0, m8, r0
- STORE_DIFF16 m11, m3, m0, m8, r0+r2
- lea r0, [r0+r2*2]
- STORE_DIFF16 m12, m4, m0, m8, r0
- STORE_DIFF16 m13, m5, m0, m8, r0+r2
- lea r0, [r0+r2*2]
- STORE_DIFF16 m14, m6, m0, m8, r0
- STORE_DIFF16 m15, m7, m0, m8, r0+r2
-%else
- IDCT8_ADD_SSE_START r1, rsp
- IDCT8_ADD_SSE_START r1+16, rsp+128
- lea r3, [r0+8]
- IDCT8_ADD_SSE_END r0, rsp, r2
- IDCT8_ADD_SSE_END r3, rsp+16, r2
- mova [r1+ 0], m7
- mova [r1+ 16], m7
- mova [r1+ 32], m7
- mova [r1+ 48], m7
- mova [r1+ 64], m7
- mova [r1+ 80], m7
- mova [r1+ 96], m7
- mova [r1+112], m7
- mova [r1+128], m7
- mova [r1+144], m7
- mova [r1+160], m7
- mova [r1+176], m7
- mova [r1+192], m7
- mova [r1+208], m7
- mova [r1+224], m7
- mova [r1+240], m7
-%endif ; ARCH_X86_64
-
- add rsp, pad
- ret
-%endmacro
-
-INIT_XMM sse2
-IDCT8_ADD
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT8_ADD
-%endif
-
-;-----------------------------------------------------------------------------
-; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
-;-----------------------------------------------------------------------------
-;;;;;;; NO FATE SAMPLES TRIGGER THIS
-%macro IDCT8_ADD4_OP 2
- cmp byte [r4+%2], 0
- jz .skipblock%1
- mov r0d, [r6+%1*4]
- add r0, r5
- call h264_idct8_add1_10 %+ SUFFIX
-.skipblock%1:
-%if %1<12
- add r1, 256
-%endif
-%endmacro
-
-%macro IDCT8_ADD4 0
-cglobal h264_idct8_add4_10, 0,7,16
- %assign pad 16-gprsize-(stack_offset&15)
- SUB rsp, pad
- mov r5, r0mp
- mov r6, r1mp
- mov r1, r2mp
- mov r2d, r3m
- movifnidn r4, r4mp
- IDCT8_ADD4_OP 0, 4+1*8
- IDCT8_ADD4_OP 4, 6+1*8
- IDCT8_ADD4_OP 8, 4+3*8
- IDCT8_ADD4_OP 12, 6+3*8
- ADD rsp, pad
- RET
-%endmacro ; IDCT8_ADD4
-
-INIT_XMM sse2
-IDCT8_ADD4
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT8_ADD4
-%endif
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred.asm b/ffmpeg/libavcodec/x86/h264_intrapred.asm
deleted file mode 100644
index 3064ec5..0000000
--- a/ffmpeg/libavcodec/x86/h264_intrapred.asm
+++ /dev/null
@@ -1,2699 +0,0 @@
-;******************************************************************************
-;* H.264 intra prediction asm optimizations
-;* Copyright (c) 2010 Jason Garrett-Glaser
-;* Copyright (c) 2010 Holger Lubitz
-;* Copyright (c) 2010 Loren Merritt
-;* Copyright (c) 2010 Ronald S. Bultje
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-tm_shuf: times 8 db 0x03, 0x80
-pw_ff00: times 8 dw 0xff00
-plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
- db 1, 2, 3, 4, 5, 6, 7, 8
-plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
- db 1, 2, 3, 4, 0, 0, 0, 0
-pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
-pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
-pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
-pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
-
-SECTION .text
-
-cextern pb_1
-cextern pb_3
-cextern pw_4
-cextern pw_5
-cextern pw_8
-cextern pw_16
-cextern pw_17
-cextern pw_32
-
-;-----------------------------------------------------------------------------
-; void pred16x16_vertical_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-cglobal pred16x16_vertical_8, 2,3
- sub r0, r1
- mov r2, 8
- movq mm0, [r0+0]
- movq mm1, [r0+8]
-.loop:
- movq [r0+r1*1+0], mm0
- movq [r0+r1*1+8], mm1
- movq [r0+r1*2+0], mm0
- movq [r0+r1*2+8], mm1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
-
-INIT_XMM sse
-cglobal pred16x16_vertical_8, 2,3
- sub r0, r1
- mov r2, 4
- movaps xmm0, [r0]
-.loop:
- movaps [r0+r1*1], xmm0
- movaps [r0+r1*2], xmm0
- lea r0, [r0+r1*2]
- movaps [r0+r1*1], xmm0
- movaps [r0+r1*2], xmm0
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void pred16x16_horizontal_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_H 0
-cglobal pred16x16_horizontal_8, 2,3
- mov r2, 8
-%if cpuflag(ssse3)
- mova m2, [pb_3]
-%endif
-.loop:
- movd m0, [r0+r1*0-4]
- movd m1, [r0+r1*1-4]
-
-%if cpuflag(ssse3)
- pshufb m0, m2
- pshufb m1, m2
-%else
- punpcklbw m0, m0
- punpcklbw m1, m1
- SPLATW m0, m0, 3
- SPLATW m1, m1, 3
- mova [r0+r1*0+8], m0
- mova [r0+r1*1+8], m1
-%endif
-
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-PRED16x16_H
-INIT_MMX mmxext
-PRED16x16_H
-INIT_XMM ssse3
-PRED16x16_H
-
-;-----------------------------------------------------------------------------
-; void pred16x16_dc_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC 0
-cglobal pred16x16_dc_8, 2,7
- mov r4, r0
- sub r0, r1
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [r0+0]
- psadbw mm1, [r0+8]
- dec r0
- movzx r5d, byte [r0+r1*1]
- paddw mm0, mm1
- movd r6d, mm0
- lea r0, [r0+r1*2]
-%rep 7
- movzx r2d, byte [r0+r1*0]
- movzx r3d, byte [r0+r1*1]
- add r5d, r2d
- add r6d, r3d
- lea r0, [r0+r1*2]
-%endrep
- movzx r2d, byte [r0+r1*0]
- add r5d, r6d
- lea r2d, [r2+r5+16]
- shr r2d, 5
-%if cpuflag(ssse3)
- pxor m1, m1
-%endif
- SPLATB_REG m0, r2, m1
-
-%if mmsize==8
- mov r3d, 8
-.loop:
- mova [r4+r1*0+0], m0
- mova [r4+r1*0+8], m0
- mova [r4+r1*1+0], m0
- mova [r4+r1*1+8], m0
-%else
- mov r3d, 4
-.loop:
- mova [r4+r1*0], m0
- mova [r4+r1*1], m0
- lea r4, [r4+r1*2]
- mova [r4+r1*0], m0
- mova [r4+r1*1], m0
-%endif
- lea r4, [r4+r1*2]
- dec r3d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_DC
-INIT_XMM sse2
-PRED16x16_DC
-INIT_XMM ssse3
-PRED16x16_DC
-
-;-----------------------------------------------------------------------------
-; void pred16x16_tm_vp8_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_TM 0
-cglobal pred16x16_tm_vp8_8, 2,5
- sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0+0]
- movq mm2, [r0+8]
- movq mm1, mm0
- movq mm3, mm2
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- movzx r3d, byte [r0-1]
- mov r4d, 16
-.loop:
- movzx r2d, byte [r0+r1-1]
- sub r2d, r3d
- movd mm4, r2d
- SPLATW mm4, mm4, 0
- movq mm5, mm4
- movq mm6, mm4
- movq mm7, mm4
- paddw mm4, mm0
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
- packuswb mm4, mm5
- packuswb mm6, mm7
- movq [r0+r1+0], mm4
- movq [r0+r1+8], mm6
- add r0, r1
- dec r4d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-PRED16x16_TM
-INIT_MMX mmxext
-PRED16x16_TM
-
-INIT_XMM sse2
-cglobal pred16x16_tm_vp8_8, 2,6,6
- sub r0, r1
- pxor xmm2, xmm2
- movdqa xmm0, [r0]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- movzx r4d, byte [r0-1]
- mov r5d, 8
-.loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd xmm2, r2d
- movd xmm4, r3d
- pshuflw xmm2, xmm2, 0
- pshuflw xmm4, xmm4, 0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm4, xmm4
- movdqa xmm3, xmm2
- movdqa xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm3, xmm1
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- packuswb xmm2, xmm3
- packuswb xmm4, xmm5
- movdqa [r0+r1*1], xmm2
- movdqa [r0+r1*2], xmm4
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void pred16x16_plane_*_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro H264_PRED16x16_PLANE 1
-cglobal pred16x16_plane_%1_8, 2,9,7
- mov r2, r1 ; +stride
- neg r1 ; -stride
-
- movh m0, [r0+r1 -1]
-%if mmsize == 8
- pxor m4, m4
- movh m1, [r0+r1 +3 ]
- movh m2, [r0+r1 +8 ]
- movh m3, [r0+r1 +12]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- pmullw m0, [pw_m8tom1 ]
- pmullw m1, [pw_m8tom1+8]
- pmullw m2, [pw_1to8 ]
- pmullw m3, [pw_1to8 +8]
- paddw m0, m2
- paddw m1, m3
-%else ; mmsize == 16
-%if cpuflag(ssse3)
- movhps m0, [r0+r1 +8]
- pmaddubsw m0, [plane_shuf] ; H coefficients
-%else ; sse2
- pxor m2, m2
- movh m1, [r0+r1 +8]
- punpcklbw m0, m2
- punpcklbw m1, m2
- pmullw m0, [pw_m8tom1]
- pmullw m1, [pw_1to8]
- paddw m0, m1
-%endif
- movhlps m1, m0
-%endif
- paddw m0, m1
-%if cpuflag(mmxext)
- PSHUFLW m1, m0, 0xE
-%elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 32
-%endif
- paddw m0, m1
-%if cpuflag(mmxext)
- PSHUFLW m1, m0, 0x1
-%elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 16
-%endif
- paddw m0, m1 ; sum of H coefficients
-
- lea r4, [r0+r2*8-1]
- lea r3, [r0+r2*4-1]
- add r4, r2
-
-%if ARCH_X86_64
-%define e_reg r8
-%else
-%define e_reg r0
-%endif
-
- movzx e_reg, byte [r3+r2*2 ]
- movzx r5, byte [r4+r1 ]
- sub r5, e_reg
-
- movzx e_reg, byte [r3+r2 ]
- movzx r6, byte [r4 ]
- sub r6, e_reg
- lea r5, [r5+r6*2]
-
- movzx e_reg, byte [r3+r1 ]
- movzx r6, byte [r4+r2*2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
-
- movzx e_reg, byte [r3 ]
-%if ARCH_X86_64
- movzx r7, byte [r4+r2 ]
- sub r7, e_reg
-%else
- movzx r6, byte [r4+r2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
- sub r5, r6
-%endif
-
- lea e_reg, [r3+r1*4]
- lea r3, [r4+r2*4]
-
- movzx r4, byte [e_reg+r2 ]
- movzx r6, byte [r3 ]
- sub r6, r4
-%if ARCH_X86_64
- lea r6, [r7+r6*2]
- lea r5, [r5+r6*2]
- add r5, r6
-%else
- lea r5, [r5+r6*4]
- lea r5, [r5+r6*2]
-%endif
-
- movzx r4, byte [e_reg ]
-%if ARCH_X86_64
- movzx r7, byte [r3 +r2 ]
- sub r7, r4
- sub r5, r7
-%else
- movzx r6, byte [r3 +r2 ]
- sub r6, r4
- lea r5, [r5+r6*8]
- sub r5, r6
-%endif
-
- movzx r4, byte [e_reg+r1 ]
- movzx r6, byte [r3 +r2*2]
- sub r6, r4
-%if ARCH_X86_64
- add r6, r7
-%endif
- lea r5, [r5+r6*8]
-
- movzx r4, byte [e_reg+r2*2]
- movzx r6, byte [r3 +r1 ]
- sub r6, r4
- lea r5, [r5+r6*4]
- add r5, r6 ; sum of V coefficients
-
-%if ARCH_X86_64 == 0
- mov r0, r0m
-%endif
-
-%ifidn %1, h264
- lea r5, [r5*5+32]
- sar r5, 6
-%elifidn %1, rv40
- lea r5, [r5*5]
- sar r5, 6
-%elifidn %1, svq3
- test r5, r5
- lea r6, [r5+3]
- cmovs r5, r6
- sar r5, 2 ; V/4
- lea r5, [r5*5] ; 5*(V/4)
- test r5, r5
- lea r6, [r5+15]
- cmovs r5, r6
- sar r5, 4 ; (5*(V/4))/16
-%endif
-
- movzx r4, byte [r0+r1 +15]
- movzx r3, byte [r3+r2*2 ]
- lea r3, [r3+r4+1]
- shl r3, 4
-
- movd r1d, m0
- movsx r1d, r1w
-%ifnidn %1, svq3
-%ifidn %1, h264
- lea r1d, [r1d*5+32]
-%else ; rv40
- lea r1d, [r1d*5]
-%endif
- sar r1d, 6
-%else ; svq3
- test r1d, r1d
- lea r4d, [r1d+3]
- cmovs r1d, r4d
- sar r1d, 2 ; H/4
- lea r1d, [r1d*5] ; 5*(H/4)
- test r1d, r1d
- lea r4d, [r1d+15]
- cmovs r1d, r4d
- sar r1d, 4 ; (5*(H/4))/16
-%endif
- movd m0, r1d
-
- add r1d, r5d
- add r3d, r1d
- shl r1d, 3
- sub r3d, r1d ; a
-
- movd m1, r5d
- movd m3, r3d
- SPLATW m0, m0, 0 ; H
- SPLATW m1, m1, 0 ; V
- SPLATW m3, m3, 0 ; a
-%ifidn %1, svq3
- SWAP 0, 1
-%endif
- mova m2, m0
-%if mmsize == 8
- mova m5, m0
-%endif
- pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
-%if mmsize == 16
- psllw m2, 3
-%else
- psllw m5, 3
- psllw m2, 2
- mova m6, m5
- paddw m6, m2
-%endif
- paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
- paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
-%if mmsize == 8
- paddw m5, m0 ; a + {8,9,10,11}*H
- paddw m6, m0 ; a + {12,13,14,15}*H
-%endif
-
- mov r4, 8
-.loop:
- mova m3, m0 ; b[0..7]
- mova m4, m2 ; b[8..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0], m3
-%if mmsize == 8
- mova m3, m5 ; b[8..11]
- mova m4, m6 ; b[12..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+8], m3
-%endif
- paddw m0, m1
- paddw m2, m1
-%if mmsize == 8
- paddw m5, m1
- paddw m6, m1
-%endif
-
- mova m3, m0 ; b[0..7]
- mova m4, m2 ; b[8..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+r2], m3
-%if mmsize == 8
- mova m3, m5 ; b[8..11]
- mova m4, m6 ; b[12..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+r2+8], m3
-%endif
- paddw m0, m1
- paddw m2, m1
-%if mmsize == 8
- paddw m5, m1
- paddw m6, m1
-%endif
-
- lea r0, [r0+r2*2]
- dec r4
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-H264_PRED16x16_PLANE h264
-H264_PRED16x16_PLANE rv40
-H264_PRED16x16_PLANE svq3
-INIT_MMX mmxext
-H264_PRED16x16_PLANE h264
-H264_PRED16x16_PLANE rv40
-H264_PRED16x16_PLANE svq3
-INIT_XMM sse2
-H264_PRED16x16_PLANE h264
-H264_PRED16x16_PLANE rv40
-H264_PRED16x16_PLANE svq3
-INIT_XMM ssse3
-H264_PRED16x16_PLANE h264
-H264_PRED16x16_PLANE rv40
-H264_PRED16x16_PLANE svq3
-
-;-----------------------------------------------------------------------------
-; void pred8x8_plane_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro H264_PRED8x8_PLANE 0
-cglobal pred8x8_plane_8, 2,9,7
- mov r2, r1 ; +stride
- neg r1 ; -stride
-
- movd m0, [r0+r1 -1]
-%if mmsize == 8
- pxor m2, m2
- movh m1, [r0+r1 +4 ]
- punpcklbw m0, m2
- punpcklbw m1, m2
- pmullw m0, [pw_m4to4]
- pmullw m1, [pw_m4to4+8]
-%else ; mmsize == 16
-%if cpuflag(ssse3)
- movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
- pmaddubsw m0, [plane8_shuf] ; H coefficients
-%else ; sse2
- pxor m2, m2
- movd m1, [r0+r1 +4]
- punpckldq m0, m1
- punpcklbw m0, m2
- pmullw m0, [pw_m4to4]
-%endif
- movhlps m1, m0
-%endif
- paddw m0, m1
-
-%if notcpuflag(ssse3)
-%if cpuflag(mmxext)
- PSHUFLW m1, m0, 0xE
-%elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 32
-%endif
- paddw m0, m1
-%endif ; !ssse3
-
-%if cpuflag(mmxext)
- PSHUFLW m1, m0, 0x1
-%elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 16
-%endif
- paddw m0, m1 ; sum of H coefficients
-
- lea r4, [r0+r2*4-1]
- lea r3, [r0 -1]
- add r4, r2
-
-%if ARCH_X86_64
-%define e_reg r8
-%else
-%define e_reg r0
-%endif
-
- movzx e_reg, byte [r3+r2*2 ]
- movzx r5, byte [r4+r1 ]
- sub r5, e_reg
-
- movzx e_reg, byte [r3 ]
-%if ARCH_X86_64
- movzx r7, byte [r4+r2 ]
- sub r7, e_reg
- sub r5, r7
-%else
- movzx r6, byte [r4+r2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
- sub r5, r6
-%endif
-
- movzx e_reg, byte [r3+r1 ]
- movzx r6, byte [r4+r2*2 ]
- sub r6, e_reg
-%if ARCH_X86_64
- add r6, r7
-%endif
- lea r5, [r5+r6*4]
-
- movzx e_reg, byte [r3+r2 ]
- movzx r6, byte [r4 ]
- sub r6, e_reg
- lea r6, [r5+r6*2]
-
- lea r5, [r6*9+16]
- lea r5, [r5+r6*8]
- sar r5, 5
-
-%if ARCH_X86_64 == 0
- mov r0, r0m
-%endif
-
- movzx r3, byte [r4+r2*2 ]
- movzx r4, byte [r0+r1 +7]
- lea r3, [r3+r4+1]
- shl r3, 4
- movd r1d, m0
- movsx r1d, r1w
- imul r1d, 17
- add r1d, 16
- sar r1d, 5
- movd m0, r1d
- add r1d, r5d
- sub r3d, r1d
- add r1d, r1d
- sub r3d, r1d ; a
-
- movd m1, r5d
- movd m3, r3d
- SPLATW m0, m0, 0 ; H
- SPLATW m1, m1, 0 ; V
- SPLATW m3, m3, 0 ; a
-%if mmsize == 8
- mova m2, m0
-%endif
- pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
- paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
-%if mmsize == 8
- psllw m2, 2
- paddw m2, m0 ; a + {4,5,6,7}*H
-%endif
-
- mov r4, 4
-ALIGN 16
-.loop:
-%if mmsize == 16
- mova m3, m0 ; b[0..7]
- paddw m0, m1
- psraw m3, 5
- mova m4, m0 ; V+b[0..7]
- paddw m0, m1
- psraw m4, 5
- packuswb m3, m4
- movh [r0], m3
- movhps [r0+r2], m3
-%else ; mmsize == 8
- mova m3, m0 ; b[0..3]
- mova m4, m2 ; b[4..7]
- paddw m0, m1
- paddw m2, m1
- psraw m3, 5
- psraw m4, 5
- mova m5, m0 ; V+b[0..3]
- mova m6, m2 ; V+b[4..7]
- paddw m0, m1
- paddw m2, m1
- psraw m5, 5
- psraw m6, 5
- packuswb m3, m4
- packuswb m5, m6
- mova [r0], m3
- mova [r0+r2], m5
-%endif
-
- lea r0, [r0+r2*2]
- dec r4
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-H264_PRED8x8_PLANE
-INIT_MMX mmxext
-H264_PRED8x8_PLANE
-INIT_XMM sse2
-H264_PRED8x8_PLANE
-INIT_XMM ssse3
-H264_PRED8x8_PLANE
-
-;-----------------------------------------------------------------------------
-; void pred8x8_vertical_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-cglobal pred8x8_vertical_8, 2,2
- sub r0, r1
- movq mm0, [r0]
-%rep 3
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- lea r0, [r0+r1*2]
-%endrep
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_horizontal_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8_H 0
-cglobal pred8x8_horizontal_8, 2,3
- mov r2, 4
-%if cpuflag(ssse3)
- mova m2, [pb_3]
-%endif
-.loop:
- SPLATB_LOAD m0, r0+r1*0-1, m2
- SPLATB_LOAD m1, r0+r1*1-1, m2
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-PRED8x8_H
-INIT_MMX mmxext
-PRED8x8_H
-INIT_MMX ssse3
-PRED8x8_H
-
-;-----------------------------------------------------------------------------
-; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal pred8x8_top_dc_8, 2,5
- sub r0, r1
- movq mm0, [r0]
- pxor mm1, mm1
- pxor mm2, mm2
- lea r2, [r0+r1*2]
- punpckhbw mm1, mm0
- punpcklbw mm0, mm2
- psadbw mm1, mm2 ; s1
- lea r3, [r2+r1*2]
- psadbw mm0, mm2 ; s0
- psrlw mm1, 1
- psrlw mm0, 1
- pavgw mm1, mm2
- lea r4, [r3+r1*2]
- pavgw mm0, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- packuswb mm0, mm1 ; dc0,dc1 (b)
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- lea r0, [r3+r1*2]
- movq [r2+r1*1], mm0
- movq [r2+r1*2], mm0
- movq [r3+r1*1], mm0
- movq [r3+r1*2], mm0
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_dc_8_mmxext(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8_dc_8, 2,5
- sub r0, r1
- pxor m7, m7
- movd m0, [r0+0]
- movd m1, [r0+4]
- psadbw m0, m7 ; s0
- mov r4, r0
- psadbw m1, m7 ; s1
-
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- lea r0, [r0+r1*2]
- add r2d, r3d
- movzx r3d, byte [r0+r1*1-1]
- add r2d, r3d
- movzx r3d, byte [r0+r1*2-1]
- add r2d, r3d
- lea r0, [r0+r1*2]
- movd m2, r2d ; s2
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- lea r0, [r0+r1*2]
- add r2d, r3d
- movzx r3d, byte [r0+r1*1-1]
- add r2d, r3d
- movzx r3d, byte [r0+r1*2-1]
- add r2d, r3d
- movd m3, r2d ; s3
-
- punpcklwd m0, m1
- mov r0, r4
- punpcklwd m2, m3
- punpckldq m0, m2 ; s0, s1, s2, s3
- pshufw m3, m0, 11110110b ; s2, s1, s3, s3
- lea r2, [r0+r1*2]
- pshufw m0, m0, 01110100b ; s0, s1, s3, s1
- paddw m0, m3
- lea r3, [r2+r1*2]
- psrlw m0, 2
- pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
- lea r4, [r3+r1*2]
- packuswb m0, m0
- punpcklbw m0, m0
- movq m1, m0
- punpcklbw m0, m0
- punpckhbw m1, m1
- movq [r0+r1*1], m0
- movq [r0+r1*2], m0
- movq [r2+r1*1], m0
- movq [r2+r1*2], m0
- movq [r3+r1*1], m1
- movq [r3+r1*2], m1
- movq [r4+r1*1], m1
- movq [r4+r1*2], m1
- RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_dc_rv40_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8_dc_rv40_8, 2,7
- mov r4, r0
- sub r0, r1
- pxor mm0, mm0
- psadbw mm0, [r0]
- dec r0
- movzx r5d, byte [r0+r1*1]
- movd r6d, mm0
- lea r0, [r0+r1*2]
-%rep 3
- movzx r2d, byte [r0+r1*0]
- movzx r3d, byte [r0+r1*1]
- add r5d, r2d
- add r6d, r3d
- lea r0, [r0+r1*2]
-%endrep
- movzx r2d, byte [r0+r1*0]
- add r5d, r6d
- lea r2d, [r2+r5+8]
- shr r2d, 4
- movd mm0, r2d
- punpcklbw mm0, mm0
- pshufw mm0, mm0, 0
- mov r3d, 4
-.loop:
- movq [r4+r1*0], mm0
- movq [r4+r1*1], mm0
- lea r4, [r4+r1*2]
- dec r3d
- jg .loop
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_tm_vp8_8(uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8_TM 0
-cglobal pred8x8_tm_vp8_8, 2,6
- sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0]
- movq mm1, mm0
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- movzx r4d, byte [r0-1]
- mov r5d, 4
-.loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd mm2, r2d
- movd mm4, r3d
- SPLATW mm2, mm2, 0
- SPLATW mm4, mm4, 0
- movq mm3, mm2
- movq mm5, mm4
- paddw mm2, mm0
- paddw mm3, mm1
- paddw mm4, mm0
- paddw mm5, mm1
- packuswb mm2, mm3
- packuswb mm4, mm5
- movq [r0+r1*1], mm2
- movq [r0+r1*2], mm4
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-PRED8x8_TM
-INIT_MMX mmxext
-PRED8x8_TM
-
-INIT_XMM sse2
-cglobal pred8x8_tm_vp8_8, 2,6,4
- sub r0, r1
- pxor xmm1, xmm1
- movq xmm0, [r0]
- punpcklbw xmm0, xmm1
- movzx r4d, byte [r0-1]
- mov r5d, 4
-.loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd xmm2, r2d
- movd xmm3, r3d
- pshuflw xmm2, xmm2, 0
- pshuflw xmm3, xmm3, 0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
- paddw xmm2, xmm0
- paddw xmm3, xmm0
- packuswb xmm2, xmm3
- movq [r0+r1*1], xmm2
- movhps [r0+r1*2], xmm2
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
-
-INIT_XMM ssse3
-cglobal pred8x8_tm_vp8_8, 2,3,6
- sub r0, r1
- movdqa xmm4, [tm_shuf]
- pxor xmm1, xmm1
- movq xmm0, [r0]
- punpcklbw xmm0, xmm1
- movd xmm5, [r0-4]
- pshufb xmm5, xmm4
- mov r2d, 4
-.loop:
- movd xmm2, [r0+r1*1-4]
- movd xmm3, [r0+r1*2-4]
- pshufb xmm2, xmm4
- pshufb xmm3, xmm4
- psubw xmm2, xmm5
- psubw xmm3, xmm5
- paddw xmm2, xmm0
- paddw xmm3, xmm0
- packuswb xmm2, xmm3
- movq [r0+r1*1], xmm2
- movhps [r0+r1*2], xmm2
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
-
-; dest, left, right, src, tmp
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED4x4_LOWPASS 5
- mova %5, %2
- pavgb %2, %3
- pxor %3, %5
- mova %1, %4
- pand %3, [pb_1]
- psubusb %2, %3
- pavgb %1, %2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_TOP_DC 0
-cglobal pred8x8l_top_dc_8, 4,4
- sub r0, r3
- pxor mm7, mm7
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .body
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .body
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
-.body:
- PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
- psadbw mm7, mm0
- paddw mm7, [pw_4]
- psrlw mm7, 3
- pshufw mm7, mm7, 0
- packuswb mm7, mm7
-%rep 3
- movq [r0+r3*1], mm7
- movq [r0+r3*2], mm7
- lea r0, [r0+r3*2]
-%endrep
- movq [r0+r3*1], mm7
- movq [r0+r3*2], mm7
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_TOP_DC
-INIT_MMX ssse3
-PRED8x8L_TOP_DC
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8L_DC 0
-cglobal pred8x8l_dc_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .body
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .body
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.body:
- lea r1, [r0+r3*2]
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- pxor mm0, mm0
- pxor mm1, mm1
- lea r2, [r1+r3*2]
- psadbw mm0, mm7
- psadbw mm1, mm6
- paddw mm0, [pw_8]
- paddw mm0, mm1
- lea r4, [r2+r3*2]
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- movq [r1+r3*1], mm0
- movq [r1+r3*2], mm0
- movq [r2+r3*1], mm0
- movq [r2+r3*2], mm0
- movq [r4+r3*1], mm0
- movq [r4+r3*2], mm0
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_DC
-INIT_MMX ssse3
-PRED8x8L_DC
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8L_HORIZONTAL 0
-cglobal pred8x8l_horizontal_8, 4,4
- sub r0, r3
- lea r2, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- test r1, r1
- lea r1, [r0+r3]
- cmovnz r1, r0
- punpckhbw mm0, [r1+r3*0-8]
- movq mm1, [r2+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r2, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r1+r3*0-8]
- mov r0, r2
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm3, mm7
- lea r1, [r0+r3*2]
- movq mm7, mm3
- punpckhbw mm3, mm3
- punpcklbw mm7, mm7
- pshufw mm0, mm3, 0xff
- pshufw mm1, mm3, 0xaa
- lea r2, [r1+r3*2]
- pshufw mm2, mm3, 0x55
- pshufw mm3, mm3, 0x00
- pshufw mm4, mm7, 0xff
- pshufw mm5, mm7, 0xaa
- pshufw mm6, mm7, 0x55
- pshufw mm7, mm7, 0x00
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm1
- movq [r1+r3*1], mm2
- movq [r1+r3*2], mm3
- movq [r2+r3*1], mm4
- movq [r2+r3*2], mm5
- lea r0, [r2+r3*2]
- movq [r0+r3*1], mm6
- movq [r0+r3*2], mm7
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_HORIZONTAL
-INIT_MMX ssse3
-PRED8x8L_HORIZONTAL
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8L_VERTICAL 0
-cglobal pred8x8l_vertical_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .body
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .body
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
-.body:
- PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
-%rep 3
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- lea r0, [r0+r3*2]
-%endrep
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_VERTICAL
-INIT_MMX ssse3
-PRED8x8L_VERTICAL
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8l_down_left_8, 4,5
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- jmp .do_top
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm7, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
-.do_topright:
- lea r1, [r0+r3*2]
- movq mm6, mm1
- psrlq mm1, 56
- movq mm4, mm1
- lea r2, [r1+r3*2]
- movq mm2, mm6
- PALIGNR mm2, mm7, 1, mm0
- movq mm3, mm6
- PALIGNR mm3, mm7, 7, mm0
- PALIGNR mm4, mm6, 1, mm0
- movq mm5, mm7
- movq mm1, mm7
- movq mm7, mm6
- lea r4, [r2+r3*2]
- psllq mm1, 8
- PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
- PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
- movq [r4+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r4+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r2+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r2+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r1+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r1+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r0+r3*2], mm1
- psllq mm1, 8
- psrlq mm0, 56
- por mm1, mm0
- movq [r0+r3*1], mm1
- RET
-
-%macro PRED8x8L_DOWN_LEFT 0
-cglobal pred8x8l_down_left_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .do_top
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm3, mm4
- test r2, r2 ; top_right
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
-.do_topright:
- movq2dq xmm4, mm1
- psrlq mm1, 56
- movq2dq xmm5, mm1
- lea r1, [r0+r3*2]
- pslldq xmm4, 8
- por xmm3, xmm4
- movdqa xmm2, xmm3
- psrldq xmm2, 1
- pslldq xmm5, 15
- por xmm2, xmm5
- lea r2, [r1+r3*2]
- movdqa xmm1, xmm3
- pslldq xmm1, 1
-INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
- psrldq xmm0, 1
- movq [r0+r3*1], xmm0
- psrldq xmm0, 1
- movq [r0+r3*2], xmm0
- psrldq xmm0, 1
- lea r0, [r2+r3*2]
- movq [r1+r3*1], xmm0
- psrldq xmm0, 1
- movq [r1+r3*2], xmm0
- psrldq xmm0, 1
- movq [r2+r3*1], xmm0
- psrldq xmm0, 1
- movq [r2+r3*2], xmm0
- psrldq xmm0, 1
- movq [r0+r3*1], xmm0
- psrldq xmm0, 1
- movq [r0+r3*2], xmm0
- RET
-%endmacro
-
-INIT_MMX sse2
-PRED8x8L_DOWN_LEFT
-INIT_MMX ssse3
-PRED8x8L_DOWN_LEFT
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8l_down_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1 ; top_left
- jz .fix_lt_1
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq mm6, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm5, mm4
- jmp .body
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.body:
- lea r1, [r0+r3*2]
- movq mm1, mm7
- movq mm7, mm5
- movq mm5, mm6
- movq mm2, mm7
- lea r2, [r1+r3*2]
- PALIGNR mm2, mm6, 1, mm0
- movq mm3, mm7
- PALIGNR mm3, mm6, 7, mm0
- movq mm4, mm7
- lea r4, [r2+r3*2]
- psrlq mm4, 8
- PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
- PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
- movq [r4+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r4+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r2+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r2+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r1+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r1+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r0+r3*2], mm0
- psrlq mm0, 8
- psllq mm1, 56
- por mm0, mm1
- movq [r0+r3*1], mm0
- RET
-
-%macro PRED8x8L_DOWN_RIGHT 0
-cglobal pred8x8l_down_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jz .fix_lt_1
- jmp .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq2dq xmm3, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq2dq xmm1, mm7
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm4, mm4
- lea r1, [r0+r3*2]
- movdqa xmm0, xmm3
- pslldq xmm4, 8
- por xmm3, xmm4
- lea r2, [r1+r3*2]
- pslldq xmm4, 1
- por xmm1, xmm4
- psrldq xmm0, 7
- pslldq xmm0, 15
- psrldq xmm0, 7
- por xmm1, xmm0
- lea r0, [r2+r3*2]
- movdqa xmm2, xmm3
- psrldq xmm2, 1
-INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- movq [r0+r3*2], xmm0
- movq [r0+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r2+r3*2], xmm0
- movq [r2+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r1+r3*2], xmm0
- movq [r1+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r4+r3*2], xmm0
- movq [r4+r3*1], xmm1
- RET
-%endmacro
-
-INIT_MMX sse2
-PRED8x8L_DOWN_RIGHT
-INIT_MMX ssse3
-PRED8x8L_DOWN_RIGHT
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8l_vertical_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jz .fix_lt_1
- jmp .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm7, mm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- lea r1, [r0+r3*2]
- movq mm2, mm6
- movq mm3, mm6
- PALIGNR mm3, mm7, 7, mm0
- PALIGNR mm6, mm7, 6, mm1
- movq mm4, mm3
- pavgb mm3, mm2
- lea r2, [r1+r3*2]
- PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
- movq [r0+r3*1], mm3
- movq [r0+r3*2], mm0
- movq mm5, mm0
- movq mm6, mm3
- movq mm1, mm7
- movq mm2, mm1
- psllq mm2, 8
- movq mm3, mm1
- psllq mm3, 16
- lea r4, [r2+r3*2]
- PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
- PALIGNR mm6, mm0, 7, mm2
- movq [r1+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r1+r3*2], mm5
- psllq mm0, 8
- PALIGNR mm6, mm0, 7, mm2
- movq [r2+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r2+r3*2], mm5
- psllq mm0, 8
- PALIGNR mm6, mm0, 7, mm2
- movq [r4+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r4+r3*2], mm5
- RET
-
-%macro PRED8x8L_VERTICAL_RIGHT 0
-cglobal pred8x8l_vertical_right_8, 4,5,7
- ; manually spill XMM registers for Win64 because
- ; the code here is initialized with INIT_MMX
- WIN64_SPILL_XMM 7
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq2dq xmm0, mm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- lea r1, [r0+r3*2]
- movq2dq xmm4, mm6
- pslldq xmm4, 8
- por xmm0, xmm4
- movdqa xmm6, [pw_ff00]
- movdqa xmm1, xmm0
- lea r2, [r1+r3*2]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- pslldq xmm0, 1
- pslldq xmm1, 2
- pavgb xmm2, xmm0
-INIT_XMM cpuname
- PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
- pandn xmm6, xmm4
- movdqa xmm5, xmm4
- psrlw xmm4, 8
- packuswb xmm6, xmm4
- movhlps xmm4, xmm6
- movhps [r0+r3*2], xmm5
- movhps [r0+r3*1], xmm2
- psrldq xmm5, 4
- movss xmm5, xmm6
- psrldq xmm2, 4
- movss xmm2, xmm4
- lea r0, [r2+r3*2]
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r0+r3*2], xmm5
- movq [r0+r3*1], xmm2
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r2+r3*2], xmm5
- movq [r2+r3*1], xmm2
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r1+r3*2], xmm5
- movq [r1+r3*1], xmm2
- RET
-%endmacro
-
-INIT_MMX sse2
-PRED8x8L_VERTICAL_RIGHT
-INIT_MMX ssse3
-PRED8x8L_VERTICAL_RIGHT
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8L_VERTICAL_LEFT 0
-cglobal pred8x8l_vertical_left_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- jmp .do_top
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm4, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
-.do_topright:
- movq2dq xmm3, mm1
- lea r1, [r0+r3*2]
- pslldq xmm3, 8
- por xmm4, xmm3
- movdqa xmm2, xmm4
- movdqa xmm1, xmm4
- movdqa xmm3, xmm4
- psrldq xmm2, 1
- pslldq xmm1, 1
- pavgb xmm3, xmm2
- lea r2, [r1+r3*2]
-INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
- psrldq xmm0, 1
- movq [r0+r3*1], xmm3
- movq [r0+r3*2], xmm0
- lea r0, [r2+r3*2]
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r1+r3*1], xmm3
- movq [r1+r3*2], xmm0
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r2+r3*1], xmm3
- movq [r2+r3*2], xmm0
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r0+r3*1], xmm3
- movq [r0+r3*2], xmm0
- RET
-%endmacro
-
-INIT_MMX sse2
-PRED8x8L_VERTICAL_LEFT
-INIT_MMX ssse3
-PRED8x8L_VERTICAL_LEFT
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED8x8L_HORIZONTAL_UP 0
-cglobal pred8x8l_horizontal_up_8, 4,4
- sub r0, r3
- lea r2, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- test r1, r1
- lea r1, [r0+r3]
- cmovnz r1, r0
- punpckhbw mm0, [r1+r3*0-8]
- movq mm1, [r2+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r2, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r1+r3*0-8]
- mov r0, r2
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- lea r1, [r0+r3*2]
- pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
- movq mm2, mm0
- psllw mm0, 8
- psrlw mm2, 8
- por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
- movq mm3, mm2
- movq mm4, mm2
- movq mm5, mm2
- psrlq mm2, 8
- psrlq mm3, 16
- lea r2, [r1+r3*2]
- por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckhbw mm7, mm7
- por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavgb mm4, mm2
- PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
- movq mm5, mm4
- punpcklbw mm4, mm1 ; p4 p3 p2 p1
- punpckhbw mm5, mm1 ; p8 p7 p6 p5
- movq mm6, mm5
- movq mm7, mm5
- movq mm0, mm5
- PALIGNR mm5, mm4, 2, mm1
- pshufw mm1, mm6, 11111001b
- PALIGNR mm6, mm4, 4, mm2
- pshufw mm2, mm7, 11111110b
- PALIGNR mm7, mm4, 6, mm3
- pshufw mm3, mm0, 11111111b
- movq [r0+r3*1], mm4
- movq [r0+r3*2], mm5
- lea r0, [r2+r3*2]
- movq [r1+r3*1], mm6
- movq [r1+r3*2], mm7
- movq [r2+r3*1], mm0
- movq [r2+r3*2], mm1
- movq [r0+r3*1], mm2
- movq [r0+r3*2], mm3
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_HORIZONTAL_UP
-INIT_MMX ssse3
-PRED8x8L_HORIZONTAL_UP
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred8x8l_horizontal_down_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq mm6, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm5, mm4
- lea r1, [r0+r3*2]
- psllq mm7, 56
- movq mm2, mm5
- movq mm3, mm6
- movq mm4, mm2
- PALIGNR mm2, mm6, 7, mm5
- PALIGNR mm6, mm7, 7, mm0
- lea r2, [r1+r3*2]
- PALIGNR mm4, mm3, 1, mm7
- movq mm5, mm3
- pavgb mm3, mm6
- PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
- movq mm4, mm2
- movq mm1, mm2
- lea r4, [r2+r3*2]
- psrlq mm4, 16
- psrlq mm1, 8
- PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
- movq mm7, mm3
- punpcklbw mm3, mm0
- punpckhbw mm7, mm0
- movq mm1, mm7
- movq mm0, mm7
- movq mm4, mm7
- movq [r4+r3*2], mm3
- PALIGNR mm7, mm3, 2, mm5
- movq [r4+r3*1], mm7
- PALIGNR mm1, mm3, 4, mm5
- movq [r2+r3*2], mm1
- PALIGNR mm0, mm3, 6, mm3
- movq [r2+r3*1], mm0
- movq mm2, mm6
- movq mm3, mm6
- movq [r1+r3*2], mm4
- PALIGNR mm6, mm4, 2, mm5
- movq [r1+r3*1], mm6
- PALIGNR mm2, mm4, 4, mm5
- movq [r0+r3*2], mm2
- PALIGNR mm3, mm4, 6, mm4
- movq [r0+r3*1], mm3
- RET
-
-%macro PRED8x8L_HORIZONTAL_DOWN 0
-cglobal pred8x8l_horizontal_down_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
-.fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
-.fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
-.fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
-.do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq2dq xmm0, mm2
- pslldq xmm0, 8
- movq mm4, mm0
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- movq2dq xmm2, mm1
- pslldq xmm2, 15
- psrldq xmm2, 8
- por xmm0, xmm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
-.do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm1, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
-.do_topright:
- movq2dq xmm5, mm1
- pslldq xmm5, 8
- por xmm1, xmm5
-INIT_XMM cpuname
- lea r2, [r4+r3*2]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- PALIGNR xmm1, xmm0, 7, xmm4
- PALIGNR xmm2, xmm0, 9, xmm5
- lea r1, [r2+r3*2]
- PALIGNR xmm3, xmm0, 8, xmm0
- movdqa xmm4, xmm1
- pavgb xmm4, xmm3
- lea r0, [r1+r3*2]
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
- punpcklbw xmm4, xmm0
- movhlps xmm0, xmm4
- movq [r0+r3*2], xmm4
- movq [r2+r3*2], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r0+r3*1], xmm4
- movq [r2+r3*1], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r1+r3*2], xmm4
- movq [r4+r3*2], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r1+r3*1], xmm4
- movq [r4+r3*1], xmm0
- RET
-%endmacro
-
-INIT_MMX sse2
-PRED8x8L_HORIZONTAL_DOWN
-INIT_MMX ssse3
-PRED8x8L_HORIZONTAL_DOWN
-
-;-----------------------------------------------------------------------------
-; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_dc_8, 3,5
- pxor mm7, mm7
- mov r4, r0
- sub r0, r2
- movd mm0, [r0]
- psadbw mm0, mm7
- movzx r1d, byte [r0+r2*1-1]
- movd r3d, mm0
- add r3d, r1d
- movzx r1d, byte [r0+r2*2-1]
- lea r0, [r0+r2*2]
- add r3d, r1d
- movzx r1d, byte [r0+r2*1-1]
- add r3d, r1d
- movzx r1d, byte [r0+r2*2-1]
- add r3d, r1d
- add r3d, 4
- shr r3d, 3
- imul r3d, 0x01010101
- mov [r4+r2*0], r3d
- mov [r0+r2*0], r3d
- mov [r0+r2*1], r3d
- mov [r0+r2*2], r3d
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-%macro PRED4x4_TM 0
-cglobal pred4x4_tm_vp8_8, 3,6
- sub r0, r2
- pxor mm7, mm7
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movzx r4d, byte [r0-1]
- mov r5d, 2
-.loop:
- movzx r1d, byte [r0+r2*1-1]
- movzx r3d, byte [r0+r2*2-1]
- sub r1d, r4d
- sub r3d, r4d
- movd mm2, r1d
- movd mm4, r3d
-%if cpuflag(mmxext)
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
-%else
- punpcklwd mm2, mm2
- punpcklwd mm4, mm4
- punpckldq mm2, mm2
- punpckldq mm4, mm4
-%endif
- paddw mm2, mm0
- paddw mm4, mm0
- packuswb mm2, mm2
- packuswb mm4, mm4
- movd [r0+r2*1], mm2
- movd [r0+r2*2], mm4
- lea r0, [r0+r2*2]
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-PRED4x4_TM
-INIT_MMX mmxext
-PRED4x4_TM
-
-INIT_XMM ssse3
-cglobal pred4x4_tm_vp8_8, 3,3
- sub r0, r2
- movq mm6, [tm_shuf]
- pxor mm1, mm1
- movd mm0, [r0]
- punpcklbw mm0, mm1
- movd mm7, [r0-4]
- pshufb mm7, mm6
- lea r1, [r0+r2*2]
- movd mm2, [r0+r2*1-4]
- movd mm3, [r0+r2*2-4]
- movd mm4, [r1+r2*1-4]
- movd mm5, [r1+r2*2-4]
- pshufb mm2, mm6
- pshufb mm3, mm6
- pshufb mm4, mm6
- pshufb mm5, mm6
- psubw mm0, mm7
- paddw mm2, mm0
- paddw mm3, mm0
- paddw mm4, mm0
- paddw mm5, mm0
- packuswb mm2, mm2
- packuswb mm3, mm3
- packuswb mm4, mm4
- packuswb mm5, mm5
- movd [r0+r2*1], mm2
- movd [r0+r2*2], mm3
- movd [r1+r2*1], mm4
- movd [r1+r2*2], mm5
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_vertical_vp8_8, 3,3
- sub r0, r2
- movd m1, [r0-1]
- movd m0, [r0]
- mova m2, m0 ;t0 t1 t2 t3
- punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
- lea r1, [r0+r2*2]
- psrlq m0, 8 ;t1 t2 t3 t4
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- movd [r0+r2*1], m3
- movd [r0+r2*2], m3
- movd [r1+r2*1], m3
- movd [r1+r2*2], m3
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal pred4x4_down_left_8, 3,3
- sub r0, r2
- movq m1, [r0]
- punpckldq m1, [r1]
- movq m2, m1
- movq m3, m1
- psllq m1, 8
- pxor m2, m1
- psrlq m2, 8
- pxor m2, m3
- PRED4x4_LOWPASS m0, m1, m2, m3, m4
- lea r1, [r0+r2*2]
- psrlq m0, 8
- movd [r0+r2*1], m0
- psrlq m0, 8
- movd [r0+r2*2], m0
- psrlq m0, 8
- movd [r1+r2*1], m0
- psrlq m0, 8
- movd [r1+r2*2], m0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_vertical_left_8, 3,3
- sub r0, r2
- movq m1, [r0]
- punpckldq m1, [r1]
- movq m3, m1
- movq m2, m1
- psrlq m3, 8
- psrlq m2, 16
- movq m4, m3
- pavgb m4, m1
- PRED4x4_LOWPASS m0, m1, m2, m3, m5
- lea r1, [r0+r2*2]
- movh [r0+r2*1], m4
- movh [r0+r2*2], m0
- psrlq m4, 8
- psrlq m0, 8
- movh [r1+r2*1], m4
- movh [r1+r2*2], m0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_horizontal_up_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movd m0, [r0+r2*1-4]
- punpcklbw m0, [r0+r2*2-4]
- movd m1, [r1+r2*1-4]
- punpcklbw m1, [r1+r2*2-4]
- punpckhwd m0, m1
- movq m1, m0
- punpckhbw m1, m1
- pshufw m1, m1, 0xFF
- punpckhdq m0, m1
- movq m2, m0
- movq m3, m0
- movq m7, m0
- psrlq m2, 16
- psrlq m3, 8
- pavgb m7, m3
- PRED4x4_LOWPASS m4, m0, m2, m3, m5
- punpcklbw m7, m4
- movd [r0+r2*1], m7
- psrlq m7, 16
- movd [r0+r2*2], m7
- psrlq m7, 16
- movd [r1+r2*1], m7
- movd [r1+r2*2], m1
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_horizontal_down_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movh m0, [r0-4] ; lt ..
- punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
- psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
- movd m1, [r1+r2*2-4] ; l3
- punpcklbw m1, [r1+r2*1-4] ; l2 l3
- movd m2, [r0+r2*2-4] ; l1
- punpcklbw m2, [r0+r2*1-4] ; l0 l1
- punpckhwd m1, m2 ; l0 l1 l2 l3
- punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- movq m0, m1
- movq m2, m1
- movq m5, m1
- psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
- psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
- pavgb m5, m2
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- punpcklbw m5, m3
- psrlq m3, 32
- PALIGNR m3, m5, 6, m4
- movh [r1+r2*2], m5
- psrlq m5, 16
- movh [r1+r2*1], m5
- psrlq m5, 16
- movh [r0+r2*2], m5
- movh [r0+r2*1], m3
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_vertical_right_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movh m0, [r0] ; ........t3t2t1t0
- movq m5, m0
- PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
- pavgb m5, m0
- PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
- movq m1, m0
- PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
- movq m2, m0
- PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- movq m1, m3
- psrlq m3, 16
- psllq m1, 48
- movh [r0+r2*1], m5
- movh [r0+r2*2], m3
- PALIGNR m5, m1, 7, m2
- psllq m1, 8
- movh [r1+r2*1], m5
- PALIGNR m3, m1, 7, m1
- movh [r1+r2*2], m3
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmxext
-cglobal pred4x4_down_right_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m1, [r1-8]
- movq m2, [r0+r2*1-8]
- punpckhbw m2, [r0-8]
- movh m3, [r0]
- punpckhwd m1, m2
- PALIGNR m3, m1, 5, m1
- movq m1, m3
- PALIGNR m3, [r1+r2*1-8], 7, m4
- movq m2, m3
- PALIGNR m3, [r1+r2*2-8], 7, m4
- PRED4x4_LOWPASS m0, m3, m1, m2, m4
- movh [r1+r2*2], m0
- psrlq m0, 8
- movh [r1+r2*1], m0
- psrlq m0, 8
- movh [r0+r2*2], m0
- psrlq m0, 8
- movh [r0+r2*1], m0
- RET
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm b/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
deleted file mode 100644
index 54eaee5..0000000
--- a/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
+++ /dev/null
@@ -1,1199 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-cextern pw_16
-cextern pw_8
-cextern pw_4
-cextern pw_2
-cextern pw_1
-
-pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
-pw_m3: times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_512: times 8 dw 512
-pd_17: times 4 dd 17
-pd_16: times 4 dd 16
-
-SECTION .text
-
-; dest, left, right, src
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED4x4_LOWPASS 4
- paddw %2, %3
- psrlw %2, 1
- pavgw %1, %4, %2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED4x4_DR 0
-cglobal pred4x4_down_right_10, 3, 3
- sub r0, r2
- lea r1, [r0+r2*2]
- movhps m1, [r1-8]
- movhps m2, [r0+r2*1-8]
- movhps m4, [r0-8]
- punpckhwd m2, m4
- movq m3, [r0]
- punpckhdq m1, m2
- PALIGNR m3, m1, 10, m1
- movhps m4, [r1+r2*1-8]
- PALIGNR m0, m3, m4, 14, m4
- movhps m4, [r1+r2*2-8]
- PALIGNR m2, m0, m4, 14, m4
- PRED4x4_LOWPASS m0, m2, m3, m0
- movq [r1+r2*2], m0
- psrldq m0, 2
- movq [r1+r2*1], m0
- psrldq m0, 2
- movq [r0+r2*2], m0
- psrldq m0, 2
- movq [r0+r2*1], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED4x4_DR
-INIT_XMM ssse3
-PRED4x4_DR
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED4x4_DR
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED4x4_VR 0
-cglobal pred4x4_vertical_right_10, 3, 3, 6
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m5, [r0] ; ........t3t2t1t0
- movhps m1, [r0-8]
- PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
- pavgw m5, m0
- movhps m1, [r0+r2*1-8]
- PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
- movhps m2, [r0+r2*2-8]
- PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
- movhps m3, [r1+r2*1-8]
- PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
- PRED4x4_LOWPASS m1, m0, m2, m1
- pslldq m0, m1, 12
- psrldq m1, 4
- movq [r0+r2*1], m5
- movq [r0+r2*2], m1
- PALIGNR m5, m0, 14, m2
- pslldq m0, 2
- movq [r1+r2*1], m5
- PALIGNR m1, m0, 14, m0
- movq [r1+r2*2], m1
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED4x4_VR
-INIT_XMM ssse3
-PRED4x4_VR
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED4x4_VR
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED4x4_HD 0
-cglobal pred4x4_horizontal_down_10, 3, 3
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m0, [r0-8] ; lt ..
- movhps m0, [r0]
- pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
- movq m1, [r1+r2*2-8] ; l3
- movq m3, [r1+r2*1-8]
- punpcklwd m1, m3 ; l2 l3
- movq m2, [r0+r2*2-8] ; l1
- movq m3, [r0+r2*1-8]
- punpcklwd m2, m3 ; l0 l1
- punpckhdq m1, m2 ; l0 l1 l2 l3
- punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
- psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
- pavgw m5, m1, m3
- PRED4x4_LOWPASS m3, m1, m0, m3
- punpcklwd m5, m3
- psrldq m3, 8
- PALIGNR m3, m5, 12, m4
- movq [r1+r2*2], m5
- movhps [r0+r2*2], m5
- psrldq m5, 4
- movq [r1+r2*1], m5
- movq [r0+r2*1], m3
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED4x4_HD
-INIT_XMM ssse3
-PRED4x4_HD
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED4x4_HD
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
- movhlps %2, %1
- paddd %1, %2
- pshuflw %2, %1, 0xE
- paddd %1, %2
-%else
- pshufw %2, %1, 0xE
- paddd %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
- pmaddwd %1, [pw_1]
- HADDD %1, %2
-%endmacro
-
-INIT_MMX mmxext
-cglobal pred4x4_dc_10, 3, 3
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m2, [r0+r2*1-8]
- paddw m2, [r0+r2*2-8]
- paddw m2, [r1+r2*1-8]
- paddw m2, [r1+r2*2-8]
- psrlq m2, 48
- movq m0, [r0]
- HADDW m0, m1
- paddw m0, [pw_4]
- paddw m0, m2
- psrlw m0, 3
- SPLATW m0, m0, 0
- movq [r0+r2*1], m0
- movq [r0+r2*2], m0
- movq [r1+r2*1], m0
- movq [r1+r2*2], m0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED4x4_DL 0
-cglobal pred4x4_down_left_10, 3, 3
- sub r0, r2
- movq m0, [r0]
- movhps m0, [r1]
- psrldq m2, m0, 2
- pslldq m3, m0, 2
- pshufhw m2, m2, 10100100b
- PRED4x4_LOWPASS m0, m3, m2, m0
- lea r1, [r0+r2*2]
- movhps [r1+r2*2], m0
- psrldq m0, 2
- movq [r0+r2*1], m0
- psrldq m0, 2
- movq [r0+r2*2], m0
- psrldq m0, 2
- movq [r1+r2*1], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED4x4_DL
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED4x4_DL
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED4x4_VL 0
-cglobal pred4x4_vertical_left_10, 3, 3
- sub r0, r2
- movu m1, [r0]
- movhps m1, [r1]
- psrldq m0, m1, 2
- psrldq m2, m1, 4
- pavgw m4, m0, m1
- PRED4x4_LOWPASS m0, m1, m2, m0
- lea r1, [r0+r2*2]
- movq [r0+r2*1], m4
- movq [r0+r2*2], m0
- psrldq m4, 2
- psrldq m0, 2
- movq [r1+r2*1], m4
- movq [r1+r2*2], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED4x4_VL
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED4x4_VL
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
-;-----------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal pred4x4_horizontal_up_10, 3, 3
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m0, [r0+r2*1-8]
- punpckhwd m0, [r0+r2*2-8]
- movq m1, [r1+r2*1-8]
- punpckhwd m1, [r1+r2*2-8]
- punpckhdq m0, m1
- pshufw m1, m1, 0xFF
- movq [r1+r2*2], m1
- movd [r1+r2*1+4], m1
- pshufw m2, m0, 11111001b
- movq m1, m2
- pavgw m2, m0
-
- pshufw m5, m0, 11111110b
- PRED4x4_LOWPASS m1, m0, m5, m1
- movq m6, m2
- punpcklwd m6, m1
- movq [r0+r2*1], m6
- psrlq m2, 16
- psrlq m1, 16
- punpcklwd m2, m1
- movq [r0+r2*2], m2
- psrlq m2, 32
- movd [r1+r2*1], m2
- RET
-
-
-
-;-----------------------------------------------------------------------------
-; void pred8x8_vertical(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal pred8x8_vertical_10, 2, 2
- sub r0, r1
- mova m0, [r0]
-%rep 3
- mova [r0+r1*1], m0
- mova [r0+r1*2], m0
- lea r0, [r0+r1*2]
-%endrep
- mova [r0+r1*1], m0
- mova [r0+r1*2], m0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_horizontal(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal pred8x8_horizontal_10, 2, 3
- mov r2d, 4
-.loop:
- movq m0, [r0+r1*0-8]
- movq m1, [r0+r1*1-8]
- pshuflw m0, m0, 0xff
- pshuflw m1, m1, 0xff
- punpcklqdq m0, m0
- punpcklqdq m1, m1
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void predict_8x8_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MOV8 2-3
-; sort of a hack, but it works
-%if mmsize==8
- movq [%1+0], %2
- movq [%1+8], %3
-%else
- movdqa [%1], %2
-%endif
-%endmacro
-
-%macro PRED8x8_DC 1
-cglobal pred8x8_dc_10, 2, 6
- sub r0, r1
- pxor m4, m4
- movq m0, [r0+0]
- movq m1, [r0+8]
-%if mmsize==16
- punpcklwd m0, m1
- movhlps m1, m0
- paddw m0, m1
-%else
- pshufw m2, m0, 00001110b
- pshufw m3, m1, 00001110b
- paddw m0, m2
- paddw m1, m3
- punpcklwd m0, m1
-%endif
- %1 m2, m0, 00001110b
- paddw m0, m2
-
- lea r5, [r1*3]
- lea r4, [r0+r1*4]
- movzx r2d, word [r0+r1*1-2]
- movzx r3d, word [r0+r1*2-2]
- add r2d, r3d
- movzx r3d, word [r0+r5*1-2]
- add r2d, r3d
- movzx r3d, word [r4-2]
- add r2d, r3d
- movd m2, r2d ; s2
-
- movzx r2d, word [r4+r1*1-2]
- movzx r3d, word [r4+r1*2-2]
- add r2d, r3d
- movzx r3d, word [r4+r5*1-2]
- add r2d, r3d
- movzx r3d, word [r4+r1*4-2]
- add r2d, r3d
- movd m3, r2d ; s3
-
- punpcklwd m2, m3
- punpckldq m0, m2 ; s0, s1, s2, s3
- %1 m3, m0, 11110110b ; s2, s1, s3, s3
- %1 m0, m0, 01110100b ; s0, s1, s3, s1
- paddw m0, m3
- psrlw m0, 2
- pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
-%if mmsize==16
- punpcklwd m0, m0
- pshufd m3, m0, 11111010b
- punpckldq m0, m0
- SWAP 0,1
-%else
- pshufw m1, m0, 0x00
- pshufw m2, m0, 0x55
- pshufw m3, m0, 0xaa
- pshufw m4, m0, 0xff
-%endif
- MOV8 r0+r1*1, m1, m2
- MOV8 r0+r1*2, m1, m2
- MOV8 r0+r5*1, m1, m2
- MOV8 r0+r1*4, m1, m2
- MOV8 r4+r1*1, m3, m4
- MOV8 r4+r1*2, m3, m4
- MOV8 r4+r5*1, m3, m4
- MOV8 r4+r1*4, m3, m4
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8_DC pshufw
-INIT_XMM sse2
-PRED8x8_DC pshuflw
-
-;-----------------------------------------------------------------------------
-; void pred8x8_top_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal pred8x8_top_dc_10, 2, 4
- sub r0, r1
- mova m0, [r0]
- pshuflw m1, m0, 0x4e
- pshufhw m1, m1, 0x4e
- paddw m0, m1
- pshuflw m1, m0, 0xb1
- pshufhw m1, m1, 0xb1
- paddw m0, m1
- lea r2, [r1*3]
- lea r3, [r0+r1*4]
- paddw m0, [pw_2]
- psrlw m0, 2
- mova [r0+r1*1], m0
- mova [r0+r1*2], m0
- mova [r0+r2*1], m0
- mova [r0+r1*4], m0
- mova [r3+r1*1], m0
- mova [r3+r1*2], m0
- mova [r3+r2*1], m0
- mova [r3+r1*4], m0
- RET
-
-;-----------------------------------------------------------------------------
-; void pred8x8_plane(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal pred8x8_plane_10, 2, 7, 7
- sub r0, r1
- lea r2, [r1*3]
- lea r3, [r0+r1*4]
- mova m2, [r0]
- pmaddwd m2, [pw_m32101234]
- HADDD m2, m1
- movd m0, [r0-4]
- psrld m0, 14
- psubw m2, m0 ; H
- movd m0, [r3+r1*4-4]
- movd m1, [r0+12]
- paddw m0, m1
- psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
- movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
- movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
- sub r4d, r5d
- movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
- movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
- sub r6d, r5d
- lea r4d, [r4+r6*2]
- movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
- movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
- sub r5d, r6d
- lea r5d, [r5*3]
- add r4d, r5d
- movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
- movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
- sub r6d, r5d
- lea r4d, [r4+r6*4]
- movd m3, r4d ; V
- punpckldq m2, m3
- pmaddwd m2, [pd_17]
- paddd m2, [pd_16]
- psrad m2, 5 ; b, c
-
- mova m3, [pw_pixel_max]
- pxor m1, m1
- SPLATW m0, m0, 1
- SPLATW m4, m2, 2
- SPLATW m2, m2, 0
- pmullw m2, [pw_m32101234] ; b
- pmullw m5, m4, [pw_m3] ; c
- paddw m5, [pw_16]
- mov r2d, 8
- add r0, r1
-.loop:
- paddsw m6, m2, m5
- paddsw m6, m0
- psraw m6, 5
- CLIPW m6, m1, m3
- mova [r0], m6
- paddw m5, m4
- add r0, r1
- dec r2d
- jg .loop
- REP_RET
-
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_128_DC 0
-cglobal pred8x8l_128_dc_10, 4, 4
- mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
- lea r1, [r3*3]
- lea r2, [r0+r3*4]
- MOV8 r0+r3*0, m0, m0
- MOV8 r0+r3*1, m0, m0
- MOV8 r0+r3*2, m0, m0
- MOV8 r0+r1*1, m0, m0
- MOV8 r2+r3*0, m0, m0
- MOV8 r2+r3*1, m0, m0
- MOV8 r2+r3*2, m0, m0
- MOV8 r2+r1*1, m0, m0
- RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_128_DC
-INIT_XMM sse2
-PRED8x8L_128_DC
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_TOP_DC 0
-cglobal pred8x8l_top_dc_10, 4, 4, 6
- sub r0, r3
- mova m0, [r0]
- shr r1d, 14
- shr r2d, 13
- neg r1
- pslldq m1, m0, 2
- psrldq m2, m0, 2
- pinsrw m1, [r0+r1], 0
- pinsrw m2, [r0+r2+14], 7
- lea r1, [r3*3]
- lea r2, [r0+r3*4]
- PRED4x4_LOWPASS m0, m2, m1, m0
- HADDW m0, m1
- paddw m0, [pw_4]
- psrlw m0, 3
- SPLATW m0, m0, 0
- mova [r0+r3*1], m0
- mova [r0+r3*2], m0
- mova [r0+r1*1], m0
- mova [r0+r3*4], m0
- mova [r2+r3*1], m0
- mova [r2+r3*2], m0
- mova [r2+r1*1], m0
- mova [r2+r3*4], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_TOP_DC
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_TOP_DC
-%endif
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-;TODO: see if scalar is faster
-%macro PRED8x8L_DC 0
-cglobal pred8x8l_dc_10, 4, 6, 6
- sub r0, r3
- lea r4, [r0+r3*4]
- lea r5, [r3*3]
- mova m0, [r0+r3*2-16]
- punpckhwd m0, [r0+r3*1-16]
- mova m1, [r4+r3*0-16]
- punpckhwd m1, [r0+r5*1-16]
- punpckhdq m1, m0
- mova m2, [r4+r3*2-16]
- punpckhwd m2, [r4+r3*1-16]
- mova m3, [r4+r3*4-16]
- punpckhwd m3, [r4+r5*1-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- mova m0, [r0]
- shr r1d, 14
- shr r2d, 13
- neg r1
- pslldq m1, m0, 2
- psrldq m2, m0, 2
- pinsrw m1, [r0+r1], 0
- pinsrw m2, [r0+r2+14], 7
- not r1
- and r1, r3
- pslldq m4, m3, 2
- psrldq m5, m3, 2
- pshuflw m4, m4, 11100101b
- pinsrw m5, [r0+r1-2], 7
- PRED4x4_LOWPASS m3, m4, m5, m3
- PRED4x4_LOWPASS m0, m2, m1, m0
- paddw m0, m3
- HADDW m0, m1
- paddw m0, [pw_8]
- psrlw m0, 4
- SPLATW m0, m0
- mova [r0+r3*1], m0
- mova [r0+r3*2], m0
- mova [r0+r5*1], m0
- mova [r0+r3*4], m0
- mova [r4+r3*1], m0
- mova [r4+r3*2], m0
- mova [r4+r5*1], m0
- mova [r4+r3*4], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_DC
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_DC
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_VERTICAL 0
-cglobal pred8x8l_vertical_10, 4, 4, 6
- sub r0, r3
- mova m0, [r0]
- shr r1d, 14
- shr r2d, 13
- neg r1
- pslldq m1, m0, 2
- psrldq m2, m0, 2
- pinsrw m1, [r0+r1], 0
- pinsrw m2, [r0+r2+14], 7
- lea r1, [r3*3]
- lea r2, [r0+r3*4]
- PRED4x4_LOWPASS m0, m2, m1, m0
- mova [r0+r3*1], m0
- mova [r0+r3*2], m0
- mova [r0+r1*1], m0
- mova [r0+r3*4], m0
- mova [r2+r3*1], m0
- mova [r2+r3*2], m0
- mova [r2+r1*1], m0
- mova [r2+r3*4], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_VERTICAL
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_VERTICAL
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_HORIZONTAL 0
-cglobal pred8x8l_horizontal_10, 4, 4, 5
- mova m0, [r0-16]
- shr r1d, 14
- dec r1
- and r1, r3
- sub r1, r3
- punpckhwd m0, [r0+r1-16]
- mova m1, [r0+r3*2-16]
- punpckhwd m1, [r0+r3*1-16]
- lea r2, [r0+r3*4]
- lea r1, [r3*3]
- punpckhdq m1, m0
- mova m2, [r2+r3*0-16]
- punpckhwd m2, [r0+r1-16]
- mova m3, [r2+r3*2-16]
- punpckhwd m3, [r2+r3*1-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- PALIGNR m4, m3, [r2+r1-16], 14, m0
- pslldq m0, m4, 2
- pshuflw m0, m0, 11100101b
- PRED4x4_LOWPASS m4, m3, m0, m4
- punpckhwd m3, m4, m4
- punpcklwd m4, m4
- pshufd m0, m3, 0xff
- pshufd m1, m3, 0xaa
- pshufd m2, m3, 0x55
- pshufd m3, m3, 0x00
- mova [r0+r3*0], m0
- mova [r0+r3*1], m1
- mova [r0+r3*2], m2
- mova [r0+r1*1], m3
- pshufd m0, m4, 0xff
- pshufd m1, m4, 0xaa
- pshufd m2, m4, 0x55
- pshufd m3, m4, 0x00
- mova [r2+r3*0], m0
- mova [r2+r3*1], m1
- mova [r2+r3*2], m2
- mova [r2+r1*1], m3
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_HORIZONTAL
-INIT_XMM ssse3
-PRED8x8L_HORIZONTAL
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_HORIZONTAL
-%endif
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_DOWN_LEFT 0
-cglobal pred8x8l_down_left_10, 4, 4, 7
- sub r0, r3
- mova m3, [r0]
- shr r1d, 14
- neg r1
- shr r2d, 13
- pslldq m1, m3, 2
- psrldq m2, m3, 2
- pinsrw m1, [r0+r1], 0
- pinsrw m2, [r0+r2+14], 7
- PRED4x4_LOWPASS m6, m2, m1, m3
- jz .fix_tr ; flags from shr r2d
- mova m1, [r0+16]
- psrldq m5, m1, 2
- PALIGNR m2, m1, m3, 14, m3
- pshufhw m5, m5, 10100100b
- PRED4x4_LOWPASS m1, m2, m5, m1
-.do_topright:
- lea r1, [r3*3]
- psrldq m5, m1, 14
- lea r2, [r0+r3*4]
- PALIGNR m2, m1, m6, 2, m0
- PALIGNR m3, m1, m6, 14, m0
- PALIGNR m5, m1, 2, m0
- pslldq m4, m6, 2
- PRED4x4_LOWPASS m6, m4, m2, m6
- PRED4x4_LOWPASS m1, m3, m5, m1
- mova [r2+r3*4], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r2+r1*1], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r2+r3*2], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r2+r3*1], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r0+r3*4], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r0+r1*1], m1
- PALIGNR m1, m6, 14, m2
- pslldq m6, 2
- mova [r0+r3*2], m1
- PALIGNR m1, m6, 14, m6
- mova [r0+r3*1], m1
- RET
-.fix_tr:
- punpckhwd m3, m3
- pshufd m1, m3, 0xFF
- jmp .do_topright
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_DOWN_LEFT
-INIT_XMM ssse3
-PRED8x8L_DOWN_LEFT
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_DOWN_LEFT
-%endif
-
-;-----------------------------------------------------------------------------
-;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_DOWN_RIGHT 0
-; standard forbids this when has_topleft is false
-; no need to check
-cglobal pred8x8l_down_right_10, 4, 5, 8
- sub r0, r3
- lea r4, [r0+r3*4]
- lea r1, [r3*3]
- mova m0, [r0+r3*1-16]
- punpckhwd m0, [r0+r3*0-16]
- mova m1, [r0+r1*1-16]
- punpckhwd m1, [r0+r3*2-16]
- punpckhdq m1, m0
- mova m2, [r4+r3*1-16]
- punpckhwd m2, [r4+r3*0-16]
- mova m3, [r4+r1*1-16]
- punpckhwd m3, [r4+r3*2-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- mova m0, [r4+r3*4-16]
- mova m1, [r0]
- PALIGNR m4, m3, m0, 14, m0
- PALIGNR m1, m3, 2, m2
- pslldq m0, m4, 2
- pshuflw m0, m0, 11100101b
- PRED4x4_LOWPASS m6, m1, m4, m3
- PRED4x4_LOWPASS m4, m3, m0, m4
- mova m3, [r0]
- shr r2d, 13
- pslldq m1, m3, 2
- psrldq m2, m3, 2
- pinsrw m1, [r0-2], 0
- pinsrw m2, [r0+r2+14], 7
- PRED4x4_LOWPASS m3, m2, m1, m3
- PALIGNR m2, m3, m6, 2, m0
- PALIGNR m5, m3, m6, 14, m0
- psrldq m7, m3, 2
- PRED4x4_LOWPASS m6, m4, m2, m6
- PRED4x4_LOWPASS m3, m5, m7, m3
- mova [r4+r3*4], m6
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r0+r3*1], m3
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r0+r3*2], m3
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r0+r1*1], m3
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r0+r3*4], m3
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r4+r3*1], m3
- PALIGNR m3, m6, 14, m2
- pslldq m6, 2
- mova [r4+r3*2], m3
- PALIGNR m3, m6, 14, m6
- mova [r4+r1*1], m3
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_DOWN_RIGHT
-INIT_XMM ssse3
-PRED8x8L_DOWN_RIGHT
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_DOWN_RIGHT
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_VERTICAL_RIGHT 0
-; likewise with 8x8l_down_right
-cglobal pred8x8l_vertical_right_10, 4, 5, 7
- sub r0, r3
- lea r4, [r0+r3*4]
- lea r1, [r3*3]
- mova m0, [r0+r3*1-16]
- punpckhwd m0, [r0+r3*0-16]
- mova m1, [r0+r1*1-16]
- punpckhwd m1, [r0+r3*2-16]
- punpckhdq m1, m0
- mova m2, [r4+r3*1-16]
- punpckhwd m2, [r4+r3*0-16]
- mova m3, [r4+r1*1-16]
- punpckhwd m3, [r4+r3*2-16]
- punpckhdq m3, m2
- punpckhqdq m3, m1
- mova m0, [r4+r3*4-16]
- mova m1, [r0]
- PALIGNR m4, m3, m0, 14, m0
- PALIGNR m1, m3, 2, m2
- PRED4x4_LOWPASS m3, m1, m4, m3
- mova m2, [r0]
- shr r2d, 13
- pslldq m1, m2, 2
- psrldq m5, m2, 2
- pinsrw m1, [r0-2], 0
- pinsrw m5, [r0+r2+14], 7
- PRED4x4_LOWPASS m2, m5, m1, m2
- PALIGNR m6, m2, m3, 12, m1
- PALIGNR m5, m2, m3, 14, m0
- PRED4x4_LOWPASS m0, m6, m2, m5
- pavgw m2, m5
- mova [r0+r3*2], m0
- mova [r0+r3*1], m2
- pslldq m6, m3, 4
- pslldq m1, m3, 2
- PRED4x4_LOWPASS m1, m3, m6, m1
- PALIGNR m2, m1, 14, m4
- mova [r0+r1*1], m2
- pslldq m1, 2
- PALIGNR m0, m1, 14, m3
- mova [r0+r3*4], m0
- pslldq m1, 2
- PALIGNR m2, m1, 14, m4
- mova [r4+r3*1], m2
- pslldq m1, 2
- PALIGNR m0, m1, 14, m3
- mova [r4+r3*2], m0
- pslldq m1, 2
- PALIGNR m2, m1, 14, m4
- mova [r4+r1*1], m2
- pslldq m1, 2
- PALIGNR m0, m1, 14, m1
- mova [r4+r3*4], m0
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_VERTICAL_RIGHT
-INIT_XMM ssse3
-PRED8x8L_VERTICAL_RIGHT
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_VERTICAL_RIGHT
-%endif
-
-;-----------------------------------------------------------------------------
-; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED8x8L_HORIZONTAL_UP 0
-cglobal pred8x8l_horizontal_up_10, 4, 4, 6
- mova m0, [r0+r3*0-16]
- punpckhwd m0, [r0+r3*1-16]
- shr r1d, 14
- dec r1
- and r1, r3
- sub r1, r3
- mova m4, [r0+r1*1-16]
- lea r1, [r3*3]
- lea r2, [r0+r3*4]
- mova m1, [r0+r3*2-16]
- punpckhwd m1, [r0+r1*1-16]
- punpckhdq m0, m1
- mova m2, [r2+r3*0-16]
- punpckhwd m2, [r2+r3*1-16]
- mova m3, [r2+r3*2-16]
- punpckhwd m3, [r2+r1*1-16]
- punpckhdq m2, m3
- punpckhqdq m0, m2
- PALIGNR m1, m0, m4, 14, m4
- psrldq m2, m0, 2
- pshufhw m2, m2, 10100100b
- PRED4x4_LOWPASS m0, m1, m2, m0
- psrldq m1, m0, 2
- psrldq m2, m0, 4
- pshufhw m1, m1, 10100100b
- pshufhw m2, m2, 01010100b
- pavgw m4, m0, m1
- PRED4x4_LOWPASS m1, m2, m0, m1
- punpckhwd m5, m4, m1
- punpcklwd m4, m1
- mova [r2+r3*0], m5
- mova [r0+r3*0], m4
- pshufd m0, m5, 11111001b
- pshufd m1, m5, 11111110b
- pshufd m2, m5, 11111111b
- mova [r2+r3*1], m0
- mova [r2+r3*2], m1
- mova [r2+r1*1], m2
- PALIGNR m2, m5, m4, 4, m0
- PALIGNR m3, m5, m4, 8, m1
- PALIGNR m5, m5, m4, 12, m4
- mova [r0+r3*1], m2
- mova [r0+r3*2], m3
- mova [r0+r1*1], m5
- RET
-%endmacro
-
-INIT_XMM sse2
-PRED8x8L_HORIZONTAL_UP
-INIT_XMM ssse3
-PRED8x8L_HORIZONTAL_UP
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-PRED8x8L_HORIZONTAL_UP
-%endif
-
-
-;-----------------------------------------------------------------------------
-; void pred16x16_vertical(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MOV16 3-5
- mova [%1+ 0], %2
- mova [%1+mmsize], %3
-%if mmsize==8
- mova [%1+ 16], %4
- mova [%1+ 24], %5
-%endif
-%endmacro
-
-%macro PRED16x16_VERTICAL 0
-cglobal pred16x16_vertical_10, 2, 3
- sub r0, r1
- mov r2d, 8
- mova m0, [r0+ 0]
- mova m1, [r0+mmsize]
-%if mmsize==8
- mova m2, [r0+16]
- mova m3, [r0+24]
-%endif
-.loop:
- MOV16 r0+r1*1, m0, m1, m2, m3
- MOV16 r0+r1*2, m0, m1, m2, m3
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_VERTICAL
-INIT_XMM sse2
-PRED16x16_VERTICAL
-
-;-----------------------------------------------------------------------------
-; void pred16x16_horizontal(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED16x16_HORIZONTAL 0
-cglobal pred16x16_horizontal_10, 2, 3
- mov r2d, 8
-.vloop:
- movd m0, [r0+r1*0-4]
- movd m1, [r0+r1*1-4]
- SPLATW m0, m0, 1
- SPLATW m1, m1, 1
- MOV16 r0+r1*0, m0, m0, m0, m0
- MOV16 r0+r1*1, m1, m1, m1, m1
- lea r0, [r0+r1*2]
- dec r2d
- jg .vloop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_HORIZONTAL
-INIT_XMM sse2
-PRED16x16_HORIZONTAL
-
-;-----------------------------------------------------------------------------
-; void pred16x16_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED16x16_DC 0
-cglobal pred16x16_dc_10, 2, 6
- mov r5, r0
- sub r0, r1
- mova m0, [r0+0]
- paddw m0, [r0+mmsize]
-%if mmsize==8
- paddw m0, [r0+16]
- paddw m0, [r0+24]
-%endif
- HADDW m0, m2
-
- lea r0, [r0+r1-2]
- movzx r3d, word [r0]
- movzx r4d, word [r0+r1]
-%rep 7
- lea r0, [r0+r1*2]
- movzx r2d, word [r0]
- add r3d, r2d
- movzx r2d, word [r0+r1]
- add r4d, r2d
-%endrep
- lea r3d, [r3+r4+16]
-
- movd m1, r3d
- paddw m0, m1
- psrlw m0, 5
- SPLATW m0, m0
- mov r3d, 8
-.loop:
- MOV16 r5+r1*0, m0, m0, m0, m0
- MOV16 r5+r1*1, m0, m0, m0, m0
- lea r5, [r5+r1*2]
- dec r3d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_DC
-INIT_XMM sse2
-PRED16x16_DC
-
-;-----------------------------------------------------------------------------
-; void pred16x16_top_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED16x16_TOP_DC 0
-cglobal pred16x16_top_dc_10, 2, 3
- sub r0, r1
- mova m0, [r0+0]
- paddw m0, [r0+mmsize]
-%if mmsize==8
- paddw m0, [r0+16]
- paddw m0, [r0+24]
-%endif
- HADDW m0, m2
-
- SPLATW m0, m0
- paddw m0, [pw_8]
- psrlw m0, 4
- mov r2d, 8
-.loop:
- MOV16 r0+r1*1, m0, m0, m0, m0
- MOV16 r0+r1*2, m0, m0, m0, m0
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_TOP_DC
-INIT_XMM sse2
-PRED16x16_TOP_DC
-
-;-----------------------------------------------------------------------------
-; void pred16x16_left_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED16x16_LEFT_DC 0
-cglobal pred16x16_left_dc_10, 2, 6
- mov r5, r0
-
- sub r0, 2
- movzx r3d, word [r0]
- movzx r4d, word [r0+r1]
-%rep 7
- lea r0, [r0+r1*2]
- movzx r2d, word [r0]
- add r3d, r2d
- movzx r2d, word [r0+r1]
- add r4d, r2d
-%endrep
- lea r3d, [r3+r4+8]
- shr r3d, 4
-
- movd m0, r3d
- SPLATW m0, m0
- mov r3d, 8
-.loop:
- MOV16 r5+r1*0, m0, m0, m0, m0
- MOV16 r5+r1*1, m0, m0, m0, m0
- lea r5, [r5+r1*2]
- dec r3d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_LEFT_DC
-INIT_XMM sse2
-PRED16x16_LEFT_DC
-
-;-----------------------------------------------------------------------------
-; void pred16x16_128_dc(pixel *src, int stride)
-;-----------------------------------------------------------------------------
-%macro PRED16x16_128_DC 0
-cglobal pred16x16_128_dc_10, 2,3
- mova m0, [pw_512]
- mov r2d, 8
-.loop:
- MOV16 r0+r1*0, m0, m0, m0, m0
- MOV16 r0+r1*1, m0, m0, m0, m0
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PRED16x16_128_DC
-INIT_XMM sse2
-PRED16x16_128_DC
diff --git a/ffmpeg/libavcodec/x86/h264_intrapred_init.c b/ffmpeg/libavcodec/x86/h264_intrapred_init.c
deleted file mode 100644
index ad2984b..0000000
--- a/ffmpeg/libavcodec/x86/h264_intrapred_init.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * Copyright (c) 2010 Jason Garrett-Glaser
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/h264pred.h"
-
-#define PRED4x4(TYPE, DEPTH, OPT) \
-void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
- const uint8_t *topright, \
- ptrdiff_t stride);
-
-PRED4x4(dc, 10, mmxext)
-PRED4x4(down_left, 10, sse2)
-PRED4x4(down_left, 10, avx)
-PRED4x4(down_right, 10, sse2)
-PRED4x4(down_right, 10, ssse3)
-PRED4x4(down_right, 10, avx)
-PRED4x4(vertical_left, 10, sse2)
-PRED4x4(vertical_left, 10, avx)
-PRED4x4(vertical_right, 10, sse2)
-PRED4x4(vertical_right, 10, ssse3)
-PRED4x4(vertical_right, 10, avx)
-PRED4x4(horizontal_up, 10, mmxext)
-PRED4x4(horizontal_down, 10, sse2)
-PRED4x4(horizontal_down, 10, ssse3)
-PRED4x4(horizontal_down, 10, avx)
-
-#define PRED8x8(TYPE, DEPTH, OPT) \
-void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
- ptrdiff_t stride);
-
-PRED8x8(dc, 10, mmxext)
-PRED8x8(dc, 10, sse2)
-PRED8x8(top_dc, 10, sse2)
-PRED8x8(plane, 10, sse2)
-PRED8x8(vertical, 10, sse2)
-PRED8x8(horizontal, 10, sse2)
-
-#define PRED8x8L(TYPE, DEPTH, OPT)\
-void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
- int has_topleft, \
- int has_topright, \
- ptrdiff_t stride);
-
-PRED8x8L(dc, 10, sse2)
-PRED8x8L(dc, 10, avx)
-PRED8x8L(128_dc, 10, mmxext)
-PRED8x8L(128_dc, 10, sse2)
-PRED8x8L(top_dc, 10, sse2)
-PRED8x8L(top_dc, 10, avx)
-PRED8x8L(vertical, 10, sse2)
-PRED8x8L(vertical, 10, avx)
-PRED8x8L(horizontal, 10, sse2)
-PRED8x8L(horizontal, 10, ssse3)
-PRED8x8L(horizontal, 10, avx)
-PRED8x8L(down_left, 10, sse2)
-PRED8x8L(down_left, 10, ssse3)
-PRED8x8L(down_left, 10, avx)
-PRED8x8L(down_right, 10, sse2)
-PRED8x8L(down_right, 10, ssse3)
-PRED8x8L(down_right, 10, avx)
-PRED8x8L(vertical_right, 10, sse2)
-PRED8x8L(vertical_right, 10, ssse3)
-PRED8x8L(vertical_right, 10, avx)
-PRED8x8L(horizontal_up, 10, sse2)
-PRED8x8L(horizontal_up, 10, ssse3)
-PRED8x8L(horizontal_up, 10, avx)
-
-#define PRED16x16(TYPE, DEPTH, OPT)\
-void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
- ptrdiff_t stride);
-
-PRED16x16(dc, 10, mmxext)
-PRED16x16(dc, 10, sse2)
-PRED16x16(top_dc, 10, mmxext)
-PRED16x16(top_dc, 10, sse2)
-PRED16x16(128_dc, 10, mmxext)
-PRED16x16(128_dc, 10, sse2)
-PRED16x16(left_dc, 10, mmxext)
-PRED16x16(left_dc, 10, sse2)
-PRED16x16(vertical, 10, mmxext)
-PRED16x16(vertical, 10, sse2)
-PRED16x16(horizontal, 10, mmxext)
-PRED16x16(horizontal, 10, sse2)
-
-/* 8-bit versions */
-PRED16x16(vertical, 8, mmx)
-PRED16x16(vertical, 8, sse)
-PRED16x16(horizontal, 8, mmx)
-PRED16x16(horizontal, 8, mmxext)
-PRED16x16(horizontal, 8, ssse3)
-PRED16x16(dc, 8, mmxext)
-PRED16x16(dc, 8, sse2)
-PRED16x16(dc, 8, ssse3)
-PRED16x16(plane_h264, 8, mmx)
-PRED16x16(plane_h264, 8, mmxext)
-PRED16x16(plane_h264, 8, sse2)
-PRED16x16(plane_h264, 8, ssse3)
-PRED16x16(plane_rv40, 8, mmx)
-PRED16x16(plane_rv40, 8, mmxext)
-PRED16x16(plane_rv40, 8, sse2)
-PRED16x16(plane_rv40, 8, ssse3)
-PRED16x16(plane_svq3, 8, mmx)
-PRED16x16(plane_svq3, 8, mmxext)
-PRED16x16(plane_svq3, 8, sse2)
-PRED16x16(plane_svq3, 8, ssse3)
-PRED16x16(tm_vp8, 8, mmx)
-PRED16x16(tm_vp8, 8, mmxext)
-PRED16x16(tm_vp8, 8, sse2)
-
-PRED8x8(top_dc, 8, mmxext)
-PRED8x8(dc_rv40, 8, mmxext)
-PRED8x8(dc, 8, mmxext)
-PRED8x8(vertical, 8, mmx)
-PRED8x8(horizontal, 8, mmx)
-PRED8x8(horizontal, 8, mmxext)
-PRED8x8(horizontal, 8, ssse3)
-PRED8x8(plane, 8, mmx)
-PRED8x8(plane, 8, mmxext)
-PRED8x8(plane, 8, sse2)
-PRED8x8(plane, 8, ssse3)
-PRED8x8(tm_vp8, 8, mmx)
-PRED8x8(tm_vp8, 8, mmxext)
-PRED8x8(tm_vp8, 8, sse2)
-PRED8x8(tm_vp8, 8, ssse3)
-
-PRED8x8L(top_dc, 8, mmxext)
-PRED8x8L(top_dc, 8, ssse3)
-PRED8x8L(dc, 8, mmxext)
-PRED8x8L(dc, 8, ssse3)
-PRED8x8L(horizontal, 8, mmxext)
-PRED8x8L(horizontal, 8, ssse3)
-PRED8x8L(vertical, 8, mmxext)
-PRED8x8L(vertical, 8, ssse3)
-PRED8x8L(down_left, 8, mmxext)
-PRED8x8L(down_left, 8, sse2)
-PRED8x8L(down_left, 8, ssse3)
-PRED8x8L(down_right, 8, mmxext)
-PRED8x8L(down_right, 8, sse2)
-PRED8x8L(down_right, 8, ssse3)
-PRED8x8L(vertical_right, 8, mmxext)
-PRED8x8L(vertical_right, 8, sse2)
-PRED8x8L(vertical_right, 8, ssse3)
-PRED8x8L(vertical_left, 8, sse2)
-PRED8x8L(vertical_left, 8, ssse3)
-PRED8x8L(horizontal_up, 8, mmxext)
-PRED8x8L(horizontal_up, 8, ssse3)
-PRED8x8L(horizontal_down, 8, mmxext)
-PRED8x8L(horizontal_down, 8, sse2)
-PRED8x8L(horizontal_down, 8, ssse3)
-
-PRED4x4(dc, 8, mmxext)
-PRED4x4(down_left, 8, mmxext)
-PRED4x4(down_right, 8, mmxext)
-PRED4x4(vertical_left, 8, mmxext)
-PRED4x4(vertical_right, 8, mmxext)
-PRED4x4(horizontal_up, 8, mmxext)
-PRED4x4(horizontal_down, 8, mmxext)
-PRED4x4(tm_vp8, 8, mmx)
-PRED4x4(tm_vp8, 8, mmxext)
-PRED4x4(tm_vp8, 8, ssse3)
-PRED4x4(vertical_vp8, 8, mmxext)
-
-av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
- const int bit_depth,
- const int chroma_format_idc)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (bit_depth == 8) {
- if (EXTERNAL_MMX(cpu_flags)) {
- h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
- if (chroma_format_idc == 1) {
- h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
- h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
- }
- if (codec_id == AV_CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
- h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
- } else {
- if (chroma_format_idc == 1)
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
- if (codec_id == AV_CODEC_ID_SVQ3) {
- if (cpu_flags & AV_CPU_FLAG_CMOV)
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
- } else if (codec_id == AV_CODEC_ID_RV40) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
- } else {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
- }
- }
- }
-
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
- if (chroma_format_idc == 1)
- h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
- h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
- h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
- h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
- h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
- h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
- h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
- h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
- h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
- h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
- h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
- h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
- h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
- h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
- if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) {
- h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
- }
- if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
- h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
- }
- if (codec_id != AV_CODEC_ID_RV40) {
- h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
- }
- if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
- if (chroma_format_idc == 1) {
- h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
- h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
- }
- }
- if (codec_id == AV_CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
- h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
- h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
- h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
- } else {
- if (chroma_format_idc == 1)
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
- if (codec_id == AV_CODEC_ID_SVQ3) {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
- } else if (codec_id == AV_CODEC_ID_RV40) {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
- } else {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
- }
- }
- }
-
- if (EXTERNAL_SSE(cpu_flags)) {
- h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
- h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
- h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
- h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
- h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
- h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
- if (codec_id == AV_CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
- h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
- } else {
- if (chroma_format_idc == 1)
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
- if (codec_id == AV_CODEC_ID_SVQ3) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
- } else if (codec_id == AV_CODEC_ID_RV40) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
- } else {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
- }
- }
- }
-
- if (EXTERNAL_SSSE3(cpu_flags)) {
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
- if (chroma_format_idc == 1)
- h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
- h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
- h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
- h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
- h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
- h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
- h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
- h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
- h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
- h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
- h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
- if (codec_id == AV_CODEC_ID_VP8) {
- h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
- } else {
- if (chroma_format_idc == 1)
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
- if (codec_id == AV_CODEC_ID_SVQ3) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
- } else if (codec_id == AV_CODEC_ID_RV40) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
- } else {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
- }
- }
- }
- } else if (bit_depth == 10) {
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
- h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
-
- if (chroma_format_idc == 1)
- h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
-
- h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
-
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
- h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
- h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
- h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
- h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
- h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
- h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
- h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
- h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
-
- if (chroma_format_idc == 1) {
- h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
- h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
- h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
- h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
- h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
- }
-
- h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
- h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
- h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
- h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
- h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
- h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
- h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
- h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
- h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
-
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
- h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
- h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
- h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
- h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
- h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
- h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
-
- h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
- h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
- h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
- h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
- h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
- }
- if (EXTERNAL_AVX(cpu_flags)) {
- h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
- h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
- h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
- h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
- h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
-
- h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
- h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
- h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
- h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
- h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
- h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
- h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
- h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
- }
- }
-}
diff --git a/ffmpeg/libavcodec/x86/h264_qpel.c b/ffmpeg/libavcodec/x86/h264_qpel.c
deleted file mode 100644
index fd6068f..0000000
--- a/ffmpeg/libavcodec/x86/h264_qpel.c
+++ /dev/null
@@ -1,634 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
- * Copyright (c) 2011 Daniel Kang
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/h264qpel.h"
-#include "libavcodec/mpegvideo.h"
-#include "dsputil_x86.h"
-
-#if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int dstStride, int src1Stride, int h);
-#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
-#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
-#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
-#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-PIXELS16(static, ff_avg, , , _mmxext)
-PIXELS16(static, ff_put, , , _mmxext)
-
-#define DEF_QPEL(OPNAME)\
-void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
-void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
-void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
-void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
-void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
-void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
-void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
-
-DEF_QPEL(avg)
-DEF_QPEL(put)
-
-#define QPEL_H264(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- int w=3;\
- src -= 2*srcStride+2;\
- while(w--){\
- ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
- tmp += 4;\
- src += 4;\
- }\
- tmp -= 3*4;\
- ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- src -= 2*srcStride;\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
- src += 4;\
- dst += 4;\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
- int w = (size+8)>>2;\
- src -= 2*srcStride+2;\
- while(w--){\
- ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
- tmp += 4;\
- src += 4;\
- }\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
- int w = size>>4;\
- do{\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
- tmp += 8;\
- dst += 8;\
- }while(w--);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- src += 8*dstStride;\
- dst += 8*dstStride;\
- src2 += 8*src2Stride;\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
-}\
-\
-static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
- ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
- ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
-}\
-
-
-#if ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-
-void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
-void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
-
-#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- src += 8*dstStride;\
- dst += 8*dstStride;\
- src2 += 8*src2Stride;\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}
-#endif // ARCH_X86_64
-
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}
-
-static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
- uint8_t *src,
- int tmpStride,
- int srcStride,
- int size)
-{
- int w = (size+8)>>3;
- src -= 2*srcStride+2;
- while(w--){
- ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
- tmp += 8;
- src += 8;
- }
-}
-
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
- ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
-}\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
-}\
-
-#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
-#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
-#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
-#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
-
-#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
-#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
-#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
-#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
-
-#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
-#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
-
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
-
-static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels16_sse2(dst, src, stride, 16);
-}
-static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels16_sse2(dst, src, stride, 16);
-}
-#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
-
-#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
-}\
-
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
-}\
-
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
-}\
-
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- av_assert2(((int)temp & 7) == 0);\
- ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- av_assert2(((int)temp & 7) == 0);\
- ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- av_assert2(((int)temp & 7) == 0);\
- ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
-{\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- av_assert2(((int)temp & 7) == 0);\
- ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
-}\
-
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
-
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
-
-QPEL_H264(put_, PUT_OP, mmxext)
-QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
-QPEL_H264_V_XMM(put_, PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
-QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-
-H264_MC_4816(mmxext)
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
-
-
-//10bit
-#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
-void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
- (uint8_t *dst, uint8_t *src, ptrdiff_t stride);
-
-#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
- LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
-
-#define LUMA_MC_816(DEPTH, TYPE, OPT) \
- LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
- LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
-
-LUMA_MC_ALL(10, mc00, mmxext)
-LUMA_MC_ALL(10, mc10, mmxext)
-LUMA_MC_ALL(10, mc20, mmxext)
-LUMA_MC_ALL(10, mc30, mmxext)
-LUMA_MC_ALL(10, mc01, mmxext)
-LUMA_MC_ALL(10, mc11, mmxext)
-LUMA_MC_ALL(10, mc21, mmxext)
-LUMA_MC_ALL(10, mc31, mmxext)
-LUMA_MC_ALL(10, mc02, mmxext)
-LUMA_MC_ALL(10, mc12, mmxext)
-LUMA_MC_ALL(10, mc22, mmxext)
-LUMA_MC_ALL(10, mc32, mmxext)
-LUMA_MC_ALL(10, mc03, mmxext)
-LUMA_MC_ALL(10, mc13, mmxext)
-LUMA_MC_ALL(10, mc23, mmxext)
-LUMA_MC_ALL(10, mc33, mmxext)
-
-LUMA_MC_816(10, mc00, sse2)
-LUMA_MC_816(10, mc10, sse2)
-LUMA_MC_816(10, mc10, sse2_cache64)
-LUMA_MC_816(10, mc10, ssse3_cache64)
-LUMA_MC_816(10, mc20, sse2)
-LUMA_MC_816(10, mc20, sse2_cache64)
-LUMA_MC_816(10, mc20, ssse3_cache64)
-LUMA_MC_816(10, mc30, sse2)
-LUMA_MC_816(10, mc30, sse2_cache64)
-LUMA_MC_816(10, mc30, ssse3_cache64)
-LUMA_MC_816(10, mc01, sse2)
-LUMA_MC_816(10, mc11, sse2)
-LUMA_MC_816(10, mc21, sse2)
-LUMA_MC_816(10, mc31, sse2)
-LUMA_MC_816(10, mc02, sse2)
-LUMA_MC_816(10, mc12, sse2)
-LUMA_MC_816(10, mc22, sse2)
-LUMA_MC_816(10, mc32, sse2)
-LUMA_MC_816(10, mc03, sse2)
-LUMA_MC_816(10, mc13, sse2)
-LUMA_MC_816(10, mc23, sse2)
-LUMA_MC_816(10, mc33, sse2)
-
-#define QPEL16_OPMC(OP, MC, MMX)\
-void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\
- ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
- src += 8*stride;\
- dst += 8*stride;\
- ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
- ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
-}
-
-#define QPEL16_OP(MC, MMX)\
-QPEL16_OPMC(put, MC, MMX)\
-QPEL16_OPMC(avg, MC, MMX)
-
-#define QPEL16(MMX)\
-QPEL16_OP(mc00, MMX)\
-QPEL16_OP(mc01, MMX)\
-QPEL16_OP(mc02, MMX)\
-QPEL16_OP(mc03, MMX)\
-QPEL16_OP(mc10, MMX)\
-QPEL16_OP(mc11, MMX)\
-QPEL16_OP(mc12, MMX)\
-QPEL16_OP(mc13, MMX)\
-QPEL16_OP(mc20, MMX)\
-QPEL16_OP(mc21, MMX)\
-QPEL16_OP(mc22, MMX)\
-QPEL16_OP(mc23, MMX)\
-QPEL16_OP(mc30, MMX)\
-QPEL16_OP(mc31, MMX)\
-QPEL16_OP(mc32, MMX)\
-QPEL16_OP(mc33, MMX)
-
-#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
-QPEL16(mmxext)
-#endif
-
-#endif /* HAVE_YASM */
-
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
- do { \
- c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
- c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
- } while (0)
-
-#define H264_QPEL_FUNCS(x, y, CPU) \
- do { \
- c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
- c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
- c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
- c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
- } while (0)
-
-#define H264_QPEL_FUNCS_10(x, y, CPU) \
- do { \
- c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
- c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
- c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
- c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
- } while (0)
-
-av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
-{
-#if HAVE_YASM
- int high_bit_depth = bit_depth > 8;
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- if (!high_bit_depth) {
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
- } else if (bit_depth == 10) {
-#if ARCH_X86_32
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
-#endif
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
- }
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) {
- // these functions are slower than mmx on AMD, but faster on Intel
- H264_QPEL_FUNCS(0, 0, sse2);
- }
-
- if (!high_bit_depth) {
- H264_QPEL_FUNCS(0, 1, sse2);
- H264_QPEL_FUNCS(0, 2, sse2);
- H264_QPEL_FUNCS(0, 3, sse2);
- H264_QPEL_FUNCS(1, 1, sse2);
- H264_QPEL_FUNCS(1, 2, sse2);
- H264_QPEL_FUNCS(1, 3, sse2);
- H264_QPEL_FUNCS(2, 1, sse2);
- H264_QPEL_FUNCS(2, 2, sse2);
- H264_QPEL_FUNCS(2, 3, sse2);
- H264_QPEL_FUNCS(3, 1, sse2);
- H264_QPEL_FUNCS(3, 2, sse2);
- H264_QPEL_FUNCS(3, 3, sse2);
- }
-
- if (bit_depth == 10) {
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
- H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
- H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
- H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
- }
- }
-
- if (EXTERNAL_SSSE3(cpu_flags)) {
- if (!high_bit_depth) {
- H264_QPEL_FUNCS(1, 0, ssse3);
- H264_QPEL_FUNCS(1, 1, ssse3);
- H264_QPEL_FUNCS(1, 2, ssse3);
- H264_QPEL_FUNCS(1, 3, ssse3);
- H264_QPEL_FUNCS(2, 0, ssse3);
- H264_QPEL_FUNCS(2, 1, ssse3);
- H264_QPEL_FUNCS(2, 2, ssse3);
- H264_QPEL_FUNCS(2, 3, ssse3);
- H264_QPEL_FUNCS(3, 0, ssse3);
- H264_QPEL_FUNCS(3, 1, ssse3);
- H264_QPEL_FUNCS(3, 2, ssse3);
- H264_QPEL_FUNCS(3, 3, ssse3);
- }
-
- if (bit_depth == 10) {
- H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
- H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
- H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
- }
- }
-
- if (EXTERNAL_AVX(cpu_flags)) {
- /* AVX implies 64 byte cache lines without the need to avoid unaligned
- * memory accesses that cross the boundary between two cache lines.
- * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
- * having to treat SSE2 functions with such properties as AVX. */
- if (bit_depth == 10) {
- H264_QPEL_FUNCS_10(1, 0, sse2);
- H264_QPEL_FUNCS_10(2, 0, sse2);
- H264_QPEL_FUNCS_10(3, 0, sse2);
- }
- }
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm b/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
deleted file mode 100644
index 4561871..0000000
--- a/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
+++ /dev/null
@@ -1,884 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
-;*****************************************************************************
-;* Copyright (C) 2011 x264 project
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-cextern pw_16
-cextern pw_1
-cextern pb_0
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
-pad10: times 8 dw 10*1023
-pad20: times 8 dw 20*1023
-pad30: times 8 dw 30*1023
-depad: times 4 dd 32*20*1023 + 512
-depad2: times 8 dw 20*1023 + 16*1022 + 16
-unpad: times 8 dw 16*1022/32 ; needs to be mod 16
-
-tap1: times 4 dw 1, -5
-tap2: times 4 dw 20, 20
-tap3: times 4 dw -5, 1
-pd_0f: times 4 dd 0xffff
-
-SECTION .text
-
-
-%macro AVG_MOV 2
- pavgw %2, %1
- mova %1, %2
-%endmacro
-
-%macro ADDW 3
-%if mmsize == 8
- paddw %1, %2
-%else
- movu %3, %2
- paddw %1, %3
-%endif
-%endmacro
-
-%macro FILT_H 4
- paddw %1, %4
- psubw %1, %2 ; a-b
- psraw %1, 2 ; (a-b)/4
- psubw %1, %2 ; (a-b)/4-b
- paddw %1, %3 ; (a-b)/4-b+c
- psraw %1, 2 ; ((a-b)/4-b+c)/4
- paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-%endmacro
-
-%macro PRELOAD_V 0
- lea r3, [r2*3]
- sub r1, r3
- movu m0, [r1+r2]
- movu m1, [r1+r2*2]
- add r1, r3
- movu m2, [r1]
- movu m3, [r1+r2]
- movu m4, [r1+r2*2]
- add r1, r3
-%endmacro
-
-%macro FILT_V 8
- movu %6, [r1]
- paddw %1, %6
- mova %7, %2
- paddw %7, %5
- mova %8, %3
- paddw %8, %4
- FILT_H %1, %7, %8, [pw_16]
- psraw %1, 1
- CLIPW %1, [pb_0], [pw_pixel_max]
-%endmacro
-
-%macro MC 1
-%define OP_MOV mova
-INIT_MMX mmxext
-%1 put, 4
-INIT_XMM sse2
-%1 put, 8
-
-%define OP_MOV AVG_MOV
-INIT_MMX mmxext
-%1 avg, 4
-INIT_XMM sse2
-%1 avg, 8
-%endmacro
-
-%macro MCAxA_OP 7
-%if ARCH_X86_32
-cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- add r0, %3*2
- add r1, %3*2
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%3]
- lea r1, [r1+r2*%3]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%3+%3*2]
- lea r1, [r1+r2*%3+%3*2]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
-%else ; ARCH_X86_64
-cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
- mov r%6, r0
-%assign p1 %6+1
- mov r %+ p1, r1
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+%3*2]
- lea r1, [r %+ p1+%3*2]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+r2*%3]
- lea r1, [r %+ p1+r2*%3]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+r2*%3+%3*2]
- lea r1, [r %+ p1+r2*%3+%3*2]
-%if UNIX64 == 0 ; fall through to function
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
-%endif
-%endif
-%endmacro
-
-;cpu, put/avg, mc, 4/8, ...
-%macro cglobal_mc 6
-%assign i %3*2
-%if ARCH_X86_32 || cpuflag(sse2)
-MCAxA_OP %1, %2, %3, i, %4,%5,%6
-%endif
-
-cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
-%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
-%endif
-
-stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro COPY4 0
- movu m0, [r1 ]
- OP_MOV [r0 ], m0
- movu m0, [r1+r2 ]
- OP_MOV [r0+r2 ], m0
- movu m0, [r1+r2*2]
- OP_MOV [r0+r2*2], m0
- movu m0, [r1+r3 ]
- OP_MOV [r0+r3 ], m0
-%endmacro
-
-%macro MC00 1
-INIT_MMX mmxext
-cglobal_mc %1, mc00, 4, 3,4,0
- lea r3, [r2*3]
- COPY4
- ret
-
-INIT_XMM sse2
-cglobal %1_h264_qpel8_mc00_10, 3,4
- lea r3, [r2*3]
- COPY4
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
- COPY4
- RET
-
-cglobal %1_h264_qpel16_mc00_10, 3,4
- mov r3d, 8
-.loop:
- movu m0, [r1 ]
- movu m1, [r1 +16]
- OP_MOV [r0 ], m0
- OP_MOV [r0 +16], m1
- movu m0, [r1+r2 ]
- movu m1, [r1+r2+16]
- OP_MOV [r0+r2 ], m0
- OP_MOV [r0+r2+16], m1
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- dec r3d
- jg .loop
- REP_RET
-%endmacro
-
-%define OP_MOV mova
-MC00 put
-
-%define OP_MOV AVG_MOV
-MC00 avg
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC_CACHE 1
-%define OP_MOV mova
-INIT_MMX mmxext
-%1 put, 4
-INIT_XMM sse2, cache64
-%1 put, 8
-INIT_XMM ssse3, cache64
-%1 put, 8
-INIT_XMM sse2
-%1 put, 8
-
-%define OP_MOV AVG_MOV
-INIT_MMX mmxext
-%1 avg, 4
-INIT_XMM sse2, cache64
-%1 avg, 8
-INIT_XMM ssse3, cache64
-%1 avg, 8
-INIT_XMM sse2
-%1 avg, 8
-%endmacro
-
-%macro MC20 2
-cglobal_mc %1, mc20, %2, 3,4,9
- mov r3d, %2
- mova m1, [pw_pixel_max]
-%if num_mmregs > 8
- mova m8, [pw_16]
- %define p16 m8
-%else
- %define p16 [pw_16]
-%endif
-.nextrow:
-%if %0 == 4
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- ADDW m2, [r1+6], m5
- ADDW m3, [r1+4], m5
- ADDW m4, [r1+2], m5
-%else ; movu is slow on these processors
-%if mmsize==16
- movu m2, [r1-4]
- movu m0, [r1+6]
- mova m6, m0
- psrldq m0, 6
-
- paddw m6, m2
- PALIGNR m3, m0, m2, 2, m5
- PALIGNR m7, m0, m2, 8, m5
- paddw m3, m7
- PALIGNR m4, m0, m2, 4, m5
- PALIGNR m7, m0, m2, 6, m5
- paddw m4, m7
- SWAP 2, 6
-%else
- movu m2, [r1-4]
- movu m6, [r1+4]
- PALIGNR m3, m6, m2, 2, m5
- paddw m3, m6
- PALIGNR m4, m6, m2, 4, m5
- PALIGNR m7, m6, m2, 6, m5
- paddw m4, m7
- paddw m2, [r1+6]
-%endif
-%endif
-
- FILT_H m2, m3, m4, p16
- psraw m2, 1
- pxor m0, m0
- CLIPW m2, m0, m1
- OP_MOV [r0], m2
- add r0, r2
- add r1, r2
- dec r3d
- jg .nextrow
- rep ret
-%endmacro
-
-MC_CACHE MC20
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC30 2
-cglobal_mc %1, mc30, %2, 3,5,9
- lea r4, [r1+2]
- jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC_CACHE MC30
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC10 2
-cglobal_mc %1, mc10, %2, 3,5,9
- mov r4, r1
-.body:
- mov r3d, %2
- mova m1, [pw_pixel_max]
-%if num_mmregs > 8
- mova m8, [pw_16]
- %define p16 m8
-%else
- %define p16 [pw_16]
-%endif
-.nextrow:
-%if %0 == 4
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- ADDW m2, [r1+6], m5
- ADDW m3, [r1+4], m5
- ADDW m4, [r1+2], m5
-%else ; movu is slow on these processors
-%if mmsize==16
- movu m2, [r1-4]
- movu m0, [r1+6]
- mova m6, m0
- psrldq m0, 6
-
- paddw m6, m2
- PALIGNR m3, m0, m2, 2, m5
- PALIGNR m7, m0, m2, 8, m5
- paddw m3, m7
- PALIGNR m4, m0, m2, 4, m5
- PALIGNR m7, m0, m2, 6, m5
- paddw m4, m7
- SWAP 2, 6
-%else
- movu m2, [r1-4]
- movu m6, [r1+4]
- PALIGNR m3, m6, m2, 2, m5
- paddw m3, m6
- PALIGNR m4, m6, m2, 4, m5
- PALIGNR m7, m6, m2, 6, m5
- paddw m4, m7
- paddw m2, [r1+6]
-%endif
-%endif
-
- FILT_H m2, m3, m4, p16
- psraw m2, 1
- pxor m0, m0
- CLIPW m2, m0, m1
- movu m3, [r4]
- pavgw m2, m3
- OP_MOV [r0], m2
- add r0, r2
- add r1, r2
- add r4, r2
- dec r3d
- jg .nextrow
- rep ret
-%endmacro
-
-MC_CACHE MC10
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro V_FILT 10
-v_filt%9_%10_10
- add r4, r2
-.no_addr4:
- FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- add r1, r2
- add r0, r2
- ret
-%endmacro
-
-INIT_MMX mmxext
-RESET_MM_PERMUTATION
-%assign i 0
-%rep 4
-V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
-SWAP 0,1,2,3,4,5
-%assign i i+1
-%endrep
-
-INIT_XMM sse2
-RESET_MM_PERMUTATION
-%assign i 0
-%rep 6
-V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
-SWAP 0,1,2,3,4,5
-%assign i i+1
-%endrep
-
-%macro MC02 2
-cglobal_mc %1, mc02, %2, 3,4,8
- PRELOAD_V
-
- sub r0, r2
-%assign j 0
-%rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10.no_addr4
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
-%endrep
- ret
-%endmacro
-
-MC MC02
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC01 2
-cglobal_mc %1, mc01, %2, 3,5,8
- mov r4, r1
-.body:
- PRELOAD_V
-
- sub r4, r2
- sub r0, r2
-%assign j 0
-%rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10
- movu m7, [r4]
- pavgw m0, m7
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
-%endrep
- ret
-%endmacro
-
-MC MC01
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC03 2
-cglobal_mc %1, mc03, %2, 3,5,8
- lea r4, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC03
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro H_FILT_AVG 2-3
-h_filt%1_%2_10:
-;FILT_H with fewer registers and averaged with the FILT_V result
-;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
-;unfortunately I need three registers, so m5 will have to be re-read from memory
- movu m5, [r4-4]
- ADDW m5, [r4+6], m7
- movu m6, [r4-2]
- ADDW m6, [r4+4], m7
- paddw m5, [pw_16]
- psubw m5, m6 ; a-b
- psraw m5, 2 ; (a-b)/4
- psubw m5, m6 ; (a-b)/4-b
- movu m6, [r4+0]
- ADDW m6, [r4+2], m7
- paddw m5, m6 ; (a-b)/4-b+c
- psraw m5, 2 ; ((a-b)/4-b+c)/4
- paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- psraw m5, 1
- CLIPW m5, [pb_0], [pw_pixel_max]
-;avg FILT_V, FILT_H
- pavgw m0, m5
-%if %0!=4
- movu m5, [r1+r5]
-%endif
- ret
-%endmacro
-
-INIT_MMX mmxext
-RESET_MM_PERMUTATION
-%assign i 0
-%rep 3
-H_FILT_AVG 4, i
-SWAP 0,1,2,3,4,5
-%assign i i+1
-%endrep
-H_FILT_AVG 4, i, 0
-
-INIT_XMM sse2
-RESET_MM_PERMUTATION
-%assign i 0
-%rep 6
-%if i==1
-H_FILT_AVG 8, i, 0
-%else
-H_FILT_AVG 8, i
-%endif
-SWAP 0,1,2,3,4,5
-%assign i i+1
-%endrep
-
-%macro MC11 2
-; this REALLY needs x86_64
-cglobal_mc %1, mc11, %2, 3,6,8
- mov r4, r1
-.body:
- PRELOAD_V
-
- sub r0, r2
- sub r4, r2
- mov r5, r2
- neg r5
-%assign j 0
-%rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10
- call h_filt%2_ %+ i %+ _10
-%if %2==8 && i==1
- movu m5, [r1+r5]
-%endif
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
-%endrep
- ret
-%endmacro
-
-MC MC11
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC31 2
-cglobal_mc %1, mc31, %2, 3,6,8
- mov r4, r1
- add r1, 2
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC31
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC13 2
-cglobal_mc %1, mc13, %2, 3,7,12
- lea r4, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC13
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC33 2
-cglobal_mc %1, mc33, %2, 3,6,8
- lea r4, [r1+r2]
- add r1, 2
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC33
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro FILT_H2 3
- psubw %1, %2 ; a-b
- psubw %2, %3 ; b-c
- psllw %2, 2
- psubw %1, %2 ; a-5*b+4*c
- psllw %3, 4
- paddw %1, %3 ; a-5*b+20*c
-%endmacro
-
-%macro FILT_VNRD 8
- movu %6, [r1]
- paddw %1, %6
- mova %7, %2
- paddw %7, %5
- mova %8, %3
- paddw %8, %4
- FILT_H2 %1, %7, %8
-%endmacro
-
-%macro HV 1
-%if mmsize==16
-%define PAD 12
-%define COUNT 2
-%else
-%define PAD 4
-%define COUNT 3
-%endif
-put_hv%1_10:
- neg r2 ; This actually saves instructions
- lea r1, [r1+r2*2-mmsize+PAD]
- lea r4, [rsp+PAD+gprsize]
- mov r3d, COUNT
-.v_loop:
- movu m0, [r1]
- sub r1, r2
- movu m1, [r1]
- sub r1, r2
- movu m2, [r1]
- sub r1, r2
- movu m3, [r1]
- sub r1, r2
- movu m4, [r1]
- sub r1, r2
-%assign i 0
-%rep %1-1
- FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- psubw m0, [pad20]
- movu [r4+i*mmsize*3], m0
- sub r1, r2
- SWAP 0,1,2,3,4,5
-%assign i i+1
-%endrep
- FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- psubw m0, [pad20]
- movu [r4+i*mmsize*3], m0
- add r4, mmsize
- lea r1, [r1+r2*8+mmsize]
-%if %1==8
- lea r1, [r1+r2*4]
-%endif
- dec r3d
- jg .v_loop
- neg r2
- ret
-%endmacro
-
-INIT_MMX mmxext
-HV 4
-INIT_XMM sse2
-HV 8
-
-%macro H_LOOP 1
-%if num_mmregs > 8
- %define s1 m8
- %define s2 m9
- %define s3 m10
- %define d1 m11
-%else
- %define s1 [tap1]
- %define s2 [tap2]
- %define s3 [tap3]
- %define d1 [depad]
-%endif
-h%1_loop_op:
- movu m1, [r1+mmsize-4]
- movu m2, [r1+mmsize-2]
- mova m3, [r1+mmsize+0]
- movu m4, [r1+mmsize+2]
- movu m5, [r1+mmsize+4]
- movu m6, [r1+mmsize+6]
-%if num_mmregs > 8
- pmaddwd m1, s1
- pmaddwd m2, s1
- pmaddwd m3, s2
- pmaddwd m4, s2
- pmaddwd m5, s3
- pmaddwd m6, s3
- paddd m1, d1
- paddd m2, d1
-%else
- mova m0, s1
- pmaddwd m1, m0
- pmaddwd m2, m0
- mova m0, s2
- pmaddwd m3, m0
- pmaddwd m4, m0
- mova m0, s3
- pmaddwd m5, m0
- pmaddwd m6, m0
- mova m0, d1
- paddd m1, m0
- paddd m2, m0
-%endif
- paddd m3, m5
- paddd m4, m6
- paddd m1, m3
- paddd m2, m4
- psrad m1, 10
- psrad m2, 10
- pslld m2, 16
- pand m1, [pd_0f]
- por m1, m2
-%if num_mmregs <= 8
- pxor m0, m0
-%endif
- CLIPW m1, m0, m7
- add r1, mmsize*3
- ret
-%endmacro
-
-INIT_MMX mmxext
-H_LOOP 4
-INIT_XMM sse2
-H_LOOP 8
-
-%macro MC22 2
-cglobal_mc %1, mc22, %2, 3,7,12
-%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- mov r3d, %2
- mova m7, [pw_pixel_max]
-%if num_mmregs > 8
- pxor m0, m0
- mova m8, [tap1]
- mova m9, [tap2]
- mova m10, [tap3]
- mova m11, [depad]
-%endif
- mov r1, rsp
-.h_loop:
- call h%2_loop_op
-
- OP_MOV [r0], m1
- add r0, r2
- dec r3d
- jg .h_loop
-
- mov rsp, r6 ; restore stack pointer
- ret
-%endmacro
-
-MC MC22
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC12 2
-cglobal_mc %1, mc12, %2, 3,7,12
-%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- xor r4d, r4d
-.body:
- mov r3d, %2
- pxor m0, m0
- mova m7, [pw_pixel_max]
-%if num_mmregs > 8
- mova m8, [tap1]
- mova m9, [tap2]
- mova m10, [tap3]
- mova m11, [depad]
-%endif
- mov r1, rsp
-.h_loop:
- call h%2_loop_op
-
- movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
- paddw m3, [depad2]
- psrlw m3, 5
- psubw m3, [unpad]
- CLIPW m3, m0, m7
- pavgw m1, m3
-
- OP_MOV [r0], m1
- add r0, r2
- dec r3d
- jg .h_loop
-
- mov rsp, r6 ; restore stack pointer
- ret
-%endmacro
-
-MC MC12
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC32 2
-cglobal_mc %1, mc32, %2, 3,7,12
-%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- mov r4d, 2 ; sizeof(pixel)
- jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC32
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro H_NRD 1
-put_h%1_10:
- add rsp, gprsize
- mov r3d, %1
- xor r4d, r4d
- mova m6, [pad20]
-.nextrow:
- movu m2, [r5-4]
- movu m3, [r5-2]
- movu m4, [r5+0]
- ADDW m2, [r5+6], m5
- ADDW m3, [r5+4], m5
- ADDW m4, [r5+2], m5
-
- FILT_H2 m2, m3, m4
- psubw m2, m6
- mova [rsp+r4], m2
- add r4d, mmsize*3
- add r5, r2
- dec r3d
- jg .nextrow
- sub rsp, gprsize
- ret
-%endmacro
-
-INIT_MMX mmxext
-H_NRD 4
-INIT_XMM sse2
-H_NRD 8
-
-%macro MC21 2
-cglobal_mc %1, mc21, %2, 3,7,12
- mov r5, r1
-.body:
-%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
-
- sub rsp, PAD
- call put_h%2_10
-
- sub rsp, PAD
- call put_hv%2_10
-
- mov r4d, PAD-mmsize ; H buffer
- jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC21
-
-;-----------------------------------------------------------------------------
-; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
-;-----------------------------------------------------------------------------
-%macro MC23 2
-cglobal_mc %1, mc23, %2, 3,7,12
- lea r5, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
-%endmacro
-
-MC MC23
diff --git a/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm b/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
deleted file mode 100644
index 2d287ba..0000000
--- a/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
+++ /dev/null
@@ -1,862 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
-;*****************************************************************************
-;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
-;* Copyright (C) 2012 Daniel Kang
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-cextern pw_16
-cextern pw_5
-cextern pb_0
-
-SECTION .text
-
-
-%macro op_avgh 3
- movh %3, %2
- pavgb %1, %3
- movh %2, %1
-%endmacro
-
-%macro op_avg 2-3
- pavgb %1, %2
- mova %2, %1
-%endmacro
-
-%macro op_puth 2-3
- movh %2, %1
-%endmacro
-
-%macro op_put 2-3
- mova %2, %1
-%endmacro
-
-%macro QPEL4_H_LOWPASS_OP 1
-cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- pxor m7, m7
- mova m4, [pw_5]
- mova m5, [pw_16]
- mov r4d, 4
-.loop:
- movh m1, [r1-1]
- movh m2, [r1+0]
- movh m3, [r1+1]
- movh m0, [r1+2]
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m0, m7
- paddw m1, m0
- paddw m2, m3
- movh m0, [r1-2]
- movh m3, [r1+3]
- punpcklbw m0, m7
- punpcklbw m3, m7
- paddw m0, m3
- psllw m2, 2
- psubw m2, m1
- pmullw m2, m4
- paddw m0, m5
- paddw m0, m2
- psraw m0, 5
- packuswb m0, m0
- op_%1h m0, [r0], m6
- add r0, r2
- add r1, r3
- dec r4d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL4_H_LOWPASS_OP put
-QPEL4_H_LOWPASS_OP avg
-
-%macro QPEL8_H_LOWPASS_OP 1
-cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- mov r4d, 8
- pxor m7, m7
- mova m6, [pw_5]
-.loop:
- mova m0, [r1]
- mova m2, [r1+1]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- paddw m0, m2
- paddw m1, m3
- psllw m0, 2
- psllw m1, 2
- mova m2, [r1-1]
- mova m4, [r1+2]
- mova m3, m2
- mova m5, m4
- punpcklbw m2, m7
- punpckhbw m3, m7
- punpcklbw m4, m7
- punpckhbw m5, m7
- paddw m2, m4
- paddw m5, m3
- psubw m0, m2
- psubw m1, m5
- pmullw m0, m6
- pmullw m1, m6
- movd m2, [r1-2]
- movd m5, [r1+7]
- punpcklbw m2, m7
- punpcklbw m5, m7
- paddw m2, m3
- paddw m4, m5
- mova m5, [pw_16]
- paddw m2, m5
- paddw m4, m5
- paddw m0, m2
- paddw m1, m4
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m1
- op_%1 m0, [r0], m4
- add r0, r2
- add r1, r3
- dec r4d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8_H_LOWPASS_OP put
-QPEL8_H_LOWPASS_OP avg
-
-%macro QPEL8_H_LOWPASS_OP_XMM 1
-cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- mov r4d, 8
- pxor m7, m7
- mova m6, [pw_5]
-.loop:
- movu m1, [r1-2]
- mova m0, m1
- punpckhbw m1, m7
- punpcklbw m0, m7
- mova m2, m1
- mova m3, m1
- mova m4, m1
- mova m5, m1
- palignr m4, m0, 2
- palignr m3, m0, 4
- palignr m2, m0, 6
- palignr m1, m0, 8
- palignr m5, m0, 10
- paddw m0, m5
- paddw m2, m3
- paddw m1, m4
- psllw m2, 2
- psubw m2, m1
- paddw m0, [pw_16]
- pmullw m2, m6
- paddw m2, m0
- psraw m2, 5
- packuswb m2, m2
- op_%1h m2, [r0], m4
- add r1, r3
- add r0, r2
- dec r4d
- jne .loop
- REP_RET
-%endmacro
-
-INIT_XMM ssse3
-QPEL8_H_LOWPASS_OP_XMM put
-QPEL8_H_LOWPASS_OP_XMM avg
-
-
-%macro QPEL4_H_LOWPASS_L2_OP 1
-cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- pxor m7, m7
- mova m4, [pw_5]
- mova m5, [pw_16]
- mov r5d, 4
-.loop:
- movh m1, [r1-1]
- movh m2, [r1+0]
- movh m3, [r1+1]
- movh m0, [r1+2]
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m0, m7
- paddw m1, m0
- paddw m2, m3
- movh m0, [r1-2]
- movh m3, [r1+3]
- punpcklbw m0, m7
- punpcklbw m3, m7
- paddw m0, m3
- psllw m2, 2
- psubw m2, m1
- pmullw m2, m4
- paddw m0, m5
- paddw m0, m2
- movh m3, [r2]
- psraw m0, 5
- packuswb m0, m0
- pavgb m0, m3
- op_%1h m0, [r0], m6
- add r0, r3
- add r1, r3
- add r2, r4
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL4_H_LOWPASS_L2_OP put
-QPEL4_H_LOWPASS_L2_OP avg
-
-
-%macro QPEL8_H_LOWPASS_L2_OP 1
-cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- mov r5d, 8
- pxor m7, m7
- mova m6, [pw_5]
-.loop:
- mova m0, [r1]
- mova m2, [r1+1]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- paddw m0, m2
- paddw m1, m3
- psllw m0, 2
- psllw m1, 2
- mova m2, [r1-1]
- mova m4, [r1+2]
- mova m3, m2
- mova m5, m4
- punpcklbw m2, m7
- punpckhbw m3, m7
- punpcklbw m4, m7
- punpckhbw m5, m7
- paddw m2, m4
- paddw m5, m3
- psubw m0, m2
- psubw m1, m5
- pmullw m0, m6
- pmullw m1, m6
- movd m2, [r1-2]
- movd m5, [r1+7]
- punpcklbw m2, m7
- punpcklbw m5, m7
- paddw m2, m3
- paddw m4, m5
- mova m5, [pw_16]
- paddw m2, m5
- paddw m4, m5
- paddw m0, m2
- paddw m1, m4
- psraw m0, 5
- psraw m1, 5
- mova m4, [r2]
- packuswb m0, m1
- pavgb m0, m4
- op_%1 m0, [r0], m4
- add r0, r3
- add r1, r3
- add r2, r4
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8_H_LOWPASS_L2_OP put
-QPEL8_H_LOWPASS_L2_OP avg
-
-
-%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
-cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- mov r5d, 8
- pxor m7, m7
- mova m6, [pw_5]
-.loop:
- lddqu m1, [r1-2]
- mova m0, m1
- punpckhbw m1, m7
- punpcklbw m0, m7
- mova m2, m1
- mova m3, m1
- mova m4, m1
- mova m5, m1
- palignr m4, m0, 2
- palignr m3, m0, 4
- palignr m2, m0, 6
- palignr m1, m0, 8
- palignr m5, m0, 10
- paddw m0, m5
- paddw m2, m3
- paddw m1, m4
- psllw m2, 2
- movh m3, [r2]
- psubw m2, m1
- paddw m0, [pw_16]
- pmullw m2, m6
- paddw m2, m0
- psraw m2, 5
- packuswb m2, m2
- pavgb m2, m3
- op_%1h m2, [r0], m4
- add r1, r3
- add r0, r3
- add r2, r4
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_XMM ssse3
-QPEL8_H_LOWPASS_L2_OP_XMM put
-QPEL8_H_LOWPASS_L2_OP_XMM avg
-
-
-; All functions that call this are required to have function arguments of
-; dst, src, dstStride, srcStride
-%macro FILT_V 1
- mova m6, m2
- movh m5, [r1]
- paddw m6, m3
- psllw m6, 2
- psubw m6, m1
- psubw m6, m4
- punpcklbw m5, m7
- pmullw m6, [pw_5]
- paddw m0, [pw_16]
- add r1, r3
- paddw m0, m5
- paddw m6, m0
- psraw m6, 5
- packuswb m6, m6
- op_%1h m6, [r0], m0 ; 1
- add r0, r2
- SWAP 0, 1, 2, 3, 4, 5
-%endmacro
-
-%macro QPEL4_V_LOWPASS_OP 1
-cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- sub r1, r3
- sub r1, r3
- pxor m7, m7
- movh m0, [r1]
- movh m1, [r1+r3]
- lea r1, [r1+2*r3]
- movh m2, [r1]
- movh m3, [r1+r3]
- lea r1, [r1+2*r3]
- movh m4, [r1]
- add r1, r3
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL4_V_LOWPASS_OP put
-QPEL4_V_LOWPASS_OP avg
-
-
-
-%macro QPEL8OR16_V_LOWPASS_OP 1
-%if cpuflag(sse2)
-cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- sub r1, r3
- sub r1, r3
-%else
-cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
-%endif
- pxor m7, m7
- movh m0, [r1]
- movh m1, [r1+r3]
- lea r1, [r1+2*r3]
- movh m2, [r1]
- movh m3, [r1+r3]
- lea r1, [r1+2*r3]
- movh m4, [r1]
- add r1, r3
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- cmp r4d, 16
- jne .end
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
- FILT_V %1
-.end:
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8OR16_V_LOWPASS_OP put
-QPEL8OR16_V_LOWPASS_OP avg
-
-INIT_XMM sse2
-QPEL8OR16_V_LOWPASS_OP put
-QPEL8OR16_V_LOWPASS_OP avg
-
-
-; All functions that use this are required to have args:
-; src, tmp, srcSize
-%macro FILT_HV 1 ; offset
- mova m6, m2
- movh m5, [r0]
- paddw m6, m3
- psllw m6, 2
- paddw m0, [pw_16]
- psubw m6, m1
- psubw m6, m4
- punpcklbw m5, m7
- pmullw m6, [pw_5]
- paddw m0, m5
- add r0, r2
- paddw m6, m0
- mova [r1+%1], m6
- SWAP 0, 1, 2, 3, 4, 5
-%endmacro
-
-%macro QPEL4_HV1_LOWPASS_OP 1
-cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
- movsxdifnidn r2, r2d
- pxor m7, m7
- movh m0, [r0]
- movh m1, [r0+r2]
- lea r0, [r0+2*r2]
- movh m2, [r0]
- movh m3, [r0+r2]
- lea r0, [r0+2*r2]
- movh m4, [r0]
- add r0, r2
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- FILT_HV 0*24
- FILT_HV 1*24
- FILT_HV 2*24
- FILT_HV 3*24
- RET
-
-cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
- movsxdifnidn r2, r2d
- mov r3d, 4
-.loop:
- mova m0, [r0]
- paddw m0, [r0+10]
- mova m1, [r0+2]
- paddw m1, [r0+8]
- mova m2, [r0+4]
- paddw m2, [r0+6]
- psubw m0, m1
- psraw m0, 2
- psubw m0, m1
- paddsw m0, m2
- psraw m0, 2
- paddw m0, m2
- psraw m0, 6
- packuswb m0, m0
- op_%1h m0, [r1], m7
- add r0, 24
- add r1, r2
- dec r3d
- jnz .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL4_HV1_LOWPASS_OP put
-QPEL4_HV1_LOWPASS_OP avg
-
-%macro QPEL8OR16_HV1_LOWPASS_OP 1
-cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
- movsxdifnidn r2, r2d
- pxor m7, m7
- movh m0, [r0]
- movh m1, [r0+r2]
- lea r0, [r0+2*r2]
- movh m2, [r0]
- movh m3, [r0+r2]
- lea r0, [r0+2*r2]
- movh m4, [r0]
- add r0, r2
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- FILT_HV 0*48
- FILT_HV 1*48
- FILT_HV 2*48
- FILT_HV 3*48
- FILT_HV 4*48
- FILT_HV 5*48
- FILT_HV 6*48
- FILT_HV 7*48
- cmp r3d, 16
- jne .end
- FILT_HV 8*48
- FILT_HV 9*48
- FILT_HV 10*48
- FILT_HV 11*48
- FILT_HV 12*48
- FILT_HV 13*48
- FILT_HV 14*48
- FILT_HV 15*48
-.end:
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8OR16_HV1_LOWPASS_OP put
-QPEL8OR16_HV1_LOWPASS_OP avg
-
-INIT_XMM sse2
-QPEL8OR16_HV1_LOWPASS_OP put
-
-
-
-%macro QPEL8OR16_HV2_LOWPASS_OP 1
-; unused is to match ssse3 and mmxext args
-cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
- movsxdifnidn r2, r2d
-.loop:
- mova m0, [r1]
- mova m3, [r1+8]
- mova m1, [r1+2]
- mova m4, [r1+10]
- paddw m0, m4
- paddw m1, m3
- paddw m3, [r1+18]
- paddw m4, [r1+16]
- mova m2, [r1+4]
- mova m5, [r1+12]
- paddw m2, [r1+6]
- paddw m5, [r1+14]
- psubw m0, m1
- psubw m3, m4
- psraw m0, 2
- psraw m3, 2
- psubw m0, m1
- psubw m3, m4
- paddsw m0, m2
- paddsw m3, m5
- psraw m0, 2
- psraw m3, 2
- paddw m0, m2
- paddw m3, m5
- psraw m0, 6
- psraw m3, 6
- packuswb m0, m3
- op_%1 m0, [r0], m7
- add r1, 48
- add r0, r2
- dec r4d
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8OR16_HV2_LOWPASS_OP put
-QPEL8OR16_HV2_LOWPASS_OP avg
-
-%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
-cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- cmp r4d, 16
- je .op16
-.loop8:
- mova m1, [r1+16]
- mova m0, [r1]
- mova m2, m1
- mova m3, m1
- mova m4, m1
- mova m5, m1
- palignr m5, m0, 10
- palignr m4, m0, 8
- palignr m3, m0, 6
- palignr m2, m0, 4
- palignr m1, m0, 2
- paddw m0, m5
- paddw m1, m4
- paddw m2, m3
- psubw m0, m1
- psraw m0, 2
- psubw m0, m1
- paddw m0, m2
- psraw m0, 2
- paddw m0, m2
- psraw m0, 6
- packuswb m0, m0
- op_%1h m0, [r0], m7
- add r1, 48
- add r0, r2
- dec r4d
- jne .loop8
- jmp .done
-.op16:
- mova m4, [r1+32]
- mova m5, [r1+16]
- mova m7, [r1]
- mova m3, m4
- mova m2, m4
- mova m1, m4
- mova m0, m4
- palignr m0, m5, 10
- palignr m1, m5, 8
- palignr m2, m5, 6
- palignr m3, m5, 4
- palignr m4, m5, 2
- paddw m0, m5
- paddw m1, m4
- paddw m2, m3
- mova m6, m5
- mova m4, m5
- mova m3, m5
- palignr m4, m7, 8
- palignr m6, m7, 2
- palignr m3, m7, 10
- paddw m4, m6
- mova m6, m5
- palignr m5, m7, 6
- palignr m6, m7, 4
- paddw m3, m7
- paddw m5, m6
- psubw m0, m1
- psubw m3, m4
- psraw m0, 2
- psraw m3, 2
- psubw m0, m1
- psubw m3, m4
- paddw m0, m2
- paddw m3, m5
- psraw m0, 2
- psraw m3, 2
- paddw m0, m2
- paddw m3, m5
- psraw m0, 6
- psraw m3, 6
- packuswb m3, m0
- op_%1 m3, [r0], m7
- add r1, 48
- add r0, r2
- dec r4d
- jne .op16
-.done:
- REP_RET
-%endmacro
-
-INIT_XMM ssse3
-QPEL8OR16_HV2_LOWPASS_OP_XMM put
-QPEL8OR16_HV2_LOWPASS_OP_XMM avg
-
-
-%macro PIXELS4_L2_SHIFT5 1
-cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- mova m0, [r1]
- mova m1, [r1+24]
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m0
- packuswb m1, m1
- pavgb m0, [r2]
- pavgb m1, [r2+r4]
- op_%1h m0, [r0], m4
- op_%1h m1, [r0+r3], m5
- lea r2, [r2+r4*2]
- lea r0, [r0+r3*2]
- mova m0, [r1+48]
- mova m1, [r1+72]
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m0
- packuswb m1, m1
- pavgb m0, [r2]
- pavgb m1, [r2+r4]
- op_%1h m0, [r0], m4
- op_%1h m1, [r0+r3], m5
- RET
-%endmacro
-
-INIT_MMX mmxext
-PIXELS4_L2_SHIFT5 put
-PIXELS4_L2_SHIFT5 avg
-
-
-%macro PIXELS8_L2_SHIFT5 1
-cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
-.loop:
- mova m0, [r1]
- mova m1, [r1+8]
- mova m2, [r1+48]
- mova m3, [r1+48+8]
- psraw m0, 5
- psraw m1, 5
- psraw m2, 5
- psraw m3, 5
- packuswb m0, m1
- packuswb m2, m3
- pavgb m0, [r2]
- pavgb m2, [r2+r4]
- op_%1 m0, [r0], m4
- op_%1 m2, [r0+r3], m5
- lea r2, [r2+2*r4]
- add r1, 48*2
- lea r0, [r0+2*r3]
- sub r5d, 2
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PIXELS8_L2_SHIFT5 put
-PIXELS8_L2_SHIFT5 avg
-
-
-%if ARCH_X86_64
-%macro QPEL16_H_LOWPASS_L2_OP 1
-cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- mov r5d, 16
- pxor m15, m15
- mova m14, [pw_5]
- mova m13, [pw_16]
-.loop:
- lddqu m1, [r1+6]
- lddqu m7, [r1-2]
- mova m0, m1
- punpckhbw m1, m15
- punpcklbw m0, m15
- punpcklbw m7, m15
- mova m2, m1
- mova m6, m0
- mova m3, m1
- mova m8, m0
- mova m4, m1
- mova m9, m0
- mova m12, m0
- mova m11, m1
- palignr m11, m0, 10
- palignr m12, m7, 10
- palignr m4, m0, 2
- palignr m9, m7, 2
- palignr m3, m0, 4
- palignr m8, m7, 4
- palignr m2, m0, 6
- palignr m6, m7, 6
- paddw m11, m0
- palignr m1, m0, 8
- palignr m0, m7, 8
- paddw m7, m12
- paddw m2, m3
- paddw m6, m8
- paddw m1, m4
- paddw m0, m9
- psllw m2, 2
- psllw m6, 2
- psubw m2, m1
- psubw m6, m0
- paddw m11, m13
- paddw m7, m13
- pmullw m2, m14
- pmullw m6, m14
- lddqu m3, [r2]
- paddw m2, m11
- paddw m6, m7
- psraw m2, 5
- psraw m6, 5
- packuswb m6, m2
- pavgb m6, m3
- op_%1 m6, [r0], m11
- add r1, r3
- add r0, r3
- add r2, r4
- dec r5d
- jg .loop
- REP_RET
-%endmacro
-
-INIT_XMM ssse3
-QPEL16_H_LOWPASS_L2_OP put
-QPEL16_H_LOWPASS_L2_OP avg
-%endif
diff --git a/ffmpeg/libavcodec/x86/h264_weight.asm b/ffmpeg/libavcodec/x86/h264_weight.asm
deleted file mode 100644
index 4759a06..0000000
--- a/ffmpeg/libavcodec/x86/h264_weight.asm
+++ /dev/null
@@ -1,317 +0,0 @@
-;*****************************************************************************
-;* SSE2-optimized weighted prediction code
-;*****************************************************************************
-;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
-;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; biweight pred:
-;
-; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
-; int height, int log2_denom, int weightd,
-; int weights, int offset);
-; and
-; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
-; int log2_denom, int weight, int offset);
-;-----------------------------------------------------------------------------
-
-%macro WEIGHT_SETUP 0
- add r5, r5
- inc r5
- movd m3, r4d
- movd m5, r5d
- movd m6, r3d
- pslld m5, m6
- psrld m5, 1
-%if mmsize == 16
- pshuflw m3, m3, 0
- pshuflw m5, m5, 0
- punpcklqdq m3, m3
- punpcklqdq m5, m5
-%else
- pshufw m3, m3, 0
- pshufw m5, m5, 0
-%endif
- pxor m7, m7
-%endmacro
-
-%macro WEIGHT_OP 2
- movh m0, [r0+%1]
- movh m1, [r0+%2]
- punpcklbw m0, m7
- punpcklbw m1, m7
- pmullw m0, m3
- pmullw m1, m3
- paddsw m0, m5
- paddsw m1, m5
- psraw m0, m6
- psraw m1, m6
- packuswb m0, m1
-%endmacro
-
-INIT_MMX mmxext
-cglobal h264_weight_16, 6, 6, 0
- WEIGHT_SETUP
-.nextrow:
- WEIGHT_OP 0, 4
- mova [r0 ], m0
- WEIGHT_OP 8, 12
- mova [r0+8], m0
- add r0, r1
- dec r2d
- jnz .nextrow
- REP_RET
-
-%macro WEIGHT_FUNC_MM 2
-cglobal h264_weight_%1, 6, 6, %2
- WEIGHT_SETUP
-.nextrow:
- WEIGHT_OP 0, mmsize/2
- mova [r0], m0
- add r0, r1
- dec r2d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-WEIGHT_FUNC_MM 8, 0
-INIT_XMM sse2
-WEIGHT_FUNC_MM 16, 8
-
-%macro WEIGHT_FUNC_HALF_MM 2
-cglobal h264_weight_%1, 6, 6, %2
- WEIGHT_SETUP
- sar r2d, 1
- lea r3, [r1*2]
-.nextrow:
- WEIGHT_OP 0, r1
- movh [r0], m0
-%if mmsize == 16
- movhps [r0+r1], m0
-%else
- psrlq m0, 32
- movh [r0+r1], m0
-%endif
- add r0, r3
- dec r2d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-WEIGHT_FUNC_HALF_MM 4, 0
-INIT_XMM sse2
-WEIGHT_FUNC_HALF_MM 8, 8
-
-%macro BIWEIGHT_SETUP 0
-%if ARCH_X86_64
-%define off_regd r7d
-%else
-%define off_regd r3d
-%endif
- mov off_regd, r7m
- add off_regd, 1
- or off_regd, 1
- add r4, 1
- cmp r5, 128
- jne .normal
- sar r5, 1
- sar r6, 1
- sar off_regd, 1
- sub r4, 1
-.normal
-%if cpuflag(ssse3)
- movd m4, r5d
- movd m0, r6d
-%else
- movd m3, r5d
- movd m4, r6d
-%endif
- movd m5, off_regd
- movd m6, r4d
- pslld m5, m6
- psrld m5, 1
-%if cpuflag(ssse3)
- punpcklbw m4, m0
- pshuflw m4, m4, 0
- pshuflw m5, m5, 0
- punpcklqdq m4, m4
- punpcklqdq m5, m5
-
-%else
-%if mmsize == 16
- pshuflw m3, m3, 0
- pshuflw m4, m4, 0
- pshuflw m5, m5, 0
- punpcklqdq m3, m3
- punpcklqdq m4, m4
- punpcklqdq m5, m5
-%else
- pshufw m3, m3, 0
- pshufw m4, m4, 0
- pshufw m5, m5, 0
-%endif
- pxor m7, m7
-%endif
-%endmacro
-
-%macro BIWEIGHT_STEPA 3
- movh m%1, [r0+%3]
- movh m%2, [r1+%3]
- punpcklbw m%1, m7
- punpcklbw m%2, m7
- pmullw m%1, m3
- pmullw m%2, m4
- paddsw m%1, m%2
-%endmacro
-
-%macro BIWEIGHT_STEPB 0
- paddsw m0, m5
- paddsw m1, m5
- psraw m0, m6
- psraw m1, m6
- packuswb m0, m1
-%endmacro
-
-INIT_MMX mmxext
-cglobal h264_biweight_16, 7, 8, 0
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
-.nextrow:
- BIWEIGHT_STEPA 0, 1, 0
- BIWEIGHT_STEPA 1, 2, 4
- BIWEIGHT_STEPB
- mova [r0], m0
- BIWEIGHT_STEPA 0, 1, 8
- BIWEIGHT_STEPA 1, 2, 12
- BIWEIGHT_STEPB
- mova [r0+8], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-
-%macro BIWEIGHT_FUNC_MM 2
-cglobal h264_biweight_%1, 7, 8, %2
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
-.nextrow:
- BIWEIGHT_STEPA 0, 1, 0
- BIWEIGHT_STEPA 1, 2, mmsize/2
- BIWEIGHT_STEPB
- mova [r0], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-BIWEIGHT_FUNC_MM 8, 0
-INIT_XMM sse2
-BIWEIGHT_FUNC_MM 16, 8
-
-%macro BIWEIGHT_FUNC_HALF_MM 2
-cglobal h264_biweight_%1, 7, 8, %2
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
- sar r3, 1
- lea r4, [r2*2]
-.nextrow:
- BIWEIGHT_STEPA 0, 1, 0
- BIWEIGHT_STEPA 1, 2, r2
- BIWEIGHT_STEPB
- movh [r0], m0
-%if mmsize == 16
- movhps [r0+r2], m0
-%else
- psrlq m0, 32
- movh [r0+r2], m0
-%endif
- add r0, r4
- add r1, r4
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-BIWEIGHT_FUNC_HALF_MM 4, 0
-INIT_XMM sse2
-BIWEIGHT_FUNC_HALF_MM 8, 8
-
-%macro BIWEIGHT_SSSE3_OP 0
- pmaddubsw m0, m4
- pmaddubsw m2, m4
- paddsw m0, m5
- paddsw m2, m5
- psraw m0, m6
- psraw m2, m6
- packuswb m0, m2
-%endmacro
-
-INIT_XMM ssse3
-cglobal h264_biweight_16, 7, 8, 8
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
-
-.nextrow:
- movh m0, [r0]
- movh m2, [r0+8]
- movh m3, [r1+8]
- punpcklbw m0, [r1]
- punpcklbw m2, m3
- BIWEIGHT_SSSE3_OP
- mova [r0], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-
-INIT_XMM ssse3
-cglobal h264_biweight_8, 7, 8, 8
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
- sar r3, 1
- lea r4, [r2*2]
-
-.nextrow:
- movh m0, [r0]
- movh m1, [r1]
- movh m2, [r0+r2]
- movh m3, [r1+r2]
- punpcklbw m0, m1
- punpcklbw m2, m3
- BIWEIGHT_SSSE3_OP
- movh [r0], m0
- movhps [r0+r2], m0
- add r0, r4
- add r1, r4
- dec r3d
- jnz .nextrow
- REP_RET
diff --git a/ffmpeg/libavcodec/x86/h264_weight_10bit.asm b/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
deleted file mode 100644
index b7845fd..0000000
--- a/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
+++ /dev/null
@@ -1,282 +0,0 @@
-;*****************************************************************************
-;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
-;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
-;*
-;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-sq_1: dq 1
- dq 0
-
-cextern pw_1
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
-; int weight, int offset);
-;-----------------------------------------------------------------------------
-%macro WEIGHT_PROLOGUE 0
-.prologue:
- PROLOGUE 0,6,8
- movifnidn r0, r0mp
- movifnidn r1d, r1m
- movifnidn r2d, r2m
- movifnidn r4d, r4m
- movifnidn r5d, r5m
-%endmacro
-
-%macro WEIGHT_SETUP 0
- mova m0, [pw_1]
- movd m2, r3m
- pslld m0, m2 ; 1<<log2_denom
- SPLATW m0, m0
- shl r5, 19 ; *8, move to upper half of dword
- lea r5, [r5+r4*2+0x10000]
- movd m3, r5d ; weight<<1 | 1+(offset<<(3))
- pshufd m3, m3, 0
- mova m4, [pw_pixel_max]
- paddw m2, [sq_1] ; log2_denom+1
-%if notcpuflag(sse4)
- pxor m7, m7
-%endif
-%endmacro
-
-%macro WEIGHT_OP 1-2
-%if %0==1
- mova m5, [r0+%1]
- punpckhwd m6, m5, m0
- punpcklwd m5, m0
-%else
- movq m5, [r0+%1]
- movq m6, [r0+%2]
- punpcklwd m5, m0
- punpcklwd m6, m0
-%endif
- pmaddwd m5, m3
- pmaddwd m6, m3
- psrad m5, m2
- psrad m6, m2
-%if cpuflag(sse4)
- packusdw m5, m6
- pminsw m5, m4
-%else
- packssdw m5, m6
- CLIPW m5, m7, m4
-%endif
-%endmacro
-
-%macro WEIGHT_FUNC_DBL 0
-cglobal h264_weight_16_10
- WEIGHT_PROLOGUE
- WEIGHT_SETUP
-.nextrow:
- WEIGHT_OP 0
- mova [r0 ], m5
- WEIGHT_OP 16
- mova [r0+16], m5
- add r0, r1
- dec r2d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-WEIGHT_FUNC_DBL
-INIT_XMM sse4
-WEIGHT_FUNC_DBL
-
-
-%macro WEIGHT_FUNC_MM 0
-cglobal h264_weight_8_10
- WEIGHT_PROLOGUE
- WEIGHT_SETUP
-.nextrow:
- WEIGHT_OP 0
- mova [r0], m5
- add r0, r1
- dec r2d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-WEIGHT_FUNC_MM
-INIT_XMM sse4
-WEIGHT_FUNC_MM
-
-
-%macro WEIGHT_FUNC_HALF_MM 0
-cglobal h264_weight_4_10
- WEIGHT_PROLOGUE
- sar r2d, 1
- WEIGHT_SETUP
- lea r3, [r1*2]
-.nextrow:
- WEIGHT_OP 0, r1
- movh [r0], m5
- movhps [r0+r1], m5
- add r0, r3
- dec r2d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-WEIGHT_FUNC_HALF_MM
-INIT_XMM sse4
-WEIGHT_FUNC_HALF_MM
-
-
-;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
-; int log2_denom, int weightd, int weights, int offset);
-;-----------------------------------------------------------------------------
-%if ARCH_X86_32
-DECLARE_REG_TMP 3
-%else
-DECLARE_REG_TMP 7
-%endif
-
-%macro BIWEIGHT_PROLOGUE 0
-.prologue:
- PROLOGUE 0,8,8
- movifnidn r0, r0mp
- movifnidn r1, r1mp
- movifnidn r2d, r2m
- movifnidn r5d, r5m
- movifnidn r6d, r6m
- movifnidn t0d, r7m
-%endmacro
-
-%macro BIWEIGHT_SETUP 0
- lea t0, [t0*4+1] ; (offset<<2)+1
- or t0, 1
- shl r6, 16
- or r5, r6
- movd m4, r5d ; weightd | weights
- movd m5, t0d ; (offset+1)|1
- movd m6, r4m ; log2_denom
- pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
- paddd m6, [sq_1]
- pshufd m4, m4, 0
- pshufd m5, m5, 0
- mova m3, [pw_pixel_max]
- movifnidn r3d, r3m
-%if notcpuflag(sse4)
- pxor m7, m7
-%endif
-%endmacro
-
-%macro BIWEIGHT 1-2
-%if %0==1
- mova m0, [r0+%1]
- mova m1, [r1+%1]
- punpckhwd m2, m0, m1
- punpcklwd m0, m1
-%else
- movq m0, [r0+%1]
- movq m1, [r1+%1]
- punpcklwd m0, m1
- movq m2, [r0+%2]
- movq m1, [r1+%2]
- punpcklwd m2, m1
-%endif
- pmaddwd m0, m4
- pmaddwd m2, m4
- paddd m0, m5
- paddd m2, m5
- psrad m0, m6
- psrad m2, m6
-%if cpuflag(sse4)
- packusdw m0, m2
- pminsw m0, m3
-%else
- packssdw m0, m2
- CLIPW m0, m7, m3
-%endif
-%endmacro
-
-%macro BIWEIGHT_FUNC_DBL 0
-cglobal h264_biweight_16_10
- BIWEIGHT_PROLOGUE
- BIWEIGHT_SETUP
-.nextrow:
- BIWEIGHT 0
- mova [r0 ], m0
- BIWEIGHT 16
- mova [r0+16], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-BIWEIGHT_FUNC_DBL
-INIT_XMM sse4
-BIWEIGHT_FUNC_DBL
-
-%macro BIWEIGHT_FUNC 0
-cglobal h264_biweight_8_10
- BIWEIGHT_PROLOGUE
- BIWEIGHT_SETUP
-.nextrow:
- BIWEIGHT 0
- mova [r0], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-BIWEIGHT_FUNC
-INIT_XMM sse4
-BIWEIGHT_FUNC
-
-%macro BIWEIGHT_FUNC_HALF 0
-cglobal h264_biweight_4_10
- BIWEIGHT_PROLOGUE
- BIWEIGHT_SETUP
- sar r3d, 1
- lea r4, [r2*2]
-.nextrow:
- BIWEIGHT 0, r2
- movh [r0 ], m0
- movhps [r0+r2], m0
- add r0, r4
- add r1, r4
- dec r3d
- jnz .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM sse2
-BIWEIGHT_FUNC_HALF
-INIT_XMM sse4
-BIWEIGHT_FUNC_HALF
diff --git a/ffmpeg/libavcodec/x86/h264chroma_init.c b/ffmpeg/libavcodec/x86/h264chroma_init.c
deleted file mode 100644
index 3d8d5b0..0000000
--- a/ffmpeg/libavcodec/x86/h264chroma_init.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/h264chroma.h"
-
-void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
-void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
- (uint8_t *dst, uint8_t *src, \
- int stride, int h, int x, int y);
-
-CHROMA_MC(put, 2, 10, mmxext)
-CHROMA_MC(avg, 2, 10, mmxext)
-CHROMA_MC(put, 4, 10, mmxext)
-CHROMA_MC(avg, 4, 10, mmxext)
-CHROMA_MC(put, 8, 10, sse2)
-CHROMA_MC(avg, 8, 10, sse2)
-CHROMA_MC(put, 8, 10, avx)
-CHROMA_MC(avg, 8, 10, avx)
-
-av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
-{
-#if HAVE_YASM
- int high_bit_depth = bit_depth > 8;
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
- c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
- c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
- }
-
- if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
- }
-
- if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
- c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
- c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
- }
-
- if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
- c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
- c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
- c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
- }
-
- if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
- c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
- }
-
- if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
- c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
- c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
- }
-
- if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
- // AVX implies !cache64.
- // TODO: Port cache(32|64) detection from x264.
- c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
- }
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/h264dsp_init.c b/ffmpeg/libavcodec/x86/h264dsp_init.c
deleted file mode 100644
index 30801c4..0000000
--- a/ffmpeg/libavcodec/x86/h264dsp_init.c
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/h264dsp.h"
-#include "dsputil_x86.h"
-
-/***********************************/
-/* IDCT */
-#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
-void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- int16_t *block, \
- int stride);
-
-IDCT_ADD_FUNC(, 8, mmx)
-IDCT_ADD_FUNC(, 10, sse2)
-IDCT_ADD_FUNC(_dc, 8, mmxext)
-IDCT_ADD_FUNC(_dc, 10, mmxext)
-IDCT_ADD_FUNC(8_dc, 8, mmxext)
-IDCT_ADD_FUNC(8_dc, 10, sse2)
-IDCT_ADD_FUNC(8, 8, mmx)
-IDCT_ADD_FUNC(8, 8, sse2)
-IDCT_ADD_FUNC(8, 10, sse2)
-IDCT_ADD_FUNC(, 10, avx)
-IDCT_ADD_FUNC(8_dc, 10, avx)
-IDCT_ADD_FUNC(8, 10, avx)
-
-
-#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
-void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
- (uint8_t *dst, const int *block_offset, \
- int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
-
-IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
-IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
-IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
-IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
-IDCT_ADD_REP_FUNC(8, 4, 10, avx)
-IDCT_ADD_REP_FUNC(, 16, 8, mmx)
-IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
-IDCT_ADD_REP_FUNC(, 16, 8, sse2)
-IDCT_ADD_REP_FUNC(, 16, 10, sse2)
-IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
-IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
-IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
-IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
-IDCT_ADD_REP_FUNC(, 16, 10, avx)
-IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
-
-
-#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
-void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
- (uint8_t **dst, const int *block_offset, \
- int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
-
-IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
-IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
-IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
-IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
-IDCT_ADD_REP_FUNC2(, 8, 10, avx)
-
-void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
-void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
-
-/***********************************/
-/* deblocking */
-
-void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
- int8_t ref[2][40],
- int16_t mv[2][40][2],
- int bidir, int edges, int step,
- int mask_mv0, int mask_mv1, int field);
-
-#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
-void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
- int stride, \
- int alpha, \
- int beta, \
- int8_t *tc0);
-#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
-void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
- int stride, \
- int alpha, \
- int beta);
-
-#define LF_FUNCS(type, depth) \
-LF_FUNC(h, chroma, depth, mmxext) \
-LF_IFUNC(h, chroma_intra, depth, mmxext) \
-LF_FUNC(v, chroma, depth, mmxext) \
-LF_IFUNC(v, chroma_intra, depth, mmxext) \
-LF_FUNC(h, luma, depth, mmxext) \
-LF_IFUNC(h, luma_intra, depth, mmxext) \
-LF_FUNC(h, luma, depth, sse2) \
-LF_IFUNC(h, luma_intra, depth, sse2) \
-LF_FUNC(v, luma, depth, sse2) \
-LF_IFUNC(v, luma_intra, depth, sse2) \
-LF_FUNC(h, chroma, depth, sse2) \
-LF_IFUNC(h, chroma_intra, depth, sse2) \
-LF_FUNC(v, chroma, depth, sse2) \
-LF_IFUNC(v, chroma_intra, depth, sse2) \
-LF_FUNC(h, luma, depth, avx) \
-LF_IFUNC(h, luma_intra, depth, avx) \
-LF_FUNC(v, luma, depth, avx) \
-LF_IFUNC(v, luma_intra, depth, avx) \
-LF_FUNC(h, chroma, depth, avx) \
-LF_IFUNC(h, chroma_intra, depth, avx) \
-LF_FUNC(v, chroma, depth, avx) \
-LF_IFUNC(v, chroma_intra, depth, avx)
-
-LF_FUNCS(uint8_t, 8)
-LF_FUNCS(uint16_t, 10)
-
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
-LF_FUNC(v8, luma, 8, mmxext)
-static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0)
-{
- if ((tc0[0] & tc0[1]) >= 0)
- ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
- if ((tc0[2] & tc0[3]) >= 0)
- ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
-}
-LF_IFUNC(v8, luma_intra, 8, mmxext)
-static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
- int alpha, int beta)
-{
- ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
- ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
-}
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
-
-LF_FUNC(v, luma, 10, mmxext)
-LF_IFUNC(v, luma_intra, 10, mmxext)
-
-/***********************************/
-/* weighted prediction */
-
-#define H264_WEIGHT(W, OPT) \
-void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
- int height, int log2_denom, \
- int weight, int offset);
-
-#define H264_BIWEIGHT(W, OPT) \
-void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
- int stride, int height, \
- int log2_denom, int weightd, \
- int weights, int offset);
-
-#define H264_BIWEIGHT_MMX(W) \
- H264_WEIGHT(W, mmxext) \
- H264_BIWEIGHT(W, mmxext)
-
-#define H264_BIWEIGHT_MMX_SSE(W) \
- H264_BIWEIGHT_MMX(W) \
- H264_WEIGHT(W, sse2) \
- H264_BIWEIGHT(W, sse2) \
- H264_BIWEIGHT(W, ssse3)
-
-H264_BIWEIGHT_MMX_SSE(16)
-H264_BIWEIGHT_MMX_SSE(8)
-H264_BIWEIGHT_MMX(4)
-
-#define H264_WEIGHT_10(W, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- int stride, \
- int height, \
- int log2_denom, \
- int weight, \
- int offset);
-
-#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- uint8_t *src, \
- int stride, \
- int height, \
- int log2_denom, \
- int weightd, \
- int weights, \
- int offset);
-
-#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
- H264_WEIGHT_10(W, DEPTH, sse2) \
- H264_WEIGHT_10(W, DEPTH, sse4) \
- H264_BIWEIGHT_10(W, DEPTH, sse2) \
- H264_BIWEIGHT_10(W, DEPTH, sse4)
-
-H264_BIWEIGHT_10_SSE(16, 10)
-H264_BIWEIGHT_10_SSE(8, 10)
-H264_BIWEIGHT_10_SSE(4, 10)
-
-av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
- const int chroma_format_idc)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
- if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags))
- c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
-
- if (bit_depth == 8) {
- if (EXTERNAL_MMX(cpu_flags)) {
- c->h264_idct_dc_add =
- c->h264_idct_add = ff_h264_idct_add_8_mmx;
- c->h264_idct8_dc_add =
- c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
-
- c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
- if (cpu_flags & AV_CPU_FLAG_CMOV)
- c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
- c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
- c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
-
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
- if (chroma_format_idc == 1) {
- c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
- c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
- }
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
- c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
- c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
-
- c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
- c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
- c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
-
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
-
- c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
- c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
- }
- if (EXTERNAL_AVX(cpu_flags)) {
- c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
- c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
- }
- } else if (bit_depth == 10) {
- if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if ARCH_X86_32
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
- c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
- c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
-#endif /* ARCH_X86_32 */
- c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->h264_idct_add = ff_h264_idct_add_10_sse2;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
-
- c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
-#if HAVE_ALIGNED_STACK
- c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
- c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
-#endif /* HAVE_ALIGNED_STACK */
-
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
-
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
-#if HAVE_ALIGNED_STACK
- c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
- c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
-#endif /* HAVE_ALIGNED_STACK */
- }
- if (EXTERNAL_SSE4(cpu_flags)) {
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
- }
- if (EXTERNAL_AVX(cpu_flags)) {
- c->h264_idct_dc_add =
- c->h264_idct_add = ff_h264_idct_add_10_avx;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
-
- c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
-#if HAVE_ALIGNED_STACK
- c->h264_idct8_add = ff_h264_idct8_add_10_avx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
-#endif /* HAVE_ALIGNED_STACK */
-
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
-#if HAVE_ALIGNED_STACK
- c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
- c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
-#endif /* HAVE_ALIGNED_STACK */
- }
- }
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/hpeldsp.asm b/ffmpeg/libavcodec/x86/hpeldsp.asm
deleted file mode 100644
index 4eaba6e..0000000
--- a/ffmpeg/libavcodec/x86/hpeldsp.asm
+++ /dev/null
@@ -1,461 +0,0 @@
-;******************************************************************************
-;*
-;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
-;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
-;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
-;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
-;* Copyright (c) 2013 Daniel Kang
-;*
-;* MMX optimized hpel functions
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-cextern pb_1
-
-SECTION_TEXT
-
-; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_PIXELS8_X2 0
-cglobal put_pixels8_x2, 4,5
- lea r4, [r2*2]
-.loop:
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- mova [r0], m0
- mova [r0+r2], m1
- add r1, r4
- add r0, r4
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- add r1, r4
- mova [r0], m0
- mova [r0+r2], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_PIXELS8_X2
-INIT_MMX 3dnow
-PUT_PIXELS8_X2
-
-
-; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_PIXELS_16 0
-cglobal put_pixels16_x2, 4,5
- lea r4, [r2*2]
-.loop:
- mova m0, [r1]
- mova m1, [r1+r2]
- mova m2, [r1+8]
- mova m3, [r1+r2+8]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+9]
- PAVGB m3, [r1+r2+9]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+8], m2
- mova [r0+r2+8], m3
- add r1, r4
- add r0, r4
- mova m0, [r1]
- mova m1, [r1+r2]
- mova m2, [r1+8]
- mova m3, [r1+r2+8]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+9]
- PAVGB m3, [r1+r2+9]
- add r1, r4
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+8], m2
- mova [r0+r2+8], m3
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_PIXELS_16
-INIT_MMX 3dnow
-PUT_PIXELS_16
-
-
-; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_NO_RND_PIXELS8_X2 0
-cglobal put_no_rnd_pixels8_x2, 4,5
- mova m6, [pb_1]
- lea r4, [r2*2]
-.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- mova m1, [r1+1]
- mova m3, [r1+r2+1]
- add r1, r4
- psubusb m0, m6
- psubusb m2, m6
- PAVGB m0, m1
- PAVGB m2, m3
- mova [r0], m0
- mova [r0+r2], m2
- mova m0, [r1]
- mova m1, [r1+1]
- mova m2, [r1+r2]
- mova m3, [r1+r2+1]
- add r0, r4
- add r1, r4
- psubusb m0, m6
- psubusb m2, m6
- PAVGB m0, m1
- PAVGB m2, m3
- mova [r0], m0
- mova [r0+r2], m2
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS8_X2
-INIT_MMX 3dnow
-PUT_NO_RND_PIXELS8_X2
-
-
-; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
-cglobal put_no_rnd_pixels8_x2_exact, 4,5
- lea r4, [r2*3]
- pcmpeqb m6, m6
-.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- mova m1, [r1+1]
- mova m3, [r1+r2+1]
- pxor m0, m6
- pxor m2, m6
- pxor m1, m6
- pxor m3, m6
- PAVGB m0, m1
- PAVGB m2, m3
- pxor m0, m6
- pxor m2, m6
- mova [r0], m0
- mova [r0+r2], m2
- mova m0, [r1+r2*2]
- mova m1, [r1+r2*2+1]
- mova m2, [r1+r4]
- mova m3, [r1+r4+1]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m1
- PAVGB m2, m3
- pxor m0, m6
- pxor m2, m6
- mova [r0+r2*2], m0
- mova [r0+r4], m2
- lea r1, [r1+r2*4]
- lea r0, [r0+r2*4]
- sub r3d, 4
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS8_X2_EXACT
-INIT_MMX 3dnow
-PUT_NO_RND_PIXELS8_X2_EXACT
-
-
-; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_PIXELS8_Y2 0
-cglobal put_pixels8_y2, 4,5
- lea r4, [r2*2]
- mova m0, [r1]
- sub r0, r2
-.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
- add r1, r4
- PAVGB m0, m1
- PAVGB m1, m2
- mova [r0+r2], m0
- mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
- add r0, r4
- add r1, r4
- PAVGB m2, m1
- PAVGB m1, m0
- mova [r0+r2], m2
- mova [r0+r4], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_PIXELS8_Y2
-INIT_MMX 3dnow
-PUT_PIXELS8_Y2
-
-
-; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_NO_RND_PIXELS8_Y2 0
-cglobal put_no_rnd_pixels8_y2, 4,5
- mova m6, [pb_1]
- lea r4, [r2+r2]
- mova m0, [r1]
- sub r0, r2
-.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
- add r1, r4
- psubusb m1, m6
- PAVGB m0, m1
- PAVGB m1, m2
- mova [r0+r2], m0
- mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
- add r0, r4
- add r1, r4
- psubusb m1, m6
- PAVGB m2, m1
- PAVGB m1, m0
- mova [r0+r2], m2
- mova [r0+r4], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS8_Y2
-INIT_MMX 3dnow
-PUT_NO_RND_PIXELS8_Y2
-
-
-; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
-cglobal put_no_rnd_pixels8_y2_exact, 4,5
- lea r4, [r2*3]
- mova m0, [r1]
- pcmpeqb m6, m6
- add r1, r2
- pxor m0, m6
-.loop:
- mova m1, [r1]
- mova m2, [r1+r2]
- pxor m1, m6
- pxor m2, m6
- PAVGB m0, m1
- PAVGB m1, m2
- pxor m0, m6
- pxor m1, m6
- mova [r0], m0
- mova [r0+r2], m1
- mova m1, [r1+r2*2]
- mova m0, [r1+r4]
- pxor m1, m6
- pxor m0, m6
- PAVGB m2, m1
- PAVGB m1, m0
- pxor m2, m6
- pxor m1, m6
- mova [r0+r2*2], m2
- mova [r0+r4], m1
- lea r1, [r1+r2*4]
- lea r0, [r0+r2*4]
- sub r3d, 4
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS8_Y2_EXACT
-INIT_MMX 3dnow
-PUT_NO_RND_PIXELS8_Y2_EXACT
-
-
-; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8 0
-cglobal avg_pixels8, 4,5
- lea r4, [r2*2]
-.loop:
- mova m0, [r0]
- mova m1, [r0+r2]
- PAVGB m0, [r1]
- PAVGB m1, [r1+r2]
- mova [r0], m0
- mova [r0+r2], m1
- add r1, r4
- add r0, r4
- mova m0, [r0]
- mova m1, [r0+r2]
- PAVGB m0, [r1]
- PAVGB m1, [r1+r2]
- add r1, r4
- mova [r0], m0
- mova [r0+r2], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX 3dnow
-AVG_PIXELS8
-
-
-; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_X2 0
-cglobal avg_pixels8_x2, 4,5
- lea r4, [r2*2]
-.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m2, [r1+r2+1]
- PAVGB m0, [r0]
- PAVGB m2, [r0+r2]
- add r1, r4
- mova [r0], m0
- mova [r0+r2], m2
- mova m0, [r1]
- mova m2, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m2, [r1+r2+1]
- add r0, r4
- add r1, r4
- PAVGB m0, [r0]
- PAVGB m2, [r0+r2]
- mova [r0], m0
- mova [r0+r2], m2
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-AVG_PIXELS8_X2
-INIT_MMX 3dnow
-AVG_PIXELS8_X2
-
-
-; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_Y2 0
-cglobal avg_pixels8_y2, 4,5
- lea r4, [r2*2]
- mova m0, [r1]
- sub r0, r2
-.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
- add r1, r4
- PAVGB m0, m1
- PAVGB m1, m2
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m0, m3
- PAVGB m1, m4
- mova [r0+r2], m0
- mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
- PAVGB m2, m1
- PAVGB m1, m0
- add r0, r4
- add r1, r4
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m2, m3
- PAVGB m1, m4
- mova [r0+r2], m2
- mova [r0+r4], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-AVG_PIXELS8_Y2
-INIT_MMX 3dnow
-AVG_PIXELS8_Y2
-
-
-; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
- mova m6, [pb_1]
- lea r4, [r2*2]
- mova m0, [r1]
- PAVGB m0, [r1+1]
-.loop:
- mova m2, [r1+r4]
- mova m1, [r1+r2]
- psubusb m2, m6
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+r4+1]
- add r1, r4
- PAVGB m0, m1
- PAVGB m1, m2
- PAVGB m0, [r0]
- PAVGB m1, [r0+r2]
- mova [r0], m0
- mova [r0+r2], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
- PAVGB m1, [r1+r2+1]
- PAVGB m0, [r1+r4+1]
- add r0, r4
- add r1, r4
- PAVGB m2, m1
- PAVGB m1, m0
- PAVGB m2, [r0]
- PAVGB m1, [r0+r2]
- mova [r0], m2
- mova [r0+r2], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-AVG_PIXELS8_XY2
-INIT_MMX 3dnow
-AVG_PIXELS8_XY2
diff --git a/ffmpeg/libavcodec/x86/hpeldsp_init.c b/ffmpeg/libavcodec/x86/hpeldsp_init.c
deleted file mode 100644
index 8ecf909..0000000
--- a/ffmpeg/libavcodec/x86/hpeldsp_init.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/hpeldsp.h"
-#include "dsputil_x86.h"
-
-void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
- const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
- const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
- const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
- const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-
-#define avg_pixels8_mmx ff_avg_pixels8_mmx
-#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
-#define avg_pixels16_mmx ff_avg_pixels16_mmx
-#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
-#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
-#define put_pixels8_mmx ff_put_pixels8_mmx
-#define put_pixels16_mmx ff_put_pixels16_mmx
-#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
-#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
-#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
-#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
-#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
-
-#if HAVE_INLINE_ASM
-
-/***********************************/
-/* MMX no rounding */
-#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
-#define SET_RND MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
-#define STATIC static
-
-#include "rnd_template.c"
-#include "hpeldsp_rnd_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
-#undef STATIC
-
-PIXELS16(static, avg_no_rnd, , _y2, _mmx)
-PIXELS16(static, put_no_rnd, , _y2, _mmx)
-
-PIXELS16(static, avg_no_rnd, , _xy2, _mmx)
-PIXELS16(static, put_no_rnd, , _xy2, _mmx)
-
-/***********************************/
-/* MMX rounding */
-
-#define DEF(x, y) x ## _ ## y ## _mmx
-#define SET_RND MOVQ_WTWO
-#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
-
-#include "hpeldsp_rnd_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
-
-PIXELS16(static, avg, , _y2, _mmx)
-PIXELS16(static, put, , _y2, _mmx)
-
-#endif /* HAVE_INLINE_ASM */
-
-
-#if HAVE_YASM
-
-#define HPELDSP_AVG_PIXELS16(CPUEXT) \
- PIXELS16(static, put_no_rnd, ff_, _x2, CPUEXT) \
- PIXELS16(static, put, ff_, _y2, CPUEXT) \
- PIXELS16(static, put_no_rnd, ff_, _y2, CPUEXT) \
- PIXELS16(static, avg, ff_, , CPUEXT) \
- PIXELS16(static, avg, ff_, _x2, CPUEXT) \
- PIXELS16(static, avg, ff_, _y2, CPUEXT) \
- PIXELS16(static, avg, ff_, _xy2, CPUEXT)
-
-HPELDSP_AVG_PIXELS16(_3dnow)
-HPELDSP_AVG_PIXELS16(_mmxext)
-
-#endif /* HAVE_YASM */
-
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
- do { \
- c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
- c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
- c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
- c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
- } while (0)
-
-static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
-{
-#if HAVE_MMX_INLINE
- SET_HPEL_FUNCS(put, [0], 16, mmx);
- SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
- SET_HPEL_FUNCS(avg, [0], 16, mmx);
- SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
- SET_HPEL_FUNCS(put, [1], 8, mmx);
- SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
- SET_HPEL_FUNCS(avg, [1], 8, mmx);
-#endif /* HAVE_MMX_INLINE */
-}
-
-static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
-{
-#if HAVE_MMXEXT_EXTERNAL
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
- c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
-
- c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
- c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
- c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
-
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
-
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
- c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
- c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
-
- if (!(flags & CODEC_FLAG_BITEXACT)) {
- c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
- c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
-
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
- }
-
- if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
- }
-#endif /* HAVE_MMXEXT_EXTERNAL */
-}
-
-static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
-{
-#if HAVE_AMD3DNOW_EXTERNAL
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
- c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
-
- c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
- c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
- c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
-
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
-
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
- c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
- c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
-
- if (!(flags & CODEC_FLAG_BITEXACT)){
- c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
- c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
-
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
- }
-
- if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
- }
-#endif /* HAVE_AMD3DNOW_EXTERNAL */
-}
-
-static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
-{
-#if HAVE_SSE2_EXTERNAL
- if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
- // these functions are slower than mmx on AMD, but faster on Intel
- c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
- }
-#endif /* HAVE_SSE2_EXTERNAL */
-}
-
-void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags))
- hpeldsp_init_mmx(c, flags, cpu_flags);
-
- if (EXTERNAL_AMD3DNOW(cpu_flags))
- hpeldsp_init_3dnow(c, flags, cpu_flags);
-
- if (EXTERNAL_MMXEXT(cpu_flags))
- hpeldsp_init_mmxext(c, flags, cpu_flags);
-
- if (EXTERNAL_SSE2(cpu_flags))
- hpeldsp_init_sse2(c, flags, cpu_flags);
-}
diff --git a/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c b/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
deleted file mode 100644
index 94e06d8..0000000
--- a/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * DSP utils mmx functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-// put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :REG_a, "memory");
-}
-
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :REG_a, "memory");
-}
-
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"),%%mm2 \n\t"
- PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"),%%mm0 \n\t"
- PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :REG_a, "memory");
-}
-
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %1, %%mm0 \n\t"
- "movq 1%1, %%mm1 \n\t"
- "movq %0, %%mm3 \n\t"
- PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, %0 \n\t"
- "movq 8%1, %%mm0 \n\t"
- "movq 9%1, %%mm1 \n\t"
- "movq 8%0, %%mm3 \n\t"
- PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, 8%0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- } while (--h);
-}
-
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
-
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm2, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
-
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :REG_a, "memory");
-}
diff --git a/ffmpeg/libavcodec/x86/idct_mmx_xvid.c b/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
deleted file mode 100644
index 4cd6de1..0000000
--- a/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with FFmpeg; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-#include "libavcodec/avcodec.h"
-#include "libavutil/mem.h"
-#include "dsputil_x86.h"
-#include "idct_xvid.h"
-
-#if HAVE_MMX_INLINE
-
-//=============================================================================
-// Macros and other preprocessor constants
-//=============================================================================
-
-#define BITS_INV_ACC 5 // 4 or 5 for IEEE
-#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
-#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
-#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
-#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
-#define RND_INV_CORR (RND_INV_COL - 1)
-
-#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
-#define SHIFT_FRW_COL BITS_FRW_ACC
-#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
-#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
-
-
-//-----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-//-----------------------------------------------------------------------------
-
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
- 13036,13036,13036,13036, // tg * (2<<16) + 0.5
- 27146,27146,27146,27146, // tg * (2<<16) + 0.5
- -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
- 23170,23170,23170,23170}; // cos * (2<<15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
- 65536,65536,
- 3597,3597,
- 2260,2260,
- 1203,1203,
- 0,0,
- 120,120,
- 512,512,
- 512,512};
-
-//-----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-//-----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-//-----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
-// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
-// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
-// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
-// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
-// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-// int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
-// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
-// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
-// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-// y[0] = SHIFT_ROUND ( a0 + b0 );
-// y[1] = SHIFT_ROUND ( a1 + b1 );
-// y[2] = SHIFT_ROUND ( a2 + b2 );
-// y[3] = SHIFT_ROUND ( a3 + b3 );
-// y[4] = SHIFT_ROUND ( a3 - b3 );
-// y[5] = SHIFT_ROUND ( a2 - b2 );
-// y[6] = SHIFT_ROUND ( a1 - b1 );
-// y[7] = SHIFT_ROUND ( a0 - b0 );
-// }
-//
-//-----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-// for rows 0,4 - by cos_4_16,
-// for rows 1,7 - by cos_1_16,
-// for rows 2,6 - by cos_2_16,
-// for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy
-//
-// For the constants used,
-// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
-//
-//-----------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------
-// Tables for mmx processors
-//-----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
- 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
- 21407,8867,8867,-21407, // w07 w05 w03 w01
- 16384,-16384,16384,16384, // w14 w12 w10 w08
- -8867,21407,-21407,-8867, // w15 w13 w11 w09
- 22725,12873,19266,-22725, // w22 w20 w18 w16
- 19266,4520,-4520,-12873, // w23 w21 w19 w17
- 12873,4520,4520,19266, // w30 w28 w26 w24
- -22725,19266,-12873,-22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
- 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
- 29692,12299,12299,-29692, // w07 w05 w03 w01
- 22725,-22725,22725,22725, // w14 w12 w10 w08
- -12299,29692,-29692,-12299, // w15 w13 w11 w09
- 31521,17855,26722,-31521, // w22 w20 w18 w16
- 26722,6270,-6270,-17855, // w23 w21 w19 w17
- 17855,6270,6270,26722, // w30 w28 w26 w24
- -31521,26722,-17855,-31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
- 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
- 27969,11585,11585,-27969, // w07 w05 w03 w01
- 21407,-21407,21407,21407, // w14 w12 w10 w08
- -11585,27969,-27969,-11585, // w15 w13 w11 w09
- 29692,16819,25172,-29692, // w22 w20 w18 w16
- 25172,5906,-5906,-16819, // w23 w21 w19 w17
- 16819,5906,5906,25172, // w30 w28 w26 w24
- -29692,25172,-16819,-29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
- 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
- 25172,10426,10426,-25172, // w07 w05 w03 w01
- 19266,-19266,19266,19266, // w14 w12 w10 w08
- -10426,25172,-25172,-10426, // w15 w13 w11 w09
- 26722,15137,22654,-26722, // w22 w20 w18 w16
- 22654,5315,-5315,-15137, // w23 w21 w19 w17
- 15137,5315,5315,22654, // w30 w28 w26 w24
- -26722,22654,-15137,-26722, // w31 w29 w27 w25
-};
-//-----------------------------------------------------------------------------
-// Tables for xmm processors
-//-----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
- 16384,21407,16384,8867, // movq-> w05 w04 w01 w00
- 16384,8867,-16384,-21407, // w07 w06 w03 w02
- 16384,-8867,16384,-21407, // w13 w12 w09 w08
- -16384,21407,16384,-8867, // w15 w14 w11 w10
- 22725,19266,19266,-4520, // w21 w20 w17 w16
- 12873,4520,-22725,-12873, // w23 w22 w19 w18
- 12873,-22725,4520,-12873, // w29 w28 w25 w24
- 4520,19266,19266,-22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
- 22725,29692,22725,12299, // movq-> w05 w04 w01 w00
- 22725,12299,-22725,-29692, // w07 w06 w03 w02
- 22725,-12299,22725,-29692, // w13 w12 w09 w08
- -22725,29692,22725,-12299, // w15 w14 w11 w10
- 31521,26722,26722,-6270, // w21 w20 w17 w16
- 17855,6270,-31521,-17855, // w23 w22 w19 w18
- 17855,-31521,6270,-17855, // w29 w28 w25 w24
- 6270,26722,26722,-31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
- 21407,27969,21407,11585, // movq-> w05 w04 w01 w00
- 21407,11585,-21407,-27969, // w07 w06 w03 w02
- 21407,-11585,21407,-27969, // w13 w12 w09 w08
- -21407,27969,21407,-11585, // w15 w14 w11 w10
- 29692,25172,25172,-5906, // w21 w20 w17 w16
- 16819,5906,-29692,-16819, // w23 w22 w19 w18
- 16819,-29692,5906,-16819, // w29 w28 w25 w24
- 5906,25172,25172,-29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
- 19266,25172,19266,10426, // movq-> w05 w04 w01 w00
- 19266,10426,-19266,-25172, // w07 w06 w03 w02
- 19266,-10426,19266,-25172, // w13 w12 w09 w08
- -19266,25172,19266,-10426, // w15 w14 w11 w10
- 26722,22654,22654,-5315, // w21 w20 w17 w16
- 15137,5315,-26722,-15137, // w23 w22 w19 w18
- 15137,-26722,5315,-15137, // w29 w28 w25 w24
- 5315,22654,22654,-26722, // w31 w30 w27 w26
-};
-//=============================================================================
-// Helper macros for the code
-//=============================================================================
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
- "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
- "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
- "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
- "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
- "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
- "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
- "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
- "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
- "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
- "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
- "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
- "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
- "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
- "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
- "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
- "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
- "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
- "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
- "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
- "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
- "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
- "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
- "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
- "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
- "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
- "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
- "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
- "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
- "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
- "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
- "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
- "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
- "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
- "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
- "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
- "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
- "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
- "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
- "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
- "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
- "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
- "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
- "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
- "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
- "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
- "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
- "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
-
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
- "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
- "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
- "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
- "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
- "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
- "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
- "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
- "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
- "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
- "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
- "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
- "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
- "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
- "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
- "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
- "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
- "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
- "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
- "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
- "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
- "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
- "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
- "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
- "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
- "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
- "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
- "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
- "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
- "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
- "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
- "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
- "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
- "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
- "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
- "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
- "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
- "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
- "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
- "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
- "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
- "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
-
-
-//-----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-//-----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-//-----------------------------------------------------------------------------
-//
-// #define DCT_8_FRW_COL(x, y)
-//{
-// short t0, t1, t2, t3, t4, t5, t6, t7;
-// short tp03, tm03, tp12, tm12, tp65, tm65;
-// short tp465, tm465, tp765, tm765;
-//
-// t0 = LEFT_SHIFT ( x[0] + x[7] );
-// t1 = LEFT_SHIFT ( x[1] + x[6] );
-// t2 = LEFT_SHIFT ( x[2] + x[5] );
-// t3 = LEFT_SHIFT ( x[3] + x[4] );
-// t4 = LEFT_SHIFT ( x[3] - x[4] );
-// t5 = LEFT_SHIFT ( x[2] - x[5] );
-// t6 = LEFT_SHIFT ( x[1] - x[6] );
-// t7 = LEFT_SHIFT ( x[0] - x[7] );
-//
-// tp03 = t0 + t3;
-// tm03 = t0 - t3;
-// tp12 = t1 + t2;
-// tm12 = t1 - t2;
-//
-// y[0] = tp03 + tp12;
-// y[4] = tp03 - tp12;
-//
-// y[2] = tm03 + tm12 * tg_2_16;
-// y[6] = tm03 * tg_2_16 - tm12;
-//
-// tp65 =(t6 +t5 )*cos_4_16;
-// tm65 =(t6 -t5 )*cos_4_16;
-//
-// tp765 = t7 + tp65;
-// tm765 = t7 - tp65;
-// tp465 = t4 + tm65;
-// tm465 = t4 - tm65;
-//
-// y[1] = tp765 + tp465 * tg_1_16;
-// y[7] = tp765 * tg_1_16 - tp465;
-// y[5] = tm765 * tg_3_16 + tm465;
-// y[3] = tm765 - tm465 * tg_3_16;
-//}
-//
-//-----------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_COL_4 INP,OUT
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1,A2)\
- "movq 2*8(%3),%%mm0\n\t"\
- "movq 16*3+" #A1 ",%%mm3\n\t"\
- "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
- "movq 16*5+" #A1 ",%%mm5\n\t"\
- "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
- "movq (%3),%%mm4\n\t"\
- "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
- "movq 16*7+" #A1 ",%%mm7\n\t"\
- "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
- "movq 16*1+" #A1 ",%%mm6\n\t"\
- "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
- "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
- "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
- "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
- "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
- "movq 3*8(%3),%%mm3\n\t"\
- "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
- "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
- "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
- "movq %%mm4,%%mm5 \n\t"/* tp17*/\
- "movq %%mm2,%%mm6 \n\t"/* tm17*/\
- "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
- "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
- "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
- "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
- "movq 1*8(%3),%%mm7\n\t"\
- "movq %%mm4,%%mm1 \n\t"/* t1*/\
- "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
- "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
- "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
- "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
- "movq 2*16+" #A1 ",%%mm5\n\t"\
- "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
- "movq 6*16+" #A1 ",%%mm6\n\t"\
- "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
- "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
- "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
- "movq 0*16+" #A1 ",%%mm2\n\t"\
- "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
- "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
- "movq %%mm2,%%mm3 \n\t"/* x0*/\
- "movq 4*16+" #A1 ",%%mm6\n\t"\
- "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
- "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
- "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
- "movq %%mm2,%%mm5 \n\t"/* tp04*/\
- "movq %%mm3,%%mm6 \n\t"/* tm04*/\
- "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
- "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
- "paddsw %%mm1,%%mm1 \n\t"/* b1*/\
- "paddsw %%mm4,%%mm4 \n\t"/* b2*/\
- "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
- "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
- "movq %%mm3,%%mm7 \n\t"/* a1*/\
- "movq %%mm6,%%mm0 \n\t"/* a2*/\
- "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
- "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
- "psraw $6,%%mm3 \n\t"/* dst1*/\
- "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
- "psraw $6,%%mm6 \n\t"/* dst2*/\
- "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
- "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
- "psraw $6,%%mm7 \n\t"/* dst6*/\
- "movq %%mm5,%%mm4 \n\t"/* a0*/\
- "psraw $6,%%mm0 \n\t"/* dst5*/\
- "movq %%mm3,1*16+" #A2 "\n\t"\
- "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
- "movq %%mm6,2*16+" #A2 "\n\t"\
- "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
- "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
- "psraw $6,%%mm5 \n\t"/* dst0*/\
- "movq %%mm2,%%mm6 \n\t"/* a3*/\
- "psraw $6,%%mm4 \n\t"/* dst7*/\
- "movq %%mm0,5*16+" #A2 "\n\t"\
- "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
- "movq %%mm7,6*16+" #A2 "\n\t"\
- "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
- "movq %%mm5,0*16+" #A2 "\n\t"\
- "psraw $6,%%mm2 \n\t"/* dst3*/\
- "movq %%mm4,7*16+" #A2 "\n\t"\
- "psraw $6,%%mm6 \n\t"/* dst4*/\
- "movq %%mm2,3*16+" #A2 "\n\t"\
- "movq %%mm6,4*16+" #A2 "\n\t"
-
-//=============================================================================
-// Code
-//=============================================================================
-
-//-----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-//-----------------------------------------------------------------------------
-
-
-void ff_idct_xvid_mmx(short *block){
-__asm__ volatile(
- //# Process each row
- DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
- DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
- DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
- DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
- DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
- DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
- DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
- DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
-
- //# Process the columns (4 at a time)
- DCT_8_INV_COL(0(%0), 0(%0))
- DCT_8_INV_COL(8(%0), 8(%0))
- :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
-}
-
-void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_idct_xvid_mmx(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_idct_xvid_mmx(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-//-----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-//-----------------------------------------------------------------------------
-
-
-void ff_idct_xvid_mmxext(short *block)
-{
-__asm__ volatile(
- //# Process each row
- DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
- DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
- DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
- DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
- DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
- DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
- DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
- DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
-
- //# Process the columns (4 at a time)
- DCT_8_INV_COL(0(%0), 0(%0))
- DCT_8_INV_COL(8(%0), 8(%0))
- :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
-}
-
-void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_idct_xvid_mmxext(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_idct_xvid_mmxext(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/ffmpeg/libavcodec/x86/idct_sse2_xvid.c b/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
deleted file mode 100644
index af4790c..0000000
--- a/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of FFmpeg.
- *
- * Vertical pass is an implementation of the scheme:
- * Loeffler C., Ligtenberg A., and Moschytz C.S.:
- * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- * Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with FFmpeg; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "idct_xvid.h"
-#include "dsputil_x86.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 idct compatible with xvidmmx
- */
-
-#define X8(x) x,x,x,x,x,x,x,x
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 6
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
- 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
- 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
- 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
- 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
- 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
- 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
- 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
- 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
- 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
- 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
- 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
- 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
- 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
- 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
- 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
- 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
- 65536, 65536, 65536, 65536,
- 3597, 3597, 3597, 3597,
- 2260, 2260, 2260, 2260,
- 1203, 1203, 1203, 1203,
- 120, 120, 120, 120,
- 512, 512, 512, 512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
- "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
- "movdqa %%xmm2, "dst" \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd "MANGLE(x)
-
-#define JZ(reg, to) \
- "testl "reg","reg" \n\t" \
- "jz "to" \n\t"
-
-#define JNZ(reg, to) \
- "testl "reg","reg" \n\t" \
- "jnz "to" \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear) \
- clear \
- "movq "src", %%mm1 \n\t" \
- "por 8+"src", %%mm1 \n\t" \
- "paddusb %%mm0, %%mm1 \n\t" \
- "pmovmskb %%mm1, "reg" \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
- clear1 \
- clear2 \
- "movq "row1", %%mm1 \n\t" \
- "por 8+"row1", %%mm1 \n\t" \
- "movq "row2", %%mm2 \n\t" \
- "por 8+"row2", %%mm2 \n\t" \
- "paddusb %%mm0, %%mm1 \n\t" \
- "paddusb %%mm0, %%mm2 \n\t" \
- "pmovmskb %%mm1, "reg1" \n\t" \
- "pmovmskb %%mm2, "reg2" \n\t"
-
-///IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put) \
- "movdqa "src", %%xmm3 \n\t" \
- "movdqa %%xmm3, %%xmm0 \n\t" \
- "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
- "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
- "pmaddwd "table", %%xmm0 \n\t" \
- "pmaddwd 16+"table", %%xmm1 \n\t" \
- "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
- "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
- "pmaddwd 32+"table", %%xmm2 \n\t" \
- "pmaddwd 48+"table", %%xmm3 \n\t" \
- "paddd %%xmm1, %%xmm0 \n\t" \
- "paddd %%xmm3, %%xmm2 \n\t" \
- rounder", %%xmm0 \n\t" \
- "movdqa %%xmm2, %%xmm3 \n\t" \
- "paddd %%xmm0, %%xmm2 \n\t" \
- "psubd %%xmm3, %%xmm0 \n\t" \
- "psrad $11, %%xmm2 \n\t" \
- "psrad $11, %%xmm0 \n\t" \
- "packssdw %%xmm0, %%xmm2 \n\t" \
- put \
- "1: \n\t"
-
-#define iLLM_HEAD \
- "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
- "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
-
-///IDCT pass on columns.
-#define iLLM_PASS(dct) \
- "movdqa "TAN3", %%xmm1 \n\t" \
- "movdqa "TAN1", %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "pmulhw %%xmm5, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN3" \n\t" \
- "paddsw %%xmm5, %%xmm1 \n\t" \
- "psubsw %%xmm5, "TAN3" \n\t" \
- "paddsw %%xmm4, %%xmm1 \n\t" \
- "pmulhw %%xmm7, %%xmm3 \n\t" \
- "pmulhw %%xmm6, "TAN1" \n\t" \
- "paddsw %%xmm6, %%xmm3 \n\t" \
- "psubsw %%xmm7, "TAN1" \n\t" \
- "movdqa %%xmm3, %%xmm7 \n\t" \
- "movdqa "TAN1", %%xmm6 \n\t" \
- "psubsw %%xmm1, %%xmm3 \n\t" \
- "psubsw "TAN3", "TAN1" \n\t" \
- "paddsw %%xmm7, %%xmm1 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa %%xmm3, %%xmm6 \n\t" \
- "psubsw "TAN3", %%xmm3 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
- "pmulhw %%xmm4, %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw "TAN3", "TAN3" \n\t" \
- "paddsw %%xmm3, %%xmm3 \n\t" \
- "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
- MOV_32_ONLY ROW2", "REG2" \n\t" \
- MOV_32_ONLY ROW6", "REG6" \n\t" \
- "movdqa %%xmm7, %%xmm5 \n\t" \
- "pmulhw "REG6", %%xmm7 \n\t" \
- "pmulhw "REG2", %%xmm5 \n\t" \
- "paddsw "REG2", %%xmm7 \n\t" \
- "psubsw "REG6", %%xmm5 \n\t" \
- MOV_32_ONLY ROW0", "REG0" \n\t" \
- MOV_32_ONLY ROW4", "REG4" \n\t" \
- MOV_32_ONLY" "TAN1", (%0) \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw "REG4", "REG0" \n\t" \
- "paddsw "XMMS", "REG4" \n\t" \
- "movdqa "REG4", "XMMS" \n\t" \
- "psubsw %%xmm7, "REG4" \n\t" \
- "paddsw "XMMS", %%xmm7 \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm5, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm5 \n\t" \
- "movdqa %%xmm5, "XMMS" \n\t" \
- "psubsw "TAN3", %%xmm5 \n\t" \
- "paddsw "XMMS", "TAN3" \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm3, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm3 \n\t" \
- MOV_32_ONLY" (%0), "TAN1" \n\t" \
- "psraw $6, %%xmm5 \n\t" \
- "psraw $6, "REG0" \n\t" \
- "psraw $6, "TAN3" \n\t" \
- "psraw $6, %%xmm3 \n\t" \
- "movdqa "TAN3", 1*16("dct") \n\t" \
- "movdqa %%xmm3, 2*16("dct") \n\t" \
- "movdqa "REG0", 5*16("dct") \n\t" \
- "movdqa %%xmm5, 6*16("dct") \n\t" \
- "movdqa %%xmm7, %%xmm0 \n\t" \
- "movdqa "REG4", %%xmm4 \n\t" \
- "psubsw %%xmm1, %%xmm7 \n\t" \
- "psubsw "TAN1", "REG4" \n\t" \
- "paddsw %%xmm0, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN1" \n\t" \
- "psraw $6, %%xmm1 \n\t" \
- "psraw $6, %%xmm7 \n\t" \
- "psraw $6, "TAN1" \n\t" \
- "psraw $6, "REG4" \n\t" \
- "movdqa %%xmm1, ("dct") \n\t" \
- "movdqa "TAN1", 3*16("dct") \n\t" \
- "movdqa "REG4", 4*16("dct") \n\t" \
- "movdqa %%xmm7, 7*16("dct") \n\t"
-
-///IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct) \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw %%xmm4, "TAN3" \n\t" \
- "movdqa %%xmm6, %%xmm3 \n\t" \
- "pmulhw %%xmm6, "TAN1" \n\t" \
- "movdqa %%xmm4, %%xmm1 \n\t" \
- "psubsw %%xmm1, %%xmm3 \n\t" \
- "paddsw %%xmm6, %%xmm1 \n\t" \
- "movdqa "TAN1", %%xmm6 \n\t" \
- "psubsw "TAN3", "TAN1" \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa %%xmm3, %%xmm6 \n\t" \
- "psubsw "TAN3", %%xmm3 \n\t" \
- "paddsw %%xmm6, "TAN3" \n\t" \
- "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
- "pmulhw %%xmm4, %%xmm3 \n\t" \
- "pmulhw %%xmm4, "TAN3" \n\t" \
- "paddsw "TAN3", "TAN3" \n\t" \
- "paddsw %%xmm3, %%xmm3 \n\t" \
- "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
- MOV_32_ONLY ROW2", "SREG2" \n\t" \
- "pmulhw "SREG2", %%xmm5 \n\t" \
- MOV_32_ONLY ROW0", "REG0" \n\t" \
- "movdqa "REG0", %%xmm6 \n\t" \
- "psubsw "SREG2", %%xmm6 \n\t" \
- "paddsw "REG0", "SREG2" \n\t" \
- MOV_32_ONLY" "TAN1", (%0) \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm5, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm5 \n\t" \
- "movdqa %%xmm5, "XMMS" \n\t" \
- "psubsw "TAN3", %%xmm5 \n\t" \
- "paddsw "XMMS", "TAN3" \n\t" \
- "movdqa "REG0", "XMMS" \n\t" \
- "psubsw %%xmm3, "REG0" \n\t" \
- "paddsw "XMMS", %%xmm3 \n\t" \
- MOV_32_ONLY" (%0), "TAN1" \n\t" \
- "psraw $6, %%xmm5 \n\t" \
- "psraw $6, "REG0" \n\t" \
- "psraw $6, "TAN3" \n\t" \
- "psraw $6, %%xmm3 \n\t" \
- "movdqa "TAN3", 1*16("dct") \n\t" \
- "movdqa %%xmm3, 2*16("dct") \n\t" \
- "movdqa "REG0", 5*16("dct") \n\t" \
- "movdqa %%xmm5, 6*16("dct") \n\t" \
- "movdqa "SREG2", %%xmm0 \n\t" \
- "movdqa %%xmm6, %%xmm4 \n\t" \
- "psubsw %%xmm1, "SREG2" \n\t" \
- "psubsw "TAN1", %%xmm6 \n\t" \
- "paddsw %%xmm0, %%xmm1 \n\t" \
- "paddsw %%xmm4, "TAN1" \n\t" \
- "psraw $6, %%xmm1 \n\t" \
- "psraw $6, "SREG2" \n\t" \
- "psraw $6, "TAN1" \n\t" \
- "psraw $6, %%xmm6 \n\t" \
- "movdqa %%xmm1, ("dct") \n\t" \
- "movdqa "TAN1", 3*16("dct") \n\t" \
- "movdqa %%xmm6, 4*16("dct") \n\t" \
- "movdqa "SREG2", 7*16("dct") \n\t"
-
-inline void ff_idct_xvid_sse2(short *block)
-{
- __asm__ volatile(
- "movq "MANGLE(m127)", %%mm0 \n\t"
- iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
- iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
- iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
-
- TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
- JZ("%%eax", "1f")
- iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
-
- TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
- TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
- iLLM_HEAD
- ".p2align 4 \n\t"
- JNZ("%%ecx", "2f")
- JNZ("%%eax", "3f")
- JNZ("%%edx", "4f")
- JNZ("%%esi", "5f")
- iLLM_PASS_SPARSE("%0")
- "jmp 6f \n\t"
- "2: \n\t"
- iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
- "3: \n\t"
- iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
- JZ("%%edx", "1f")
- "4: \n\t"
- iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
- JZ("%%esi", "1f")
- "5: \n\t"
- iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
-#if ARCH_X86_32
- iLLM_HEAD
-#endif
- iLLM_PASS("%0")
- "6: \n\t"
- : "+r"(block)
- :
- : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
- "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
-#if ARCH_X86_64
- XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14",)
-#endif
- "%eax", "%ecx", "%edx", "%esi", "memory"
- );
-}
-
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
-{
- ff_idct_xvid_sse2(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
-{
- ff_idct_xvid_sse2(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/ffmpeg/libavcodec/x86/idct_xvid.h b/ffmpeg/libavcodec/x86/idct_xvid.h
deleted file mode 100644
index 7a2847b..0000000
--- a/ffmpeg/libavcodec/x86/idct_xvid.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * header for Xvid IDCT functions
- */
-
-#ifndef AVCODEC_X86_IDCT_XVID_H
-#define AVCODEC_X86_IDCT_XVID_H
-
-#include <stdint.h>
-
-void ff_idct_xvid_mmx(short *block);
-void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_idct_xvid_mmxext(short *block);
-void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_idct_xvid_sse2(short *block);
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
-
-#endif /* AVCODEC_X86_IDCT_XVID_H */
diff --git a/ffmpeg/libavcodec/x86/imdct36.asm b/ffmpeg/libavcodec/x86/imdct36.asm
deleted file mode 100644
index d311fbe..0000000
--- a/ffmpeg/libavcodec/x86/imdct36.asm
+++ /dev/null
@@ -1,724 +0,0 @@
-;******************************************************************************
-;* 36 point SSE-optimized IMDCT transform
-;* Copyright (c) 2011 Vitor Sessak
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-align 16
-ps_mask: dd 0, ~0, ~0, ~0
-ps_mask2: dd 0, ~0, 0, ~0
-ps_mask3: dd 0, 0, 0, ~0
-ps_mask4: dd 0, ~0, 0, 0
-
-ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
-ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
-ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
-ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
-ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
-ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
-ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
-
-ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
-ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
-
-ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
- dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
- dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
- dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
- dd 1.0, 0.70710678118654752439, 0.0, 0.0
-
-ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
- dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
- dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
- dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
- dd 1.0, 0.70710678118654752439, 0.0, 0.0
-
-costabs: times 4 dd 0.98480773
- times 4 dd 0.93969262
- times 4 dd 0.86602539
- times 4 dd -0.76604444
- times 4 dd -0.64278764
- times 4 dd 0.50000000
- times 4 dd -0.50000000
- times 4 dd -0.34202015
- times 4 dd -0.17364818
- times 4 dd 0.50190992
- times 4 dd 0.51763808
- times 4 dd 0.55168896
- times 4 dd 0.61038726
- times 4 dd 0.70710677
- times 4 dd 0.87172341
- times 4 dd 1.18310082
- times 4 dd 1.93185163
- times 4 dd 5.73685646
-
-%define SBLIMIT 32
-SECTION_TEXT
-
-%macro PSHUFD 3
-%if cpuflag(sse2) && notcpuflag(avx)
- pshufd %1, %2, %3
-%else
- shufps %1, %2, %2, %3
-%endif
-%endmacro
-
-; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
-; output %1={x3,x4,y1,y2}
-%macro BUILDINVHIGHLOW 3
-%if cpuflag(avx)
- shufps %1, %2, %3, 0x4e
-%else
- movlhps %1, %3
- movhlps %1, %2
-%endif
-%endmacro
-
-; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
-; output %1={x4,y1,y2,y3}
-%macro ROTLEFT 3
-%if cpuflag(ssse3)
- palignr %1, %3, %2, 12
-%else
- BUILDINVHIGHLOW %1, %2, %3
- shufps %1, %1, %3, 0x99
-%endif
-%endmacro
-
-%macro INVERTHL 2
-%if cpuflag(sse2)
- PSHUFD %1, %2, 0x4e
-%else
- movhlps %1, %2
- movlhps %1, %2
-%endif
-%endmacro
-
-%macro BUTTERF 3
- INVERTHL %2, %1
- xorps %1, [ps_p1p1m1m1]
- addps %1, %2
-%if cpuflag(sse3)
- mulps %1, %1, [ps_cosh_sse3 + %3]
- PSHUFD %2, %1, 0xb1
- addsubps %1, %1, %2
-%else
- mulps %1, [ps_cosh + %3]
- PSHUFD %2, %1, 0xb1
- xorps %1, [ps_p1m1p1m1]
- addps %1, %2
-%endif
-%endmacro
-
-%macro STORE 4
- movhlps %2, %1
- movss [%3 ], %1
- movss [%3 + 2*%4], %2
- shufps %1, %1, 0xb1
- movss [%3 + %4], %1
- movhlps %2, %1
- movss [%3 + 3*%4], %2
-%endmacro
-
-%macro LOAD 4
- movlps %1, [%3 ]
- movhps %1, [%3 + %4]
- movlps %2, [%3 + 2*%4]
- movhps %2, [%3 + 3*%4]
- shufps %1, %2, 0x88
-%endmacro
-
-%macro LOADA64 2
-%if cpuflag(avx)
- movu %1, [%2]
-%else
- movlps %1, [%2]
- movhps %1, [%2 + 8]
-%endif
-%endmacro
-
-%macro DEFINE_IMDCT 0
-cglobal imdct36_float, 4,4,9, out, buf, in, win
-
- ; for(i=17;i>=1;i--) in[i] += in[i-1];
- LOADA64 m0, inq
- LOADA64 m1, inq + 16
-
- ROTLEFT m5, m0, m1
-
- PSHUFD m6, m0, 0x93
- andps m6, m6, [ps_mask]
- addps m0, m0, m6
-
- LOADA64 m2, inq + 32
-
- ROTLEFT m7, m1, m2
-
- addps m1, m1, m5
- LOADA64 m3, inq + 48
-
- ROTLEFT m5, m2, m3
-
- xorps m4, m4, m4
- movlps m4, [inq+64]
- BUILDINVHIGHLOW m6, m3, m4
- shufps m6, m6, m4, 0xa9
-
- addps m4, m4, m6
- addps m2, m2, m7
- addps m3, m3, m5
-
- ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
- movlhps m5, m5, m0
- andps m5, m5, [ps_mask3]
-
- BUILDINVHIGHLOW m7, m0, m1
- andps m7, m7, [ps_mask2]
-
- addps m0, m0, m5
-
- BUILDINVHIGHLOW m6, m1, m2
- andps m6, m6, [ps_mask2]
-
- addps m1, m1, m7
-
- BUILDINVHIGHLOW m7, m2, m3
- andps m7, m7, [ps_mask2]
-
- addps m2, m2, m6
-
- movhlps m6, m6, m3
- andps m6, m6, [ps_mask4]
-
- addps m3, m3, m7
- addps m4, m4, m6
-
- ; Populate tmp[]
- movlhps m6, m1, m5 ; zero out high values
- subps m6, m6, m4
-
- subps m5, m0, m3
-
-%if ARCH_X86_64
- SWAP m5, m8
-%endif
-
- mulps m7, m2, [ps_val1]
-
-%if ARCH_X86_64
- mulps m5, m8, [ps_val2]
-%else
- mulps m5, m5, [ps_val2]
-%endif
- addps m7, m7, m5
-
- mulps m5, m6, [ps_val1]
- subps m7, m7, m5
-
-%if ARCH_X86_64
- SWAP m5, m8
-%else
- subps m5, m0, m3
-%endif
-
- subps m5, m5, m6
- addps m5, m5, m2
-
- shufps m6, m4, m3, 0xe4
- subps m6, m6, m2
- mulps m6, m6, [ps_val3]
-
- addps m4, m4, m1
- mulps m4, m4, [ps_val4]
-
- shufps m1, m1, m0, 0xe4
- addps m1, m1, m2
- mulps m1, m1, [ps_val5]
-
- mulps m3, m3, [ps_val6]
- mulps m0, m0, [ps_val7]
- addps m0, m0, m3
-
- xorps m2, m1, [ps_p1p1m1m1]
- subps m2, m2, m4
- addps m2, m2, m0
-
- addps m3, m4, m0
- subps m3, m3, m6
- xorps m3, m3, [ps_p1p1m1m1]
-
- shufps m0, m0, m4, 0xe4
- subps m0, m0, m1
- addps m0, m0, m6
-
- BUILDINVHIGHLOW m4, m2, m3
- shufps m3, m3, m2, 0x4e
-
- ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
-
- BUTTERF m0, m1, 0
- BUTTERF m7, m2, 16
- BUTTERF m3, m6, 32
- BUTTERF m4, m1, 48
-
- mulps m5, m5, [ps_cosh + 64]
- PSHUFD m1, m5, 0xe1
- xorps m5, m5, [ps_p1m1p1m1]
- addps m5, m5, m1
-
- ; permutates:
- ; m0 0 1 2 3 => 2 6 10 14 m1
- ; m7 4 5 6 7 => 3 7 11 15 m2
- ; m3 8 9 10 11 => 17 13 9 5 m3
- ; m4 12 13 14 15 => 16 12 8 4 m5
- ; m5 16 17 xx xx => 0 1 xx xx m0
-
- unpckhps m1, m0, m7
- unpckhps m6, m3, m4
- movhlps m2, m6, m1
- movlhps m1, m1, m6
-
- unpcklps m5, m5, m4
- unpcklps m3, m3, m7
- movhlps m4, m3, m5
- movlhps m5, m5, m3
- SWAP m4, m3
- ; permutation done
-
- PSHUFD m6, m2, 0xb1
- movss m4, [bufq + 4*68]
- movss m7, [bufq + 4*64]
- unpcklps m7, m7, m4
- mulps m6, m6, [winq + 16*4]
- addps m6, m6, m7
- movss [outq + 64*SBLIMIT], m6
- shufps m6, m6, m6, 0xb1
- movss [outq + 68*SBLIMIT], m6
-
- mulps m6, m3, [winq + 4*4]
- LOAD m4, m7, bufq + 4*16, 16
- addps m6, m6, m4
- STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
-
- shufps m4, m0, m3, 0xb5
- mulps m4, m4, [winq + 8*4]
- LOAD m7, m6, bufq + 4*32, 16
- addps m4, m4, m7
- STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
-
- shufps m3, m3, m2, 0xb1
- mulps m3, m3, [winq + 12*4]
- LOAD m7, m6, bufq + 4*48, 16
- addps m3, m3, m7
- STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
-
- mulps m2, m2, [winq]
- LOAD m6, m7, bufq, 16
- addps m2, m2, m6
- STORE m2, m7, outq, 4*SBLIMIT
-
- mulps m4, m1, [winq + 20*4]
- STORE m4, m7, bufq, 16
-
- mulps m3, m5, [winq + 24*4]
- STORE m3, m7, bufq + 4*16, 16
-
- shufps m0, m0, m5, 0xb0
- mulps m0, m0, [winq + 28*4]
- STORE m0, m7, bufq + 4*32, 16
-
- shufps m5, m5, m1, 0xb1
- mulps m5, m5, [winq + 32*4]
- STORE m5, m7, bufq + 4*48, 16
-
- shufps m1, m1, m1, 0xb1
- mulps m1, m1, [winq + 36*4]
- movss [bufq + 4*64], m1
- shufps m1, m1, 0xb1
- movss [bufq + 4*68], m1
- RET
-%endmacro
-
-INIT_XMM sse
-DEFINE_IMDCT
-
-INIT_XMM sse2
-DEFINE_IMDCT
-
-INIT_XMM sse3
-DEFINE_IMDCT
-
-INIT_XMM ssse3
-DEFINE_IMDCT
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEFINE_IMDCT
-%endif
-
-INIT_XMM sse
-
-%if ARCH_X86_64
-%define SPILL SWAP
-%define UNSPILL SWAP
-%define SPILLED(x) m %+ x
-%else
-%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
-%macro SPILL 2 ; xmm#, mempos
- movaps SPILLED(%2), m%1
-%endmacro
-%macro UNSPILL 2
- movaps m%1, SPILLED(%2)
-%endmacro
-%endif
-
-%macro DEFINE_FOUR_IMDCT 0
-cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
- movlps m0, [inq+64]
- movhps m0, [inq+64 + 72]
- movlps m3, [inq+64 + 2*72]
- movhps m3, [inq+64 + 3*72]
-
- shufps m5, m0, m3, 0xdd
- shufps m0, m0, m3, 0x88
-
- mova m1, [inq+48]
- movu m6, [inq+48 + 72]
- mova m7, [inq+48 + 2*72]
- movu m3, [inq+48 + 3*72]
-
- TRANSPOSE4x4PS 1, 6, 7, 3, 4
-
- addps m4, m6, m7
- mova [tmpq+4*28], m4
-
- addps m7, m3
- addps m6, m1
- addps m3, m0
- addps m0, m5
- addps m0, m7
- addps m7, m6
- mova [tmpq+4*12], m7
- SPILL 3, 12
-
- mova m4, [inq+32]
- movu m5, [inq+32 + 72]
- mova m2, [inq+32 + 2*72]
- movu m7, [inq+32 + 3*72]
-
- TRANSPOSE4x4PS 4, 5, 2, 7, 3
-
- addps m1, m7
- SPILL 1, 11
-
- addps m3, m5, m2
- SPILL 3, 13
-
- addps m7, m2
- addps m5, m4
- addps m6, m7
- mova [tmpq], m6
- addps m7, m5
- mova [tmpq+4*16], m7
-
- mova m2, [inq+16]
- movu m7, [inq+16 + 72]
- mova m1, [inq+16 + 2*72]
- movu m6, [inq+16 + 3*72]
-
- TRANSPOSE4x4PS 2, 7, 1, 6, 3
-
- addps m4, m6
- addps m6, m1
- addps m1, m7
- addps m7, m2
- addps m5, m6
- SPILL 5, 15
- addps m6, m7
- mulps m6, [costabs + 16*2]
- mova [tmpq+4*8], m6
- SPILL 1, 10
- SPILL 0, 14
-
- mova m1, [inq]
- movu m6, [inq + 72]
- mova m3, [inq + 2*72]
- movu m5, [inq + 3*72]
-
- TRANSPOSE4x4PS 1, 6, 3, 5, 0
-
- addps m2, m5
- addps m5, m3
- addps m7, m5
- addps m3, m6
- addps m6, m1
- SPILL 7, 8
- addps m5, m6
- SPILL 6, 9
- addps m6, m4, SPILLED(12)
- subps m6, m2
- UNSPILL 7, 11
- SPILL 5, 11
- subps m5, m1, m7
- mulps m7, [costabs + 16*5]
- addps m7, m1
- mulps m0, m6, [costabs + 16*6]
- addps m0, m5
- mova [tmpq+4*24], m0
- addps m6, m5
- mova [tmpq+4*4], m6
- addps m6, m4, m2
- mulps m6, [costabs + 16*1]
- subps m4, SPILLED(12)
- mulps m4, [costabs + 16*8]
- addps m2, SPILLED(12)
- mulps m2, [costabs + 16*3]
- subps m5, m7, m6
- subps m5, m2
- addps m6, m7
- addps m6, m4
- addps m7, m2
- subps m7, m4
- mova [tmpq+4*20], m7
- mova m2, [tmpq+4*28]
- mova [tmpq+4*28], m5
- UNSPILL 7, 13
- subps m5, m7, m2
- mulps m5, [costabs + 16*7]
- UNSPILL 1, 10
- mulps m1, [costabs + 16*2]
- addps m4, m3, m2
- mulps m4, [costabs + 16*4]
- addps m2, m7
- addps m7, m3
- mulps m7, [costabs]
- subps m3, m2
- mulps m3, [costabs + 16*2]
- addps m2, m7, m5
- addps m2, m1
- SPILL 2, 10
- addps m7, m4
- subps m7, m1
- SPILL 7, 12
- subps m5, m4
- subps m5, m1
- UNSPILL 0, 14
- SPILL 5, 13
- addps m1, m0, SPILLED(15)
- subps m1, SPILLED(8)
- mova m4, [costabs + 16*5]
- mulps m4, [tmpq]
- UNSPILL 2, 9
- addps m4, m2
- subps m2, [tmpq]
- mulps m5, m1, [costabs + 16*6]
- addps m5, m2
- SPILL 5, 9
- addps m2, m1
- SPILL 2, 14
- UNSPILL 5, 15
- subps m7, m5, m0
- addps m5, SPILLED(8)
- mulps m5, [costabs + 16*1]
- mulps m7, [costabs + 16*8]
- addps m0, SPILLED(8)
- mulps m0, [costabs + 16*3]
- subps m2, m4, m5
- subps m2, m0
- SPILL 2, 15
- addps m5, m4
- addps m5, m7
- addps m4, m0
- subps m4, m7
- SPILL 4, 8
- mova m7, [tmpq+4*16]
- mova m2, [tmpq+4*12]
- addps m0, m7, m2
- subps m0, SPILLED(11)
- mulps m0, [costabs + 16*2]
- addps m4, m7, SPILLED(11)
- mulps m4, [costabs]
- subps m7, m2
- mulps m7, [costabs + 16*7]
- addps m2, SPILLED(11)
- mulps m2, [costabs + 16*4]
- addps m1, m7, [tmpq+4*8]
- addps m1, m4
- addps m4, m2
- subps m4, [tmpq+4*8]
- SPILL 4, 11
- subps m7, m2
- subps m7, [tmpq+4*8]
- addps m4, m6, SPILLED(10)
- subps m6, SPILLED(10)
- addps m2, m5, m1
- mulps m2, [costabs + 16*9]
- subps m5, m1
- mulps m5, [costabs + 16*17]
- subps m1, m4, m2
- addps m4, m2
- mulps m2, m1, [winq+4*36]
- addps m2, [bufq+4*36]
- mova [outq+1152], m2
- mulps m1, [winq+4*32]
- addps m1, [bufq+4*32]
- mova [outq+1024], m1
- mulps m1, m4, [winq+4*116]
- mova [bufq+4*36], m1
- mulps m4, [winq+4*112]
- mova [bufq+4*32], m4
- addps m2, m6, m5
- subps m6, m5
- mulps m1, m6, [winq+4*68]
- addps m1, [bufq+4*68]
- mova [outq+2176], m1
- mulps m6, [winq]
- addps m6, [bufq]
- mova [outq], m6
- mulps m1, m2, [winq+4*148]
- mova [bufq+4*68], m1
- mulps m2, [winq+4*80]
- mova [bufq], m2
- addps m5, m3, [tmpq+4*24]
- mova m2, [tmpq+4*24]
- subps m2, m3
- mova m1, SPILLED(9)
- subps m1, m0
- mulps m1, [costabs + 16*10]
- addps m0, SPILLED(9)
- mulps m0, [costabs + 16*16]
- addps m6, m5, m1
- subps m5, m1
- mulps m3, m5, [winq+4*40]
- addps m3, [bufq+4*40]
- mova [outq+1280], m3
- mulps m5, [winq+4*28]
- addps m5, [bufq+4*28]
- mova [outq+896], m5
- mulps m1, m6, [winq+4*120]
- mova [bufq+4*40], m1
- mulps m6, [winq+4*108]
- mova [bufq+4*28], m6
- addps m1, m2, m0
- subps m2, m0
- mulps m5, m2, [winq+4*64]
- addps m5, [bufq+4*64]
- mova [outq+2048], m5
- mulps m2, [winq+4*4]
- addps m2, [bufq+4*4]
- mova [outq+128], m2
- mulps m0, m1, [winq+4*144]
- mova [bufq+4*64], m0
- mulps m1, [winq+4*84]
- mova [bufq+4*4], m1
- mova m1, [tmpq+4*28]
- mova m5, m1
- addps m1, SPILLED(13)
- subps m5, SPILLED(13)
- UNSPILL 3, 15
- addps m2, m7, m3
- mulps m2, [costabs + 16*11]
- subps m3, m7
- mulps m3, [costabs + 16*15]
- addps m0, m2, m1
- subps m1, m2
- SWAP m0, m2
- mulps m6, m1, [winq+4*44]
- addps m6, [bufq+4*44]
- mova [outq+1408], m6
- mulps m1, [winq+4*24]
- addps m1, [bufq+4*24]
- mova [outq+768], m1
- mulps m0, m2, [winq+4*124]
- mova [bufq+4*44], m0
- mulps m2, [winq+4*104]
- mova [bufq+4*24], m2
- addps m0, m5, m3
- subps m5, m3
- mulps m1, m5, [winq+4*60]
- addps m1, [bufq+4*60]
- mova [outq+1920], m1
- mulps m5, [winq+4*8]
- addps m5, [bufq+4*8]
- mova [outq+256], m5
- mulps m1, m0, [winq+4*140]
- mova [bufq+4*60], m1
- mulps m0, [winq+4*88]
- mova [bufq+4*8], m0
- mova m1, [tmpq+4*20]
- addps m1, SPILLED(12)
- mova m2, [tmpq+4*20]
- subps m2, SPILLED(12)
- UNSPILL 7, 8
- subps m0, m7, SPILLED(11)
- addps m7, SPILLED(11)
- mulps m4, m7, [costabs + 16*12]
- mulps m0, [costabs + 16*14]
- addps m5, m1, m4
- subps m1, m4
- mulps m7, m1, [winq+4*48]
- addps m7, [bufq+4*48]
- mova [outq+1536], m7
- mulps m1, [winq+4*20]
- addps m1, [bufq+4*20]
- mova [outq+640], m1
- mulps m1, m5, [winq+4*128]
- mova [bufq+4*48], m1
- mulps m5, [winq+4*100]
- mova [bufq+4*20], m5
- addps m6, m2, m0
- subps m2, m0
- mulps m1, m2, [winq+4*56]
- addps m1, [bufq+4*56]
- mova [outq+1792], m1
- mulps m2, [winq+4*12]
- addps m2, [bufq+4*12]
- mova [outq+384], m2
- mulps m0, m6, [winq+4*136]
- mova [bufq+4*56], m0
- mulps m6, [winq+4*92]
- mova [bufq+4*12], m6
- UNSPILL 0, 14
- mulps m0, [costabs + 16*13]
- mova m3, [tmpq+4*4]
- addps m2, m0, m3
- subps m3, m0
- mulps m0, m3, [winq+4*52]
- addps m0, [bufq+4*52]
- mova [outq+1664], m0
- mulps m3, [winq+4*16]
- addps m3, [bufq+4*16]
- mova [outq+512], m3
- mulps m0, m2, [winq+4*132]
- mova [bufq+4*52], m0
- mulps m2, [winq+4*96]
- mova [bufq+4*16], m2
- RET
-%endmacro
-
-INIT_XMM sse
-DEFINE_FOUR_IMDCT
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-DEFINE_FOUR_IMDCT
-%endif
diff --git a/ffmpeg/libavcodec/x86/lpc.c b/ffmpeg/libavcodec/x86/lpc.c
deleted file mode 100644
index 8a74755..0000000
--- a/ffmpeg/libavcodec/x86/lpc.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * MMX optimized LPC DSP utils
- * Copyright (c) 2007 Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/lpc.h"
-
-DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
-DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
-
-#if HAVE_SSE2_INLINE
-
-static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
- double *w_data)
-{
- double c = 2.0 / (len-1.0);
- int n2 = len>>1;
- x86_reg i = -n2*sizeof(int32_t);
- x86_reg j = n2*sizeof(int32_t);
- __asm__ volatile(
- "movsd %4, %%xmm7 \n\t"
- "movapd "MANGLE(pd_1)", %%xmm6 \n\t"
- "movapd "MANGLE(pd_2)", %%xmm5 \n\t"
- "movlhps %%xmm7, %%xmm7 \n\t"
- "subpd %%xmm5, %%xmm7 \n\t"
- "addsd %%xmm6, %%xmm7 \n\t"
- "test $1, %5 \n\t"
- "jz 2f \n\t"
-#define WELCH(MOVPD, offset)\
- "1: \n\t"\
- "movapd %%xmm7, %%xmm1 \n\t"\
- "mulpd %%xmm1, %%xmm1 \n\t"\
- "movapd %%xmm6, %%xmm0 \n\t"\
- "subpd %%xmm1, %%xmm0 \n\t"\
- "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
- "cvtpi2pd (%3,%0), %%xmm2 \n\t"\
- "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
- "mulpd %%xmm0, %%xmm2 \n\t"\
- "mulpd %%xmm1, %%xmm3 \n\t"\
- "movapd %%xmm2, (%2,%0,2) \n\t"\
- MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
- "subpd %%xmm5, %%xmm7 \n\t"\
- "sub $8, %1 \n\t"\
- "add $8, %0 \n\t"\
- "jl 1b \n\t"\
-
- WELCH("movupd", -1)
- "jmp 3f \n\t"
- "2: \n\t"
- WELCH("movapd", -2)
- "3: \n\t"
- :"+&r"(i), "+&r"(j)
- :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm5", "%xmm6", "%xmm7")
- );
-#undef WELCH
-}
-
-static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
- double *autoc)
-{
- int j;
-
- if((x86_reg)data & 15)
- data++;
-
- for(j=0; j<lag; j+=2){
- x86_reg i = -len*sizeof(double);
- if(j == lag-2) {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
- "1: \n\t"
- "movapd (%2,%0), %%xmm3 \n\t"
- "movupd -8(%3,%0), %%xmm4 \n\t"
- "movapd (%3,%0), %%xmm5 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm5 \n\t"
- "mulpd -16(%3,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm5, %%xmm0 \n\t"
- "addpd %%xmm3, %%xmm2 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "movhlps %%xmm2, %%xmm5 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "addsd %%xmm5, %%xmm2 \n\t"
- "movsd %%xmm0, (%1) \n\t"
- "movsd %%xmm1, 8(%1) \n\t"
- "movsd %%xmm2, 16(%1) \n\t"
- :"+&r"(i)
- :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
- :"memory"
- );
- } else {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "1: \n\t"
- "movapd (%3,%0), %%xmm3 \n\t"
- "movupd -8(%4,%0), %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd (%4,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm3, %%xmm0 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
- :"r"(data+len), "r"(data+len-j)
- );
- }
- }
-}
-
-#endif /* HAVE_SSE2_INLINE */
-
-av_cold void ff_lpc_init_x86(LPCContext *c)
-{
-#if HAVE_SSE2_INLINE
- int cpu_flags = av_get_cpu_flags();
-
- if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
- c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
- c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
- }
-#endif /* HAVE_SSE2_INLINE */
-}
diff --git a/ffmpeg/libavcodec/x86/mathops.h b/ffmpeg/libavcodec/x86/mathops.h
deleted file mode 100644
index 9c48afe..0000000
--- a/ffmpeg/libavcodec/x86/mathops.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MATHOPS_H
-#define AVCODEC_X86_MATHOPS_H
-
-#include "config.h"
-#include "libavutil/common.h"
-
-#if HAVE_INLINE_ASM
-
-#if ARCH_X86_32
-
-#define MULL MULL
-static av_always_inline av_const int MULL(int a, int b, unsigned shift)
-{
- int rt, dummy;
- __asm__ (
- "imull %3 \n\t"
- "shrdl %4, %%edx, %%eax \n\t"
- :"=a"(rt), "=d"(dummy)
- :"a"(a), "rm"(b), "ci"((uint8_t)shift)
- );
- return rt;
-}
-
-#define MULH MULH
-static av_always_inline av_const int MULH(int a, int b)
-{
- int rt, dummy;
- __asm__ (
- "imull %3"
- :"=d"(rt), "=a"(dummy)
- :"a"(a), "rm"(b)
- );
- return rt;
-}
-
-#define MUL64 MUL64
-static av_always_inline av_const int64_t MUL64(int a, int b)
-{
- int64_t rt;
- __asm__ (
- "imull %2"
- :"=A"(rt)
- :"a"(a), "rm"(b)
- );
- return rt;
-}
-
-#endif /* ARCH_X86_32 */
-
-#if HAVE_I686
-/* median of 3 */
-#define mid_pred mid_pred
-static inline av_const int mid_pred(int a, int b, int c)
-{
- int i=b;
- __asm__ (
- "cmp %2, %1 \n\t"
- "cmovg %1, %0 \n\t"
- "cmovg %2, %1 \n\t"
- "cmp %3, %1 \n\t"
- "cmovl %3, %1 \n\t"
- "cmp %1, %0 \n\t"
- "cmovg %1, %0 \n\t"
- :"+&r"(i), "+&r"(a)
- :"r"(b), "r"(c)
- );
- return i;
-}
-
-#define COPY3_IF_LT(x, y, a, b, c, d)\
-__asm__ volatile(\
- "cmpl %0, %3 \n\t"\
- "cmovl %3, %0 \n\t"\
- "cmovl %4, %1 \n\t"\
- "cmovl %5, %2 \n\t"\
- : "+&r" (x), "+&r" (a), "+r" (c)\
- : "r" (y), "r" (b), "r" (d)\
-);
-#endif /* HAVE_I686 */
-
-#define MASK_ABS(mask, level) \
- __asm__ ("cltd \n\t" \
- "xorl %1, %0 \n\t" \
- "subl %1, %0 \n\t" \
- : "+a"(level), "=&d"(mask))
-
-// avoid +32 for shift optimization (gcc should do that ...)
-#define NEG_SSR32 NEG_SSR32
-static inline int32_t NEG_SSR32( int32_t a, int8_t s){
- __asm__ ("sarl %1, %0\n\t"
- : "+r" (a)
- : "ic" ((uint8_t)(-s))
- );
- return a;
-}
-
-#define NEG_USR32 NEG_USR32
-static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
- __asm__ ("shrl %1, %0\n\t"
- : "+r" (a)
- : "ic" ((uint8_t)(-s))
- );
- return a;
-}
-
-#endif /* HAVE_INLINE_ASM */
-#endif /* AVCODEC_X86_MATHOPS_H */
diff --git a/ffmpeg/libavcodec/x86/mlpdsp.c b/ffmpeg/libavcodec/x86/mlpdsp.c
deleted file mode 100644
index 94849b7..0000000
--- a/ffmpeg/libavcodec/x86/mlpdsp.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * MLP DSP functions x86-optimized
- * Copyright (c) 2009 Ramiro Polla
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/mlpdsp.h"
-#include "libavcodec/mlp.h"
-
-#if HAVE_7REGS && HAVE_INLINE_ASM
-
-extern char ff_mlp_firorder_8;
-extern char ff_mlp_firorder_7;
-extern char ff_mlp_firorder_6;
-extern char ff_mlp_firorder_5;
-extern char ff_mlp_firorder_4;
-extern char ff_mlp_firorder_3;
-extern char ff_mlp_firorder_2;
-extern char ff_mlp_firorder_1;
-extern char ff_mlp_firorder_0;
-
-extern char ff_mlp_iirorder_4;
-extern char ff_mlp_iirorder_3;
-extern char ff_mlp_iirorder_2;
-extern char ff_mlp_iirorder_1;
-extern char ff_mlp_iirorder_0;
-
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
- &ff_mlp_firorder_2, &ff_mlp_firorder_3,
- &ff_mlp_firorder_4, &ff_mlp_firorder_5,
- &ff_mlp_firorder_6, &ff_mlp_firorder_7,
- &ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
- &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
- &ff_mlp_iirorder_4 };
-
-#if ARCH_X86_64
-
-#define MLPMUL(label, offset, offs, offc) \
- LABEL_MANGLE(label)": \n\t" \
- "movslq "offset"+"offs"(%0), %%rax\n\t" \
- "movslq "offset"+"offc"(%1), %%rdx\n\t" \
- "imul %%rdx, %%rax\n\t" \
- "add %%rax, %%rsi\n\t"
-
-#define FIRMULREG(label, offset, firc)\
- LABEL_MANGLE(label)": \n\t" \
- "movslq "#offset"(%0), %%rax\n\t" \
- "imul %"#firc", %%rax\n\t" \
- "add %%rax, %%rsi\n\t"
-
-#define CLEAR_ACCUM \
- "xor %%rsi, %%rsi\n\t"
-
-#define SHIFT_ACCUM \
- "shr %%cl, %%rsi\n\t"
-
-#define ACCUM "%%rdx"
-#define RESULT "%%rsi"
-#define RESULT32 "%%esi"
-
-#else /* if ARCH_X86_32 */
-
-#define MLPMUL(label, offset, offs, offc) \
- LABEL_MANGLE(label)": \n\t" \
- "mov "offset"+"offs"(%0), %%eax\n\t" \
- "imull "offset"+"offc"(%1) \n\t" \
- "add %%eax , %%esi\n\t" \
- "adc %%edx , %%ecx\n\t"
-
-#define FIRMULREG(label, offset, firc) \
- MLPMUL(label, #offset, "0", "0")
-
-#define CLEAR_ACCUM \
- "xor %%esi, %%esi\n\t" \
- "xor %%ecx, %%ecx\n\t"
-
-#define SHIFT_ACCUM \
- "mov %%ecx, %%edx\n\t" \
- "mov %%esi, %%eax\n\t" \
- "movzbl %7 , %%ecx\n\t" \
- "shrd %%cl, %%edx, %%eax\n\t" \
-
-#define ACCUM "%%edx"
-#define RESULT "%%eax"
-#define RESULT32 "%%eax"
-
-#endif /* !ARCH_X86_64 */
-
-#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
-#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
-#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
-
-#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
-#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
-
-static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
- int firorder, int iirorder,
- unsigned int filter_shift, int32_t mask,
- int blocksize, int32_t *sample_buffer)
-{
- const void *firjump = firtable[firorder];
- const void *iirjump = iirtable[iirorder];
-
- blocksize = -blocksize;
-
- __asm__ volatile(
- "1: \n\t"
- CLEAR_ACCUM
- "jmp *%5 \n\t"
- FIRMUL (ff_mlp_firorder_8, 0x1c )
- FIRMUL (ff_mlp_firorder_7, 0x18 )
- FIRMUL (ff_mlp_firorder_6, 0x14 )
- FIRMUL (ff_mlp_firorder_5, 0x10 )
- FIRMUL (ff_mlp_firorder_4, 0x0c )
- FIRMULREG(ff_mlp_firorder_3, 0x08,10)
- FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
- FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
- LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
- "jmp *%6 \n\t"
- IIRMUL (ff_mlp_iirorder_4, 0x0c )
- IIRMUL (ff_mlp_iirorder_3, 0x08 )
- IIRMUL (ff_mlp_iirorder_2, 0x04 )
- IIRMUL (ff_mlp_iirorder_1, 0x00 )
- LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
- SHIFT_ACCUM
- "mov "RESULT" ,"ACCUM" \n\t"
- "add (%2) ,"RESULT" \n\t"
- "and %4 ,"RESULT" \n\t"
- "sub $4 , %0 \n\t"
- "mov "RESULT32", (%0) \n\t"
- "mov "RESULT32", (%2) \n\t"
- "add $"BINC" , %2 \n\t"
- "sub "ACCUM" ,"RESULT" \n\t"
- "mov "RESULT32","IOFFS"(%0) \n\t"
- "incl %3 \n\t"
- "js 1b \n\t"
- : /* 0*/"+r"(state),
- /* 1*/"+r"(coeff),
- /* 2*/"+r"(sample_buffer),
-#if ARCH_X86_64
- /* 3*/"+r"(blocksize)
- : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
- /* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
- , /* 8*/"r"((int64_t)coeff[0])
- , /* 9*/"r"((int64_t)coeff[1])
- , /*10*/"r"((int64_t)coeff[2])
- : "rax", "rdx", "rsi"
-#else /* ARCH_X86_32 */
- /* 3*/"+m"(blocksize)
- : /* 4*/"m"( mask), /* 5*/"m"(firjump),
- /* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
- : "eax", "edx", "esi", "ecx"
-#endif /* !ARCH_X86_64 */
- );
-}
-
-#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
-
-av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
-{
-#if HAVE_7REGS && HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags))
- c->mlp_filter_channel = mlp_filter_channel_x86;
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/motion_est.c b/ffmpeg/libavcodec/x86/motion_est.c
deleted file mode 100644
index 5f5d93e..0000000
--- a/ffmpeg/libavcodec/x86/motion_est.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * MMX optimized motion estimation
- * Copyright (c) 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer
- *
- * mostly by Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "dsputil_x86.h"
-
-#if HAVE_INLINE_ASM
-
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
-0x0000000000000000ULL,
-0x0001000100010001ULL,
-0x0002000200020002ULL,
-};
-
-DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
-
-static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
-{
- x86_reg len= -(x86_reg)stride*h;
- __asm__ volatile(
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "movq (%2, %%"REG_a"), %%mm2 \n\t"
- "movq (%2, %%"REG_a"), %%mm4 \n\t"
- "add %3, %%"REG_a" \n\t"
- "psubusb %%mm0, %%mm2 \n\t"
- "psubusb %%mm4, %%mm0 \n\t"
- "movq (%1, %%"REG_a"), %%mm1 \n\t"
- "movq (%2, %%"REG_a"), %%mm3 \n\t"
- "movq (%2, %%"REG_a"), %%mm5 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm5, %%mm1 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm1, %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm3, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "punpckhbw %%mm7, %%mm2 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "add %3, %%"REG_a" \n\t"
- " js 1b \n\t"
- : "+a" (len)
- : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
- );
-}
-
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile(
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg)stride)
- );
-}
-
-static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
-{
- int ret;
- __asm__ volatile(
- "pxor %%xmm2, %%xmm2 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movdqu (%1), %%xmm0 \n\t"
- "movdqu (%1, %4), %%xmm1 \n\t"
- "psadbw (%2), %%xmm0 \n\t"
- "psadbw (%2, %4), %%xmm1 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "paddw %%xmm1, %%xmm2 \n\t"
- "lea (%1,%4,2), %1 \n\t"
- "lea (%2,%4,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- "movhlps %%xmm2, %%xmm0 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "movd %%xmm2, %3 \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
- : "r" ((x86_reg)stride)
- );
- return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile(
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "pavgb 1(%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg)stride)
- );
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg)stride)
- );
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile(
- "movq "MANGLE(bone)", %%mm5 \n\t"
- "movq (%1), %%mm0 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1,%3), %%mm2 \n\t"
- "pavgb 1(%1), %%mm1 \n\t"
- "pavgb 1(%1,%3), %%mm2 \n\t"
- "psubusb %%mm5, %%mm1 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2,%3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg)stride)
- );
-}
-
-static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
-{
- x86_reg len= -(x86_reg)stride*h;
- __asm__ volatile(
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "movq (%2, %%"REG_a"), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm2 \n\t"
- "movq (%2, %%"REG_a"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "paddw %%mm2, %%mm3 \n\t"
- "movq (%3, %%"REG_a"), %%mm4 \n\t"
- "movq (%3, %%"REG_a"), %%mm2 \n\t"
- "paddw %%mm5, %%mm1 \n\t"
- "paddw %%mm5, %%mm3 \n\t"
- "psrlw $1, %%mm1 \n\t"
- "psrlw $1, %%mm3 \n\t"
- "packuswb %%mm3, %%mm1 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm2, %%mm1 \n\t"
- "por %%mm4, %%mm1 \n\t"
- "movq %%mm1, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "add %4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : "+a" (len)
- : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
- );
-}
-
-static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
-{
- x86_reg len= -(x86_reg)stride*h;
- __asm__ volatile(
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm3, %%mm1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%2, %%"REG_a"), %%mm2 \n\t"
- "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddw %%mm4, %%mm2 \n\t"
- "paddw %%mm5, %%mm3 \n\t"
- "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm3, %%mm1 \n\t"
- "paddw %%mm5, %%mm0 \n\t"
- "paddw %%mm5, %%mm1 \n\t"
- "movq (%3, %%"REG_a"), %%mm4 \n\t"
- "movq (%3, %%"REG_a"), %%mm5 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "psubusb %%mm0, %%mm4 \n\t"
- "psubusb %%mm5, %%mm0 \n\t"
- "por %%mm4, %%mm0 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpckhbw %%mm7, %%mm4 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm4, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "movq %%mm3, %%mm1 \n\t"
- "add %4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : "+a" (len)
- : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
- );
-}
-
-static inline int sum_mmx(void)
-{
- int ret;
- __asm__ volatile(
- "movq %%mm6, %%mm0 \n\t"
- "psrlq $32, %%mm6 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "psrlq $16, %%mm6 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "movd %%mm6, %0 \n\t"
- : "=r" (ret)
- );
- return ret&0xFFFF;
-}
-
-static inline int sum_mmxext(void)
-{
- int ret;
- __asm__ volatile(
- "movd %%mm6, %0 \n\t"
- : "=r" (ret)
- );
- return ret;
-}
-
-static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
-{
- sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
-}
-static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
-{
- sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
-}
-
-
-#define PIX_SAD(suf)\
-static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- av_assert2(h==8);\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t":);\
-\
- sad8_1_ ## suf(blk1, blk2, stride, 8);\
-\
- return sum_ ## suf();\
-}\
-static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- av_assert2(h==8);\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "movq %0, %%mm5 \n\t"\
- :: "m"(round_tab[1]) \
- );\
-\
- sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
-\
- return sum_ ## suf();\
-}\
-\
-static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- av_assert2(h==8);\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "movq %0, %%mm5 \n\t"\
- :: "m"(round_tab[1]) \
- );\
-\
- sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
-\
- return sum_ ## suf();\
-}\
-\
-static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- av_assert2(h==8);\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- ::);\
-\
- sad8_4_ ## suf(blk1, blk2, stride, 8);\
-\
- return sum_ ## suf();\
-}\
-\
-static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t":);\
-\
- sad8_1_ ## suf(blk1 , blk2 , stride, h);\
- sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
-\
- return sum_ ## suf();\
-}\
-static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "movq %0, %%mm5 \n\t"\
- :: "m"(round_tab[1]) \
- );\
-\
- sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
- sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
-\
- return sum_ ## suf();\
-}\
-static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "movq %0, %%mm5 \n\t"\
- :: "m"(round_tab[1]) \
- );\
-\
- sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
- sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
-\
- return sum_ ## suf();\
-}\
-static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- ::);\
-\
- sad8_4_ ## suf(blk1 , blk2 , stride, h);\
- sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
-\
- return sum_ ## suf();\
-}\
-
-PIX_SAD(mmx)
-PIX_SAD(mmxext)
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
-{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags)) {
- c->pix_abs[0][0] = sad16_mmx;
- c->pix_abs[0][1] = sad16_x2_mmx;
- c->pix_abs[0][2] = sad16_y2_mmx;
- c->pix_abs[0][3] = sad16_xy2_mmx;
- c->pix_abs[1][0] = sad8_mmx;
- c->pix_abs[1][1] = sad8_x2_mmx;
- c->pix_abs[1][2] = sad8_y2_mmx;
- c->pix_abs[1][3] = sad8_xy2_mmx;
-
- c->sad[0]= sad16_mmx;
- c->sad[1]= sad8_mmx;
- }
- if (INLINE_MMXEXT(cpu_flags)) {
- c->pix_abs[0][0] = sad16_mmxext;
- c->pix_abs[1][0] = sad8_mmxext;
-
- c->sad[0] = sad16_mmxext;
- c->sad[1] = sad8_mmxext;
-
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->pix_abs[0][1] = sad16_x2_mmxext;
- c->pix_abs[0][2] = sad16_y2_mmxext;
- c->pix_abs[0][3] = sad16_xy2_mmxext;
- c->pix_abs[1][1] = sad8_x2_mmxext;
- c->pix_abs[1][2] = sad8_y2_mmxext;
- c->pix_abs[1][3] = sad8_xy2_mmxext;
- }
- }
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
- c->sad[0]= sad16_sse2;
- }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/ffmpeg/libavcodec/x86/mpeg4qpel.asm b/ffmpeg/libavcodec/x86/mpeg4qpel.asm
deleted file mode 100644
index ca52375..0000000
--- a/ffmpeg/libavcodec/x86/mpeg4qpel.asm
+++ /dev/null
@@ -1,560 +0,0 @@
-;******************************************************************************
-;* mpeg4 qpel
-;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
-;* Copyright (c) 2008 Loren Merritt
-;* Copyright (c) 2013 Daniel Kang
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-cextern pb_1
-cextern pw_3
-cextern pw_15
-cextern pw_16
-cextern pw_20
-
-
-SECTION_TEXT
-
-; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-%macro PUT_NO_RND_PIXELS8_L2 0
-cglobal put_no_rnd_pixels8_l2, 6,6
- movsxdifnidn r4, r4d
- movsxdifnidn r3, r3d
- pcmpeqb m6, m6
- test r5d, 1
- je .loop
- mova m0, [r1]
- mova m1, [r2]
- add r1, r4
- add r2, 8
- pxor m0, m6
- pxor m1, m6
- PAVGB m0, m1
- pxor m0, m6
- mova [r0], m0
- add r0, r3
- dec r5d
-.loop:
- mova m0, [r1]
- add r1, r4
- mova m1, [r1]
- add r1, r4
- mova m2, [r2]
- mova m3, [r2+8]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m2
- PAVGB m1, m3
- pxor m0, m6
- pxor m1, m6
- mova [r0], m0
- add r0, r3
- mova [r0], m1
- add r0, r3
- mova m0, [r1]
- add r1, r4
- mova m1, [r1]
- add r1, r4
- mova m2, [r2+16]
- mova m3, [r2+24]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m2
- PAVGB m1, m3
- pxor m0, m6
- pxor m1, m6
- mova [r0], m0
- add r0, r3
- mova [r0], m1
- add r0, r3
- add r2, 32
- sub r5d, 4
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS8_L2
-
-
-; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-%macro PUT_NO_RND_PIXELS16_l2 0
-cglobal put_no_rnd_pixels16_l2, 6,6
- movsxdifnidn r3, r3d
- movsxdifnidn r4, r4d
- pcmpeqb m6, m6
- test r5d, 1
- je .loop
- mova m0, [r1]
- mova m1, [r1+8]
- mova m2, [r2]
- mova m3, [r2+8]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m2
- PAVGB m1, m3
- pxor m0, m6
- pxor m1, m6
- add r1, r4
- add r2, 16
- mova [r0], m0
- mova [r0+8], m1
- add r0, r3
- dec r5d
-.loop:
- mova m0, [r1]
- mova m1, [r1+8]
- add r1, r4
- mova m2, [r2]
- mova m3, [r2+8]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m2
- PAVGB m1, m3
- pxor m0, m6
- pxor m1, m6
- mova [r0], m0
- mova [r0+8], m1
- add r0, r3
- mova m0, [r1]
- mova m1, [r1+8]
- add r1, r4
- mova m2, [r2+16]
- mova m3, [r2+24]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
- PAVGB m0, m2
- PAVGB m1, m3
- pxor m0, m6
- pxor m1, m6
- mova [r0], m0
- mova [r0+8], m1
- add r0, r3
- add r2, 32
- sub r5d, 2
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_NO_RND_PIXELS16_l2
-INIT_MMX 3dnow
-PUT_NO_RND_PIXELS16_l2
-
-%macro MPEG4_QPEL16_H_LOWPASS 1
-cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- pxor m7, m7
-.loop:
- mova m0, [r1]
- mova m1, m0
- mova m2, m0
- punpcklbw m0, m7
- punpckhbw m1, m7
- pshufw m5, m0, 0x90
- pshufw m6, m0, 0x41
- mova m3, m2
- mova m4, m2
- psllq m2, 8
- psllq m3, 16
- psllq m4, 24
- punpckhbw m2, m7
- punpckhbw m3, m7
- punpckhbw m4, m7
- paddw m5, m3
- paddw m6, m2
- paddw m5, m5
- psubw m6, m5
- pshufw m5, m0, 6
- pmullw m6, [pw_3]
- paddw m0, m4
- paddw m5, m1
- pmullw m0, [pw_20]
- psubw m0, m5
- paddw m6, [PW_ROUND]
- paddw m0, m6
- psraw m0, 5
- mova [rsp+8], m0
- mova m0, [r1+5]
- mova m5, m0
- mova m6, m0
- psrlq m0, 8
- psrlq m5, 16
- punpcklbw m0, m7
- punpcklbw m5, m7
- paddw m2, m0
- paddw m3, m5
- paddw m2, m2
- psubw m3, m2
- mova m2, m6
- psrlq m6, 24
- punpcklbw m2, m7
- punpcklbw m6, m7
- pmullw m3, [pw_3]
- paddw m1, m2
- paddw m4, m6
- pmullw m1, [pw_20]
- psubw m3, m4
- paddw m1, [PW_ROUND]
- paddw m3, m1
- psraw m3, 5
- mova m1, [rsp+8]
- packuswb m1, m3
- OP_MOV [r0], m1, m4
- mova m1, [r1+9]
- mova m4, m1
- mova m3, m1
- psrlq m1, 8
- psrlq m4, 16
- punpcklbw m1, m7
- punpcklbw m4, m7
- paddw m5, m1
- paddw m0, m4
- paddw m5, m5
- psubw m0, m5
- mova m5, m3
- psrlq m3, 24
- pmullw m0, [pw_3]
- punpcklbw m3, m7
- paddw m2, m3
- psubw m0, m2
- mova m2, m5
- punpcklbw m2, m7
- punpckhbw m5, m7
- paddw m6, m2
- pmullw m6, [pw_20]
- paddw m0, [PW_ROUND]
- paddw m0, m6
- psraw m0, 5
- paddw m3, m5
- pshufw m6, m5, 0xf9
- paddw m6, m4
- pshufw m4, m5, 0xbe
- pshufw m5, m5, 0x6f
- paddw m4, m1
- paddw m5, m2
- paddw m6, m6
- psubw m4, m6
- pmullw m3, [pw_20]
- pmullw m4, [pw_3]
- psubw m3, m5
- paddw m4, [PW_ROUND]
- paddw m4, m3
- psraw m4, 5
- packuswb m0, m4
- OP_MOV [r0+8], m0, m4
- add r1, r3
- add r0, r2
- dec r4d
- jne .loop
- REP_RET
-%endmacro
-
-%macro PUT_OP 2-3
- mova %1, %2
-%endmacro
-
-%macro AVG_OP 2-3
- mova %3, %1
- pavgb %2, %3
- mova %1, %2
-%endmacro
-
-INIT_MMX mmxext
-%define PW_ROUND pw_16
-%define OP_MOV PUT_OP
-MPEG4_QPEL16_H_LOWPASS put
-%define PW_ROUND pw_16
-%define OP_MOV AVG_OP
-MPEG4_QPEL16_H_LOWPASS avg
-%define PW_ROUND pw_15
-%define OP_MOV PUT_OP
-MPEG4_QPEL16_H_LOWPASS put_no_rnd
-
-
-
-%macro MPEG4_QPEL8_H_LOWPASS 1
-cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- pxor m7, m7
-.loop:
- mova m0, [r1]
- mova m1, m0
- mova m2, m0
- punpcklbw m0, m7
- punpckhbw m1, m7
- pshufw m5, m0, 0x90
- pshufw m6, m0, 0x41
- mova m3, m2
- mova m4, m2
- psllq m2, 8
- psllq m3, 16
- psllq m4, 24
- punpckhbw m2, m7
- punpckhbw m3, m7
- punpckhbw m4, m7
- paddw m5, m3
- paddw m6, m2
- paddw m5, m5
- psubw m6, m5
- pshufw m5, m0, 0x6
- pmullw m6, [pw_3]
- paddw m0, m4
- paddw m5, m1
- pmullw m0, [pw_20]
- psubw m0, m5
- paddw m6, [PW_ROUND]
- paddw m0, m6
- psraw m0, 5
- movh m5, [r1+5]
- punpcklbw m5, m7
- pshufw m6, m5, 0xf9
- paddw m1, m5
- paddw m2, m6
- pshufw m6, m5, 0xbe
- pshufw m5, m5, 0x6f
- paddw m3, m6
- paddw m4, m5
- paddw m2, m2
- psubw m3, m2
- pmullw m1, [pw_20]
- pmullw m3, [pw_3]
- psubw m3, m4
- paddw m1, [PW_ROUND]
- paddw m3, m1
- psraw m3, 5
- packuswb m0, m3
- OP_MOV [r0], m0, m4
- add r1, r3
- add r0, r2
- dec r4d
- jne .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-%define PW_ROUND pw_16
-%define OP_MOV PUT_OP
-MPEG4_QPEL8_H_LOWPASS put
-%define PW_ROUND pw_16
-%define OP_MOV AVG_OP
-MPEG4_QPEL8_H_LOWPASS avg
-%define PW_ROUND pw_15
-%define OP_MOV PUT_OP
-MPEG4_QPEL8_H_LOWPASS put_no_rnd
-
-
-
-%macro QPEL_V_LOW 5
- paddw m0, m1
- mova m4, [pw_20]
- pmullw m4, m0
- mova m0, %4
- mova m5, %1
- paddw m5, m0
- psubw m4, m5
- mova m5, %2
- mova m6, %3
- paddw m5, m3
- paddw m6, m2
- paddw m6, m6
- psubw m5, m6
- pmullw m5, [pw_3]
- paddw m4, [PW_ROUND]
- paddw m5, m4
- psraw m5, 5
- packuswb m5, m5
- OP_MOV %5, m5, m7
- SWAP 0,1,2,3
-%endmacro
-
-%macro MPEG4_QPEL16_V_LOWPASS 1
-cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
-
- mov r4d, 17
- mov r5, rsp
- pxor m7, m7
-.looph:
- mova m0, [r1]
- mova m1, [r1]
- mova m2, [r1+8]
- mova m3, [r1+8]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- mova [r5], m0
- mova [r5+0x88], m1
- mova [r5+0x110], m2
- mova [r5+0x198], m3
- add r5, 8
- add r1, r3
- dec r4d
- jne .looph
-
-
- ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
- mov r4d, 4
- mov r1, 4
- neg r2
- lea r1, [r1+r2*8]
- lea r1, [r1+r2*4]
- lea r1, [r1+r2*2]
- neg r2
- mov r5, rsp
-.loopv:
- pxor m7, m7
- mova m0, [r5+ 0x0]
- mova m1, [r5+ 0x8]
- mova m2, [r5+0x10]
- mova m3, [r5+0x18]
- QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
- QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
- QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
- QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
- QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
- QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
- QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
- QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
- QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
-
- add r5, 0x88
- add r0, r1
- dec r4d
- jne .loopv
- REP_RET
-%endmacro
-
-%macro PUT_OPH 2-3
- movh %1, %2
-%endmacro
-
-%macro AVG_OPH 2-3
- movh %3, %1
- pavgb %2, %3
- movh %1, %2
-%endmacro
-
-INIT_MMX mmxext
-%define PW_ROUND pw_16
-%define OP_MOV PUT_OPH
-MPEG4_QPEL16_V_LOWPASS put
-%define PW_ROUND pw_16
-%define OP_MOV AVG_OPH
-MPEG4_QPEL16_V_LOWPASS avg
-%define PW_ROUND pw_15
-%define OP_MOV PUT_OPH
-MPEG4_QPEL16_V_LOWPASS put_no_rnd
-
-
-
-%macro MPEG4_QPEL8_V_LOWPASS 1
-cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
-
- mov r4d, 9
- mov r5, rsp
- pxor m7, m7
-.looph:
- mova m0, [r1]
- mova m1, [r1]
- punpcklbw m0, m7
- punpckhbw m1, m7
- mova [r5], m0
- mova [r5+0x48], m1
- add r5, 8
- add r1, r3
- dec r4d
- jne .looph
-
-
- ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
- mov r4d, 2
- mov r1, 4
- neg r2
- lea r1, [r1+r2*4]
- lea r1, [r1+r2*2]
- neg r2
- mov r5, rsp
-.loopv:
- pxor m7, m7
- mova m0, [r5+ 0x0]
- mova m1, [r5+ 0x8]
- mova m2, [r5+0x10]
- mova m3, [r5+0x18]
- QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
- QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
- QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
- QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
- lea r0, [r0+r2*2]
- QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
- QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
-
- add r5, 0x48
- add r0, r1
- dec r4d
- jne .loopv
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-%define PW_ROUND pw_16
-%define OP_MOV PUT_OPH
-MPEG4_QPEL8_V_LOWPASS put
-%define PW_ROUND pw_16
-%define OP_MOV AVG_OPH
-MPEG4_QPEL8_V_LOWPASS avg
-%define PW_ROUND pw_15
-%define OP_MOV PUT_OPH
-MPEG4_QPEL8_V_LOWPASS put_no_rnd
diff --git a/ffmpeg/libavcodec/x86/mpegvideo.c b/ffmpeg/libavcodec/x86/mpegvideo.c
deleted file mode 100644
index b2ce680..0000000
--- a/ffmpeg/libavcodec/x86/mpegvideo.c
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
- * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/mpegvideo.h"
-#include "dsputil_x86.h"
-
-#if HAVE_MMX_INLINE
-
-static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg level, qmul, qadd, nCoeffs;
-
- qmul = qscale << 1;
-
- av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
-
- if (!s->h263_aic) {
- if (n < 4)
- level = block[0] * s->y_dc_scale;
- else
- level = block[0] * s->c_dc_scale;
- qadd = (qscale - 1) | 1;
- }else{
- qadd = 0;
- level= block[0];
- }
- if(s->ac_pred)
- nCoeffs=63;
- else
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-__asm__ volatile(
- "movd %1, %%mm6 \n\t" //qmul
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "movd %2, %%mm5 \n\t" //qadd
- "pxor %%mm7, %%mm7 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "psubw %%mm5, %%mm7 \n\t"
- "pxor %%mm4, %%mm4 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %3), %%mm0 \n\t"
- "movq 8(%0, %3), %%mm1 \n\t"
-
- "pmullw %%mm6, %%mm0 \n\t"
- "pmullw %%mm6, %%mm1 \n\t"
-
- "movq (%0, %3), %%mm2 \n\t"
- "movq 8(%0, %3), %%mm3 \n\t"
-
- "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
-
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
-
- "paddw %%mm7, %%mm0 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
-
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
-
- "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
-
- "pandn %%mm2, %%mm0 \n\t"
- "pandn %%mm3, %%mm1 \n\t"
-
- "movq %%mm0, (%0, %3) \n\t"
- "movq %%mm1, 8(%0, %3) \n\t"
-
- "add $16, %3 \n\t"
- "jng 1b \n\t"
- ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
- : "memory"
- );
- block[0]= level;
-}
-
-
-static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg qmul, qadd, nCoeffs;
-
- qmul = qscale << 1;
- qadd = (qscale - 1) | 1;
-
- av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
-
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-__asm__ volatile(
- "movd %1, %%mm6 \n\t" //qmul
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "movd %2, %%mm5 \n\t" //qadd
- "pxor %%mm7, %%mm7 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "psubw %%mm5, %%mm7 \n\t"
- "pxor %%mm4, %%mm4 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %3), %%mm0 \n\t"
- "movq 8(%0, %3), %%mm1 \n\t"
-
- "pmullw %%mm6, %%mm0 \n\t"
- "pmullw %%mm6, %%mm1 \n\t"
-
- "movq (%0, %3), %%mm2 \n\t"
- "movq 8(%0, %3), %%mm3 \n\t"
-
- "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
-
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
-
- "paddw %%mm7, %%mm0 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
-
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
-
- "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
-
- "pandn %%mm2, %%mm0 \n\t"
- "pandn %%mm3, %%mm1 \n\t"
-
- "movq %%mm0, (%0, %3) \n\t"
- "movq %%mm1, 8(%0, %3) \n\t"
-
- "add $16, %3 \n\t"
- "jng 1b \n\t"
- ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
- : "memory"
- );
-}
-
-static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg nCoeffs;
- const uint16_t *quant_matrix;
- int block0;
-
- av_assert2(s->block_last_index[n]>=0);
-
- nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
-
- if (n < 4)
- block0 = block[0] * s->y_dc_scale;
- else
- block0 = block[0] * s->c_dc_scale;
- /* XXX: only mpeg1 */
- quant_matrix = s->intra_matrix;
-__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psraw $3, %%mm0 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psubw %%mm7, %%mm1 \n\t"
- "por %%mm7, %%mm0 \n\t"
- "por %%mm7, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"REG_a") \n\t"
-
- "add $16, %%"REG_a" \n\t"
- "js 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
- : "%"REG_a, "memory"
- );
- block[0]= block0;
-}
-
-static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg nCoeffs;
- const uint16_t *quant_matrix;
-
- av_assert2(s->block_last_index[n]>=0);
-
- nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
-
- quant_matrix = s->inter_matrix;
-__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
- "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
- "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
- "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
- "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
- "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psraw $4, %%mm0 \n\t"
- "psraw $4, %%mm1 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psubw %%mm7, %%mm1 \n\t"
- "por %%mm7, %%mm0 \n\t"
- "por %%mm7, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"REG_a") \n\t"
-
- "add $16, %%"REG_a" \n\t"
- "js 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
- : "%"REG_a, "memory"
- );
-}
-
-static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg nCoeffs;
- const uint16_t *quant_matrix;
- int block0;
-
- av_assert2(s->block_last_index[n]>=0);
-
- if(s->alternate_scan) nCoeffs= 63; //FIXME
- else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
-
- if (n < 4)
- block0 = block[0] * s->y_dc_scale;
- else
- block0 = block[0] * s->c_dc_scale;
- quant_matrix = s->intra_matrix;
-__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psraw $3, %%mm0 \n\t"
- "psraw $3, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"REG_a") \n\t"
-
- "add $16, %%"REG_a" \n\t"
- "jng 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
- : "%"REG_a, "memory"
- );
- block[0]= block0;
- //Note, we do not do mismatch control for intra as errors cannot accumulate
-}
-
-static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- x86_reg nCoeffs;
- const uint16_t *quant_matrix;
-
- av_assert2(s->block_last_index[n]>=0);
-
- if(s->alternate_scan) nCoeffs= 63; //FIXME
- else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
-
- quant_matrix = s->inter_matrix;
-__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlq $48, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
- "movq (%1, %%"REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
- "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
- "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
- "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
- "psrlw $4, %%mm0 \n\t"
- "psrlw $4, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "pxor %%mm4, %%mm7 \n\t"
- "pxor %%mm5, %%mm7 \n\t"
- "movq %%mm4, (%0, %%"REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"REG_a") \n\t"
-
- "add $16, %%"REG_a" \n\t"
- "jng 1b \n\t"
- "movd 124(%0, %3), %%mm0 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $32, %%mm7 \n\t"
- "pxor %%mm6, %%mm7 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $16, %%mm7 \n\t"
- "pxor %%mm6, %%mm7 \n\t"
- "pslld $31, %%mm7 \n\t"
- "psrlq $15, %%mm7 \n\t"
- "pxor %%mm7, %%mm0 \n\t"
- "movd %%mm0, 124(%0, %3) \n\t"
-
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
- : "%"REG_a, "memory"
- );
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
-{
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags)) {
- s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
- s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
- s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
- s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
- if(!(s->flags & CODEC_FLAG_BITEXACT))
- s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
- s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
- }
-#endif /* HAVE_MMX_INLINE */
-}
diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc.c b/ffmpeg/libavcodec/x86/mpegvideoenc.c
deleted file mode 100644
index 7dd9959..0000000
--- a/ffmpeg/libavcodec/x86/mpegvideoenc.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * The simplest mpeg encoder (well, it was the simplest!)
- * Copyright (c) 2000,2001 Fabrice Bellard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dct.h"
-#include "libavcodec/mpegvideo.h"
-#include "dsputil_x86.h"
-
-/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
-DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
-
-#if HAVE_MMX_INLINE
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 0
-#define COMPILE_TEMPLATE_SSSE3 0
-#define RENAME(a) a ## _MMX
-#define RENAMEl(a) a ## _mmx
-#include "mpegvideoenc_template.c"
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-#undef COMPILE_TEMPLATE_SSSE3
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_MMXEXT
-#define COMPILE_TEMPLATE_MMXEXT 1
-#define COMPILE_TEMPLATE_SSE2 0
-#define COMPILE_TEMPLATE_SSSE3 0
-#undef RENAME
-#undef RENAMEl
-#define RENAME(a) a ## _MMXEXT
-#define RENAMEl(a) a ## _mmxext
-#include "mpegvideoenc_template.c"
-#endif /* HAVE_MMXEXT_INLINE */
-
-#if HAVE_SSE2_INLINE
-#undef COMPILE_TEMPLATE_MMXEXT
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_SSSE3
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 1
-#define COMPILE_TEMPLATE_SSSE3 0
-#undef RENAME
-#undef RENAMEl
-#define RENAME(a) a ## _SSE2
-#define RENAMEl(a) a ## _sse2
-#include "mpegvideoenc_template.c"
-#endif /* HAVE_SSE2_INLINE */
-
-#if HAVE_SSSE3_INLINE
-#undef COMPILE_TEMPLATE_MMXEXT
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_SSSE3
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 1
-#define COMPILE_TEMPLATE_SSSE3 1
-#undef RENAME
-#undef RENAMEl
-#define RENAME(a) a ## _SSSE3
-#define RENAMEl(a) a ## _sse2
-#include "mpegvideoenc_template.c"
-#endif /* HAVE_SSSE3_INLINE */
-
-#if HAVE_INLINE_ASM
-static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
- const int intra= s->mb_intra;
- int *sum= s->dct_error_sum[intra];
- uint16_t *offset= s->dct_offset[intra];
-
- s->dct_count[intra]++;
-
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "1: \n\t"
- "pxor %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "movq (%0), %%mm2 \n\t"
- "movq 8(%0), %%mm3 \n\t"
- "pcmpgtw %%mm2, %%mm0 \n\t"
- "pcmpgtw %%mm3, %%mm1 \n\t"
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
- "psubw %%mm0, %%mm2 \n\t"
- "psubw %%mm1, %%mm3 \n\t"
- "movq %%mm2, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psubusw (%2), %%mm2 \n\t"
- "psubusw 8(%2), %%mm3 \n\t"
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
- "psubw %%mm0, %%mm2 \n\t"
- "psubw %%mm1, %%mm3 \n\t"
- "movq %%mm2, (%0) \n\t"
- "movq %%mm3, 8(%0) \n\t"
- "movq %%mm4, %%mm2 \n\t"
- "movq %%mm5, %%mm3 \n\t"
- "punpcklwd %%mm7, %%mm4 \n\t"
- "punpckhwd %%mm7, %%mm2 \n\t"
- "punpcklwd %%mm7, %%mm5 \n\t"
- "punpckhwd %%mm7, %%mm3 \n\t"
- "paddd (%1), %%mm4 \n\t"
- "paddd 8(%1), %%mm2 \n\t"
- "paddd 16(%1), %%mm5 \n\t"
- "paddd 24(%1), %%mm3 \n\t"
- "movq %%mm4, (%1) \n\t"
- "movq %%mm2, 8(%1) \n\t"
- "movq %%mm5, 16(%1) \n\t"
- "movq %%mm3, 24(%1) \n\t"
- "add $16, %0 \n\t"
- "add $32, %1 \n\t"
- "add $16, %2 \n\t"
- "cmp %3, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (block), "+r" (sum), "+r" (offset)
- : "r"(block+64)
- );
-}
-
-static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
- const int intra= s->mb_intra;
- int *sum= s->dct_error_sum[intra];
- uint16_t *offset= s->dct_offset[intra];
-
- s->dct_count[intra]++;
-
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "1: \n\t"
- "pxor %%xmm0, %%xmm0 \n\t"
- "pxor %%xmm1, %%xmm1 \n\t"
- "movdqa (%0), %%xmm2 \n\t"
- "movdqa 16(%0), %%xmm3 \n\t"
- "pcmpgtw %%xmm2, %%xmm0 \n\t"
- "pcmpgtw %%xmm3, %%xmm1 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, %%xmm4 \n\t"
- "movdqa %%xmm3, %%xmm5 \n\t"
- "psubusw (%2), %%xmm2 \n\t"
- "psubusw 16(%2), %%xmm3 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, (%0) \n\t"
- "movdqa %%xmm3, 16(%0) \n\t"
- "movdqa %%xmm4, %%xmm6 \n\t"
- "movdqa %%xmm5, %%xmm0 \n\t"
- "punpcklwd %%xmm7, %%xmm4 \n\t"
- "punpckhwd %%xmm7, %%xmm6 \n\t"
- "punpcklwd %%xmm7, %%xmm5 \n\t"
- "punpckhwd %%xmm7, %%xmm0 \n\t"
- "paddd (%1), %%xmm4 \n\t"
- "paddd 16(%1), %%xmm6 \n\t"
- "paddd 32(%1), %%xmm5 \n\t"
- "paddd 48(%1), %%xmm0 \n\t"
- "movdqa %%xmm4, (%1) \n\t"
- "movdqa %%xmm6, 16(%1) \n\t"
- "movdqa %%xmm5, 32(%1) \n\t"
- "movdqa %%xmm0, 48(%1) \n\t"
- "add $32, %0 \n\t"
- "add $64, %1 \n\t"
- "add $32, %2 \n\t"
- "cmp %3, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (block), "+r" (sum), "+r" (offset)
- : "r"(block+64)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
- );
-}
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
-{
- const int dct_algo = s->avctx->dct_algo;
- int i;
-
- for (i = 0; i < 64; i++)
- inv_zigzag_direct16[ff_zigzag_direct[i]] = i + 1;
-
- if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags)) {
- s->dct_quantize = dct_quantize_MMX;
- s->denoise_dct = denoise_dct_mmx;
- }
-#endif
-#if HAVE_MMXEXT_INLINE
- if (INLINE_MMXEXT(cpu_flags))
- s->dct_quantize = dct_quantize_MMXEXT;
-#endif
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2(cpu_flags)) {
- s->dct_quantize = dct_quantize_SSE2;
- s->denoise_dct = denoise_dct_sse2;
- }
-#endif
-#if HAVE_SSSE3_INLINE
- if (INLINE_SSSE3(cpu_flags))
- s->dct_quantize = dct_quantize_SSSE3;
-#endif
- }
-}
diff --git a/ffmpeg/libavcodec/x86/mpegvideoenc_template.c b/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
deleted file mode 100644
index 0defc40..0000000
--- a/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * MPEG video MMX templates
- *
- * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#undef MMREG_WIDTH
-#undef MM
-#undef MOVQ
-#undef SPREADW
-#undef PMAXW
-#undef PMAX
-#undef SAVE_SIGN
-#undef RESTORE_SIGN
-
-#if COMPILE_TEMPLATE_SSE2
-#define MMREG_WIDTH "16"
-#define MM "%%xmm"
-#define MOVQ "movdqa"
-#define SPREADW(a) \
- "pshuflw $0, "a", "a" \n\t"\
- "punpcklwd "a", "a" \n\t"
-#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
-#define PMAX(a,b) \
- "movhlps "a", "b" \n\t"\
- PMAXW(b, a)\
- "pshuflw $0x0E, "a", "b" \n\t"\
- PMAXW(b, a)\
- "pshuflw $0x01, "a", "b" \n\t"\
- PMAXW(b, a)
-#else
-#define MMREG_WIDTH "8"
-#define MM "%%mm"
-#define MOVQ "movq"
-#if COMPILE_TEMPLATE_MMXEXT
-#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
-#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
-#define PMAX(a,b) \
- "pshufw $0x0E, "a", "b" \n\t"\
- PMAXW(b, a)\
- "pshufw $0x01, "a", "b" \n\t"\
- PMAXW(b, a)
-#else
-#define SPREADW(a) \
- "punpcklwd "a", "a" \n\t"\
- "punpcklwd "a", "a" \n\t"
-#define PMAXW(a,b) \
- "psubusw "a", "b" \n\t"\
- "paddw "a", "b" \n\t"
-#define PMAX(a,b) \
- "movq "a", "b" \n\t"\
- "psrlq $32, "a" \n\t"\
- PMAXW(b, a)\
- "movq "a", "b" \n\t"\
- "psrlq $16, "a" \n\t"\
- PMAXW(b, a)
-
-#endif
-#endif
-
-#if COMPILE_TEMPLATE_SSSE3
-#define SAVE_SIGN(a,b) \
- "movdqa "b", "a" \n\t"\
- "pabsw "b", "b" \n\t"
-#define RESTORE_SIGN(a,b) \
- "psignw "a", "b" \n\t"
-#else
-#define SAVE_SIGN(a,b) \
- "pxor "a", "a" \n\t"\
- "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
- "pxor "a", "b" \n\t"\
- "psubw "a", "b" \n\t" /* ABS(block[i]) */
-#define RESTORE_SIGN(a,b) \
- "pxor "a", "b" \n\t"\
- "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-#endif
-
-static int RENAME(dct_quantize)(MpegEncContext *s,
- int16_t *block, int n,
- int qscale, int *overflow)
-{
- x86_reg last_non_zero_p1;
- int level=0, q; //=0 is because gcc says uninitialized ...
- const uint16_t *qmat, *bias;
- LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
-
- av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
-
- //s->fdct (block);
- RENAMEl(ff_fdct) (block); //cannot be anything else ...
-
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
-
- if (s->mb_intra) {
- int dummy;
- if (n < 4){
- q = s->y_dc_scale;
- bias = s->q_intra_matrix16[qscale][1];
- qmat = s->q_intra_matrix16[qscale][0];
- }else{
- q = s->c_dc_scale;
- bias = s->q_chroma_intra_matrix16[qscale][1];
- qmat = s->q_chroma_intra_matrix16[qscale][0];
- }
- /* note: block[0] is assumed to be positive */
- if (!s->h263_aic) {
- __asm__ volatile (
- "mul %%ecx \n\t"
- : "=d" (level), "=a"(dummy)
- : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
- );
- } else
- /* For AIC we skip quant/dequant of INTRADC */
- level = (block[0] + 4)>>3;
-
- block[0]=0; //avoid fake overflow
-// temp_block[0] = (block[0] + (q >> 1)) / q;
- last_non_zero_p1 = 1;
- } else {
- last_non_zero_p1 = 0;
- bias = s->q_inter_matrix16[qscale][1];
- qmat = s->q_inter_matrix16[qscale][0];
- }
-
- if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
-
- __asm__ volatile(
- "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
- SPREADW(MM"3")
- "pxor "MM"7, "MM"7 \n\t" // 0
- "pxor "MM"4, "MM"4 \n\t" // 0
- MOVQ" (%2), "MM"5 \n\t" // qmat[0]
- "pxor "MM"6, "MM"6 \n\t"
- "psubw (%3), "MM"6 \n\t" // -bias[0]
- "mov $-128, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
- SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
- "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
- "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
- "por "MM"0, "MM"4 \n\t"
- RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
- "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
- MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
- MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
- "pandn "MM"1, "MM"0 \n\t"
- PMAXW(MM"0", MM"3")
- "add $"MMREG_WIDTH", %%"REG_a" \n\t"
- " js 1b \n\t"
- PMAX(MM"3", MM"0")
- "movd "MM"3, %%"REG_a" \n\t"
- "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
- : "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat), "r" (bias),
- "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
- );
- }else{ // FMT_H263
- __asm__ volatile(
- "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
- SPREADW(MM"3")
- "pxor "MM"7, "MM"7 \n\t" // 0
- "pxor "MM"4, "MM"4 \n\t" // 0
- "mov $-128, %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
- SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
- MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
- "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
- MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
- "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
- "por "MM"0, "MM"4 \n\t"
- RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
- "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
- MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
- MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
- "pandn "MM"1, "MM"0 \n\t"
- PMAXW(MM"0", MM"3")
- "add $"MMREG_WIDTH", %%"REG_a" \n\t"
- " js 1b \n\t"
- PMAX(MM"3", MM"0")
- "movd "MM"3, %%"REG_a" \n\t"
- "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
- : "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat+64), "r" (bias+64),
- "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
- );
- }
- __asm__ volatile(
- "movd %1, "MM"1 \n\t" // max_qcoeff
- SPREADW(MM"1")
- "psubusw "MM"1, "MM"4 \n\t"
- "packuswb "MM"4, "MM"4 \n\t"
-#if COMPILE_TEMPLATE_SSE2
- "packuswb "MM"4, "MM"4 \n\t"
-#endif
- "movd "MM"4, %0 \n\t" // *overflow
- : "=g" (*overflow)
- : "g" (s->max_qcoeff)
- );
-
- if(s->mb_intra) block[0]= level;
- else block[0]= temp_block[0];
-
- if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
- if(last_non_zero_p1 <= 1) goto end;
- block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
- block[0x20] = temp_block[0x10];
- if(last_non_zero_p1 <= 4) goto end;
- block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
- block[0x09] = temp_block[0x03];
- if(last_non_zero_p1 <= 7) goto end;
- block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
- block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
- if(last_non_zero_p1 <= 11) goto end;
- block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
- block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
- block[0x0C] = temp_block[0x05];
- if(last_non_zero_p1 <= 16) goto end;
- block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
- block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
- block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
- block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
- if(last_non_zero_p1 <= 24) goto end;
- block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
- block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
- block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
- block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
- if(last_non_zero_p1 <= 32) goto end;
- block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
- block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
- block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
- block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
- if(last_non_zero_p1 <= 40) goto end;
- block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
- block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
- block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
- block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
- if(last_non_zero_p1 <= 48) goto end;
- block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
- block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
- block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
- block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
- if(last_non_zero_p1 <= 56) goto end;
- block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
- block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
- block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
- block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
- }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
- if(last_non_zero_p1 <= 1) goto end;
- block[0x04] = temp_block[0x01];
- block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
- if(last_non_zero_p1 <= 4) goto end;
- block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
- block[0x05] = temp_block[0x03];
- if(last_non_zero_p1 <= 7) goto end;
- block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
- block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
- if(last_non_zero_p1 <= 11) goto end;
- block[0x1C] = temp_block[0x19];
- block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
- block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
- if(last_non_zero_p1 <= 16) goto end;
- block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
- block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
- block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
- block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
- if(last_non_zero_p1 <= 24) goto end;
- block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
- block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
- block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
- block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
- if(last_non_zero_p1 <= 32) goto end;
- block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
- block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
- block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
- block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
- if(last_non_zero_p1 <= 40) goto end;
- block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
- block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
- block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
- block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
- if(last_non_zero_p1 <= 48) goto end;
- block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
- block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
- block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
- block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
- if(last_non_zero_p1 <= 56) goto end;
- block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
- block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
- block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
- block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
- }else{
- if(last_non_zero_p1 <= 1) goto end;
- block[0x01] = temp_block[0x01];
- block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
- if(last_non_zero_p1 <= 4) goto end;
- block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
- block[0x03] = temp_block[0x03];
- if(last_non_zero_p1 <= 7) goto end;
- block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
- block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
- if(last_non_zero_p1 <= 11) goto end;
- block[0x19] = temp_block[0x19];
- block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
- block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
- if(last_non_zero_p1 <= 16) goto end;
- block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
- block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
- block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
- block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
- if(last_non_zero_p1 <= 24) goto end;
- block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
- block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
- block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
- block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
- if(last_non_zero_p1 <= 32) goto end;
- block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
- block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
- block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
- block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
- if(last_non_zero_p1 <= 40) goto end;
- block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
- block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
- block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
- block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
- if(last_non_zero_p1 <= 48) goto end;
- block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
- block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
- block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
- block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
- if(last_non_zero_p1 <= 56) goto end;
- block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
- block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
- block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
- block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
- }
- end:
- return last_non_zero_p1 - 1;
-}
diff --git a/ffmpeg/libavcodec/x86/pngdsp.asm b/ffmpeg/libavcodec/x86/pngdsp.asm
deleted file mode 100644
index 8e23ccf..0000000
--- a/ffmpeg/libavcodec/x86/pngdsp.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-;******************************************************************************
-;* x86 optimizations for PNG decoding
-;*
-;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
-;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-cextern pw_255
-
-SECTION_TEXT
-
-; %1 = nr. of xmm registers used
-%macro ADD_BYTES_FN 1
-cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
-%if ARCH_X86_64
- movsxd waq, wad
-%endif
- xor iq, iq
-
- ; vector loop
- mov wq, waq
- and waq, ~(mmsize*2-1)
- jmp .end_v
-.loop_v:
- mova m0, [src1q+iq]
- mova m1, [src1q+iq+mmsize]
- paddb m0, [src2q+iq]
- paddb m1, [src2q+iq+mmsize]
- mova [dstq+iq ], m0
- mova [dstq+iq+mmsize], m1
- add iq, mmsize*2
-.end_v:
- cmp iq, waq
- jl .loop_v
-
-%if mmsize == 16
- ; vector loop
- mov waq, wq
- and waq, ~7
- jmp .end_l
-.loop_l:
- movq mm0, [src1q+iq]
- paddb mm0, [src2q+iq]
- movq [dstq+iq ], mm0
- add iq, 8
-.end_l:
- cmp iq, waq
- jl .loop_l
-%endif
-
- ; scalar loop for leftover
- jmp .end_s
-.loop_s:
- mov wab, [src1q+iq]
- add wab, [src2q+iq]
- mov [dstq+iq], wab
- inc iq
-.end_s:
- cmp iq, wq
- jl .loop_s
- REP_RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-ADD_BYTES_FN 0
-%endif
-
-INIT_XMM sse2
-ADD_BYTES_FN 2
-
-%macro ADD_PAETH_PRED_FN 1
-cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
-%if ARCH_X86_64
- movsxd bppq, bppd
- movsxd wq, wd
-%endif
- lea endq, [dstq+wq-(mmsize/2-1)]
- sub topq, dstq
- sub srcq, dstq
- sub dstq, bppq
- pxor m7, m7
-
- PUSH dstq
- lea cntrq, [bppq-1]
- shr cntrq, 2 + mmsize/16
-.bpp_loop:
- lea dstq, [dstq+cntrq*(mmsize/2)]
- movh m0, [dstq]
- movh m1, [topq+dstq]
- punpcklbw m0, m7
- punpcklbw m1, m7
- add dstq, bppq
-.loop:
- mova m2, m1
- movh m1, [topq+dstq]
- mova m3, m2
- punpcklbw m1, m7
- mova m4, m2
- psubw m3, m1
- psubw m4, m0
- mova m5, m3
- paddw m5, m4
-%if cpuflag(ssse3)
- pabsw m3, m3
- pabsw m4, m4
- pabsw m5, m5
-%else ; !cpuflag(ssse3)
- psubw m7, m5
- pmaxsw m5, m7
- pxor m6, m6
- pxor m7, m7
- psubw m6, m3
- psubw m7, m4
- pmaxsw m3, m6
- pmaxsw m4, m7
- pxor m7, m7
-%endif ; cpuflag(ssse3)
- mova m6, m4
- pminsw m6, m5
- pcmpgtw m3, m6
- pcmpgtw m4, m5
- mova m6, m4
- pand m4, m3
- pandn m6, m3
- pandn m3, m0
- movh m0, [srcq+dstq]
- pand m6, m1
- pand m2, m4
- punpcklbw m0, m7
- paddw m0, m6
- paddw m3, m2
- paddw m0, m3
- pand m0, [pw_255]
- mova m3, m0
- packuswb m3, m3
- movh [dstq], m3
- add dstq, bppq
- cmp dstq, endq
- jle .loop
-
- mov dstq, [rsp]
- dec cntrq
- jge .bpp_loop
- POP dstq
- RET
-%endmacro
-
-INIT_MMX mmxext
-ADD_PAETH_PRED_FN 0
-
-INIT_MMX ssse3
-ADD_PAETH_PRED_FN 0
diff --git a/ffmpeg/libavcodec/x86/pngdsp_init.c b/ffmpeg/libavcodec/x86/pngdsp_init.c
deleted file mode 100644
index 7dca62c..0000000
--- a/ffmpeg/libavcodec/x86/pngdsp_init.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * x86 PNG optimizations.
- * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/common.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/pngdsp.h"
-
-void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
- uint8_t *top, int w, int bpp);
-void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
- uint8_t *top, int w, int bpp);
-void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
- uint8_t *src2, int w);
-void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
- uint8_t *src2, int w);
-
-av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags))
- dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
-#endif
- if (EXTERNAL_MMXEXT(cpu_flags))
- dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
- if (EXTERNAL_SSE2(cpu_flags))
- dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
- if (EXTERNAL_SSSE3(cpu_flags))
- dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
-}
diff --git a/ffmpeg/libavcodec/x86/proresdsp.asm b/ffmpeg/libavcodec/x86/proresdsp.asm
deleted file mode 100644
index aedacc2..0000000
--- a/ffmpeg/libavcodec/x86/proresdsp.asm
+++ /dev/null
@@ -1,326 +0,0 @@
-;******************************************************************************
-;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT written by Michael Niedermayer
-;* except for the clip range
-;*
-;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
-%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
-%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
-%define W6sh2 8867 ; W6 = 35468 = 8867<<2
-%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
-
-%if ARCH_X86_64
-
-SECTION_RODATA
-
-w4_plus_w2: times 4 dw W4sh2, +W2sh2
-w4_min_w2: times 4 dw W4sh2, -W2sh2
-w4_plus_w6: times 4 dw W4sh2, +W6sh2
-w4_min_w6: times 4 dw W4sh2, -W6sh2
-w1_plus_w3: times 4 dw W1sh2, +W3sh2
-w3_min_w1: times 4 dw W3sh2, -W1sh2
-w7_plus_w3: times 4 dw W7sh2, +W3sh2
-w3_min_w7: times 4 dw W3sh2, -W7sh2
-w1_plus_w5: times 4 dw W1sh2, +W5sh2
-w5_min_w1: times 4 dw W5sh2, -W1sh2
-w5_plus_w7: times 4 dw W5sh2, +W7sh2
-w7_min_w5: times 4 dw W7sh2, -W5sh2
-pw_88: times 8 dw 0x2008
-
-cextern pw_1
-cextern pw_4
-cextern pw_512
-cextern pw_1019
-
-section .text align=16
-
-; interleave data while maintaining source
-; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
-%macro SBUTTERFLY3 5
- punpckl%1 m%2, m%4, m%5
- punpckh%1 m%3, m%4, m%5
-%endmacro
-
-; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
-; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
-; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
-%macro SUMSUB_SHPK 7
- psubd %3, %1, %5 ; { a0 - b0 }[0-3]
- psubd %4, %2, %6 ; { a0 - b0 }[4-7]
- paddd %1, %5 ; { a0 + b0 }[0-3]
- paddd %2, %6 ; { a0 + b0 }[4-7]
- psrad %1, %7
- psrad %2, %7
- psrad %3, %7
- psrad %4, %7
- packssdw %1, %2 ; row[0]
- packssdw %3, %4 ; row[7]
-%endmacro
-
-; %1 = row or col (for rounding variable)
-; %2 = number of bits to shift at the end
-%macro IDCT_1D 2
- ; a0 = (W4 * row[0]) + (1 << (15 - 1));
- ; a1 = a0;
- ; a2 = a0;
- ; a3 = a0;
- ; a0 += W2 * row[2];
- ; a1 += W6 * row[2];
- ; a2 -= W6 * row[2];
- ; a3 -= W2 * row[2];
-%ifidn %1, col
- paddw m10,[pw_88]
-%endif
-%ifidn %1, row
- paddw m10,[pw_1]
-%endif
- SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
- pmaddwd m2, m0, [w4_plus_w6]
- pmaddwd m3, m1, [w4_plus_w6]
- pmaddwd m4, m0, [w4_min_w6]
- pmaddwd m5, m1, [w4_min_w6]
- pmaddwd m6, m0, [w4_min_w2]
- pmaddwd m7, m1, [w4_min_w2]
- pmaddwd m0, [w4_plus_w2]
- pmaddwd m1, [w4_plus_w2]
-
- ; a0: -1*row[0]-1*row[2]
- ; a1: -1*row[0]
- ; a2: -1*row[0]
- ; a3: -1*row[0]+1*row[2]
-
- ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
- ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
- ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
- ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
- SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
- pmaddwd m10, m8, [w4_plus_w6]
- pmaddwd m11, m9, [w4_plus_w6]
- paddd m0, m10 ; a0[0-3]
- paddd m1, m11 ; a0[4-7]
- pmaddwd m10, m8, [w4_min_w6]
- pmaddwd m11, m9, [w4_min_w6]
- paddd m6, m10 ; a3[0-3]
- paddd m7, m11 ; a3[4-7]
- pmaddwd m10, m8, [w4_min_w2]
- pmaddwd m11, m9, [w4_min_w2]
- pmaddwd m8, [w4_plus_w2]
- pmaddwd m9, [w4_plus_w2]
- psubd m4, m10 ; a2[0-3] intermediate
- psubd m5, m11 ; a2[4-7] intermediate
- psubd m2, m8 ; a1[0-3] intermediate
- psubd m3, m9 ; a1[4-7] intermediate
-
- ; load/store
- mova [r2+ 0], m0
- mova [r2+ 32], m2
- mova [r2+ 64], m4
- mova [r2+ 96], m6
- mova m10,[r2+ 16] ; { row[1] }[0-7]
- mova m8, [r2+ 48] ; { row[3] }[0-7]
- mova m13,[r2+ 80] ; { row[5] }[0-7]
- mova m14,[r2+112] ; { row[7] }[0-7]
- mova [r2+ 16], m1
- mova [r2+ 48], m3
- mova [r2+ 80], m5
- mova [r2+112], m7
-%ifidn %1, row
- pmullw m10,[r3+ 16]
- pmullw m8, [r3+ 48]
- pmullw m13,[r3+ 80]
- pmullw m14,[r3+112]
-%endif
-
- ; b0 = MUL(W1, row[1]);
- ; MAC(b0, W3, row[3]);
- ; b1 = MUL(W3, row[1]);
- ; MAC(b1, -W7, row[3]);
- ; b2 = MUL(W5, row[1]);
- ; MAC(b2, -W1, row[3]);
- ; b3 = MUL(W7, row[1]);
- ; MAC(b3, -W5, row[3]);
- SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
- pmaddwd m2, m0, [w3_min_w7]
- pmaddwd m3, m1, [w3_min_w7]
- pmaddwd m4, m0, [w5_min_w1]
- pmaddwd m5, m1, [w5_min_w1]
- pmaddwd m6, m0, [w7_min_w5]
- pmaddwd m7, m1, [w7_min_w5]
- pmaddwd m0, [w1_plus_w3]
- pmaddwd m1, [w1_plus_w3]
-
- ; b0: +1*row[1]+2*row[3]
- ; b1: +2*row[1]-1*row[3]
- ; b2: -1*row[1]-1*row[3]
- ; b3: +1*row[1]+1*row[3]
-
- ; MAC(b0, W5, row[5]);
- ; MAC(b0, W7, row[7]);
- ; MAC(b1, -W1, row[5]);
- ; MAC(b1, -W5, row[7]);
- ; MAC(b2, W7, row[5]);
- ; MAC(b2, W3, row[7]);
- ; MAC(b3, W3, row[5]);
- ; MAC(b3, -W1, row[7]);
- SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-
- ; b0: -1*row[5]+1*row[7]
- ; b1: -1*row[5]+1*row[7]
- ; b2: +1*row[5]+2*row[7]
- ; b3: +2*row[5]-1*row[7]
-
- pmaddwd m10, m8, [w1_plus_w5]
- pmaddwd m11, m9, [w1_plus_w5]
- pmaddwd m12, m8, [w5_plus_w7]
- pmaddwd m13, m9, [w5_plus_w7]
- psubd m2, m10 ; b1[0-3]
- psubd m3, m11 ; b1[4-7]
- paddd m0, m12 ; b0[0-3]
- paddd m1, m13 ; b0[4-7]
- pmaddwd m12, m8, [w7_plus_w3]
- pmaddwd m13, m9, [w7_plus_w3]
- pmaddwd m8, [w3_min_w1]
- pmaddwd m9, [w3_min_w1]
- paddd m4, m12 ; b2[0-3]
- paddd m5, m13 ; b2[4-7]
- paddd m6, m8 ; b3[0-3]
- paddd m7, m9 ; b3[4-7]
-
- ; row[0] = (a0 + b0) >> 15;
- ; row[7] = (a0 - b0) >> 15;
- ; row[1] = (a1 + b1) >> 15;
- ; row[6] = (a1 - b1) >> 15;
- ; row[2] = (a2 + b2) >> 15;
- ; row[5] = (a2 - b2) >> 15;
- ; row[3] = (a3 + b3) >> 15;
- ; row[4] = (a3 - b3) >> 15;
- mova m8, [r2+ 0] ; a0[0-3]
- mova m9, [r2+16] ; a0[4-7]
- SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
- mova m0, [r2+32] ; a1[0-3]
- mova m1, [r2+48] ; a1[4-7]
- SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
- mova m1, [r2+64] ; a2[0-3]
- mova m2, [r2+80] ; a2[4-7]
- SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
- mova m2, [r2+96] ; a3[0-3]
- mova m3, [r2+112] ; a3[4-7]
- SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
-%endmacro
-
-; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
-; int16_t *block, const int16_t *qmat);
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
- movsxd r1, r1d
- pxor m15, m15 ; zero
-
- ; for (i = 0; i < 8; i++)
- ; idctRowCondDC(block + i*8);
- mova m10,[r2+ 0] ; { row[0] }[0-7]
- mova m8, [r2+32] ; { row[2] }[0-7]
- mova m13,[r2+64] ; { row[4] }[0-7]
- mova m12,[r2+96] ; { row[6] }[0-7]
-
- pmullw m10,[r3+ 0]
- pmullw m8, [r3+32]
- pmullw m13,[r3+64]
- pmullw m12,[r3+96]
-
- IDCT_1D row, 15
-
- ; transpose for second part of IDCT
- TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
- mova [r2+ 16], m0
- mova [r2+ 48], m2
- mova [r2+ 80], m11
- mova [r2+112], m10
- SWAP 8, 10
- SWAP 1, 8
- SWAP 4, 13
- SWAP 9, 12
-
- ; for (i = 0; i < 8; i++)
- ; idctSparseColAdd(dest + i, line_size, block + i);
- IDCT_1D col, 18
-
- ; clip/store
- mova m3, [pw_4]
- mova m5, [pw_1019]
- pmaxsw m8, m3
- pmaxsw m0, m3
- pmaxsw m1, m3
- pmaxsw m2, m3
- pmaxsw m4, m3
- pmaxsw m11, m3
- pmaxsw m9, m3
- pmaxsw m10, m3
- pminsw m8, m5
- pminsw m0, m5
- pminsw m1, m5
- pminsw m2, m5
- pminsw m4, m5
- pminsw m11, m5
- pminsw m9, m5
- pminsw m10, m5
-
- lea r2, [r1*3]
- mova [r0 ], m8
- mova [r0+r1 ], m0
- mova [r0+r1*2], m1
- mova [r0+r2 ], m2
- lea r0, [r0+r1*4]
- mova [r0 ], m4
- mova [r0+r1 ], m11
- mova [r0+r1*2], m9
- mova [r0+r2 ], m10
- RET
-%endmacro
-
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
- movhlps %2, %1
- pmovsxwd %1, %1
- pmovsxwd %2, %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
- pxor %3, %3
- pcmpgtw %3, %1
- mova %2, %1
- punpcklwd %1, %3
- punpckhwd %2, %3
-%endif
-%endmacro
-
-INIT_XMM sse2
-idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-idct_put_fn 16
-%endif
-
-%endif
diff --git a/ffmpeg/libavcodec/x86/proresdsp_init.c b/ffmpeg/libavcodec/x86/proresdsp_init.c
deleted file mode 100644
index 0273d61..0000000
--- a/ffmpeg/libavcodec/x86/proresdsp_init.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Apple ProRes compatible decoder
- *
- * Copyright (c) 2010-2011 Maxim Poliakovski
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/proresdsp.h"
-
-void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
- int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
- int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
- int16_t *block, const int16_t *qmat);
-
-av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
-{
-#if ARCH_X86_64
- int cpu_flags = av_get_cpu_flags();
-
- if(avctx->flags & CODEC_FLAG_BITEXACT)
- return;
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
- dsp->idct_put = ff_prores_idct_put_10_sse2;
- }
-
- if (EXTERNAL_SSE4(cpu_flags)) {
- dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
- dsp->idct_put = ff_prores_idct_put_10_sse4;
- }
-
- if (EXTERNAL_AVX(cpu_flags)) {
- dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
- dsp->idct_put = ff_prores_idct_put_10_avx;
- }
-#endif /* ARCH_X86_64 */
-}
diff --git a/ffmpeg/libavcodec/x86/rv34dsp.asm b/ffmpeg/libavcodec/x86/rv34dsp.asm
deleted file mode 100644
index 7732d65..0000000
--- a/ffmpeg/libavcodec/x86/rv34dsp.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-;******************************************************************************
-;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
-;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-pw_row_coeffs: times 4 dw 13
- times 4 dw 17
- times 4 dw 7
-pd_512: times 2 dd 0x200
-pw_col_coeffs: dw 13, 13, 13, -13
- dw 17, 7, 7, -17
- dw 13, -13, 13, 13
- dw -7, 17, -17, -7
-
-SECTION .text
-
-%macro IDCT_DC_NOROUND 1
- imul %1, 13*13*3
- sar %1, 11
-%endmacro
-
-%macro IDCT_DC_ROUND 1
- imul %1, 13*13
- add %1, 0x200
- sar %1, 10
-%endmacro
-
-%macro rv34_idct 1
-cglobal rv34_idct_%1, 1, 2, 0
- movsx r1, word [r0]
- IDCT_DC r1
- movd m0, r1d
- pshufw m0, m0, 0
- movq [r0+ 0], m0
- movq [r0+ 8], m0
- movq [r0+16], m0
- movq [r0+24], m0
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-%define IDCT_DC IDCT_DC_ROUND
-rv34_idct dc
-%define IDCT_DC IDCT_DC_NOROUND
-rv34_idct dc_noround
-
-; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
-INIT_MMX mmx
-cglobal rv34_idct_dc_add, 3, 3
- ; calculate DC
- IDCT_DC_ROUND r2
- pxor m1, m1
- movd m0, r2d
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
- punpcklwd m0, m0
- punpcklwd m1, m1
-
- ; add DC
- lea r2, [r0+r1*2]
- movh m2, [r0]
- movh m3, [r0+r1]
- movh m4, [r2]
- movh m5, [r2+r1]
- paddusb m2, m0
- paddusb m3, m0
- paddusb m4, m0
- paddusb m5, m0
- psubusb m2, m1
- psubusb m3, m1
- psubusb m4, m1
- psubusb m5, m1
- movh [r0], m2
- movh [r0+r1], m3
- movh [r2], m4
- movh [r2+r1], m5
- RET
-
-; Load coeffs and perform row transform
-; Output: coeffs in mm[0467], rounder in mm5
-%macro ROW_TRANSFORM 1
- pxor mm7, mm7
- mova mm0, [%1+ 0*8]
- mova mm1, [%1+ 1*8]
- mova mm2, [%1+ 2*8]
- mova mm3, [%1+ 3*8]
- mova [%1+ 0*8], mm7
- mova [%1+ 1*8], mm7
- mova [%1+ 2*8], mm7
- mova [%1+ 3*8], mm7
- mova mm4, mm0
- mova mm6, [pw_row_coeffs+ 0]
- paddsw mm0, mm2 ; b0 + b2
- psubsw mm4, mm2 ; b0 - b2
- pmullw mm0, mm6 ; *13 = z0
- pmullw mm4, mm6 ; *13 = z1
- mova mm5, mm1
- pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
- pmullw mm5, [pw_row_coeffs+16] ; b1* 7
- mova mm7, mm3
- pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
- pmullw mm7, [pw_row_coeffs+16] ; b3* 7
- paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
- psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
- mova mm7, mm0
- mova mm6, mm4
- paddsw mm0, mm1 ; z0 + z3
- psubsw mm7, mm1 ; z0 - z3
- paddsw mm4, mm5 ; z1 + z2
- psubsw mm6, mm5 ; z1 - z2
- mova mm5, [pd_512] ; 0x200
-%endmacro
-
-; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
-%macro COL_TRANSFORM 4
- pshufw mm3, %2, 0xDD ; col. 1,3,1,3
- pshufw %2, %2, 0x88 ; col. 0,2,0,2
- pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
- pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
- paddd %2, mm5
- pshufw mm1, %2, 01001110b ; z1 | z0
- pshufw mm2, mm3, 01001110b ; z2 | z3
- paddd %2, mm3 ; z0+z3 | z1+z2
- psubd mm1, mm2 ; z1-z2 | z0-z3
- movd mm3, %1
- psrad %2, 10
- pxor mm2, mm2
- psrad mm1, 10
- punpcklbw mm3, mm2
- packssdw %2, mm1
- paddw %2, mm3
- packuswb %2, %2
- movd %1, %2
-%endmacro
-INIT_MMX mmxext
-cglobal rv34_idct_add, 3,3,0, d, s, b
- ROW_TRANSFORM bq
- COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
- mova mm0, [pw_col_coeffs+ 0]
- COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
- mova mm4, [pw_col_coeffs+ 8]
- lea dq, [dq + 2*sq]
- COL_TRANSFORM [dq], mm6, mm0, mm4
- COL_TRANSFORM [dq+sq], mm7, mm0, mm4
- ret
-
-; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
-INIT_XMM sse4
-cglobal rv34_idct_dc_add, 3, 3, 6
- ; load data
- IDCT_DC_ROUND r2
- pxor m1, m1
-
- ; calculate DC
- movd m0, r2d
- lea r2, [r0+r1*2]
- movd m2, [r0]
- movd m3, [r0+r1]
- pshuflw m0, m0, 0
- movd m4, [r2]
- movd m5, [r2+r1]
- punpcklqdq m0, m0
- punpckldq m2, m3
- punpckldq m4, m5
- punpcklbw m2, m1
- punpcklbw m4, m1
- paddw m2, m0
- paddw m4, m0
- packuswb m2, m4
- movd [r0], m2
- pextrd [r0+r1], m2, 1
- pextrd [r2], m2, 2
- pextrd [r2+r1], m2, 3
- RET
diff --git a/ffmpeg/libavcodec/x86/rv34dsp_init.c b/ffmpeg/libavcodec/x86/rv34dsp_init.c
deleted file mode 100644
index 027efe9..0000000
--- a/ffmpeg/libavcodec/x86/rv34dsp_init.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * RV30/40 MMX/SSE2 optimizations
- * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/rv34dsp.h"
-
-void ff_rv34_idct_dc_mmxext(int16_t *block);
-void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
-void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
-void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
-void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
-
-av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMX(cpu_flags))
- c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
- c->rv34_idct_add = ff_rv34_idct_add_mmxext;
- }
- if (EXTERNAL_SSE4(cpu_flags))
- c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
-}
diff --git a/ffmpeg/libavcodec/x86/rv40dsp.asm b/ffmpeg/libavcodec/x86/rv40dsp.asm
deleted file mode 100644
index 792a54f..0000000
--- a/ffmpeg/libavcodec/x86/rv40dsp.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-;******************************************************************************
-;* MMX/SSE2-optimized functions for the RV40 decoder
-;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
-;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
-;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-align 16
-pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
-
-sixtap_filter_hb_m: times 8 db 1, -5
- times 8 db 52, 20
- ; multiplied by 2 to have the same shift
- times 8 db 2, -10
- times 8 db 40, 40
- ; back to normal
- times 8 db 1, -5
- times 8 db 20, 52
-
-sixtap_filter_v_m: times 8 dw 1
- times 8 dw -5
- times 8 dw 52
- times 8 dw 20
- ; multiplied by 2 to have the same shift
- times 8 dw 2
- times 8 dw -10
- times 8 dw 40
- times 8 dw 40
- ; back to normal
- times 8 dw 1
- times 8 dw -5
- times 8 dw 20
- times 8 dw 52
-
-%ifdef PIC
-%define sixtap_filter_hw picregq
-%define sixtap_filter_hb picregq
-%define sixtap_filter_v picregq
-%define npicregs 1
-%else
-%define sixtap_filter_hw sixtap_filter_hw_m
-%define sixtap_filter_hb sixtap_filter_hb_m
-%define sixtap_filter_v sixtap_filter_v_m
-%define npicregs 0
-%endif
-
-filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
-
-cextern pw_32
-cextern pw_16
-cextern pw_512
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; subpel MC functions:
-;
-; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
-; uint8_t *src, int srcstride,
-; int len, int m);
-;----------------------------------------------------------------------
-%macro LOAD 2
-%if WIN64
- movsxd %1q, %1d
-%endif
-%ifdef PIC
- add %1q, picregq
-%else
- add %1q, %2
-%endif
-%endmacro
-
-%macro STORE 3
-%ifidn %3, avg
- movh %2, [dstq]
-%endif
- packuswb %1, %1
-%ifidn %3, avg
- PAVGB %1, %2
-%endif
- movh [dstq], %1
-%endmacro
-
-%macro FILTER_V 1
-cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
-%ifdef PIC
- lea picregq, [sixtap_filter_v_m]
-%endif
- pxor m7, m7
- LOAD my, sixtap_filter_v
-
- ; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
-
-%ifdef m8
- mova m8, [myq+ 0]
- mova m9, [myq+16]
- mova m10, [myq+32]
- mova m11, [myq+48]
-%define COEFF05 m8
-%define COEFF14 m9
-%define COEFF2 m10
-%define COEFF3 m11
-%else
-%define COEFF05 [myq+ 0]
-%define COEFF14 [myq+16]
-%define COEFF2 [myq+32]
-%define COEFF3 [myq+48]
-%endif
-.nextrow:
- mova m6, m1
- movh m5, [srcq+2*srcstrideq] ; read new row
- paddw m6, m4
- punpcklbw m5, m7
- pmullw m6, COEFF14
- paddw m0, m5
- pmullw m0, COEFF05
- paddw m6, m0
- mova m0, m1
- paddw m6, [pw_32]
- mova m1, m2
- pmullw m2, COEFF2
- paddw m6, m2
- mova m2, m3
- pmullw m3, COEFF3
- paddw m6, m3
-
- ; round/clip/store
- mova m3, m4
- psraw m6, 6
- mova m4, m5
- STORE m6, m5, %1
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-%endmacro
-
-%macro FILTER_H 1
-cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
-%ifdef PIC
- lea picregq, [sixtap_filter_v_m]
-%endif
- pxor m7, m7
- LOAD mx, sixtap_filter_v
- mova m6, [pw_32]
-%ifdef m8
- mova m8, [mxq+ 0]
- mova m9, [mxq+16]
- mova m10, [mxq+32]
- mova m11, [mxq+48]
-%define COEFF05 m8
-%define COEFF14 m9
-%define COEFF2 m10
-%define COEFF3 m11
-%else
-%define COEFF05 [mxq+ 0]
-%define COEFF14 [mxq+16]
-%define COEFF2 [mxq+32]
-%define COEFF3 [mxq+48]
-%endif
-.nextrow:
- movq m0, [srcq-2]
- movq m5, [srcq+3]
- movq m1, [srcq-1]
- movq m4, [srcq+2]
- punpcklbw m0, m7
- punpcklbw m5, m7
- punpcklbw m1, m7
- punpcklbw m4, m7
- movq m2, [srcq-0]
- movq m3, [srcq+1]
- paddw m0, m5
- paddw m1, m4
- punpcklbw m2, m7
- punpcklbw m3, m7
- pmullw m0, COEFF05
- pmullw m1, COEFF14
- pmullw m2, COEFF2
- pmullw m3, COEFF3
- paddw m0, m6
- paddw m1, m2
- paddw m0, m3
- paddw m0, m1
- psraw m0, 6
- STORE m0, m1, %1
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-FILTER_V put
-FILTER_H put
-
-INIT_MMX mmxext
-FILTER_V avg
-FILTER_H avg
-
-INIT_MMX 3dnow
-FILTER_V avg
-FILTER_H avg
-%endif
-
-INIT_XMM sse2
-FILTER_H put
-FILTER_H avg
-FILTER_V put
-FILTER_V avg
-
-%macro FILTER_SSSE3 1
-cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
-%ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
-%endif
-
- ; read 5 lines
- sub srcq, srcstrideq
- LOAD my, sixtap_filter_hb
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- mova m5, [myq]
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
- lea srcq, [srcq+2*srcstrideq]
-
-.nextrow:
- mova m6, m2
- punpcklbw m0, m1
- punpcklbw m6, m3
- pmaddubsw m0, m5
- pmaddubsw m6, [myq+16]
- movh m7, [srcq] ; read new row
- paddw m6, m0
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m7
- punpcklbw m7, m3
- pmaddubsw m7, m5
- paddw m6, m7
- pmulhrsw m6, [pw_512]
- STORE m6, m7, %1
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
-%ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
-%endif
- mova m3, [filter_h6_shuf2]
- mova m4, [filter_h6_shuf3]
- LOAD mx, sixtap_filter_hb
- mova m5, [mxq] ; set up 6tap filter in bytes
- mova m6, [mxq+16]
- mova m7, [filter_h6_shuf1]
-
-.nextrow:
- movu m0, [srcq-2]
- mova m1, m0
- mova m2, m0
- pshufb m0, m7
- pshufb m1, m3
- pshufb m2, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- pmaddubsw m2, m5
- paddw m0, m1
- paddw m0, m2
- pmulhrsw m0, [pw_512]
- STORE m0, m1, %1
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-%endmacro
-
-INIT_XMM ssse3
-FILTER_SSSE3 put
-FILTER_SSSE3 avg
-
-; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
-%macro RV40_WCORE 4-5
- movh m4, [%3 + r6 + 0]
- movh m5, [%4 + r6 + 0]
-%if %0 == 4
-%define OFFSET r6 + mmsize / 2
-%else
- ; 8x8 block and sse2, stride was provided
-%define OFFSET r6
- add r6, r5
-%endif
- movh m6, [%3 + OFFSET]
- movh m7, [%4 + OFFSET]
-
-%if %1 == 0
- ; 14bits weights
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- punpcklbw m7, m0
-
- psllw m4, 7
- psllw m5, 7
- psllw m6, 7
- psllw m7, 7
- pmulhw m4, m3
- pmulhw m5, m2
- pmulhw m6, m3
- pmulhw m7, m2
-
- paddw m4, m5
- paddw m6, m7
-%else
- ; 5bits weights
-%if cpuflag(ssse3)
- punpcklbw m4, m5
- punpcklbw m6, m7
-
- pmaddubsw m4, m3
- pmaddubsw m6, m3
-%else
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- punpcklbw m7, m0
-
- pmullw m4, m3
- pmullw m5, m2
- pmullw m6, m3
- pmullw m7, m2
- paddw m4, m5
- paddw m6, m7
-%endif
-
-%endif
-
- ; bias and shift down
-%if cpuflag(ssse3)
- pmulhrsw m4, m1
- pmulhrsw m6, m1
-%else
- paddw m4, m1
- paddw m6, m1
- psrlw m4, 5
- psrlw m6, 5
-%endif
-
- packuswb m4, m6
-%if %0 == 5
- ; Only called for 8x8 blocks and sse2
- sub r6, r5
- movh [%2 + r6], m4
- add r6, r5
- movhps [%2 + r6], m4
-%else
- mova [%2 + r6], m4
-%endif
-%endmacro
-
-
-%macro MAIN_LOOP 2
-%if mmsize == 8
- RV40_WCORE %2, r0, r1, r2
-%if %1 == 16
- RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
-%endif
-
- ; Prepare for next loop
- add r6, r5
-%else
-%ifidn %1, 8
- RV40_WCORE %2, r0, r1, r2, r5
- ; Prepare 2 next lines
- add r6, r5
-%else
- RV40_WCORE %2, r0, r1, r2
- ; Prepare single next line
- add r6, r5
-%endif
-%endif
-
-%endmacro
-
-; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
-; %1=size %2=num of xmm regs
-; The weights are FP0.14 notation of fractions depending on pts.
-; For timebases without rounding error (i.e. PAL), the fractions
-; can be simplified, and several operations can be avoided.
-; Therefore, we check here whether they are multiples of 2^9 for
-; those simplifications to occur.
-%macro RV40_WEIGHT 3
-cglobal rv40_weight_func_%1_%2, 6, 7, 8
-%if cpuflag(ssse3)
- mova m1, [pw_1024]
-%else
- mova m1, [pw_16]
-%endif
- pxor m0, m0
- ; Set loop counter and increments
- mov r6, r5
- shl r6, %3
- add r0, r6
- add r1, r6
- add r2, r6
- neg r6
-
- movd m2, r3d
- movd m3, r4d
-%ifidn %1,rnd
-%define RND 0
- SPLATW m2, m2
-%else
-%define RND 1
-%if cpuflag(ssse3)
- punpcklbw m3, m2
-%else
- SPLATW m2, m2
-%endif
-%endif
- SPLATW m3, m3
-
-.loop:
- MAIN_LOOP %2, RND
- jnz .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-RV40_WEIGHT rnd, 8, 3
-RV40_WEIGHT rnd, 16, 4
-RV40_WEIGHT nornd, 8, 3
-RV40_WEIGHT nornd, 16, 4
-
-INIT_XMM sse2
-RV40_WEIGHT rnd, 8, 3
-RV40_WEIGHT rnd, 16, 4
-RV40_WEIGHT nornd, 8, 3
-RV40_WEIGHT nornd, 16, 4
-
-INIT_XMM ssse3
-RV40_WEIGHT rnd, 8, 3
-RV40_WEIGHT rnd, 16, 4
-RV40_WEIGHT nornd, 8, 3
-RV40_WEIGHT nornd, 16, 4
diff --git a/ffmpeg/libavcodec/x86/rv40dsp_init.c b/ffmpeg/libavcodec/x86/rv40dsp_init.c
deleted file mode 100644
index 75ba8ba..0000000
--- a/ffmpeg/libavcodec/x86/rv40dsp_init.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * RV40 decoder motion compensation functions x86-optimised
- * Copyright (c) 2008 Konstantin Shishkov
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * RV40 decoder motion compensation functions x86-optimised
- * 2,0 and 0,2 have h264 equivalents.
- * 3,3 is bugged in the rv40 format and maps to _xy2 version
- */
-
-#include "libavcodec/rv34dsp.h"
-#include "libavutil/attributes.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/cpu.h"
-#include "dsputil_x86.h"
-
-#if HAVE_YASM
-void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-#define DECLARE_WEIGHT(opt) \
-void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride);
-DECLARE_WEIGHT(mmxext)
-DECLARE_WEIGHT(sse2)
-DECLARE_WEIGHT(ssse3)
-
-/** @{ */
-/**
- * Define one qpel function.
- * LOOPSIZE must be already set to the number of pixels processed per
- * iteration in the inner loop of the called functions.
- * COFF(x) must be already defined so as to provide the offset into any
- * array of coeffs used by the called function for the qpel position x.
- */
-#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
-static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
- uint8_t *src, \
- ptrdiff_t stride) \
-{ \
- int i; \
- if (PH && PV) { \
- DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
- uint8_t *tmpptr = tmp + SIZE * 2; \
- src -= stride * 2; \
- \
- for (i = 0; i < SIZE; i += LOOPSIZE) \
- ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
- SIZE + 5, HCOFF(PH)); \
- for (i = 0; i < SIZE; i += LOOPSIZE) \
- ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
- SIZE, SIZE, VCOFF(PV)); \
- } else if (PV) { \
- for (i = 0; i < SIZE; i += LOOPSIZE) \
- ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
- stride, SIZE, VCOFF(PV)); \
- } else { \
- for (i = 0; i < SIZE; i += LOOPSIZE) \
- ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
- stride, SIZE, HCOFF(PH)); \
- } \
-};
-
-/** Declare functions for sizes 8 and 16 and given operations
- * and qpel position. */
-#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
- QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
- QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
-
-/** Declare all functions for all sizes and qpel positions */
-#define QPEL_MC_DECL(OP, OPT) \
-void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
- const uint8_t *src, \
- ptrdiff_t srcStride, \
- int len, int m); \
-void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
- const uint8_t *src, \
- ptrdiff_t srcStride, \
- int len, int m); \
-QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
-QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
-QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
-QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
-QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
-QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
-QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
-QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
-QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
-QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
-QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
-QPEL_FUNCS_DECL(OP, 3, 2, OPT)
-/** @} */
-
-#define LOOPSIZE 8
-#define HCOFF(x) (32 * (x - 1))
-#define VCOFF(x) (32 * (x - 1))
-QPEL_MC_DECL(put_, _ssse3)
-QPEL_MC_DECL(avg_, _ssse3)
-
-#undef LOOPSIZE
-#undef HCOFF
-#undef VCOFF
-#define LOOPSIZE 8
-#define HCOFF(x) (64 * (x - 1))
-#define VCOFF(x) (64 * (x - 1))
-QPEL_MC_DECL(put_, _sse2)
-QPEL_MC_DECL(avg_, _sse2)
-
-#if ARCH_X86_32
-#undef LOOPSIZE
-#undef HCOFF
-#undef VCOFF
-#define LOOPSIZE 4
-#define HCOFF(x) (64 * (x - 1))
-#define VCOFF(x) (64 * (x - 1))
-
-QPEL_MC_DECL(put_, _mmx)
-
-#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
-#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
-QPEL_MC_DECL(avg_, _mmxext)
-
-#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
-#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
-QPEL_MC_DECL(avg_, _3dnow)
-#endif
-
-/** @{ */
-/** Set one function */
-#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
- c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
-
-/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
-#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
- QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
- QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
-
-/** Set all functions for all sizes and qpel positions */
-#define QPEL_MC_SET(OP, OPT) \
-QPEL_FUNCS_SET (OP, 0, 1, OPT) \
-QPEL_FUNCS_SET (OP, 0, 3, OPT) \
-QPEL_FUNCS_SET (OP, 1, 0, OPT) \
-QPEL_FUNCS_SET (OP, 1, 1, OPT) \
-QPEL_FUNCS_SET (OP, 1, 2, OPT) \
-QPEL_FUNCS_SET (OP, 1, 3, OPT) \
-QPEL_FUNCS_SET (OP, 2, 1, OPT) \
-QPEL_FUNCS_SET (OP, 2, 2, OPT) \
-QPEL_FUNCS_SET (OP, 2, 3, OPT) \
-QPEL_FUNCS_SET (OP, 3, 0, OPT) \
-QPEL_FUNCS_SET (OP, 3, 1, OPT) \
-QPEL_FUNCS_SET (OP, 3, 2, OPT)
-/** @} */
-
-#endif /* HAVE_YASM */
-
-#if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride)
-{
- ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
-
-av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if HAVE_MMX_INLINE
- if (INLINE_MMX(cpu_flags)) {
- c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
- c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
- c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
- c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
- }
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_YASM
- if (EXTERNAL_MMX(cpu_flags)) {
- c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
- c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
-#if ARCH_X86_32
- QPEL_MC_SET(put_, _mmx)
-#endif
- }
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
- c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
-#if ARCH_X86_32
- QPEL_MC_SET(avg_, _3dnow)
-#endif
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
- c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
- c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
- c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
- c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
- c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
-#if ARCH_X86_32
- QPEL_MC_SET(avg_, _mmxext)
-#endif
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
- c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
- c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
- c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
- QPEL_MC_SET(put_, _sse2)
- QPEL_MC_SET(avg_, _sse2)
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
- c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
- c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
- c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
- QPEL_MC_SET(put_, _ssse3)
- QPEL_MC_SET(avg_, _ssse3)
- }
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/sbrdsp.asm b/ffmpeg/libavcodec/x86/sbrdsp.asm
deleted file mode 100644
index adc13c4..0000000
--- a/ffmpeg/libavcodec/x86/sbrdsp.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-;******************************************************************************
-;* AAC Spectral Band Replication decoding functions
-;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-; mask equivalent for multiply by -1.0 1.0
-ps_mask times 2 dd 1<<31, 0
-ps_mask2 times 2 dd 0, 1<<31
-ps_neg times 4 dd 1<<31
-ps_noise0 times 2 dd 1.0, 0.0,
-ps_noise2 times 2 dd -1.0, 0.0
-ps_noise13 dd 0.0, 1.0, 0.0, -1.0
- dd 0.0, -1.0, 0.0, 1.0
- dd 0.0, 1.0, 0.0, -1.0
-cextern sbr_noise_table
-
-SECTION_TEXT
-
-INIT_XMM sse
-cglobal sbr_sum_square, 2, 3, 6
- mov r2, r1
- xorps m0, m0
- xorps m1, m1
- sar r2, 3
- jz .prepare
-.loop:
- movu m2, [r0 + 0]
- movu m3, [r0 + 16]
- movu m4, [r0 + 32]
- movu m5, [r0 + 48]
- mulps m2, m2
- mulps m3, m3
- mulps m4, m4
- mulps m5, m5
- addps m0, m2
- addps m1, m3
- addps m0, m4
- addps m1, m5
- add r0, 64
- dec r2
- jnz .loop
-.prepare:
- and r1, 7
- sar r1, 1
- jz .end
-; len is a multiple of 2, thus there are at least 4 elements to process
-.endloop:
- movu m2, [r0]
- add r0, 16
- mulps m2, m2
- dec r1
- addps m0, m2
- jnz .endloop
-.end:
- addps m0, m1
- movhlps m2, m0
- addps m0, m2
- movss m1, m0
- shufps m0, m0, 1
- addss m0, m1
-%if ARCH_X86_64 == 0
- movss r0m, m0
- fld dword r0m
-%endif
- RET
-
-%define STEP 40*4*2
-cglobal sbr_hf_g_filt, 5, 6, 5
- lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
- mov r5, r3
- and r3, 0xFC
- lea r2, [r2 + r3*4]
- lea r0, [r0 + r3*8]
- neg r3
- jz .loop1
-.loop4:
- movlps m0, [r2 + 4*r3 + 0]
- movlps m1, [r2 + 4*r3 + 8]
- movlps m2, [r1 + 0*STEP]
- movlps m3, [r1 + 2*STEP]
- movhps m2, [r1 + 1*STEP]
- movhps m3, [r1 + 3*STEP]
- unpcklps m0, m0
- unpcklps m1, m1
- mulps m0, m2
- mulps m1, m3
- movu [r0 + 8*r3 + 0], m0
- movu [r0 + 8*r3 + 16], m1
- add r1, 4*STEP
- add r3, 4
- jnz .loop4
- and r5, 3 ; number of single element loops
- jz .end
-.loop1: ; element 0 and 1 can be computed at the same time
- movss m0, [r2]
- movlps m2, [r1]
- unpcklps m0, m0
- mulps m2, m0
- movlps [r0], m2
- add r0, 8
- add r2, 4
- add r1, STEP
- dec r5
- jnz .loop1
-.end:
- RET
-
-; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
-; const float alpha0[2], const float alpha1[2],
-; float bw, int start, int end)
-;
-cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
- ; load alpha factors
-%define bw m0
-%if ARCH_X86_64 == 0 || WIN64
- movss bw, BWm
-%endif
- movlps m2, [alpha1q]
- movlps m1, [alpha0q]
- shufps bw, bw, 0
- mulps m2, bw ; (a1[0] a1[1])*bw
- mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
- mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
- mova m3, m1
- mova m4, m2
-
- ; Set pointers
-%if ARCH_X86_64 == 0 || WIN64
- ; start and end 6th and 7th args on stack
- mov r2d, Sm
- mov r3d, Em
-%define start r2q
-%define end r3q
-%else
-; BW does not actually occupy a register, so shift by 1
-%define start BWq
-%define end Sq
-%endif
- sub start, end ; neg num of loops
- lea X_highq, [X_highq + end*2*4]
- lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
- shl start, 3 ; offset from num loops
-
- mova m0, [X_lowq + start]
- shufps m3, m3, q1111
- shufps m4, m4, q1111
- xorps m3, [ps_mask]
- shufps m1, m1, q0000
- shufps m2, m2, q0000
- xorps m4, [ps_mask]
-.loop2:
- movu m7, [X_lowq + start + 8] ; BbCc
- mova m6, m0
- mova m5, m7
- shufps m0, m0, q2301 ; aAbB
- shufps m7, m7, q2301 ; bBcC
- mulps m0, m4
- mulps m7, m3
- mulps m6, m2
- mulps m5, m1
- addps m7, m0
- mova m0, [X_lowq + start +16] ; CcDd
- addps m7, m0
- addps m6, m5
- addps m7, m6
- mova [X_highq + start], m7
- add start, 16
- jnz .loop2
- RET
-
-cglobal sbr_sum64x5, 1,2,4,z
- lea r1q, [zq+ 256]
-.loop:
- mova m0, [zq+ 0]
- mova m2, [zq+ 16]
- mova m1, [zq+ 256]
- mova m3, [zq+ 272]
- addps m0, [zq+ 512]
- addps m2, [zq+ 528]
- addps m1, [zq+ 768]
- addps m3, [zq+ 784]
- addps m0, [zq+1024]
- addps m2, [zq+1040]
- addps m0, m1
- addps m2, m3
- mova [zq], m0
- mova [zq+16], m2
- add zq, 32
- cmp zq, r1q
- jne .loop
- REP_RET
-
-INIT_XMM sse
-cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
- lea r2q, [zq + (64-4)*4]
- mova m3, [ps_neg]
-.loop:
- mova m1, [zq]
- xorps m0, m3, [r2q]
- shufps m0, m0, m0, q0123
- unpcklps m2, m0, m1
- unpckhps m0, m0, m1
- mova [Wq + 0], m2
- mova [Wq + 16], m0
- add Wq, 32
- sub r2q, 16
- add zq, 16
- cmp zq, r2q
- jl .loop
- REP_RET
-
-INIT_XMM sse
-cglobal sbr_neg_odd_64, 1,2,4,z
- lea r1q, [zq+256]
-.loop:
- mova m0, [zq+ 0]
- mova m1, [zq+16]
- mova m2, [zq+32]
- mova m3, [zq+48]
- xorps m0, [ps_mask2]
- xorps m1, [ps_mask2]
- xorps m2, [ps_mask2]
- xorps m3, [ps_mask2]
- mova [zq+ 0], m0
- mova [zq+16], m1
- mova [zq+32], m2
- mova [zq+48], m3
- add zq, 64
- cmp zq, r1q
- jne .loop
- REP_RET
-
-; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
-%macro SBR_QMF_DEINT_BFLY 0
-cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
- mov cq, 64*4-2*mmsize
- lea vrevq, [vq + 64*4]
-.loop:
- mova m0, [src0q+cq]
- mova m1, [src1q]
- mova m4, [src0q+cq+mmsize]
- mova m5, [src1q+mmsize]
-%if cpuflag(sse2)
- pshufd m2, m0, q0123
- pshufd m3, m1, q0123
- pshufd m6, m4, q0123
- pshufd m7, m5, q0123
-%else
- shufps m2, m0, m0, q0123
- shufps m3, m1, m1, q0123
- shufps m6, m4, m4, q0123
- shufps m7, m5, m5, q0123
-%endif
- addps m5, m2
- subps m0, m7
- addps m1, m6
- subps m4, m3
- mova [vrevq], m1
- mova [vrevq+mmsize], m5
- mova [vq+cq], m0
- mova [vq+cq+mmsize], m4
- add src1q, 2*mmsize
- add vrevq, 2*mmsize
- sub cq, 2*mmsize
- jge .loop
- REP_RET
-%endmacro
-
-INIT_XMM sse
-SBR_QMF_DEINT_BFLY
-
-INIT_XMM sse2
-SBR_QMF_DEINT_BFLY
-
-INIT_XMM sse2
-cglobal sbr_qmf_pre_shuffle, 1,4,6,z
-%define OFFSET (32*4-2*mmsize)
- mov r3q, OFFSET
- lea r1q, [zq + (32+1)*4]
- lea r2q, [zq + 64*4]
- mova m5, [ps_neg]
-.loop:
- movu m0, [r1q]
- movu m2, [r1q + mmsize]
- movu m1, [zq + r3q + 4 + mmsize]
- movu m3, [zq + r3q + 4]
-
- pxor m2, m5
- pxor m0, m5
- pshufd m2, m2, q0123
- pshufd m0, m0, q0123
- SBUTTERFLY dq, 2, 3, 4
- SBUTTERFLY dq, 0, 1, 4
- mova [r2q + 2*r3q + 0*mmsize], m2
- mova [r2q + 2*r3q + 1*mmsize], m3
- mova [r2q + 2*r3q + 2*mmsize], m0
- mova [r2q + 2*r3q + 3*mmsize], m1
- add r1q, 2*mmsize
- sub r3q, 2*mmsize
- jge .loop
- movq m2, [zq]
- movq [r2q], m2
- REP_RET
-
-%ifdef PIC
-%define NREGS 1
-%if UNIX64
-%define NOISE_TABLE r6q ; r5q is m_max
-%else
-%define NOISE_TABLE r5q
-%endif
-%else
-%define NREGS 0
-%define NOISE_TABLE sbr_noise_table
-%endif
-
-%macro LOAD_NST 1
-%ifdef PIC
- lea NOISE_TABLE, [%1]
- mova m0, [kxq + NOISE_TABLE]
-%else
- mova m0, [kxq + %1]
-%endif
-%endmacro
-
-INIT_XMM sse2
-; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
-; const float *q_filt, int noise,
-; int kx, int m_max)
-cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
- mova m0, [ps_noise0]
- jmp apply_noise_main
-
-; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
-; const float *q_filt, int noise,
-; int kx, int m_max)
-cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
- and kxq, 1
- shl kxq, 4
- LOAD_NST ps_noise13
- jmp apply_noise_main
-
-; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
-; const float *q_filt, int noise,
-; int kx, int m_max)
-cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
- mova m0, [ps_noise2]
- jmp apply_noise_main
-
-; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
-; const float *q_filt, int noise,
-; int kx, int m_max)
-cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
- and kxq, 1
- shl kxq, 4
- LOAD_NST ps_noise13+16
-
-apply_noise_main:
-%if ARCH_X86_64 == 0 || WIN64
- mov kxd, m_maxm
-%define count kxq
-%else
-%define count m_maxq
-%endif
- dec noiseq
- shl count, 2
-%ifdef PIC
- lea NOISE_TABLE, [sbr_noise_table]
-%endif
- lea Yq, [Yq + 2*count]
- add s_mq, count
- add q_filtq, count
- shl noiseq, 3
- pxor m5, m5
- neg count
-.loop:
- mova m1, [q_filtq + count]
- movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
- movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
- add noiseq, 2*mmsize
- and noiseq, 0x1ff<<3
- punpckhdq m2, m1, m1
- punpckldq m1, m1
- mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
- mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
- mova m3, [s_mq + count]
- ; TODO: replace by a vpermd in AVX2
- punpckhdq m4, m3, m3
- punpckldq m3, m3
- pcmpeqd m6, m3, m5 ; m6 == 0
- pcmpeqd m7, m4, m5 ; m7 == 0
- mulps m3, m0 ; s_m[m] * phi_sign
- mulps m4, m0 ; s_m[m] * phi_sign
- pand m1, m6
- pand m2, m7
- movu m6, [Yq + 2*count]
- movu m7, [Yq + 2*count + mmsize]
- addps m3, m1
- addps m4, m2
- addps m6, m3
- addps m7, m4
- movu [Yq + 2*count], m6
- movu [Yq + 2*count + mmsize], m7
- add count, mmsize
- jl .loop
- RET
diff --git a/ffmpeg/libavcodec/x86/sbrdsp_init.c b/ffmpeg/libavcodec/x86/sbrdsp_init.c
deleted file mode 100644
index 2b912d0..0000000
--- a/ffmpeg/libavcodec/x86/sbrdsp_init.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * AAC Spectral Band Replication decoding functions
- * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/sbrdsp.h"
-
-float ff_sbr_sum_square_sse(float (*x)[2], int n);
-void ff_sbr_sum64x5_sse(float *z);
-void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
- const float *g_filt, int m_max, intptr_t ixh);
-void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
- const float alpha0[2], const float alpha1[2],
- float bw, int start, int end);
-void ff_sbr_neg_odd_64_sse(float *z);
-void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
-void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
-void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
-void ff_sbr_qmf_pre_shuffle_sse2(float *z);
-
-void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-
-av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_SSE(cpu_flags)) {
- s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
- s->sum_square = ff_sbr_sum_square_sse;
- s->sum64x5 = ff_sbr_sum64x5_sse;
- s->hf_g_filt = ff_sbr_hf_g_filt_sse;
- s->hf_gen = ff_sbr_hf_gen_sse;
- s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
- s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
- s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
- s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
- s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
- s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
- s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
- }
-}
diff --git a/ffmpeg/libavcodec/x86/simple_idct.c b/ffmpeg/libavcodec/x86/simple_idct.c
deleted file mode 100644
index c666b1a..0000000
--- a/ffmpeg/libavcodec/x86/simple_idct.c
+++ /dev/null
@@ -1,1167 +0,0 @@
-/*
- * Simple IDCT MMX
- *
- * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "libavcodec/simple_idct.h"
-#include "libavutil/mem.h"
-#include "dsputil_x86.h"
-
-#if HAVE_INLINE_ASM
-
-/*
-23170.475006
-22725.260826
-21406.727617
-19265.545870
-16384.000000
-12872.826198
-8866.956905
-4520.335430
-*/
-#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
-#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 20 // 6
-
-DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
-
-DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
- 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
-// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
-// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
- 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
- // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
-// 0, 0, 0, 0,
-// 0, 0, 0, 0,
-
- C4, C4, C4, C4,
- C4, -C4, C4, -C4,
-
- C2, C6, C2, C6,
- C6, -C2, C6, -C2,
-
- C1, C3, C1, C3,
- C5, C7, C5, C7,
-
- C3, -C7, C3, -C7,
--C1, -C5, -C1, -C5,
-
- C5, -C1, C5, -C1,
- C7, C3, C7, C3,
-
- C7, -C5, C7, -C5,
- C3, -C1, C3, -C1
-};
-
-static inline void idct(int16_t *block)
-{
- LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
- int16_t * const temp= (int16_t*)align_tmp;
-
- __asm__ volatile(
-#if 0 //Alternative, simpler variant
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
- "movq %%mm4, 16+" #dst " \n\t"\
-
-#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"\
-
-
-#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq "MANGLE(wm1010)", %%mm4 \n\t"\
- "pand %%mm0, %%mm4 \n\t"\
- "por %%mm1, %%mm4 \n\t"\
- "por %%mm2, %%mm4 \n\t"\
- "por %%mm3, %%mm4 \n\t"\
- "packssdw %%mm4,%%mm4 \n\t"\
- "movd %%mm4, %%eax \n\t"\
- "orl %%eax, %%eax \n\t"\
- "jz 1f \n\t"\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
- "movq %%mm4, 16+" #dst " \n\t"\
- "jmp 2f \n\t"\
- "1: \n\t"\
- "pslld $16, %%mm0 \n\t"\
- "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
- "psrad $13, %%mm0 \n\t"\
- "packssdw %%mm0, %%mm0 \n\t"\
- "movq %%mm0, " #dst " \n\t"\
- "movq %%mm0, 8+" #dst " \n\t"\
- "movq %%mm0, 16+" #dst " \n\t"\
- "movq %%mm0, 24+" #dst " \n\t"\
- "2: \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, rounder, shift)
-ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
-/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
-ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
-ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
-
-DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-#else
-
-#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq "MANGLE(wm1010)", %%mm4 \n\t"\
- "pand %%mm0, %%mm4 \n\t"\
- "por %%mm1, %%mm4 \n\t"\
- "por %%mm2, %%mm4 \n\t"\
- "por %%mm3, %%mm4 \n\t"\
- "packssdw %%mm4,%%mm4 \n\t"\
- "movd %%mm4, %%eax \n\t"\
- "orl %%eax, %%eax \n\t"\
- "jz 1f \n\t"\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
- "movq %%mm4, 16+" #dst " \n\t"\
- "jmp 2f \n\t"\
- "1: \n\t"\
- "pslld $16, %%mm0 \n\t"\
- "paddd "MANGLE(d40000)", %%mm0 \n\t"\
- "psrad $13, %%mm0 \n\t"\
- "packssdw %%mm0, %%mm0 \n\t"\
- "movq %%mm0, " #dst " \n\t"\
- "movq %%mm0, 8+" #dst " \n\t"\
- "movq %%mm0, 16+" #dst " \n\t"\
- "movq %%mm0, 24+" #dst " \n\t"\
- "2: \n\t"
-
-#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq %%mm0, %%mm4 \n\t"\
- "por %%mm1, %%mm4 \n\t"\
- "por %%mm2, %%mm4 \n\t"\
- "por %%mm3, %%mm4 \n\t"\
- "packssdw %%mm4,%%mm4 \n\t"\
- "movd %%mm4, %%eax \n\t"\
- "orl %%eax, %%eax \n\t"\
- "jz " #bt " \n\t"\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
- "movq %%mm4, 16+" #dst " \n\t"\
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
- "movq %%mm4, 16+" #dst " \n\t"\
-
-//IDCT( src0, src4, src1, src5, dst, rounder, shift)
-DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
-Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
- "# .p2align 4 \n\t"\
- "4: \n\t"
-Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm1, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm1 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm1, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
- "# .p2align 4 \n\t"\
- "6: \n\t"
-Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm1, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm1 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm1, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
- "# .p2align 4 \n\t"\
- "2: \n\t"
-Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
- "# .p2align 4 \n\t"\
- "3: \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 64(%2), %%mm3 \n\t"\
- "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm1 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm1, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
- "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm1, 32+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
- "# .p2align 4 \n\t"\
- "5: \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
- "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
- "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
- "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
- "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
- "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
- "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
- "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
- "psrad $" #shift ", %%mm4 \n\t"\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm3 \n\t"\
- "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
- "movq %%mm4, " #dst " \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
- "movq %%mm0, 16+" #dst " \n\t"\
- "movq %%mm0, 96+" #dst " \n\t"\
- "movq %%mm4, 112+" #dst " \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movq %%mm5, 32+" #dst " \n\t"\
- "psrad $" #shift ", %%mm1 \n\t"\
- "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movq %%mm6, 48+" #dst " \n\t"\
- "movq %%mm6, 64+" #dst " \n\t"\
- "movq %%mm5, 80+" #dst " \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
-
- "# .p2align 4 \n\t"\
- "1: \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 64(%2), %%mm1 \n\t"\
- "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm3 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm3, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
- "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm3 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
- "movd %%mm3, 32+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
- "jmp 9f \n\t"
-
-
- "# .p2align 4 \n\t"
- "7: \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "psrad $" #shift ", %%mm4 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
- "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
- "movq %%mm4, " #dst " \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
- "movq %%mm0, 16+" #dst " \n\t"\
- "movq %%mm0, 96+" #dst " \n\t"\
- "movq %%mm4, 112+" #dst " \n\t"\
- "movq %%mm0, 32+" #dst " \n\t"\
- "movq %%mm4, 48+" #dst " \n\t"\
- "movq %%mm4, 64+" #dst " \n\t"\
- "movq %%mm0, 80+" #dst " \n\t"
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-
-#endif
-
-/*
-Input
- 00 40 04 44 20 60 24 64
- 10 30 14 34 50 70 54 74
- 01 41 03 43 21 61 23 63
- 11 31 13 33 51 71 53 73
- 02 42 06 46 22 62 26 66
- 12 32 16 36 52 72 56 76
- 05 45 07 47 25 65 27 67
- 15 35 17 37 55 75 57 77
-
-Temp
- 00 04 10 14 20 24 30 34
- 40 44 50 54 60 64 70 74
- 01 03 11 13 21 23 31 33
- 41 43 51 53 61 63 71 73
- 02 06 12 16 22 26 32 36
- 42 46 52 56 62 66 72 76
- 05 07 15 17 25 27 35 37
- 45 47 55 57 65 67 75 77
-*/
-
-"9: \n\t"
- :: "r" (block), "r" (temp), "r" (coeffs)
- : "%eax"
- );
-}
-
-void ff_simple_idct_mmx(int16_t *block)
-{
- idct(block);
-}
-
-//FIXME merge add/put into the idct
-
-void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
-{
- idct(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
-{
- idct(block);
- ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/snowdsp.c b/ffmpeg/libavcodec/x86/snowdsp.c
deleted file mode 100644
index 735e790..0000000
--- a/ffmpeg/libavcodec/x86/snowdsp.c
+++ /dev/null
@@ -1,902 +0,0 @@
-/*
- * MMX and SSE2 optimized snow DSP utils
- * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/snow.h"
-#include "libavcodec/snow_dwt.h"
-#include "dsputil_x86.h"
-
-#if HAVE_INLINE_ASM
-
-static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
- const int w2= (width+1)>>1;
- const int w_l= (width>>1);
- const int w_r= w2 - 1;
- int i;
-
- { // Lift 0
- IDWTELEM * const ref = b + w2 - 1;
- IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
- // (the first time erroneously), we allow the SSE2 code to run an extra pass.
- // The savings in code and time are well worth having to store this value and
- // calculate b[0] correctly afterwards.
-
- i = 0;
- __asm__ volatile(
- "pcmpeqd %%xmm7, %%xmm7 \n\t"
- "pcmpeqd %%xmm3, %%xmm3 \n\t"
- "psllw $1, %%xmm3 \n\t"
- "paddw %%xmm7, %%xmm3 \n\t"
- "psllw $13, %%xmm3 \n\t"
- ::);
- for(; i<w_l-15; i+=16){
- __asm__ volatile(
- "movdqu (%1), %%xmm1 \n\t"
- "movdqu 16(%1), %%xmm5 \n\t"
- "movdqu 2(%1), %%xmm2 \n\t"
- "movdqu 18(%1), %%xmm6 \n\t"
- "paddw %%xmm1, %%xmm2 \n\t"
- "paddw %%xmm5, %%xmm6 \n\t"
- "paddw %%xmm7, %%xmm2 \n\t"
- "paddw %%xmm7, %%xmm6 \n\t"
- "pmulhw %%xmm3, %%xmm2 \n\t"
- "pmulhw %%xmm3, %%xmm6 \n\t"
- "paddw (%0), %%xmm2 \n\t"
- "paddw 16(%0), %%xmm6 \n\t"
- "movdqa %%xmm2, (%0) \n\t"
- "movdqa %%xmm6, 16(%0) \n\t"
- :: "r"(&b[i]), "r"(&ref[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
- b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
- }
-
- { // Lift 1
- IDWTELEM * const dst = b+w2;
-
- i = 0;
- for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
- dst[i] = dst[i] - (b[i] + b[i + 1]);
- }
- for(; i<w_r-15; i+=16){
- __asm__ volatile(
- "movdqu (%1), %%xmm1 \n\t"
- "movdqu 16(%1), %%xmm5 \n\t"
- "movdqu 2(%1), %%xmm2 \n\t"
- "movdqu 18(%1), %%xmm6 \n\t"
- "paddw %%xmm1, %%xmm2 \n\t"
- "paddw %%xmm5, %%xmm6 \n\t"
- "movdqa (%0), %%xmm0 \n\t"
- "movdqa 16(%0), %%xmm4 \n\t"
- "psubw %%xmm2, %%xmm0 \n\t"
- "psubw %%xmm6, %%xmm4 \n\t"
- "movdqa %%xmm0, (%0) \n\t"
- "movdqa %%xmm4, 16(%0) \n\t"
- :: "r"(&dst[i]), "r"(&b[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
- }
-
- { // Lift 2
- IDWTELEM * const ref = b+w2 - 1;
- IDWTELEM b_0 = b[0];
-
- i = 0;
- __asm__ volatile(
- "psllw $15, %%xmm7 \n\t"
- "pcmpeqw %%xmm6, %%xmm6 \n\t"
- "psrlw $13, %%xmm6 \n\t"
- "paddw %%xmm7, %%xmm6 \n\t"
- ::);
- for(; i<w_l-15; i+=16){
- __asm__ volatile(
- "movdqu (%1), %%xmm0 \n\t"
- "movdqu 16(%1), %%xmm4 \n\t"
- "movdqu 2(%1), %%xmm1 \n\t"
- "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
- "paddw %%xmm6, %%xmm0 \n\t"
- "paddw %%xmm6, %%xmm4 \n\t"
- "paddw %%xmm7, %%xmm1 \n\t"
- "paddw %%xmm7, %%xmm5 \n\t"
- "pavgw %%xmm1, %%xmm0 \n\t"
- "pavgw %%xmm5, %%xmm4 \n\t"
- "psubw %%xmm7, %%xmm0 \n\t"
- "psubw %%xmm7, %%xmm4 \n\t"
- "psraw $1, %%xmm0 \n\t"
- "psraw $1, %%xmm4 \n\t"
- "movdqa (%0), %%xmm1 \n\t"
- "movdqa 16(%0), %%xmm5 \n\t"
- "paddw %%xmm1, %%xmm0 \n\t"
- "paddw %%xmm5, %%xmm4 \n\t"
- "psraw $2, %%xmm0 \n\t"
- "psraw $2, %%xmm4 \n\t"
- "paddw %%xmm1, %%xmm0 \n\t"
- "paddw %%xmm5, %%xmm4 \n\t"
- "movdqa %%xmm0, (%0) \n\t"
- "movdqa %%xmm4, 16(%0) \n\t"
- :: "r"(&b[i]), "r"(&ref[i])
- : "memory"
- );
- }
- snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
- b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
- }
-
- { // Lift 3
- IDWTELEM * const src = b+w2;
-
- i = 0;
- for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
- temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
- }
- for(; i<w_r-7; i+=8){
- __asm__ volatile(
- "movdqu 2(%1), %%xmm2 \n\t"
- "movdqu 18(%1), %%xmm6 \n\t"
- "paddw (%1), %%xmm2 \n\t"
- "paddw 16(%1), %%xmm6 \n\t"
- "movdqu (%0), %%xmm0 \n\t"
- "movdqu 16(%0), %%xmm4 \n\t"
- "paddw %%xmm2, %%xmm0 \n\t"
- "paddw %%xmm6, %%xmm4 \n\t"
- "psraw $1, %%xmm2 \n\t"
- "psraw $1, %%xmm6 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "paddw %%xmm4, %%xmm6 \n\t"
- "movdqa %%xmm2, (%2) \n\t"
- "movdqa %%xmm6, 16(%2) \n\t"
- :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
- }
-
- {
- snow_interleave_line_header(&i, width, b, temp);
-
- for (; (i & 0x3E) != 0x3E; i-=2){
- b[i+1] = temp[i>>1];
- b[i] = b[i>>1];
- }
- for (i-=62; i>=0; i-=64){
- __asm__ volatile(
- "movdqa (%1), %%xmm0 \n\t"
- "movdqa 16(%1), %%xmm2 \n\t"
- "movdqa 32(%1), %%xmm4 \n\t"
- "movdqa 48(%1), %%xmm6 \n\t"
- "movdqa (%1), %%xmm1 \n\t"
- "movdqa 16(%1), %%xmm3 \n\t"
- "movdqa 32(%1), %%xmm5 \n\t"
- "movdqa 48(%1), %%xmm7 \n\t"
- "punpcklwd (%2), %%xmm0 \n\t"
- "punpcklwd 16(%2), %%xmm2 \n\t"
- "punpcklwd 32(%2), %%xmm4 \n\t"
- "punpcklwd 48(%2), %%xmm6 \n\t"
- "movdqa %%xmm0, (%0) \n\t"
- "movdqa %%xmm2, 32(%0) \n\t"
- "movdqa %%xmm4, 64(%0) \n\t"
- "movdqa %%xmm6, 96(%0) \n\t"
- "punpckhwd (%2), %%xmm1 \n\t"
- "punpckhwd 16(%2), %%xmm3 \n\t"
- "punpckhwd 32(%2), %%xmm5 \n\t"
- "punpckhwd 48(%2), %%xmm7 \n\t"
- "movdqa %%xmm1, 16(%0) \n\t"
- "movdqa %%xmm3, 48(%0) \n\t"
- "movdqa %%xmm5, 80(%0) \n\t"
- "movdqa %%xmm7, 112(%0) \n\t"
- :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
- : "memory"
- );
- }
- }
-}
-
-static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
- const int w2= (width+1)>>1;
- const int w_l= (width>>1);
- const int w_r= w2 - 1;
- int i;
-
- { // Lift 0
- IDWTELEM * const ref = b + w2 - 1;
-
- i = 1;
- b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "pcmpeqw %%mm3, %%mm3 \n\t"
- "psllw $1, %%mm3 \n\t"
- "paddw %%mm7, %%mm3 \n\t"
- "psllw $13, %%mm3 \n\t"
- ::);
- for(; i<w_l-7; i+=8){
- __asm__ volatile(
- "movq (%1), %%mm2 \n\t"
- "movq 8(%1), %%mm6 \n\t"
- "paddw 2(%1), %%mm2 \n\t"
- "paddw 10(%1), %%mm6 \n\t"
- "paddw %%mm7, %%mm2 \n\t"
- "paddw %%mm7, %%mm6 \n\t"
- "pmulhw %%mm3, %%mm2 \n\t"
- "pmulhw %%mm3, %%mm6 \n\t"
- "paddw (%0), %%mm2 \n\t"
- "paddw 8(%0), %%mm6 \n\t"
- "movq %%mm2, (%0) \n\t"
- "movq %%mm6, 8(%0) \n\t"
- :: "r"(&b[i]), "r"(&ref[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
- }
-
- { // Lift 1
- IDWTELEM * const dst = b+w2;
-
- i = 0;
- for(; i<w_r-7; i+=8){
- __asm__ volatile(
- "movq (%1), %%mm2 \n\t"
- "movq 8(%1), %%mm6 \n\t"
- "paddw 2(%1), %%mm2 \n\t"
- "paddw 10(%1), %%mm6 \n\t"
- "movq (%0), %%mm0 \n\t"
- "movq 8(%0), %%mm4 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm6, %%mm4 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm4, 8(%0) \n\t"
- :: "r"(&dst[i]), "r"(&b[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
- }
-
- { // Lift 2
- IDWTELEM * const ref = b+w2 - 1;
-
- i = 1;
- b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
- __asm__ volatile(
- "psllw $15, %%mm7 \n\t"
- "pcmpeqw %%mm6, %%mm6 \n\t"
- "psrlw $13, %%mm6 \n\t"
- "paddw %%mm7, %%mm6 \n\t"
- ::);
- for(; i<w_l-7; i+=8){
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm4 \n\t"
- "movq 2(%1), %%mm1 \n\t"
- "movq 10(%1), %%mm5 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm4 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
- "paddw %%mm7, %%mm5 \n\t"
- "pavgw %%mm1, %%mm0 \n\t"
- "pavgw %%mm5, %%mm4 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psubw %%mm7, %%mm4 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm4 \n\t"
- "movq (%0), %%mm1 \n\t"
- "movq 8(%0), %%mm5 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm5, %%mm4 \n\t"
- "psraw $2, %%mm0 \n\t"
- "psraw $2, %%mm4 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm5, %%mm4 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm4, 8(%0) \n\t"
- :: "r"(&b[i]), "r"(&ref[i])
- : "memory"
- );
- }
- snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
- }
-
- { // Lift 3
- IDWTELEM * const src = b+w2;
- i = 0;
-
- for(; i<w_r-7; i+=8){
- __asm__ volatile(
- "movq 2(%1), %%mm2 \n\t"
- "movq 10(%1), %%mm6 \n\t"
- "paddw (%1), %%mm2 \n\t"
- "paddw 8(%1), %%mm6 \n\t"
- "movq (%0), %%mm0 \n\t"
- "movq 8(%0), %%mm4 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "paddw %%mm6, %%mm4 \n\t"
- "psraw $1, %%mm2 \n\t"
- "psraw $1, %%mm6 \n\t"
- "paddw %%mm0, %%mm2 \n\t"
- "paddw %%mm4, %%mm6 \n\t"
- "movq %%mm2, (%2) \n\t"
- "movq %%mm6, 8(%2) \n\t"
- :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
- : "memory"
- );
- }
- snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
- }
-
- {
- snow_interleave_line_header(&i, width, b, temp);
-
- for (; (i & 0x1E) != 0x1E; i-=2){
- b[i+1] = temp[i>>1];
- b[i] = b[i>>1];
- }
- for (i-=30; i>=0; i-=32){
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "movq 16(%1), %%mm4 \n\t"
- "movq 24(%1), %%mm6 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq 8(%1), %%mm3 \n\t"
- "movq 16(%1), %%mm5 \n\t"
- "movq 24(%1), %%mm7 \n\t"
- "punpcklwd (%2), %%mm0 \n\t"
- "punpcklwd 8(%2), %%mm2 \n\t"
- "punpcklwd 16(%2), %%mm4 \n\t"
- "punpcklwd 24(%2), %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, 16(%0) \n\t"
- "movq %%mm4, 32(%0) \n\t"
- "movq %%mm6, 48(%0) \n\t"
- "punpckhwd (%2), %%mm1 \n\t"
- "punpckhwd 8(%2), %%mm3 \n\t"
- "punpckhwd 16(%2), %%mm5 \n\t"
- "punpckhwd 24(%2), %%mm7 \n\t"
- "movq %%mm1, 8(%0) \n\t"
- "movq %%mm3, 24(%0) \n\t"
- "movq %%mm5, 40(%0) \n\t"
- "movq %%mm7, 56(%0) \n\t"
- :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
- : "memory"
- );
- }
- }
-}
-
-#if HAVE_7REGS
-#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
- ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
- ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
- ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
- ""op" 48("r",%%"REG_d"), %%"t3" \n\t"
-
-#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
- snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
-
-#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
- snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
-
-#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
- "psubw %%"s0", %%"t0" \n\t"\
- "psubw %%"s1", %%"t1" \n\t"\
- "psubw %%"s2", %%"t2" \n\t"\
- "psubw %%"s3", %%"t3" \n\t"
-
-#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
- "movdqa %%"s0", ("w",%%"REG_d") \n\t"\
- "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
- "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
- "movdqa %%"s3", 48("w",%%"REG_d") \n\t"
-
-#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
- "psraw $"n", %%"t0" \n\t"\
- "psraw $"n", %%"t1" \n\t"\
- "psraw $"n", %%"t2" \n\t"\
- "psraw $"n", %%"t3" \n\t"
-
-#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
- "paddw %%"s0", %%"t0" \n\t"\
- "paddw %%"s1", %%"t1" \n\t"\
- "paddw %%"s2", %%"t2" \n\t"\
- "paddw %%"s3", %%"t3" \n\t"
-
-#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
- "pmulhw %%"s0", %%"t0" \n\t"\
- "pmulhw %%"s1", %%"t1" \n\t"\
- "pmulhw %%"s2", %%"t2" \n\t"\
- "pmulhw %%"s3", %%"t3" \n\t"
-
-#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
- "movdqa %%"s0", %%"t0" \n\t"\
- "movdqa %%"s1", %%"t1" \n\t"\
- "movdqa %%"s2", %%"t2" \n\t"\
- "movdqa %%"s3", %%"t3" \n\t"
-
-static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
- x86_reg i = width;
-
- while(i & 0x1F)
- {
- i--;
- b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
- b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
- b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
- b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
- }
- i+=i;
-
- __asm__ volatile (
- "jmp 2f \n\t"
- "1: \n\t"
- snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
-
-
- "pcmpeqw %%xmm0, %%xmm0 \n\t"
- "pcmpeqw %%xmm2, %%xmm2 \n\t"
- "paddw %%xmm2, %%xmm2 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "psllw $13, %%xmm2 \n\t"
- snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
-
- "pcmpeqw %%xmm7, %%xmm7 \n\t"
- "pcmpeqw %%xmm5, %%xmm5 \n\t"
- "psllw $15, %%xmm7 \n\t"
- "psrlw $13, %%xmm5 \n\t"
- "paddw %%xmm7, %%xmm5 \n\t"
- snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
- "movq (%2,%%"REG_d"), %%xmm1 \n\t"
- "movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
- "paddw %%xmm7, %%xmm1 \n\t"
- "paddw %%xmm7, %%xmm3 \n\t"
- "pavgw %%xmm1, %%xmm0 \n\t"
- "pavgw %%xmm3, %%xmm2 \n\t"
- "movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
- "movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
- "paddw %%xmm7, %%xmm1 \n\t"
- "paddw %%xmm7, %%xmm3 \n\t"
- "pavgw %%xmm1, %%xmm4 \n\t"
- "pavgw %%xmm3, %%xmm6 \n\t"
- snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
-
- snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
- snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
-
- "2: \n\t"
- "sub $64, %%"REG_d" \n\t"
- "jge 1b \n\t"
- :"+d"(i)
- :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
-}
-
-#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
- ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
- ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
- ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
- ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
-
-#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
- snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
-
-#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
- snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
-
-#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
- "movq %%"s0", ("w",%%"REG_d") \n\t"\
- "movq %%"s1", 8("w",%%"REG_d") \n\t"\
- "movq %%"s2", 16("w",%%"REG_d") \n\t"\
- "movq %%"s3", 24("w",%%"REG_d") \n\t"
-
-#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
- "movq %%"s0", %%"t0" \n\t"\
- "movq %%"s1", %%"t1" \n\t"\
- "movq %%"s2", %%"t2" \n\t"\
- "movq %%"s3", %%"t3" \n\t"
-
-
-static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
- x86_reg i = width;
- while(i & 15)
- {
- i--;
- b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
- b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
- b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
- b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
- }
- i+=i;
- __asm__ volatile(
- "jmp 2f \n\t"
- "1: \n\t"
-
- snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
- snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
- "pcmpeqw %%mm0, %%mm0 \n\t"
- "pcmpeqw %%mm2, %%mm2 \n\t"
- "paddw %%mm2, %%mm2 \n\t"
- "paddw %%mm0, %%mm2 \n\t"
- "psllw $13, %%mm2 \n\t"
- snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
- snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
- snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
- snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
- snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
- snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "pcmpeqw %%mm5, %%mm5 \n\t"
- "psllw $15, %%mm7 \n\t"
- "psrlw $13, %%mm5 \n\t"
- "paddw %%mm7, %%mm5 \n\t"
- snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
- "movq (%2,%%"REG_d"), %%mm1 \n\t"
- "movq 8(%2,%%"REG_d"), %%mm3 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
- "paddw %%mm7, %%mm3 \n\t"
- "pavgw %%mm1, %%mm0 \n\t"
- "pavgw %%mm3, %%mm2 \n\t"
- "movq 16(%2,%%"REG_d"), %%mm1 \n\t"
- "movq 24(%2,%%"REG_d"), %%mm3 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
- "paddw %%mm7, %%mm3 \n\t"
- "pavgw %%mm1, %%mm4 \n\t"
- "pavgw %%mm3, %%mm6 \n\t"
- snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
- snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
-
- snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
- snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
- snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
-
- "2: \n\t"
- "sub $32, %%"REG_d" \n\t"
- "jge 1b \n\t"
- :"+d"(i)
- :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
-}
-#endif //HAVE_7REGS
-
-#define snow_inner_add_yblock_sse2_header \
- IDWTELEM * * dst_array = sb->line + src_y;\
- x86_reg tmp;\
- __asm__ volatile(\
- "mov %7, %%"REG_c" \n\t"\
- "mov %6, %2 \n\t"\
- "mov %4, %%"REG_S" \n\t"\
- "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
- "pcmpeqd %%xmm3, %%xmm3 \n\t"\
- "psllw $15, %%xmm3 \n\t"\
- "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
- "1: \n\t"\
- "mov %1, %%"REG_D" \n\t"\
- "mov (%%"REG_D"), %%"REG_D" \n\t"\
- "add %3, %%"REG_D" \n\t"
-
-#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
- "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
- "movq (%%"REG_d"), %%"out_reg1" \n\t"\
- "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
- "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
- "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
- "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
- "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
- "punpcklbw %%xmm7, %%xmm0 \n\t"\
- "punpcklbw %%xmm7, %%xmm4 \n\t"\
- "pmullw %%xmm0, %%"out_reg1" \n\t"\
- "pmullw %%xmm4, %%"out_reg2" \n\t"
-
-#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
- "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
- "movq (%%"REG_d"), %%"out_reg1" \n\t"\
- "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
- "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
- "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
- "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
- "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
- "punpcklbw %%xmm7, %%xmm0 \n\t"\
- "punpcklbw %%xmm7, %%xmm4 \n\t"\
- "pmullw %%xmm0, %%"out_reg1" \n\t"\
- "pmullw %%xmm4, %%"out_reg2" \n\t"
-
-#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
- snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
- "paddusw %%xmm2, %%xmm1 \n\t"\
- "paddusw %%xmm6, %%xmm5 \n\t"
-
-#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
- snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
- "paddusw %%xmm2, %%xmm1 \n\t"\
- "paddusw %%xmm6, %%xmm5 \n\t"
-
-#define snow_inner_add_yblock_sse2_end_common1\
- "add $32, %%"REG_S" \n\t"\
- "add %%"REG_c", %0 \n\t"\
- "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
- "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
- "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
- "add %%"REG_c", (%%"REG_a") \n\t"
-
-#define snow_inner_add_yblock_sse2_end_common2\
- "jnz 1b \n\t"\
- :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
- :\
- "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
- "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
-
-#define snow_inner_add_yblock_sse2_end_8\
- "sal $1, %%"REG_c" \n\t"\
- "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
- snow_inner_add_yblock_sse2_end_common1\
- "sar $1, %%"REG_c" \n\t"\
- "sub $2, %2 \n\t"\
- snow_inner_add_yblock_sse2_end_common2
-
-#define snow_inner_add_yblock_sse2_end_16\
- "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
- snow_inner_add_yblock_sse2_end_common1\
- "dec %2 \n\t"\
- snow_inner_add_yblock_sse2_end_common2
-
-static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
- int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_header
-snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_8("2", "8")
-snow_inner_add_yblock_sse2_accum_8("1", "128")
-snow_inner_add_yblock_sse2_accum_8("0", "136")
-
- "mov %0, %%"REG_d" \n\t"
- "movdqa (%%"REG_D"), %%xmm0 \n\t"
- "movdqa %%xmm1, %%xmm2 \n\t"
-
- "punpckhwd %%xmm7, %%xmm1 \n\t"
- "punpcklwd %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm0 \n\t"
- "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
- "paddd %%xmm1, %%xmm2 \n\t"
- "paddd %%xmm3, %%xmm0 \n\t"
- "paddd %%xmm3, %%xmm2 \n\t"
-
- "mov %1, %%"REG_D" \n\t"
- "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
- "add %3, %%"REG_D" \n\t"
-
- "movdqa (%%"REG_D"), %%xmm4 \n\t"
- "movdqa %%xmm5, %%xmm6 \n\t"
- "punpckhwd %%xmm7, %%xmm5 \n\t"
- "punpcklwd %%xmm7, %%xmm6 \n\t"
- "paddd %%xmm6, %%xmm4 \n\t"
- "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
- "paddd %%xmm5, %%xmm6 \n\t"
- "paddd %%xmm3, %%xmm4 \n\t"
- "paddd %%xmm3, %%xmm6 \n\t"
-
- "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
- "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
- "packssdw %%xmm2, %%xmm0 \n\t"
- "packuswb %%xmm7, %%xmm0 \n\t"
- "movq %%xmm0, (%%"REG_d") \n\t"
-
- "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
- "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
- "packssdw %%xmm6, %%xmm4 \n\t"
- "packuswb %%xmm7, %%xmm4 \n\t"
- "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
-snow_inner_add_yblock_sse2_end_8
-}
-
-static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
- int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_header
-snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_16("2", "16")
-snow_inner_add_yblock_sse2_accum_16("1", "512")
-snow_inner_add_yblock_sse2_accum_16("0", "528")
-
- "mov %0, %%"REG_d" \n\t"
- "psrlw $4, %%xmm1 \n\t"
- "psrlw $4, %%xmm5 \n\t"
- "paddw (%%"REG_D"), %%xmm1 \n\t"
- "paddw 16(%%"REG_D"), %%xmm5 \n\t"
- "paddw %%xmm3, %%xmm1 \n\t"
- "paddw %%xmm3, %%xmm5 \n\t"
- "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
- "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
- "packuswb %%xmm5, %%xmm1 \n\t"
-
- "movdqu %%xmm1, (%%"REG_d") \n\t"
-
-snow_inner_add_yblock_sse2_end_16
-}
-
-#define snow_inner_add_yblock_mmx_header \
- IDWTELEM * * dst_array = sb->line + src_y;\
- x86_reg tmp;\
- __asm__ volatile(\
- "mov %7, %%"REG_c" \n\t"\
- "mov %6, %2 \n\t"\
- "mov %4, %%"REG_S" \n\t"\
- "pxor %%mm7, %%mm7 \n\t" /* 0 */\
- "pcmpeqd %%mm3, %%mm3 \n\t"\
- "psllw $15, %%mm3 \n\t"\
- "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
- "1: \n\t"\
- "mov %1, %%"REG_D" \n\t"\
- "mov (%%"REG_D"), %%"REG_D" \n\t"\
- "add %3, %%"REG_D" \n\t"
-
-#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
- "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
- "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
- "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
- "punpcklbw %%mm7, %%"out_reg1" \n\t"\
- "punpcklbw %%mm7, %%"out_reg2" \n\t"\
- "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
- "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- "pmullw %%mm0, %%"out_reg1" \n\t"\
- "pmullw %%mm4, %%"out_reg2" \n\t"
-
-#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
- snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
- "paddusw %%mm2, %%mm1 \n\t"\
- "paddusw %%mm6, %%mm5 \n\t"
-
-#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
- "mov %0, %%"REG_d" \n\t"\
- "psrlw $4, %%mm1 \n\t"\
- "psrlw $4, %%mm5 \n\t"\
- "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
- "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
- "paddw %%mm3, %%mm1 \n\t"\
- "paddw %%mm3, %%mm5 \n\t"\
- "psraw $4, %%mm1 \n\t"\
- "psraw $4, %%mm5 \n\t"\
- "packuswb %%mm5, %%mm1 \n\t"\
- "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
-
-#define snow_inner_add_yblock_mmx_end(s_step)\
- "add $"s_step", %%"REG_S" \n\t"\
- "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
- "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
- "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
- "add %%"REG_c", (%%"REG_a") \n\t"\
- "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
- "add %%"REG_c", %0 \n\t"\
- "dec %2 \n\t"\
- "jnz 1b \n\t"\
- :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
- :\
- "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
- "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
-
-static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
- int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_mmx_header
-snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
-snow_inner_add_yblock_mmx_accum("2", "8", "0")
-snow_inner_add_yblock_mmx_accum("1", "128", "0")
-snow_inner_add_yblock_mmx_accum("0", "136", "0")
-snow_inner_add_yblock_mmx_mix("0", "0")
-snow_inner_add_yblock_mmx_end("16")
-}
-
-static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
- int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_mmx_header
-snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
-snow_inner_add_yblock_mmx_accum("2", "16", "0")
-snow_inner_add_yblock_mmx_accum("1", "512", "0")
-snow_inner_add_yblock_mmx_accum("0", "528", "0")
-snow_inner_add_yblock_mmx_mix("0", "0")
-
-snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
-snow_inner_add_yblock_mmx_accum("2", "24", "8")
-snow_inner_add_yblock_mmx_accum("1", "520", "8")
-snow_inner_add_yblock_mmx_accum("0", "536", "8")
-snow_inner_add_yblock_mmx_mix("16", "8")
-snow_inner_add_yblock_mmx_end("32")
-}
-
-static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
- int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-
- if (b_w == 16)
- inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- else if (b_w == 8 && obmc_stride == 16) {
- if (!(b_h & 1))
- inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- else
- inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- } else
- ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
-}
-
-static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
- int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
- if (b_w == 16)
- inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- else if (b_w == 8 && obmc_stride == 16)
- inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- else
- ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-void ff_dwt_init_x86(SnowDWTContext *c)
-{
-#if HAVE_INLINE_ASM
- int mm_flags = av_get_cpu_flags();
-
- if (mm_flags & AV_CPU_FLAG_MMX) {
- if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
- c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
-#if HAVE_7REGS
- c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
-#endif
- c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
- }
- else{
- if (mm_flags & AV_CPU_FLAG_MMXEXT) {
- c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
-#if HAVE_7REGS
- c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
-#endif
- }
- c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
- }
- }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/ffmpeg/libavcodec/x86/v210-init.c b/ffmpeg/libavcodec/x86/v210-init.c
deleted file mode 100644
index 02c5eaa..0000000
--- a/ffmpeg/libavcodec/x86/v210-init.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/cpu.h"
-#include "libavcodec/v210dec.h"
-
-extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
-extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
-
-extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
-extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
-
-av_cold void v210_x86_init(V210DecContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if HAVE_YASM
- if (s->aligned_input) {
- if (cpu_flags & AV_CPU_FLAG_SSSE3)
- s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
-
- if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
- s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
- }
- else {
- if (cpu_flags & AV_CPU_FLAG_SSSE3)
- s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
-
- if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
- s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
- }
-#endif
-}
diff --git a/ffmpeg/libavcodec/x86/v210.asm b/ffmpeg/libavcodec/x86/v210.asm
deleted file mode 100644
index 6554a43..0000000
--- a/ffmpeg/libavcodec/x86/v210.asm
+++ /dev/null
@@ -1,88 +0,0 @@
-;******************************************************************************
-;* V210 SIMD unpack
-;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
-;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-v210_mask: times 4 dd 0x3ff
-v210_mult: dw 64,4,64,4,64,4,64,4
-v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
-v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
-
-SECTION .text
-
-%macro v210_planar_unpack 2
-
-; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
-cglobal v210_planar_unpack_%1_%2, 5, 5, 7
- movsxdifnidn r4, r4d
- lea r1, [r1+2*r4]
- add r2, r4
- add r3, r4
- neg r4
-
- mova m3, [v210_mult]
- mova m4, [v210_mask]
- mova m5, [v210_luma_shuf]
- mova m6, [v210_chroma_shuf]
-.loop
-%ifidn %1, unaligned
- movu m0, [r0]
-%else
- mova m0, [r0]
-%endif
-
- pmullw m1, m0, m3
- psrld m0, 10
- psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
- pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
-
- shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
- pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
- movu [r1+2*r4], m2
-
- shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
- pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
- movq [r2+r4], m1
- movhps [r3+r4], m1
-
- add r0, mmsize
- add r4, 6
- jl .loop
-
- REP_RET
-%endmacro
-
-INIT_XMM
-v210_planar_unpack unaligned, ssse3
-%if HAVE_AVX_EXTERNAL
-INIT_AVX
-v210_planar_unpack unaligned, avx
-%endif
-
-INIT_XMM
-v210_planar_unpack aligned, ssse3
-%if HAVE_AVX_EXTERNAL
-INIT_AVX
-v210_planar_unpack aligned, avx
-%endif
diff --git a/ffmpeg/libavcodec/x86/vc1dsp.asm b/ffmpeg/libavcodec/x86/vc1dsp.asm
deleted file mode 100644
index 546688c..0000000
--- a/ffmpeg/libavcodec/x86/vc1dsp.asm
+++ /dev/null
@@ -1,317 +0,0 @@
-;******************************************************************************
-;* VC1 deblocking optimizations
-;* Copyright (c) 2009 David Conrad
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-cextern pw_4
-cextern pw_5
-
-section .text
-
-; dst_low, dst_high (src), zero
-; zero-extends one vector from 8 to 16 bits
-%macro UNPACK_8TO16 4
- mova m%2, m%3
- punpckh%1 m%3, m%4
- punpckl%1 m%2, m%4
-%endmacro
-
-%macro STORE_4_WORDS 6
-%if cpuflag(sse4)
- pextrw %1, %5, %6+0
- pextrw %2, %5, %6+1
- pextrw %3, %5, %6+2
- pextrw %4, %5, %6+3
-%else
- movd %6d, %5
-%if mmsize==16
- psrldq %5, 4
-%else
- psrlq %5, 32
-%endif
- mov %1, %6w
- shr %6, 16
- mov %2, %6w
- movd %6d, %5
- mov %3, %6w
- shr %6, 16
- mov %4, %6w
-%endif
-%endmacro
-
-; in: p1 p0 q0 q1, clobbers p0
-; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
-%macro VC1_LOOP_FILTER_A0 4
- psubw %1, %4
- psubw %2, %3
- paddw %1, %1
- pmullw %2, [pw_5]
- psubw %1, %2
- paddw %1, [pw_4]
- psraw %1, 3
-%endmacro
-
-; in: p0 q0 a0 a1 a2
-; m0 m1 m7 m6 m5
-; %1: size
-; out: m0=p0' m1=q0'
-%macro VC1_FILTER 1
- PABSW m4, m7
- PABSW m3, m6
- PABSW m2, m5
- mova m6, m4
- pminsw m3, m2
- pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
- psubw m3, m4
- pmullw m3, [pw_5] ; 5*(a3 - a0)
- PABSW m2, m3
- psraw m2, 3 ; abs(d/8)
- pxor m7, m3 ; d_sign ^= a0_sign
-
- pxor m5, m5
- movd m3, r2d
-%if %1 > 4
- punpcklbw m3, m3
-%endif
- punpcklbw m3, m5
- pcmpgtw m3, m4 ; if (a0 < pq)
- pand m6, m3
-
- mova m3, m0
- psubw m3, m1
- PABSW m4, m3
- psraw m4, 1
- pxor m3, m7 ; d_sign ^ clip_sign
- psraw m3, 15
- pminsw m2, m4 ; min(d, clip)
- pcmpgtw m4, m5
- pand m6, m4 ; filt3 (C return value)
-
-; each set of 4 pixels is not filtered if the 3rd is not
-%if mmsize==16
- pshuflw m4, m6, 0xaa
-%if %1 > 4
- pshufhw m4, m4, 0xaa
-%endif
-%else
- pshufw m4, m6, 0xaa
-%endif
- pandn m3, m4
- pand m2, m6
- pand m3, m2 ; d final
-
- psraw m7, 15
- pxor m3, m7
- psubw m3, m7
- psubw m0, m3
- paddw m1, m3
- packuswb m0, m0
- packuswb m1, m1
-%endmacro
-
-; 1st param: size of filter
-; 2nd param: mov suffix equivalent to the filter size
-%macro VC1_V_LOOP_FILTER 2
- pxor m5, m5
- mov%2 m6, [r4]
- mov%2 m4, [r4+r1]
- mov%2 m7, [r4+2*r1]
- mov%2 m0, [r4+r3]
- punpcklbw m6, m5
- punpcklbw m4, m5
- punpcklbw m7, m5
- punpcklbw m0, m5
-
- VC1_LOOP_FILTER_A0 m6, m4, m7, m0
- mov%2 m1, [r0]
- mov%2 m2, [r0+r1]
- punpcklbw m1, m5
- punpcklbw m2, m5
- mova m4, m0
- VC1_LOOP_FILTER_A0 m7, m4, m1, m2
- mov%2 m3, [r0+2*r1]
- mov%2 m4, [r0+r3]
- punpcklbw m3, m5
- punpcklbw m4, m5
- mova m5, m1
- VC1_LOOP_FILTER_A0 m5, m2, m3, m4
-
- VC1_FILTER %1
- mov%2 [r4+r3], m0
- mov%2 [r0], m1
-%endmacro
-
-; 1st param: size of filter
-; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
-; 2nd (optional) param: temp register to use for storing words
-%macro VC1_H_LOOP_FILTER 1-2
-%if %1 == 4
- movq m0, [r0 -4]
- movq m1, [r0+ r1-4]
- movq m2, [r0+2*r1-4]
- movq m3, [r0+ r3-4]
- TRANSPOSE4x4B 0, 1, 2, 3, 4
-%else
- movq m0, [r0 -4]
- movq m4, [r0+ r1-4]
- movq m1, [r0+2*r1-4]
- movq m5, [r0+ r3-4]
- movq m2, [r4 -4]
- movq m6, [r4+ r1-4]
- movq m3, [r4+2*r1-4]
- movq m7, [r4+ r3-4]
- punpcklbw m0, m4
- punpcklbw m1, m5
- punpcklbw m2, m6
- punpcklbw m3, m7
- TRANSPOSE4x4W 0, 1, 2, 3, 4
-%endif
- pxor m5, m5
-
- UNPACK_8TO16 bw, 6, 0, 5
- UNPACK_8TO16 bw, 7, 1, 5
- VC1_LOOP_FILTER_A0 m6, m0, m7, m1
- UNPACK_8TO16 bw, 4, 2, 5
- mova m0, m1 ; m0 = p0
- VC1_LOOP_FILTER_A0 m7, m1, m4, m2
- UNPACK_8TO16 bw, 1, 3, 5
- mova m5, m4
- VC1_LOOP_FILTER_A0 m5, m2, m1, m3
- SWAP 1, 4 ; m1 = q0
-
- VC1_FILTER %1
- punpcklbw m0, m1
-%if %0 > 1
- STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
-%if %1 > 4
- psrldq m0, 4
- STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
-%endif
-%else
- STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
- STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
-%endif
-%endmacro
-
-
-%macro START_V_FILTER 0
- mov r4, r0
- lea r3, [4*r1]
- sub r4, r3
- lea r3, [r1+2*r1]
- imul r2, 0x01010101
-%endmacro
-
-%macro START_H_FILTER 1
- lea r3, [r1+2*r1]
-%if %1 > 4
- lea r4, [r0+4*r1]
-%endif
- imul r2, 0x01010101
-%endmacro
-
-%macro VC1_LF 0
-cglobal vc1_v_loop_filter_internal
- VC1_V_LOOP_FILTER 4, d
- ret
-
-cglobal vc1_h_loop_filter_internal
- VC1_H_LOOP_FILTER 4, r4
- ret
-
-; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4, 3,5,0
- START_V_FILTER
- call vc1_v_loop_filter_internal
- RET
-
-; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4, 3,5,0
- START_H_FILTER 4
- call vc1_h_loop_filter_internal
- RET
-
-; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8, 3,5,0
- START_V_FILTER
- call vc1_v_loop_filter_internal
- add r4, 4
- add r0, 4
- call vc1_v_loop_filter_internal
- RET
-
-; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8, 3,5,0
- START_H_FILTER 4
- call vc1_h_loop_filter_internal
- lea r0, [r0+4*r1]
- call vc1_h_loop_filter_internal
- RET
-%endmacro
-
-INIT_MMX mmxext
-VC1_LF
-
-INIT_XMM sse2
-; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8, 3,5,8
- START_V_FILTER
- VC1_V_LOOP_FILTER 8, q
- RET
-
-; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8, 3,6,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8, r5
- RET
-
-INIT_MMX ssse3
-; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4, 3,5,0
- START_V_FILTER
- VC1_V_LOOP_FILTER 4, d
- RET
-
-; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4, 3,5,0
- START_H_FILTER 4
- VC1_H_LOOP_FILTER 4, r4
- RET
-
-INIT_XMM ssse3
-; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8, 3,5,8
- START_V_FILTER
- VC1_V_LOOP_FILTER 8, q
- RET
-
-; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8, 3,6,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8, r5
- RET
-
-INIT_XMM sse4
-; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8, 3,5,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8
- RET
diff --git a/ffmpeg/libavcodec/x86/vc1dsp.h b/ffmpeg/libavcodec/x86/vc1dsp.h
deleted file mode 100644
index fdd4de1..0000000
--- a/ffmpeg/libavcodec/x86/vc1dsp.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * VC-1 and WMV3 decoder - X86 DSP init functions
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_VC1DSP_H
-#define AVCODEC_X86_VC1DSP_H
-
-#include "libavcodec/vc1dsp.h"
-
-void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
-void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
-
-#endif /* AVCODEC_X86_VC1DSP_H */
diff --git a/ffmpeg/libavcodec/x86/vc1dsp_init.c b/ffmpeg/libavcodec/x86/vc1dsp_init.c
deleted file mode 100644
index 9f18131..0000000
--- a/ffmpeg/libavcodec/x86/vc1dsp_init.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * VC-1 and WMV3 - DSP functions MMX-optimized
- * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/vc1dsp.h"
-#include "dsputil_x86.h"
-#include "vc1dsp.h"
-#include "config.h"
-
-#define LOOP_FILTER(EXT) \
-void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
-\
-static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
-{ \
- ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
- ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
-} \
-\
-static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
-{ \
- ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
- ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
-}
-
-#if HAVE_YASM
-LOOP_FILTER(mmxext)
-LOOP_FILTER(sse2)
-LOOP_FILTER(ssse3)
-
-void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
-
-static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
-{
- ff_vc1_h_loop_filter8_sse4(src, stride, pq);
- ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
-}
-
-static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd)
-{
- ff_avg_pixels8_mmxext(dst, src, stride, 8);
-}
-#endif /* HAVE_YASM */
-
-void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
- int stride, int h, int x, int y);
-
-
-av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags))
- ff_vc1dsp_init_mmx(dsp);
-
- if (INLINE_MMXEXT(cpu_flags))
- ff_vc1dsp_init_mmxext(dsp);
-
-#define ASSIGN_LF(EXT) \
- dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
- dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
- dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
- dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
- dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
- dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
-
-#if HAVE_YASM
- if (EXTERNAL_MMX(cpu_flags)) {
- dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
- }
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- ASSIGN_LF(mmxext);
- dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
- dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
- dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
- dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- ASSIGN_LF(ssse3);
- dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
- dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
- }
- if (EXTERNAL_SSE4(cpu_flags)) {
- dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
- dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/vc1dsp_mmx.c b/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
deleted file mode 100644
index 5ceacd3..0000000
--- a/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * VC-1 and WMV3 - DSP functions MMX-optimized
- * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/vc1dsp.h"
-#include "constants.h"
-#include "dsputil_x86.h"
-#include "vc1dsp.h"
-
-#if HAVE_INLINE_ASM
-
-#define OP_PUT(S,D)
-#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
-
-/** Add rounder from mm7 to mm3 and pack result at destination */
-#define NORMALIZE_MMX(SHIFT) \
- "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
- "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
- "psraw "SHIFT", %%mm3 \n\t" \
- "psraw "SHIFT", %%mm4 \n\t"
-
-#define TRANSFER_DO_PACK(OP) \
- "packuswb %%mm4, %%mm3 \n\t" \
- OP((%2), %%mm3) \
- "movq %%mm3, (%2) \n\t"
-
-#define TRANSFER_DONT_PACK(OP) \
- OP(0(%2), %%mm3) \
- OP(8(%2), %%mm4) \
- "movq %%mm3, 0(%2) \n\t" \
- "movq %%mm4, 8(%2) \n\t"
-
-/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
-#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
-#define DONT_UNPACK(reg)
-
-/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
-#define LOAD_ROUNDER_MMX(ROUND) \
- "movd "ROUND", %%mm7 \n\t" \
- "punpcklwd %%mm7, %%mm7 \n\t" \
- "punpckldq %%mm7, %%mm7 \n\t"
-
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
- "paddw %%mm"#R2", %%mm"#R1" \n\t" \
- "movd (%0,%3), %%mm"#R0" \n\t" \
- "pmullw %%mm6, %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R0" \n\t" \
- "movd (%0,%2), %%mm"#R3" \n\t" \
- "psubw %%mm"#R0", %%mm"#R1" \n\t" \
- "punpcklbw %%mm0, %%mm"#R3" \n\t" \
- "paddw %%mm7, %%mm"#R1" \n\t" \
- "psubw %%mm"#R3", %%mm"#R1" \n\t" \
- "psraw %4, %%mm"#R1" \n\t" \
- "movq %%mm"#R1", "#OFF"(%1) \n\t" \
- "add %2, %0 \n\t"
-
-/** Sacrifying mm6 allows to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
- const uint8_t *src, x86_reg stride,
- int rnd, int64_t shift)
-{
- __asm__ volatile(
- "mov $3, %%"REG_c" \n\t"
- LOAD_ROUNDER_MMX("%5")
- "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
- "1: \n\t"
- "movd (%0), %%mm2 \n\t"
- "add %2, %0 \n\t"
- "movd (%0), %%mm3 \n\t"
- "punpcklbw %%mm0, %%mm2 \n\t"
- "punpcklbw %%mm0, %%mm3 \n\t"
- SHIFT2_LINE( 0, 1, 2, 3, 4)
- SHIFT2_LINE( 24, 2, 3, 4, 1)
- SHIFT2_LINE( 48, 3, 4, 1, 2)
- SHIFT2_LINE( 72, 4, 1, 2, 3)
- SHIFT2_LINE( 96, 1, 2, 3, 4)
- SHIFT2_LINE(120, 2, 3, 4, 1)
- SHIFT2_LINE(144, 3, 4, 1, 2)
- SHIFT2_LINE(168, 4, 1, 2, 3)
- "sub %6, %0 \n\t"
- "add $8, %1 \n\t"
- "dec %%"REG_c" \n\t"
- "jnz 1b \n\t"
- : "+r"(src), "+r"(dst)
- : "r"(stride), "r"(-2*stride),
- "m"(shift), "m"(rnd), "r"(9*stride-4)
- : "%"REG_c, "memory"
- );
-}
-
-/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
- const int16_t *src, int rnd)\
-{\
- int h = 8;\
-\
- src -= 1;\
- rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
- __asm__ volatile(\
- LOAD_ROUNDER_MMX("%4")\
- "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
- "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
- "1: \n\t"\
- "movq 2*0+0(%1), %%mm1 \n\t"\
- "movq 2*0+8(%1), %%mm2 \n\t"\
- "movq 2*1+0(%1), %%mm3 \n\t"\
- "movq 2*1+8(%1), %%mm4 \n\t"\
- "paddw 2*3+0(%1), %%mm1 \n\t"\
- "paddw 2*3+8(%1), %%mm2 \n\t"\
- "paddw 2*2+0(%1), %%mm3 \n\t"\
- "paddw 2*2+8(%1), %%mm4 \n\t"\
- "pmullw %%mm5, %%mm3 \n\t"\
- "pmullw %%mm5, %%mm4 \n\t"\
- "psubw %%mm1, %%mm3 \n\t"\
- "psubw %%mm2, %%mm4 \n\t"\
- NORMALIZE_MMX("$7")\
- /* Remove bias */\
- "paddw %%mm6, %%mm3 \n\t"\
- "paddw %%mm6, %%mm4 \n\t"\
- TRANSFER_DO_PACK(OP)\
- "add $24, %1 \n\t"\
- "add %3, %2 \n\t"\
- "decl %0 \n\t"\
- "jnz 1b \n\t"\
- : "+r"(h), "+r" (src), "+r" (dst)\
- : "r"(stride), "m"(rnd)\
- : "memory"\
- );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
-/**
- * Purely vertical or horizontal 1/2 shift interpolation.
- * Sacrify mm6 for *9 factor.
- */
-#define VC1_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
- x86_reg stride, int rnd, x86_reg offset)\
-{\
- rnd = 8-rnd;\
- __asm__ volatile(\
- "mov $8, %%"REG_c" \n\t"\
- LOAD_ROUNDER_MMX("%5")\
- "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
- "1: \n\t"\
- "movd 0(%0 ), %%mm3 \n\t"\
- "movd 4(%0 ), %%mm4 \n\t"\
- "movd 0(%0,%2), %%mm1 \n\t"\
- "movd 4(%0,%2), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm0, %%mm3 \n\t"\
- "punpcklbw %%mm0, %%mm4 \n\t"\
- "punpcklbw %%mm0, %%mm1 \n\t"\
- "punpcklbw %%mm0, %%mm2 \n\t"\
- "paddw %%mm1, %%mm3 \n\t"\
- "paddw %%mm2, %%mm4 \n\t"\
- "movd 0(%0,%3), %%mm1 \n\t"\
- "movd 4(%0,%3), %%mm2 \n\t"\
- "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
- "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
- "punpcklbw %%mm0, %%mm1 \n\t"\
- "punpcklbw %%mm0, %%mm2 \n\t"\
- "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
- "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
- "movd 0(%0,%2), %%mm1 \n\t"\
- "movd 4(%0,%2), %%mm2 \n\t"\
- "punpcklbw %%mm0, %%mm1 \n\t"\
- "punpcklbw %%mm0, %%mm2 \n\t"\
- "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
- "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
- NORMALIZE_MMX("$4")\
- "packuswb %%mm4, %%mm3 \n\t"\
- OP((%1), %%mm3)\
- "movq %%mm3, (%1) \n\t"\
- "add %6, %0 \n\t"\
- "add %4, %1 \n\t"\
- "dec %%"REG_c" \n\t"\
- "jnz 1b \n\t"\
- : "+r"(src), "+r"(dst)\
- : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
- "g"(stride-offset)\
- : "%"REG_c, "memory"\
- );\
-}
-
-VC1_SHIFT2(OP_PUT, put_)
-VC1_SHIFT2(OP_AVG, avg_)
-
-/**
- * Core of the 1/4 and 3/4 shift bicubic interpolation.
- *
- * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
- * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
- * @param A1 Address of 1st tap (beware of unpacked/packed).
- * @param A2 Address of 2nd tap
- * @param A3 Address of 3rd tap
- * @param A4 Address of 4th tap
- */
-#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
- MOVQ "*0+"A1", %%mm1 \n\t" \
- MOVQ "*4+"A1", %%mm2 \n\t" \
- UNPACK("%%mm1") \
- UNPACK("%%mm2") \
- "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
- "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
- MOVQ "*0+"A2", %%mm3 \n\t" \
- MOVQ "*4+"A2", %%mm4 \n\t" \
- UNPACK("%%mm3") \
- UNPACK("%%mm4") \
- "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
- "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
- "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
- "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
- MOVQ "*0+"A4", %%mm1 \n\t" \
- MOVQ "*4+"A4", %%mm2 \n\t" \
- UNPACK("%%mm1") \
- UNPACK("%%mm2") \
- "psllw $2, %%mm1 \n\t" /* 4* */ \
- "psllw $2, %%mm2 \n\t" /* 4* */ \
- "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
- "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
- MOVQ "*0+"A3", %%mm1 \n\t" \
- MOVQ "*4+"A3", %%mm2 \n\t" \
- UNPACK("%%mm1") \
- UNPACK("%%mm2") \
- "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
- "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
- "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
- "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
-
-/**
- * Macro to build the vertical 16bits version of vc1_put_shift[13].
- * Here, offset=src_stride. Parameters passed A1 to A4 must use
- * %3 (src_stride) and %4 (3*src_stride).
- *
- * @param NAME Either 1 or 3
- * @see MSPEL_FILTER13_CORE for information on A1->A4
- */
-#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
-static void \
-vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
- x86_reg src_stride, \
- int rnd, int64_t shift) \
-{ \
- int h = 8; \
- src -= src_stride; \
- __asm__ volatile( \
- LOAD_ROUNDER_MMX("%5") \
- "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
- "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
- ".p2align 3 \n\t" \
- "1: \n\t" \
- MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
- NORMALIZE_MMX("%6") \
- TRANSFER_DONT_PACK(OP_PUT) \
- /* Last 3 (in fact 4) bytes on the line */ \
- "movd 8+"A1", %%mm1 \n\t" \
- DO_UNPACK("%%mm1") \
- "movq %%mm1, %%mm3 \n\t" \
- "paddw %%mm1, %%mm1 \n\t" \
- "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
- "movd 8+"A2", %%mm3 \n\t" \
- DO_UNPACK("%%mm3") \
- "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
- "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
- "movd 8+"A3", %%mm1 \n\t" \
- DO_UNPACK("%%mm1") \
- "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
- "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
- "movd 8+"A4", %%mm1 \n\t" \
- DO_UNPACK("%%mm1") \
- "psllw $2, %%mm1 \n\t" /* 4* */ \
- "psubw %%mm1, %%mm3 \n\t" \
- "paddw %%mm7, %%mm3 \n\t" \
- "psraw %6, %%mm3 \n\t" \
- "movq %%mm3, 16(%2) \n\t" \
- "add %3, %1 \n\t" \
- "add $24, %2 \n\t" \
- "decl %0 \n\t" \
- "jnz 1b \n\t" \
- : "+r"(h), "+r" (src), "+r" (dst) \
- : "r"(src_stride), "r"(3*src_stride), \
- "m"(rnd), "m"(shift) \
- : "memory" \
- ); \
-}
-
-/**
- * Macro to build the horizontal 16bits version of vc1_put_shift[13].
- * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
- *
- * @param NAME Either 1 or 3
- * @see MSPEL_FILTER13_CORE for information on A1->A4
- */
-#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
-static void \
-OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
- const int16_t *src, int rnd) \
-{ \
- int h = 8; \
- src -= 1; \
- rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
- __asm__ volatile( \
- LOAD_ROUNDER_MMX("%4") \
- "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
- "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
- ".p2align 3 \n\t" \
- "1: \n\t" \
- MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
- NORMALIZE_MMX("$7") \
- /* Remove bias */ \
- "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
- "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
- TRANSFER_DO_PACK(OP) \
- "add $24, %1 \n\t" \
- "add %3, %2 \n\t" \
- "decl %0 \n\t" \
- "jnz 1b \n\t" \
- : "+r"(h), "+r" (src), "+r" (dst) \
- : "r"(stride), "m"(rnd) \
- : "memory" \
- ); \
-}
-
-/**
- * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
- * Here, offset=src_stride. Parameters passed A1 to A4 must use
- * %3 (offset) and %4 (3*offset).
- *
- * @param NAME Either 1 or 3
- * @see MSPEL_FILTER13_CORE for information on A1->A4
- */
-#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
-static void \
-OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
- x86_reg stride, int rnd, x86_reg offset) \
-{ \
- int h = 8; \
- src -= offset; \
- rnd = 32-rnd; \
- __asm__ volatile ( \
- LOAD_ROUNDER_MMX("%6") \
- "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
- "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
- ".p2align 3 \n\t" \
- "1: \n\t" \
- MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
- NORMALIZE_MMX("$6") \
- TRANSFER_DO_PACK(OP) \
- "add %5, %1 \n\t" \
- "add %5, %2 \n\t" \
- "decl %0 \n\t" \
- "jnz 1b \n\t" \
- : "+r"(h), "+r" (src), "+r" (dst) \
- : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
- : "memory" \
- ); \
-}
-
-/** 1/4 shift bicubic interpolation */
-MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
-MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
-MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
-MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
-MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
-
-/** 3/4 shift bicubic interpolation */
-MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
-MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
-MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
-MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
-MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
-
-typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
-typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
-typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
-
-/**
- * Interpolate fractional pel values by applying proper vertical then
- * horizontal filter.
- *
- * @param dst Destination buffer for interpolated pels.
- * @param src Source buffer.
- * @param stride Stride for both src and dst buffers.
- * @param hmode Horizontal filter (expressed in quarter pixels shift).
- * @param hmode Vertical filter.
- * @param rnd Rounding bias.
- */
-#define VC1_MSPEL_MC(OP)\
-static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
- int hmode, int vmode, int rnd)\
-{\
- static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
- { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
- static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
- { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
- static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
- { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
-\
- __asm__ volatile(\
- "pxor %%mm0, %%mm0 \n\t"\
- ::: "memory"\
- );\
-\
- if (vmode) { /* Vertical filter to apply */\
- if (hmode) { /* Horizontal filter to apply, output to tmp */\
- static const int shift_value[] = { 0, 5, 1, 5 };\
- int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
- int r;\
- DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
-\
- r = (1<<(shift-1)) + rnd-1;\
- vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
-\
- vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
- return;\
- }\
- else { /* No horizontal filter, output 8 lines to dst */\
- vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
- return;\
- }\
- }\
-\
- /* Horizontal mode with no vertical mode */\
- vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
-}
-
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
-
-/** Macro to ease bicubic filter interpolation functions declarations */
-#define DECLARE_FUNCTION(a, b) \
-static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t stride, \
- int rnd) \
-{ \
- put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
-}\
-static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t stride, \
- int rnd) \
-{ \
- avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
-}
-
-DECLARE_FUNCTION(0, 1)
-DECLARE_FUNCTION(0, 2)
-DECLARE_FUNCTION(0, 3)
-
-DECLARE_FUNCTION(1, 0)
-DECLARE_FUNCTION(1, 1)
-DECLARE_FUNCTION(1, 2)
-DECLARE_FUNCTION(1, 3)
-
-DECLARE_FUNCTION(2, 0)
-DECLARE_FUNCTION(2, 1)
-DECLARE_FUNCTION(2, 2)
-DECLARE_FUNCTION(2, 3)
-
-DECLARE_FUNCTION(3, 0)
-DECLARE_FUNCTION(3, 1)
-DECLARE_FUNCTION(3, 2)
-DECLARE_FUNCTION(3, 3)
-
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (12 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
- dest += 4*linesize;
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = ( 3 * dc + 1) >> 1;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (3 * dc + 1) >> 1;
- dc = (3 * dc + 16) >> 5;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
- dest += 4*linesize;
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd)
-{
- ff_put_pixels8_mmx(dst, src, stride, 8);
-}
-
-av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
-{
- dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
- dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
- dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
- dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
- dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
- dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
- dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
- dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
- dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
-}
-
-av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
-{
- dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
-
- dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
-
- dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
- dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
- dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
- dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
-}
-#endif /* HAVE_INLINE_ASM */
diff --git a/ffmpeg/libavcodec/x86/videodsp.asm b/ffmpeg/libavcodec/x86/videodsp.asm
deleted file mode 100644
index 1ac0257..0000000
--- a/ffmpeg/libavcodec/x86/videodsp.asm
+++ /dev/null
@@ -1,444 +0,0 @@
-;******************************************************************************
-;* Core video DSP functions
-;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION .text
-
-; slow vertical extension loop function. Works with variable-width, and
-; does per-line reading/writing of source data
-
-%macro V_COPY_ROW 2 ; type (top/body/bottom), h
-.%1_y_loop: ; do {
- mov wq, r7mp ; initialize w (r7mp = wmp)
-.%1_x_loop: ; do {
- movu m0, [srcq+wq] ; m0 = read($mmsize)
- movu [dstq+wq], m0 ; write(m0, $mmsize)
- add wq, mmsize ; w -= $mmsize
- cmp wq, -mmsize ; } while (w > $mmsize);
- jl .%1_x_loop
- movu m0, [srcq-mmsize] ; m0 = read($mmsize)
- movu [dstq-mmsize], m0 ; write(m0, $mmsize)
-%ifidn %1, body ; if ($type == body) {
- add srcq, src_strideq ; src += src_stride
-%endif ; }
- add dstq, dst_strideq ; dst += dst_stride
- dec %2 ; } while (--$h);
- jnz .%1_y_loop
-%endmacro
-
-%macro vvar_fn 0
-; .----. <- zero
-; | | <- top is copied from first line in body of source
-; |----| <- start_y
-; | | <- body is copied verbatim (line-by-line) from source
-; |----| <- end_y
-; | | <- bottom is copied from last line in body of source
-; '----' <- bh
-%if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
- start_y, end_y, bh, w
-%else ; x86-32
-cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
-%define src_strideq r3mp
-%define dst_strideq r1mp
- mov srcq, r2mp
- mov start_yq, r4mp
- mov end_yq, r5mp
- mov bhq, r6mp
-%endif
- sub bhq, end_yq ; bh -= end_q
- sub end_yq, start_yq ; end_q -= start_q
- add srcq, r7mp ; (r7mp = wmp)
- add dstq, r7mp ; (r7mp = wmp)
- neg r7mp ; (r7mp = wmp)
- test start_yq, start_yq ; if (start_q) {
- jz .body
- V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
-.body: ; }
- V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
- test bhq, bhq ; if (bh) {
- jz .end
- sub srcq, src_strideq ; src -= src_stride
- V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
-.end: ; }
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-vvar_fn
-%endif
-
-INIT_XMM sse
-vvar_fn
-
-%macro hvar_fn 0
-cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
- lea dstq, [dstq+n_wordsq*2]
- neg n_wordsq
- lea start_xq, [start_xq+n_wordsq*2]
-.y_loop: ; do {
- ; FIXME also write a ssse3 version using pshufb
- movzx wd, byte [dstq+start_xq] ; w = read(1)
- imul wd, 0x01010101 ; w *= 0x01010101
- movd m0, wd
- mov wq, n_wordsq ; initialize w
-%if cpuflag(sse2)
- pshufd m0, m0, q0000 ; splat
-%else ; mmx
- punpckldq m0, m0 ; splat
-%endif ; mmx/sse
-.x_loop: ; do {
- movu [dstq+wq*2], m0 ; write($reg, $mmsize)
- add wq, mmsize/2 ; w -= $mmsize/2
- cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
- jl .x_loop
- movu [dstq-mmsize], m0 ; write($reg, $mmsize)
- add dstq, dst_strideq ; dst += dst_stride
- dec hq ; } while (h--)
- jnz .y_loop
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-hvar_fn
-%endif
-
-INIT_XMM sse2
-hvar_fn
-
-; macro to read/write a horizontal number of pixels (%2) to/from registers
-; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
-; - if (%2 & 8) fills 8 bytes into xmm$next
-; - if (%2 & 4) fills 4 bytes into xmm$next
-; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
-; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
-; - if (%2 & 4) fills 4 bytes into mm$next
-; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
-; writing data out is in the same way
-%macro READ_NUM_BYTES 2
-%assign %%off 0 ; offset in source buffer
-%assign %%mmx_idx 0 ; mmx register index
-%assign %%xmm_idx 0 ; xmm register index
-
-%rep %2/mmsize
-%if mmsize == 16
- movu xmm %+ %%xmm_idx, [srcq+%%off]
-%assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
- movu mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
-%endif
-%assign %%off %%off+mmsize
-%endrep ; %2/mmsize
-
-%if mmsize == 16
-%if (%2-%%off) >= 8
-%if %2 > 16 && (%2-%%off) > 8
- movu xmm %+ %%xmm_idx, [srcq+%2-16]
-%assign %%xmm_idx %%xmm_idx+1
-%assign %%off %2
-%else
- movq mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
-%assign %%off %%off+8
-%endif
-%endif ; (%2-%%off) >= 8
-%endif
-
-%if (%2-%%off) >= 4
-%if %2 > 8 && (%2-%%off) > 4
- movq mm %+ %%mmx_idx, [srcq+%2-8]
-%assign %%off %2
-%else
- movd mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%off %%off+4
-%endif
-%assign %%mmx_idx %%mmx_idx+1
-%endif ; (%2-%%off) >= 4
-
-%if (%2-%%off) >= 1
-%if %2 >= 4
- movd mm %+ %%mmx_idx, [srcq+%2-4]
-%elif (%2-%%off) == 1
- mov valb, [srcq+%2-1]
-%elif (%2-%%off) == 2
- mov valw, [srcq+%2-2]
-%elifidn %1, body
- mov vald, [srcq+%2-3]
-%else
- movd mm %+ %%mmx_idx, [srcq+%2-3]
-%endif
-%endif ; (%2-%%off) >= 1
-%endmacro ; READ_NUM_BYTES
-
-%macro WRITE_NUM_BYTES 2
-%assign %%off 0 ; offset in destination buffer
-%assign %%mmx_idx 0 ; mmx register index
-%assign %%xmm_idx 0 ; xmm register index
-
-%rep %2/mmsize
-%if mmsize == 16
- movu [dstq+%%off], xmm %+ %%xmm_idx
-%assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
- movu [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
-%endif
-%assign %%off %%off+mmsize
-%endrep ; %2/mmsize
-
-%if mmsize == 16
-%if (%2-%%off) >= 8
-%if %2 > 16 && (%2-%%off) > 8
- movu [dstq+%2-16], xmm %+ %%xmm_idx
-%assign %%xmm_idx %%xmm_idx+1
-%assign %%off %2
-%else
- movq [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
-%assign %%off %%off+8
-%endif
-%endif ; (%2-%%off) >= 8
-%endif
-
-%if (%2-%%off) >= 4
-%if %2 > 8 && (%2-%%off) > 4
- movq [dstq+%2-8], mm %+ %%mmx_idx
-%assign %%off %2
-%else
- movd [dstq+%%off], mm %+ %%mmx_idx
-%assign %%off %%off+4
-%endif
-%assign %%mmx_idx %%mmx_idx+1
-%endif ; (%2-%%off) >= 4
-
-%if (%2-%%off) >= 1
-%if %2 >= 4
- movd [dstq+%2-4], mm %+ %%mmx_idx
-%elif (%2-%%off) == 1
- mov [dstq+%2-1], valb
-%elif (%2-%%off) == 2
- mov [dstq+%2-2], valw
-%elifidn %1, body
- mov [dstq+%2-3], valw
- shr vald, 16
- mov [dstq+%2-1], valb
-%else
- movd vald, mm %+ %%mmx_idx
- mov [dstq+%2-3], valw
- shr vald, 16
- mov [dstq+%2-1], valb
-%endif
-%endif ; (%2-%%off) >= 1
-%endmacro ; WRITE_NUM_BYTES
-
-; vertical top/bottom extend and body copy fast loops
-; these are function pointers to set-width line copy functions, i.e.
-; they read a fixed number of pixels into set registers, and write
-; those out into the destination buffer
-%macro VERTICAL_EXTEND 2
-%assign %%n %1
-%rep 1+%2-%1
-%if %%n <= 3
-%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
- start_y, end_y, val, bh
- mov bhq, r6mp ; r6mp = bhmp
-%else ; x86-32
-cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
- mov dstq, r0mp
- mov srcq, r2mp
- mov start_yq, r4mp
- mov end_yq, r5mp
- mov bhq, r6mp
-%define dst_strideq r1mp
-%define src_strideq r3mp
-%endif ; x86-64/32
-%else
-%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
- start_y, end_y, bh
-%else ; x86-32
-cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
- mov srcq, r2mp
- mov start_yq, r4mp
- mov end_yq, r5mp
- mov bhq, r6mp
-%define dst_strideq r1mp
-%define src_strideq r3mp
-%endif ; x86-64/32
-%endif
- ; FIXME move this to c wrapper?
- sub bhq, end_yq ; bh -= end_y
- sub end_yq, start_yq ; end_y -= start_y
-
- ; extend pixels above body
- test start_yq, start_yq ; if (start_y) {
- jz .body_loop
- READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
-.top_loop: ; do {
- WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
- add dstq, dst_strideq ; dst += linesize
- dec start_yq ; } while (--start_y)
- jnz .top_loop ; }
-
- ; copy body pixels
-.body_loop: ; do {
- READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
- WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
- add dstq, dst_strideq ; dst += dst_stride
- add srcq, src_strideq ; src += src_stride
- dec end_yq ; } while (--end_y)
- jnz .body_loop
-
- ; copy bottom pixels
- test bhq, bhq ; if (block_h) {
- jz .end
- sub srcq, src_strideq ; src -= linesize
- READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
-.bottom_loop: ; do {
- WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
- add dstq, dst_strideq ; dst += linesize
- dec bhq ; } while (--bh)
- jnz .bottom_loop ; }
-
-.end:
- RET
-%assign %%n %%n+1
-%endrep ; 1+%2-%1
-%endmacro ; VERTICAL_EXTEND
-
-INIT_MMX mmx
-VERTICAL_EXTEND 1, 15
-%if ARCH_X86_32
-VERTICAL_EXTEND 16, 22
-%endif
-
-INIT_XMM sse
-VERTICAL_EXTEND 16, 22
-
-; left/right (horizontal) fast extend functions
-; these are essentially identical to the vertical extend ones above,
-; just left/right separated because number of pixels to extend is
-; obviously not the same on both sides.
-
-%macro READ_V_PIXEL 2
- movzx vald, byte %2
- imul vald, 0x01010101
-%if %1 >= 8
- movd m0, vald
-%if mmsize == 16
- pshufd m0, m0, q0000
-%else
- punpckldq m0, m0
-%endif ; mmsize == 16
-%endif ; %1 > 16
-%endmacro ; READ_V_PIXEL
-
-%macro WRITE_V_PIXEL 2
-%assign %%off 0
-
-%if %1 >= 8
-
-%rep %1/mmsize
- movu [%2+%%off], m0
-%assign %%off %%off+mmsize
-%endrep ; %1/mmsize
-
-%if mmsize == 16
-%if %1-%%off >= 8
-%if %1 > 16 && %1-%%off > 8
- movu [%2+%1-16], m0
-%assign %%off %1
-%else
- movq [%2+%%off], m0
-%assign %%off %%off+8
-%endif
-%endif ; %1-%%off >= 8
-%endif ; mmsize == 16
-
-%if %1-%%off >= 4
-%if %1 > 8 && %1-%%off > 4
- movq [%2+%1-8], m0
-%assign %%off %1
-%else
- movd [%2+%%off], m0
-%assign %%off %%off+4
-%endif
-%endif ; %1-%%off >= 4
-
-%else ; %1 < 8
-
-%rep %1/4
- mov [%2+%%off], vald
-%assign %%off %%off+4
-%endrep ; %1/4
-
-%endif ; %1 >=/< 8
-
-%if %1-%%off == 2
- mov [%2+%%off], valw
-%endif ; (%1-%%off)/2
-%endmacro ; WRITE_V_PIXEL
-
-%macro H_EXTEND 2
-%assign %%n %1
-%rep 1+(%2-%1)/2
-cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
-.loop_y: ; do {
- READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
- WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
- add dstq, dst_strideq ; dst += dst_stride
- dec bhq ; } while (--bh)
- jnz .loop_y
- RET
-%assign %%n %%n+2
-%endrep ; 1+(%2-%1)/2
-%endmacro ; H_EXTEND
-
-INIT_MMX mmx
-H_EXTEND 2, 14
-%if ARCH_X86_32
-H_EXTEND 16, 22
-%endif
-
-INIT_XMM sse2
-H_EXTEND 16, 22
-
-%macro PREFETCH_FN 1
-cglobal prefetch, 3, 3, 0, buf, stride, h
-.loop:
- %1 [bufq]
- add bufq, strideq
- dec hd
- jg .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-PREFETCH_FN prefetcht0
-%if ARCH_X86_32
-INIT_MMX 3dnow
-PREFETCH_FN prefetch
-%endif
diff --git a/ffmpeg/libavcodec/x86/videodsp_init.c b/ffmpeg/libavcodec/x86/videodsp_init.c
deleted file mode 100644
index 2013a93..0000000
--- a/ffmpeg/libavcodec/x86/videodsp_init.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (C) 2002-2012 Michael Niedermayer
- * Copyright (C) 2012 Ronald S. Bultje
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/common.h"
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/videodsp.h"
-
-#if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
- const uint8_t *src, x86_reg src_stride,
- x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
- const uint8_t *src, x86_reg src_stride,
- x86_reg start_y, x86_reg end_y, x86_reg bh,
- x86_reg w);
-
-extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
-#if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
- &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
- &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
- &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
- &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
- &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
- &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
- &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
- &ff_emu_edge_vfix22_mmx
-};
-#endif
-extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
- ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
- ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
- ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
- ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
- ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
- ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
- ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
- ff_emu_edge_vfix22_sse
-};
-extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
-
-typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
- x86_reg start_x, x86_reg bh);
-typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
- x86_reg start_x, x86_reg n_words, x86_reg bh);
-
-extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
-#if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
- ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
- ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
- ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
- ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
-};
-#endif
-extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
-extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
-extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
-extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
- ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
- ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
- ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
- ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
-};
-extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
-
-static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride,
- ptrdiff_t src_stride,
- x86_reg block_w, x86_reg block_h,
- x86_reg src_x, x86_reg src_y,
- x86_reg w, x86_reg h,
- emu_edge_vfix_func **vfix_tbl,
- emu_edge_vvar_func *v_extend_var,
- emu_edge_hfix_func **hfix_tbl,
- emu_edge_hvar_func *h_extend_var)
-{
- x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
-
- if (!w || !h)
- return;
-
- if (src_y >= h) {
- src -= src_y*src_stride;
- src_y_add = h - 1;
- src_y = h - 1;
- } else if (src_y <= -block_h) {
- src -= src_y*src_stride;
- src_y_add = 1 - block_h;
- src_y = 1 - block_h;
- }
- if (src_x >= w) {
- src += w - 1 - src_x;
- src_x = w - 1;
- } else if (src_x <= -block_w) {
- src += 1 - block_w - src_x;
- src_x = 1 - block_w;
- }
-
- start_y = FFMAX(0, -src_y);
- start_x = FFMAX(0, -src_x);
- end_y = FFMIN(block_h, h-src_y);
- end_x = FFMIN(block_w, w-src_x);
- av_assert2(start_x < end_x && block_w > 0);
- av_assert2(start_y < end_y && block_h > 0);
-
- // fill in the to-be-copied part plus all above/below
- src += (src_y_add + start_y) * src_stride + start_x;
- w = end_x - start_x;
- if (w <= 22) {
- vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
- start_y, end_y, block_h);
- } else {
- v_extend_var(dst + start_x, dst_stride, src, src_stride,
- start_y, end_y, block_h, w);
- }
-
- // fill left
- if (start_x) {
- if (start_x <= 22) {
- hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
- } else {
- h_extend_var(dst, dst_stride,
- start_x, (start_x + 1) >> 1, block_h);
- }
- }
-
- // fill right
- p = block_w - end_x;
- if (p) {
- if (p <= 22) {
- hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
- -!(p & 1), block_h);
- } else {
- h_extend_var(dst + end_x - (p & 1), dst_stride,
- -!(p & 1), (p + 1) >> 1, block_h);
- }
- }
-}
-
-#if ARCH_X86_32
-static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
- ptrdiff_t buf_stride,
- ptrdiff_t src_stride,
- int block_w, int block_h,
- int src_x, int src_y, int w, int h)
-{
- emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
- src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
- hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
-}
-
-static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
- ptrdiff_t buf_stride,
- ptrdiff_t src_stride,
- int block_w, int block_h,
- int src_x, int src_y, int w, int h)
-{
- emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
- src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
- hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
-}
-#endif
-
-static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
- ptrdiff_t buf_stride,
- ptrdiff_t src_stride,
- int block_w, int block_h,
- int src_x, int src_y, int w, int h)
-{
- emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
- src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
- hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
-}
-#endif /* HAVE_YASM */
-
-void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
-void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
-
-av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
- ctx->emulated_edge_mc = emulated_edge_mc_mmx;
- }
- if (EXTERNAL_AMD3DNOW(cpu_flags)) {
- ctx->prefetch = ff_prefetch_3dnow;
- }
-#endif /* ARCH_X86_32 */
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- ctx->prefetch = ff_prefetch_mmxext;
- }
-#if ARCH_X86_32
- if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
- ctx->emulated_edge_mc = emulated_edge_mc_sse;
- }
-#endif /* ARCH_X86_32 */
- if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
- ctx->emulated_edge_mc = emulated_edge_mc_sse2;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/vorbisdsp.asm b/ffmpeg/libavcodec/x86/vorbisdsp.asm
deleted file mode 100644
index b25d838..0000000
--- a/ffmpeg/libavcodec/x86/vorbisdsp.asm
+++ /dev/null
@@ -1,83 +0,0 @@
-;******************************************************************************
-;* Vorbis x86 optimizations
-;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-pdw_80000000: times 4 dd 0x80000000
-
-SECTION .text
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
- pxor m7, m7
- lea magq, [magq+block_sizeq*4]
- lea angq, [angq+block_sizeq*4]
- neg block_sizeq
-.loop:
- mova m0, [magq+block_sizeq*4]
- mova m1, [angq+block_sizeq*4]
- mova m2, m0
- mova m3, m1
- pfcmpge m2, m7 ; m <= 0.0
- pfcmpge m3, m7 ; a <= 0.0
- pslld m2, 31 ; keep only the sign bit
- pxor m1, m2
- mova m4, m3
- pand m3, m1
- pandn m4, m1
- pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
- pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
- mova [angq+block_sizeq*4], m3
- mova [magq+block_sizeq*4], m0
- add block_sizeq, 2
- jl .loop
- femms
- RET
-%endif
-
-INIT_XMM sse
-cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
- mova m5, [pdw_80000000]
- xor cntrq, cntrq
-align 16
-.loop:
- mova m0, [magq+cntrq*4]
- mova m1, [angq+cntrq*4]
- xorps m2, m2
- xorps m3, m3
- cmpleps m2, m0 ; m <= 0.0
- cmpleps m3, m1 ; a <= 0.0
- andps m2, m5 ; keep only the sign bit
- xorps m1, m2
- mova m4, m3
- andps m3, m1
- andnps m4, m1
- addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
- subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
- mova [angq+cntrq*4], m3
- mova [magq+cntrq*4], m0
- add cntrq, 4
- cmp cntrq, block_sizeq
- jl .loop
- RET
diff --git a/ffmpeg/libavcodec/x86/vorbisdsp_init.c b/ffmpeg/libavcodec/x86/vorbisdsp_init.c
deleted file mode 100644
index 284a528..0000000
--- a/ffmpeg/libavcodec/x86/vorbisdsp_init.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/vorbisdsp.h"
-
-void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
- intptr_t blocksize);
-void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
- intptr_t blocksize);
-
-av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_AMD3DNOW(cpu_flags))
- dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
-#endif /* ARCH_X86_32 */
- if (EXTERNAL_SSE(cpu_flags))
- dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/vp3dsp.asm b/ffmpeg/libavcodec/x86/vp3dsp.asm
deleted file mode 100644
index 24496ae..0000000
--- a/ffmpeg/libavcodec/x86/vp3dsp.asm
+++ /dev/null
@@ -1,709 +0,0 @@
-;******************************************************************************
-;* MMX/SSE2-optimized functions for the VP3 decoder
-;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-; MMX-optimized functions cribbed from the original VP3 source code.
-
-SECTION_RODATA
-
-vp3_idct_data: times 8 dw 64277
- times 8 dw 60547
- times 8 dw 54491
- times 8 dw 46341
- times 8 dw 36410
- times 8 dw 25080
- times 8 dw 12785
-
-pb_7: times 8 db 0x07
-pb_1F: times 8 db 0x1f
-pb_81: times 8 db 0x81
-
-cextern pb_1
-cextern pb_3
-cextern pb_80
-
-cextern pw_8
-
-SECTION .text
-
-; this is off by one or two for some cases when filter_limit is greater than 63
-; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
-; out: p1 in mm4, p2 in mm3
-%macro VP3_LOOP_FILTER 0
- movq m7, m6
- pand m6, [pb_7] ; p0&7
- psrlw m7, 3
- pand m7, [pb_1F] ; p0>>3
- movq m3, m2 ; p2
- pxor m2, m4
- pand m2, [pb_1] ; (p2^p1)&1
- movq m5, m2
- paddb m2, m2
- paddb m2, m5 ; 3*(p2^p1)&1
- paddb m2, m6 ; extra bits lost in shifts
- pcmpeqb m0, m0
- pxor m1, m0 ; 255 - p3
- pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
- pxor m0, m4 ; 255 - p1
- pavgb m0, m3 ; (256 + p2-p1) >> 1
- paddb m1, [pb_3]
- pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
- pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
- paddusb m7, m1 ; d+128+1
- movq m6, [pb_81]
- psubusb m6, m7
- psubusb m7, [pb_81]
-
- movq m5, [r2+516] ; flim
- pminub m6, m5
- pminub m7, m5
- movq m0, m6
- movq m1, m7
- paddb m6, m6
- paddb m7, m7
- pminub m6, m5
- pminub m7, m5
- psubb m6, m0
- psubb m7, m1
- paddusb m4, m7
- psubusb m4, m6
- psubusb m3, m7
- paddusb m3, m6
-%endmacro
-
-%macro STORE_4_WORDS 1
- movd r2d, %1
- mov [r0 -1], r2w
- psrlq %1, 32
- shr r2, 16
- mov [r0+r1 -1], r2w
- movd r2d, %1
- mov [r0+r1*2-1], r2w
- shr r2, 16
- mov [r0+r3 -1], r2w
-%endmacro
-
-INIT_MMX mmxext
-cglobal vp3_v_loop_filter, 3, 4
-%if ARCH_X86_64
- movsxd r1, r1d
-%endif
- mov r3, r1
- neg r1
- movq m6, [r0+r1*2]
- movq m4, [r0+r1 ]
- movq m2, [r0 ]
- movq m1, [r0+r3 ]
-
- VP3_LOOP_FILTER
-
- movq [r0+r1], m4
- movq [r0 ], m3
- RET
-
-cglobal vp3_h_loop_filter, 3, 4
-%if ARCH_X86_64
- movsxd r1, r1d
-%endif
- lea r3, [r1*3]
-
- movd m6, [r0 -2]
- movd m4, [r0+r1 -2]
- movd m2, [r0+r1*2-2]
- movd m1, [r0+r3 -2]
- lea r0, [r0+r1*4 ]
- punpcklbw m6, [r0 -2]
- punpcklbw m4, [r0+r1 -2]
- punpcklbw m2, [r0+r1*2-2]
- punpcklbw m1, [r0+r3 -2]
- sub r0, r3
- sub r0, r1
-
- TRANSPOSE4x4B 6, 4, 2, 1, 0
- VP3_LOOP_FILTER
- SBUTTERFLY bw, 4, 3, 5
-
- STORE_4_WORDS m4
- lea r0, [r0+r1*4 ]
- STORE_4_WORDS m3
- RET
-
-; from original comments: The Macro does IDct on 4 1-D Dcts
-%macro BeginIDCT 0
- movq m2, I(3)
- movq m6, C(3)
- movq m4, m2
- movq m7, J(5)
- pmulhw m4, m6 ; r4 = c3*i3 - i3
- movq m1, C(5)
- pmulhw m6, m7 ; r6 = c3*i5 - i5
- movq m5, m1
- pmulhw m1, m2 ; r1 = c5*i3 - i3
- movq m3, I(1)
- pmulhw m5, m7 ; r5 = c5*i5 - i5
- movq m0, C(1)
- paddw m4, m2 ; r4 = c3*i3
- paddw m6, m7 ; r6 = c3*i5
- paddw m2, m1 ; r2 = c5*i3
- movq m1, J(7)
- paddw m7, m5 ; r7 = c5*i5
- movq m5, m0 ; r5 = c1
- pmulhw m0, m3 ; r0 = c1*i1 - i1
- paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
- pmulhw m5, m1 ; r5 = c1*i7 - i7
- movq m7, C(7)
- psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
- paddw m0, m3 ; r0 = c1*i1
- pmulhw m3, m7 ; r3 = c7*i1
- movq m2, I(2)
- pmulhw m7, m1 ; r7 = c7*i7
- paddw m5, m1 ; r5 = c1*i7
- movq m1, m2 ; r1 = i2
- pmulhw m2, C(2) ; r2 = c2*i2 - i2
- psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
- movq m5, J(6)
- paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
- movq m7, m5 ; r7 = i6
- psubsw m0, m4 ; r0 = A - C
- pmulhw m5, C(2) ; r5 = c2*i6 - i6
- paddw m2, m1 ; r2 = c2*i2
- pmulhw m1, C(6) ; r1 = c6*i2
- paddsw m4, m4 ; r4 = C + C
- paddsw m4, m0 ; r4 = C. = A + C
- psubsw m3, m6 ; r3 = B - D
- paddw m5, m7 ; r5 = c2*i6
- paddsw m6, m6 ; r6 = D + D
- pmulhw m7, C(6) ; r7 = c6*i6
- paddsw m6, m3 ; r6 = D. = B + D
- movq I(1), m4 ; save C. at I(1)
- psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
- movq m4, C(4)
- movq m5, m3 ; r5 = B - D
- pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
- paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
- movq I(2), m6 ; save D. at I(2)
- movq m2, m0 ; r2 = A - C
- movq m6, I(0)
- pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
- paddw m5, m3 ; r5 = B. = c4 * (B - D)
- movq m3, J(4)
- psubsw m5, m1 ; r5 = B.. = B. - H
- paddw m2, m0 ; r0 = A. = c4 * (A - C)
- psubsw m6, m3 ; r6 = i0 - i4
- movq m0, m6
- pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
- paddsw m3, m3 ; r3 = i4 + i4
- paddsw m1, m1 ; r1 = H + H
- paddsw m3, m0 ; r3 = i0 + i4
- paddsw m1, m5 ; r1 = H. = B + H
- pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
- paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
- psubsw m6, m2 ; r6 = F. = F - A.
- paddsw m2, m2 ; r2 = A. + A.
- movq m0, I(1) ; r0 = C.
- paddsw m2, m6 ; r2 = A.. = F + A.
- paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
- psubsw m2, m1 ; r2 = R2 = A.. - H.
-%endmacro
-
-; RowIDCT gets ready to transpose
-%macro RowIDCT 0
- BeginIDCT
- movq m3, I(2) ; r3 = D.
- psubsw m4, m7 ; r4 = E. = E - G
- paddsw m1, m1 ; r1 = H. + H.
- paddsw m7, m7 ; r7 = G + G
- paddsw m1, m2 ; r1 = R1 = A.. + H.
- paddsw m7, m4 ; r1 = R1 = A.. + H.
- psubsw m4, m3 ; r4 = R4 = E. - D.
- paddsw m3, m3
- psubsw m6, m5 ; r6 = R6 = F. - B..
- paddsw m5, m5
- paddsw m3, m4 ; r3 = R3 = E. + D.
- paddsw m5, m6 ; r5 = R5 = F. + B..
- psubsw m7, m0 ; r7 = R7 = G. - C.
- paddsw m0, m0
- movq I(1), m1 ; save R1
- paddsw m0, m7 ; r0 = R0 = G. + C.
-%endmacro
-
-; Column IDCT normalizes and stores final results
-%macro ColumnIDCT 0
- BeginIDCT
- paddsw m2, OC_8 ; adjust R2 (and R1) for shift
- paddsw m1, m1 ; r1 = H. + H.
- paddsw m1, m2 ; r1 = R1 = A.. + H.
- psraw m2, 4 ; r2 = NR2
- psubsw m4, m7 ; r4 = E. = E - G
- psraw m1, 4 ; r1 = NR2
- movq m3, I(2) ; r3 = D.
- paddsw m7, m7 ; r7 = G + G
- movq I(2), m2 ; store NR2 at I2
- paddsw m7, m4 ; r7 = G. = E + G
- movq I(1), m1 ; store NR1 at I1
- psubsw m4, m3 ; r4 = R4 = E. - D.
- paddsw m4, OC_8 ; adjust R4 (and R3) for shift
- paddsw m3, m3 ; r3 = D. + D.
- paddsw m3, m4 ; r3 = R3 = E. + D.
- psraw m4, 4 ; r4 = NR4
- psubsw m6, m5 ; r6 = R6 = F. - B..
- psraw m3, 4 ; r3 = NR3
- paddsw m6, OC_8 ; adjust R6 (and R5) for shift
- paddsw m5, m5 ; r5 = B.. + B..
- paddsw m5, m6 ; r5 = R5 = F. + B..
- psraw m6, 4 ; r6 = NR6
- movq J(4), m4 ; store NR4 at J4
- psraw m5, 4 ; r5 = NR5
- movq I(3), m3 ; store NR3 at I3
- psubsw m7, m0 ; r7 = R7 = G. - C.
- paddsw m7, OC_8 ; adjust R7 (and R0) for shift
- paddsw m0, m0 ; r0 = C. + C.
- paddsw m0, m7 ; r0 = R0 = G. + C.
- psraw m7, 4 ; r7 = NR7
- movq J(6), m6 ; store NR6 at J6
- psraw m0, 4 ; r0 = NR0
- movq J(5), m5 ; store NR5 at J5
- movq J(7), m7 ; store NR7 at J7
- movq I(0), m0 ; store NR0 at I0
-%endmacro
-
-; Following macro does two 4x4 transposes in place.
-;
-; At entry (we assume):
-;
-; r0 = a3 a2 a1 a0
-; I(1) = b3 b2 b1 b0
-; r2 = c3 c2 c1 c0
-; r3 = d3 d2 d1 d0
-;
-; r4 = e3 e2 e1 e0
-; r5 = f3 f2 f1 f0
-; r6 = g3 g2 g1 g0
-; r7 = h3 h2 h1 h0
-;
-; At exit, we have:
-;
-; I(0) = d0 c0 b0 a0
-; I(1) = d1 c1 b1 a1
-; I(2) = d2 c2 b2 a2
-; I(3) = d3 c3 b3 a3
-;
-; J(4) = h0 g0 f0 e0
-; J(5) = h1 g1 f1 e1
-; J(6) = h2 g2 f2 e2
-; J(7) = h3 g3 f3 e3
-;
-; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
-; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
-;
-; Since r1 is free at entry, we calculate the Js first.
-%macro Transpose 0
- movq m1, m4 ; r1 = e3 e2 e1 e0
- punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
- movq I(0), m0 ; save a3 a2 a1 a0
- punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
- movq m0, m6 ; r0 = g3 g2 g1 g0
- punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
- movq m5, m4 ; r5 = f1 e1 f0 e0
- punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
- punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
- movq m6, m1 ; r6 = f3 e3 f2 e2
- movq J(4), m4
- punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
- movq J(5), m5
- punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
- movq m4, I(0) ; r4 = a3 a2 a1 a0
- punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
- movq m5, I(1) ; r5 = b3 b2 b1 b0
- movq m0, m4 ; r0 = a3 a2 a1 a0
- movq J(7), m6
- punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
- movq J(6), m1
- punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
- movq m5, m2 ; r5 = c3 c2 c1 c0
- punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
- movq m1, m0 ; r1 = b1 a1 b0 a0
- punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
- punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
- movq m2, m4 ; r2 = b3 a3 b2 a2
- movq I(0), m0
- punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
- movq I(1), m1
- punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
- punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
- movq I(3), m4
- movq I(2), m2
-%endmacro
-
-%macro VP3_1D_IDCT_SSE2 0
- movdqa m2, I(3) ; xmm2 = i3
- movdqa m6, C(3) ; xmm6 = c3
- movdqa m4, m2 ; xmm4 = i3
- movdqa m7, I(5) ; xmm7 = i5
- pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
- movdqa m1, C(5) ; xmm1 = c5
- pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
- movdqa m5, m1 ; xmm5 = c5
- pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
- movdqa m3, I(1) ; xmm3 = i1
- pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
- movdqa m0, C(1) ; xmm0 = c1
- paddw m4, m2 ; xmm4 = c3 * i3
- paddw m6, m7 ; xmm6 = c3 * i5
- paddw m2, m1 ; xmm2 = c5 * i3
- movdqa m1, I(7) ; xmm1 = i7
- paddw m7, m5 ; xmm7 = c5 * i5
- movdqa m5, m0 ; xmm5 = c1
- pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
- paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
- pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
- movdqa m7, C(7) ; xmm7 = c7
- psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
- paddw m0, m3 ; xmm0 = c1 * i1
- pmulhw m3, m7 ; xmm3 = c7 * i1
- movdqa m2, I(2) ; xmm2 = i2
- pmulhw m7, m1 ; xmm7 = c7 * i7
- paddw m5, m1 ; xmm5 = c1 * i7
- movdqa m1, m2 ; xmm1 = i2
- pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
- psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
- movdqa m5, I(6) ; xmm5 = i6
- paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
- movdqa m7, m5 ; xmm7 = i6
- psubsw m0, m4 ; xmm0 = A - C
- pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
- paddw m2, m1 ; xmm2 = i2 * c2
- pmulhw m1, C(6) ; xmm1 = c6 * i2
- paddsw m4, m4 ; xmm4 = C + C
- paddsw m4, m0 ; xmm4 = A + C = C.
- psubsw m3, m6 ; xmm3 = B - D
- paddw m5, m7 ; xmm5 = c2 * i6
- paddsw m6, m6 ; xmm6 = D + D
- pmulhw m7, C(6) ; xmm7 = c6 * i6
- paddsw m6, m3 ; xmm6 = B + D = D.
- movdqa I(1), m4 ; Save C. at I(1)
- psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
- movdqa m4, C(4) ; xmm4 = C4
- movdqa m5, m3 ; xmm5 = B - D
- pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
- paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
- movdqa I(2), m6 ; save D. at I(2)
- movdqa m2, m0 ; xmm2 = A - C
- movdqa m6, I(0) ; xmm6 = i0
- pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
- paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
- movdqa m3, I(4) ; xmm3 = i4
- psubsw m5, m1 ; xmm5 = B. - H = B..
- paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
- psubsw m6, m3 ; xmm6 = i0 - i4
- movdqa m0, m6 ; xmm0 = i0 - i4
- pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
- paddsw m3, m3 ; xmm3 = i4 + i4
- paddsw m1, m1 ; xmm1 = H + H
- paddsw m3, m0 ; xmm3 = i0 + i4
- paddsw m1, m5 ; xmm1 = B. + H = H.
- pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
- paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
- psubsw m6, m2 ; xmm6 = F - A. = F.
- paddsw m2, m2 ; xmm2 = A. + A.
- movdqa m0, I(1) ; Load C. from I(1)
- paddsw m2, m6 ; xmm2 = F + A. = A..
- paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
- psubsw m2, m1 ; xmm2 = A.. - H. = R2
- ADD(m2) ; Adjust R2 and R1 before shifting
- paddsw m1, m1 ; xmm1 = H. + H.
- paddsw m1, m2 ; xmm1 = A.. + H. = R1
- SHIFT(m2) ; xmm2 = op2
- psubsw m4, m7 ; xmm4 = E - G = E.
- SHIFT(m1) ; xmm1 = op1
- movdqa m3, I(2) ; Load D. from I(2)
- paddsw m7, m7 ; xmm7 = G + G
- paddsw m7, m4 ; xmm7 = E + G = G.
- psubsw m4, m3 ; xmm4 = E. - D. = R4
- ADD(m4) ; Adjust R4 and R3 before shifting
- paddsw m3, m3 ; xmm3 = D. + D.
- paddsw m3, m4 ; xmm3 = E. + D. = R3
- SHIFT(m4) ; xmm4 = op4
- psubsw m6, m5 ; xmm6 = F. - B..= R6
- SHIFT(m3) ; xmm3 = op3
- ADD(m6) ; Adjust R6 and R5 before shifting
- paddsw m5, m5 ; xmm5 = B.. + B..
- paddsw m5, m6 ; xmm5 = F. + B.. = R5
- SHIFT(m6) ; xmm6 = op6
- SHIFT(m5) ; xmm5 = op5
- psubsw m7, m0 ; xmm7 = G. - C. = R7
- ADD(m7) ; Adjust R7 and R0 before shifting
- paddsw m0, m0 ; xmm0 = C. + C.
- paddsw m0, m7 ; xmm0 = G. + C.
- SHIFT(m7) ; xmm7 = op7
- SHIFT(m0) ; xmm0 = op0
-%endmacro
-
-%macro PUT_BLOCK 8
- movdqa O(0), m%1
- movdqa O(1), m%2
- movdqa O(2), m%3
- movdqa O(3), m%4
- movdqa O(4), m%5
- movdqa O(5), m%6
- movdqa O(6), m%7
- movdqa O(7), m%8
-%endmacro
-
-%macro VP3_IDCT 1
-%if mmsize == 16
-%define I(x) [%1+16*x]
-%define O(x) [%1+16*x]
-%define C(x) [vp3_idct_data+16*(x-1)]
-%define SHIFT(x)
-%define ADD(x)
- VP3_1D_IDCT_SSE2
-%if ARCH_X86_64
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
-%else
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
-%endif
- PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
-
-%define SHIFT(x) psraw x, 4
-%define ADD(x) paddsw x, [pw_8]
- VP3_1D_IDCT_SSE2
- PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
-%else ; mmsize == 8
- ; eax = quantized input
- ; ebx = dequantizer matrix
- ; ecx = IDCT constants
- ; M(I) = ecx + MaskOffset(0) + I * 8
- ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
- ; edx = output
- ; r0..r7 = mm0..mm7
-%define OC_8 [pw_8]
-%define C(x) [vp3_idct_data+16*(x-1)]
-
- ; at this point, function has completed dequantization + dezigzag +
- ; partial transposition; now do the idct itself
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
- RowIDCT
- Transpose
-
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
- RowIDCT
- Transpose
-
-%define I(x) [%1+16* x]
-%define J(x) [%1+16*(x-4)+8]
- ColumnIDCT
-
-%define I(x) [%1+16* x +64]
-%define J(x) [%1+16*(x-4)+72]
- ColumnIDCT
-%endif ; mmsize == 16/8
-%endmacro
-
-%macro vp3_idct_funcs 0
-cglobal vp3_idct_put, 3, 4, 9
- VP3_IDCT r2
-
- movsxdifnidn r1, r1d
- mova m4, [pb_80]
- lea r3, [r1*3]
-%assign %%i 0
-%rep 16/mmsize
- mova m0, [r2+mmsize*0+%%i]
- mova m1, [r2+mmsize*2+%%i]
- mova m2, [r2+mmsize*4+%%i]
- mova m3, [r2+mmsize*6+%%i]
-%if mmsize == 8
- packsswb m0, [r2+mmsize*8+%%i]
- packsswb m1, [r2+mmsize*10+%%i]
- packsswb m2, [r2+mmsize*12+%%i]
- packsswb m3, [r2+mmsize*14+%%i]
-%else
- packsswb m0, [r2+mmsize*1+%%i]
- packsswb m1, [r2+mmsize*3+%%i]
- packsswb m2, [r2+mmsize*5+%%i]
- packsswb m3, [r2+mmsize*7+%%i]
-%endif
- paddb m0, m4
- paddb m1, m4
- paddb m2, m4
- paddb m3, m4
- movq [r0 ], m0
-%if mmsize == 8
- movq [r0+r1 ], m1
- movq [r0+r1*2], m2
- movq [r0+r3 ], m3
-%else
- movhps [r0+r1 ], m0
- movq [r0+r1*2], m1
- movhps [r0+r3 ], m1
-%endif
-%if %%i == 0
- lea r0, [r0+r1*4]
-%endif
-%if mmsize == 16
- movq [r0 ], m2
- movhps [r0+r1 ], m2
- movq [r0+r1*2], m3
- movhps [r0+r3 ], m3
-%endif
-%assign %%i %%i+8
-%endrep
-
- pxor m0, m0
-%assign %%offset 0
-%rep 128/mmsize
- mova [r2+%%offset], m0
-%assign %%offset %%offset+mmsize
-%endrep
- RET
-
-cglobal vp3_idct_add, 3, 4, 9
- VP3_IDCT r2
-
- movsxdifnidn r1, r1d
- lea r3, [r1*3]
- pxor m4, m4
-%if mmsize == 16
-%assign %%i 0
-%rep 2
- movq m0, [r0]
- movq m1, [r0+r1]
- movq m2, [r0+r1*2]
- movq m3, [r0+r3]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- paddsw m0, [r2+ 0+%%i]
- paddsw m1, [r2+16+%%i]
- paddsw m2, [r2+32+%%i]
- paddsw m3, [r2+48+%%i]
- packuswb m0, m1
- packuswb m2, m3
- movq [r0 ], m0
- movhps [r0+r1 ], m0
- movq [r0+r1*2], m2
- movhps [r0+r3 ], m2
-%if %%i == 0
- lea r0, [r0+r1*4]
-%endif
-%assign %%i %%i+64
-%endrep
-%else
-%assign %%i 0
-%rep 2
- movq m0, [r0]
- movq m1, [r0+r1]
- movq m2, [r0+r1*2]
- movq m3, [r0+r3]
- movq m5, m0
- movq m6, m1
- movq m7, m2
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpckhbw m5, m4
- punpckhbw m6, m4
- punpckhbw m7, m4
- paddsw m0, [r2+ 0+%%i]
- paddsw m1, [r2+16+%%i]
- paddsw m2, [r2+32+%%i]
- paddsw m5, [r2+64+%%i]
- paddsw m6, [r2+80+%%i]
- paddsw m7, [r2+96+%%i]
- packuswb m0, m5
- movq m5, m3
- punpcklbw m3, m4
- punpckhbw m5, m4
- packuswb m1, m6
- paddsw m3, [r2+48+%%i]
- paddsw m5, [r2+112+%%i]
- packuswb m2, m7
- packuswb m3, m5
- movq [r0 ], m0
- movq [r0+r1 ], m1
- movq [r0+r1*2], m2
- movq [r0+r3 ], m3
-%if %%i == 0
- lea r0, [r0+r1*4]
-%endif
-%assign %%i %%i+8
-%endrep
-%endif
-%assign %%i 0
-%rep 128/mmsize
- mova [r2+%%i], m4
-%assign %%i %%i+mmsize
-%endrep
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-vp3_idct_funcs
-%endif
-
-INIT_XMM sse2
-vp3_idct_funcs
-
-%macro DC_ADD 0
- movq m2, [r0 ]
- movq m3, [r0+r1 ]
- paddusb m2, m0
- movq m4, [r0+r1*2]
- paddusb m3, m0
- movq m5, [r0+r2 ]
- paddusb m4, m0
- paddusb m5, m0
- psubusb m2, m1
- psubusb m3, m1
- movq [r0 ], m2
- psubusb m4, m1
- movq [r0+r1 ], m3
- psubusb m5, m1
- movq [r0+r1*2], m4
- movq [r0+r2 ], m5
-%endmacro
-
-INIT_MMX mmxext
-cglobal vp3_idct_dc_add, 3, 4
-%if ARCH_X86_64
- movsxd r1, r1d
-%endif
- movsx r3, word [r2]
- mov word [r2], 0
- lea r2, [r1*3]
- add r3, 15
- sar r3, 5
- movd m0, r3d
- pshufw m0, m0, 0x0
- pxor m1, m1
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- DC_ADD
- lea r0, [r0+r1*4]
- DC_ADD
- RET
diff --git a/ffmpeg/libavcodec/x86/vp3dsp_init.c b/ffmpeg/libavcodec/x86/vp3dsp_init.c
deleted file mode 100644
index 1f02a6f..0000000
--- a/ffmpeg/libavcodec/x86/vp3dsp_init.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/vp3dsp.h"
-#include "config.h"
-
-void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
-void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block);
-void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
- int16_t *block);
-
-void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
- int *bounding_values);
-void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
- int *bounding_values);
-
-#if HAVE_MMX_INLINE
-
-#define MOVQ_BFE(regd) \
- __asm__ volatile ( \
- "pcmpeqd %%"#regd", %%"#regd" \n\t" \
- "paddb %%"#regd", %%"#regd" \n\t" ::)
-
-#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
- "movq "#rega", "#regr" \n\t" \
- "movq "#regc", "#regp" \n\t" \
- "pand "#regb", "#regr" \n\t" \
- "pand "#regd", "#regp" \n\t" \
- "pxor "#rega", "#regb" \n\t" \
- "pxor "#regc", "#regd" \n\t" \
- "pand %%mm6, "#regb" \n\t" \
- "pand %%mm6, "#regd" \n\t" \
- "psrlq $1, "#regb" \n\t" \
- "psrlq $1, "#regd" \n\t" \
- "paddb "#regb", "#regr" \n\t" \
- "paddb "#regd", "#regp" \n\t"
-
-static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
-{
-// START_TIMER
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%2), %%mm1 \n\t"
- "movq (%1,%4), %%mm2 \n\t"
- "movq (%2,%4), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%3) \n\t"
- "movq %%mm5, (%3,%4) \n\t"
-
- "movq (%1,%4,2), %%mm0 \n\t"
- "movq (%2,%4,2), %%mm1 \n\t"
- "movq (%1,%5), %%mm2 \n\t"
- "movq (%2,%5), %%mm3 \n\t"
- "lea (%1,%4,4), %1 \n\t"
- "lea (%2,%4,4), %2 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%3,%4,2) \n\t"
- "movq %%mm5, (%3,%5) \n\t"
- "lea (%3,%4,4), %3 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
- :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
- :"memory");
-// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
-}
-#endif /* HAVE_MMX_INLINE */
-
-av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if HAVE_MMX_INLINE
- c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
-#endif /* HAVE_MMX_INLINE */
-
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags)) {
- c->idct_put = ff_vp3_idct_put_mmx;
- c->idct_add = ff_vp3_idct_add_mmx;
- }
-#endif
-
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
-
- if (!(flags & CODEC_FLAG_BITEXACT)) {
- c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
- c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
- }
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct_put = ff_vp3_idct_put_sse2;
- c->idct_add = ff_vp3_idct_add_sse2;
- }
-}
diff --git a/ffmpeg/libavcodec/x86/vp56_arith.h b/ffmpeg/libavcodec/x86/vp56_arith.h
deleted file mode 100644
index e71dbf8..0000000
--- a/ffmpeg/libavcodec/x86/vp56_arith.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * VP5 and VP6 compatible video decoder (arith decoder)
- *
- * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
- * Copyright (C) 2010 Eli Friedman
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_VP56_ARITH_H
-#define AVCODEC_X86_VP56_ARITH_H
-
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
-#define vp56_rac_get_prob vp56_rac_get_prob
-static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
-{
- unsigned int code_word = vp56_rac_renorm(c);
- unsigned int high = c->high;
- unsigned int low = 1 + (((high - 1) * prob) >> 8);
- unsigned int low_shift = low << 16;
- int bit = 0;
-
- __asm__(
- "subl %4, %1 \n\t"
- "subl %3, %2 \n\t"
- "leal (%2, %3), %3 \n\t"
- "setae %b0 \n\t"
- "cmovb %4, %1 \n\t"
- "cmovb %3, %2 \n\t"
- : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
- : "r"(low)
- );
-
- c->high = high;
- c->code_word = code_word;
- return bit;
-}
-#endif
-
-#endif /* AVCODEC_X86_VP56_ARITH_H */
diff --git a/ffmpeg/libavcodec/x86/vp8dsp.asm b/ffmpeg/libavcodec/x86/vp8dsp.asm
deleted file mode 100644
index 85c7e99..0000000
--- a/ffmpeg/libavcodec/x86/vp8dsp.asm
+++ /dev/null
@@ -1,1225 +0,0 @@
-;******************************************************************************
-;* VP8 MMXEXT optimizations
-;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
-;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-fourtap_filter_hw_m: times 4 dw -6, 123
- times 4 dw 12, -1
- times 4 dw -9, 93
- times 4 dw 50, -6
- times 4 dw -6, 50
- times 4 dw 93, -9
- times 4 dw -1, 12
- times 4 dw 123, -6
-
-sixtap_filter_hw_m: times 4 dw 2, -11
- times 4 dw 108, 36
- times 4 dw -8, 1
- times 4 dw 3, -16
- times 4 dw 77, 77
- times 4 dw -16, 3
- times 4 dw 1, -8
- times 4 dw 36, 108
- times 4 dw -11, 2
-
-fourtap_filter_hb_m: times 8 db -6, 123
- times 8 db 12, -1
- times 8 db -9, 93
- times 8 db 50, -6
- times 8 db -6, 50
- times 8 db 93, -9
- times 8 db -1, 12
- times 8 db 123, -6
-
-sixtap_filter_hb_m: times 8 db 2, 1
- times 8 db -11, 108
- times 8 db 36, -8
- times 8 db 3, 3
- times 8 db -16, 77
- times 8 db 77, -16
- times 8 db 1, 2
- times 8 db -8, 36
- times 8 db 108, -11
-
-fourtap_filter_v_m: times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
-
-sixtap_filter_v_m: times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
-
-bilinear_filter_vw_m: times 8 dw 1
- times 8 dw 2
- times 8 dw 3
- times 8 dw 4
- times 8 dw 5
- times 8 dw 6
- times 8 dw 7
-
-bilinear_filter_vb_m: times 8 db 7, 1
- times 8 db 6, 2
- times 8 db 5, 3
- times 8 db 4, 4
- times 8 db 3, 5
- times 8 db 2, 6
- times 8 db 1, 7
-
-%ifdef PIC
-%define fourtap_filter_hw picregq
-%define sixtap_filter_hw picregq
-%define fourtap_filter_hb picregq
-%define sixtap_filter_hb picregq
-%define fourtap_filter_v picregq
-%define sixtap_filter_v picregq
-%define bilinear_filter_vw picregq
-%define bilinear_filter_vb picregq
-%define npicregs 1
-%else
-%define fourtap_filter_hw fourtap_filter_hw_m
-%define sixtap_filter_hw sixtap_filter_hw_m
-%define fourtap_filter_hb fourtap_filter_hb_m
-%define sixtap_filter_hb sixtap_filter_hb_m
-%define fourtap_filter_v fourtap_filter_v_m
-%define sixtap_filter_v sixtap_filter_v_m
-%define bilinear_filter_vw bilinear_filter_vw_m
-%define bilinear_filter_vb bilinear_filter_vb_m
-%define npicregs 0
-%endif
-
-filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-
-filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
-filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
-
-pw_256: times 8 dw 256
-pw_20091: times 4 dw 20091
-pw_17734: times 4 dw 17734
-
-cextern pw_3
-cextern pw_4
-cextern pw_64
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; subpel MC functions:
-;
-; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
-; uint8_t *src, int srcstride,
-; int height, int mx, int my);
-;-----------------------------------------------------------------------------
-
-%macro FILTER_SSSE3 1
-cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
- lea mxd, [mxq*3]
- mova m3, [filter_h6_shuf2]
- mova m4, [filter_h6_shuf3]
-%ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
-%endif
- mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
- mova m6, [sixtap_filter_hb+mxq*8-32]
- mova m7, [sixtap_filter_hb+mxq*8-16]
-
-.nextrow:
- movu m0, [srcq-2]
- mova m1, m0
- mova m2, m0
-%if mmsize == 8
-; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
-; shuffle with a memory operand
- punpcklbw m0, [srcq+3]
-%else
- pshufb m0, [filter_h6_shuf1]
-%endif
- pshufb m1, m3
- pshufb m2, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- paddsw m0, m1
- paddsw m0, m2
- pmulhrsw m0, [pw_256]
- packuswb m0, m0
- movh [dstq], m0 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
- mova m2, [pw_256]
- mova m3, [filter_h2_shuf]
- mova m4, [filter_h4_shuf]
-%ifdef PIC
- lea picregq, [fourtap_filter_hb_m]
-%endif
- mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
- mova m6, [fourtap_filter_hb+mxq]
-
-.nextrow:
- movu m0, [srcq-1]
- mova m1, m0
- pshufb m0, m3
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- paddsw m0, m1
- pmulhrsw m0, m2
- packuswb m0, m0
- movh [dstq], m0 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
- shl myd, 4
-%ifdef PIC
- lea picregq, [fourtap_filter_hb_m]
-%endif
- mova m5, [fourtap_filter_hb+myq-16]
- mova m6, [fourtap_filter_hb+myq]
- mova m7, [pw_256]
-
- ; read 3 lines
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+ srcstrideq]
- movh m2, [srcq+2*srcstrideq]
- add srcq, srcstrideq
-
-.nextrow:
- movh m3, [srcq+2*srcstrideq] ; read new row
- mova m4, m0
- mova m0, m1
- punpcklbw m4, m1
- mova m1, m2
- punpcklbw m2, m3
- pmaddubsw m4, m5
- pmaddubsw m2, m6
- paddsw m4, m2
- mova m2, m3
- pmulhrsw m4, m7
- packuswb m4, m4
- movh [dstq], m4
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
- lea myd, [myq*3]
-%ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
-%endif
- lea myq, [sixtap_filter_hb+myq*8]
-
- ; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
-
-.nextrow:
- movh m5, [srcq+2*srcstrideq] ; read new row
- mova m6, m0
- punpcklbw m6, m5
- mova m0, m1
- punpcklbw m1, m2
- mova m7, m3
- punpcklbw m7, m4
- pmaddubsw m6, [myq-48]
- pmaddubsw m1, [myq-32]
- pmaddubsw m7, [myq-16]
- paddsw m6, m1
- paddsw m6, m7
- mova m1, m2
- mova m2, m3
- pmulhrsw m6, [pw_256]
- mova m3, m4
- packuswb m6, m6
- mova m4, m5
- movh [dstq], m6
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX ssse3
-FILTER_SSSE3 4
-INIT_XMM ssse3
-FILTER_SSSE3 8
-
-; 4x4 block, H-only 4-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
-%ifdef PIC
- lea picregq, [fourtap_filter_hw_m]
-%endif
- movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
- movq mm5, [fourtap_filter_hw+mxq]
- movq mm7, [pw_64]
- pxor mm6, mm6
-
-.nextrow:
- movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
-
- ; first set of 2 pixels
- movq mm2, mm1 ; byte ABCD..
- punpcklbw mm1, mm6 ; byte->word ABCD
- pshufw mm0, mm2, 9 ; byte CDEF..
- punpcklbw mm0, mm6 ; byte->word CDEF
- pshufw mm3, mm1, 0x94 ; word ABBC
- pshufw mm1, mm0, 0x94 ; word CDDE
- pmaddwd mm3, mm4 ; multiply 2px with F0/F1
- movq mm0, mm1 ; backup for second set of pixels
- pmaddwd mm1, mm5 ; multiply 2px with F2/F3
- paddd mm3, mm1 ; finish 1st 2px
-
- ; second set of 2 pixels, use backup of above
- punpckhbw mm2, mm6 ; byte->word EFGH
- pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
- pshufw mm1, mm2, 0x94 ; word EFFG
- pmaddwd mm1, mm5 ; multiply 2px with F2/F3
- paddd mm0, mm1 ; finish 2nd 2px
-
- ; merge two sets of 2 pixels into one set of 4, round/clip/store
- packssdw mm3, mm0 ; merge dword->word (4px)
- paddsw mm3, mm7 ; rounding
- psraw mm3, 7
- packuswb mm3, mm6 ; clip and word->bytes
- movd [dstq], mm3 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-; 4x4 block, H-only 6-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
- lea mxd, [mxq*3]
-%ifdef PIC
- lea picregq, [sixtap_filter_hw_m]
-%endif
- movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
- movq mm5, [sixtap_filter_hw+mxq*8-32]
- movq mm6, [sixtap_filter_hw+mxq*8-16]
- movq mm7, [pw_64]
- pxor mm3, mm3
-
-.nextrow:
- movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
-
- ; first set of 2 pixels
- movq mm2, mm1 ; byte ABCD..
- punpcklbw mm1, mm3 ; byte->word ABCD
- pshufw mm0, mm2, 0x9 ; byte CDEF..
- punpckhbw mm2, mm3 ; byte->word EFGH
- punpcklbw mm0, mm3 ; byte->word CDEF
- pshufw mm1, mm1, 0x94 ; word ABBC
- pshufw mm2, mm2, 0x94 ; word EFFG
- pmaddwd mm1, mm4 ; multiply 2px with F0/F1
- pshufw mm3, mm0, 0x94 ; word CDDE
- movq mm0, mm3 ; backup for second set of pixels
- pmaddwd mm3, mm5 ; multiply 2px with F2/F3
- paddd mm1, mm3 ; add to 1st 2px cache
- movq mm3, mm2 ; backup for second set of pixels
- pmaddwd mm2, mm6 ; multiply 2px with F4/F5
- paddd mm1, mm2 ; finish 1st 2px
-
- ; second set of 2 pixels, use backup of above
- movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
- pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
- pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
- paddd mm0, mm3 ; add to 2nd 2px cache
- pxor mm3, mm3
- punpcklbw mm2, mm3 ; byte->word FGHI
- pshufw mm2, mm2, 0xE9 ; word GHHI
- pmaddwd mm2, mm6 ; multiply 2px with F4/F5
- paddd mm0, mm2 ; finish 2nd 2px
-
- ; merge two sets of 2 pixels into one set of 4, round/clip/store
- packssdw mm1, mm0 ; merge dword->word (4px)
- paddsw mm1, mm7 ; rounding
- psraw mm1, 7
- packuswb mm1, mm3 ; clip and word->bytes
- movd [dstq], mm1 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-INIT_XMM sse2
-cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 5
-%ifdef PIC
- lea picregq, [fourtap_filter_v_m]
-%endif
- lea mxq, [fourtap_filter_v+mxq-32]
- pxor m7, m7
- mova m4, [pw_64]
- mova m5, [mxq+ 0]
- mova m6, [mxq+16]
-%ifdef m8
- mova m8, [mxq+32]
- mova m9, [mxq+48]
-%endif
-.nextrow:
- movq m0, [srcq-1]
- movq m1, [srcq-0]
- movq m2, [srcq+1]
- movq m3, [srcq+2]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- pmullw m0, m5
- pmullw m1, m6
-%ifdef m8
- pmullw m2, m8
- pmullw m3, m9
-%else
- pmullw m2, [mxq+32]
- pmullw m3, [mxq+48]
-%endif
- paddsw m0, m1
- paddsw m2, m3
- paddsw m0, m2
- paddsw m0, m4
- psraw m0, 7
- packuswb m0, m7
- movh [dstq], m0 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-INIT_XMM sse2
-cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
- lea mxd, [mxq*3]
- shl mxd, 4
-%ifdef PIC
- lea picregq, [sixtap_filter_v_m]
-%endif
- lea mxq, [sixtap_filter_v+mxq-96]
- pxor m7, m7
- mova m6, [pw_64]
-%ifdef m8
- mova m8, [mxq+ 0]
- mova m9, [mxq+16]
- mova m10, [mxq+32]
- mova m11, [mxq+48]
- mova m12, [mxq+64]
- mova m13, [mxq+80]
-%endif
-.nextrow:
- movq m0, [srcq-2]
- movq m1, [srcq-1]
- movq m2, [srcq-0]
- movq m3, [srcq+1]
- movq m4, [srcq+2]
- movq m5, [srcq+3]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- punpcklbw m5, m7
-%ifdef m8
- pmullw m0, m8
- pmullw m1, m9
- pmullw m2, m10
- pmullw m3, m11
- pmullw m4, m12
- pmullw m5, m13
-%else
- pmullw m0, [mxq+ 0]
- pmullw m1, [mxq+16]
- pmullw m2, [mxq+32]
- pmullw m3, [mxq+48]
- pmullw m4, [mxq+64]
- pmullw m5, [mxq+80]
-%endif
- paddsw m1, m4
- paddsw m0, m5
- paddsw m1, m2
- paddsw m0, m3
- paddsw m0, m1
- paddsw m0, m6
- psraw m0, 7
- packuswb m0, m7
- movh [dstq], m0 ; store
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-%macro FILTER_V 1
-; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
- shl myd, 5
-%ifdef PIC
- lea picregq, [fourtap_filter_v_m]
-%endif
- lea myq, [fourtap_filter_v+myq-32]
- mova m6, [pw_64]
- pxor m7, m7
- mova m5, [myq+48]
-
- ; read 3 lines
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+ srcstrideq]
- movh m2, [srcq+2*srcstrideq]
- add srcq, srcstrideq
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
-
-.nextrow:
- ; first calculate negative taps (to prevent losing positive overflows)
- movh m4, [srcq+2*srcstrideq] ; read new row
- punpcklbw m4, m7
- mova m3, m4
- pmullw m0, [myq+0]
- pmullw m4, m5
- paddsw m4, m0
-
- ; then calculate positive taps
- mova m0, m1
- pmullw m1, [myq+16]
- paddsw m4, m1
- mova m1, m2
- pmullw m2, [myq+32]
- paddsw m4, m2
- mova m2, m3
-
- ; round/clip/store
- paddsw m4, m6
- psraw m4, 7
- packuswb m4, m7
- movh [dstq], m4
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-
-
-; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
- shl myd, 4
- lea myq, [myq*3]
-%ifdef PIC
- lea picregq, [sixtap_filter_v_m]
-%endif
- lea myq, [sixtap_filter_v+myq-96]
- pxor m7, m7
-
- ; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
-
-.nextrow:
- ; first calculate negative taps (to prevent losing positive overflows)
- mova m5, m1
- pmullw m5, [myq+16]
- mova m6, m4
- pmullw m6, [myq+64]
- paddsw m6, m5
-
- ; then calculate positive taps
- movh m5, [srcq+2*srcstrideq] ; read new row
- punpcklbw m5, m7
- pmullw m0, [myq+0]
- paddsw m6, m0
- mova m0, m1
- mova m1, m2
- pmullw m2, [myq+32]
- paddsw m6, m2
- mova m2, m3
- pmullw m3, [myq+48]
- paddsw m6, m3
- mova m3, m4
- mova m4, m5
- pmullw m5, [myq+80]
- paddsw m6, m5
-
- ; round/clip/store
- paddsw m6, [pw_64]
- psraw m6, 7
- packuswb m6, m7
- movh [dstq], m6
-
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-FILTER_V 4
-INIT_XMM sse2
-FILTER_V 8
-
-%macro FILTER_BILINEAR 1
-cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
- shl myd, 4
-%ifdef PIC
- lea picregq, [bilinear_filter_vw_m]
-%endif
- pxor m6, m6
- mova m5, [bilinear_filter_vw+myq-1*16]
- neg myq
- mova m4, [bilinear_filter_vw+myq+7*16]
-.nextrow:
- movh m0, [srcq+srcstrideq*0]
- movh m1, [srcq+srcstrideq*1]
- movh m3, [srcq+srcstrideq*2]
- punpcklbw m0, m6
- punpcklbw m1, m6
- punpcklbw m3, m6
- mova m2, m1
- pmullw m0, m4
- pmullw m1, m5
- pmullw m2, m4
- pmullw m3, m5
- paddsw m0, m1
- paddsw m2, m3
- psraw m0, 2
- psraw m2, 2
- pavgw m0, m6
- pavgw m2, m6
-%if mmsize == 8
- packuswb m0, m0
- packuswb m2, m2
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m2
-%else
- packuswb m0, m2
- movh [dstq+dststrideq*0], m0
- movhps [dstq+dststrideq*1], m0
-%endif
-
- lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
-%ifdef PIC
- lea picregq, [bilinear_filter_vw_m]
-%endif
- pxor m6, m6
- mova m5, [bilinear_filter_vw+mxq-1*16]
- neg mxq
- mova m4, [bilinear_filter_vw+mxq+7*16]
-.nextrow:
- movh m0, [srcq+srcstrideq*0+0]
- movh m1, [srcq+srcstrideq*0+1]
- movh m2, [srcq+srcstrideq*1+0]
- movh m3, [srcq+srcstrideq*1+1]
- punpcklbw m0, m6
- punpcklbw m1, m6
- punpcklbw m2, m6
- punpcklbw m3, m6
- pmullw m0, m4
- pmullw m1, m5
- pmullw m2, m4
- pmullw m3, m5
- paddsw m0, m1
- paddsw m2, m3
- psraw m0, 2
- psraw m2, 2
- pavgw m0, m6
- pavgw m2, m6
-%if mmsize == 8
- packuswb m0, m0
- packuswb m2, m2
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m2
-%else
- packuswb m0, m2
- movh [dstq+dststrideq*0], m0
- movhps [dstq+dststrideq*1], m0
-%endif
-
- lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-FILTER_BILINEAR 4
-INIT_XMM sse2
-FILTER_BILINEAR 8
-
-%macro FILTER_BILINEAR_SSSE3 1
-cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
- shl myd, 4
-%ifdef PIC
- lea picregq, [bilinear_filter_vb_m]
-%endif
- pxor m4, m4
- mova m3, [bilinear_filter_vb+myq-16]
-.nextrow:
- movh m0, [srcq+srcstrideq*0]
- movh m1, [srcq+srcstrideq*1]
- movh m2, [srcq+srcstrideq*2]
- punpcklbw m0, m1
- punpcklbw m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- psraw m0, 2
- psraw m1, 2
- pavgw m0, m4
- pavgw m1, m4
-%if mmsize==8
- packuswb m0, m0
- packuswb m1, m1
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m1
-%else
- packuswb m0, m1
- movh [dstq+dststrideq*0], m0
- movhps [dstq+dststrideq*1], m0
-%endif
-
- lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
- shl mxd, 4
-%ifdef PIC
- lea picregq, [bilinear_filter_vb_m]
-%endif
- pxor m4, m4
- mova m2, [filter_h2_shuf]
- mova m3, [bilinear_filter_vb+mxq-16]
-.nextrow:
- movu m0, [srcq+srcstrideq*0]
- movu m1, [srcq+srcstrideq*1]
- pshufb m0, m2
- pshufb m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- psraw m0, 2
- psraw m1, 2
- pavgw m0, m4
- pavgw m1, m4
-%if mmsize==8
- packuswb m0, m0
- packuswb m1, m1
- movh [dstq+dststrideq*0], m0
- movh [dstq+dststrideq*1], m1
-%else
- packuswb m0, m1
- movh [dstq+dststrideq*0], m0
- movhps [dstq+dststrideq*1], m0
-%endif
-
- lea dstq, [dstq+dststrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-%endmacro
-
-INIT_MMX ssse3
-FILTER_BILINEAR_SSSE3 4
-INIT_XMM ssse3
-FILTER_BILINEAR_SSSE3 8
-
-INIT_MMX mmx
-cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
-.nextrow:
- movq mm0, [srcq+srcstrideq*0]
- movq mm1, [srcq+srcstrideq*1]
- lea srcq, [srcq+srcstrideq*2]
- movq [dstq+dststrideq*0], mm0
- movq [dstq+dststrideq*1], mm1
- lea dstq, [dstq+dststrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-
-%if ARCH_X86_32
-INIT_MMX mmx
-cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
-.nextrow:
- movq mm0, [srcq+srcstrideq*0+0]
- movq mm1, [srcq+srcstrideq*0+8]
- movq mm2, [srcq+srcstrideq*1+0]
- movq mm3, [srcq+srcstrideq*1+8]
- lea srcq, [srcq+srcstrideq*2]
- movq [dstq+dststrideq*0+0], mm0
- movq [dstq+dststrideq*0+8], mm1
- movq [dstq+dststrideq*1+0], mm2
- movq [dstq+dststrideq*1+8], mm3
- lea dstq, [dstq+dststrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-%endif
-
-INIT_XMM sse
-cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
-.nextrow:
- movups xmm0, [srcq+srcstrideq*0]
- movups xmm1, [srcq+srcstrideq*1]
- lea srcq, [srcq+srcstrideq*2]
- movaps [dstq+dststrideq*0], xmm0
- movaps [dstq+dststrideq*1], xmm1
- lea dstq, [dstq+dststrideq*2]
- sub heightd, 2
- jg .nextrow
- REP_RET
-
-;-----------------------------------------------------------------------------
-; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
-;-----------------------------------------------------------------------------
-
-%macro ADD_DC 4
- %4 m2, [dst1q+%3]
- %4 m3, [dst1q+strideq+%3]
- %4 m4, [dst2q+%3]
- %4 m5, [dst2q+strideq+%3]
- paddusb m2, %1
- paddusb m3, %1
- paddusb m4, %1
- paddusb m5, %1
- psubusb m2, %2
- psubusb m3, %2
- psubusb m4, %2
- psubusb m5, %2
- %4 [dst1q+%3], m2
- %4 [dst1q+strideq+%3], m3
- %4 [dst2q+%3], m4
- %4 [dst2q+strideq+%3], m5
-%endmacro
-
-INIT_MMX mmx
-cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
- ; load data
- movd m0, [blockq]
-
- ; calculate DC
- paddw m0, [pw_4]
- pxor m1, m1
- psraw m0, 3
- movd [blockq], m1
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
- punpcklwd m0, m0
- punpcklwd m1, m1
-
- ; add DC
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+strideq*2]
- ADD_DC m0, m1, 0, movh
- RET
-
-INIT_XMM sse4
-cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
- ; load data
- movd m0, [blockq]
- pxor m1, m1
-
- ; calculate DC
- paddw m0, [pw_4]
- movd [blockq], m1
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+strideq*2]
- movd m2, [dst1q]
- movd m3, [dst1q+strideq]
- movd m4, [dst2q]
- movd m5, [dst2q+strideq]
- psraw m0, 3
- pshuflw m0, m0, 0
- punpcklqdq m0, m0
- punpckldq m2, m3
- punpckldq m4, m5
- punpcklbw m2, m1
- punpcklbw m4, m1
- paddw m2, m0
- paddw m4, m0
- packuswb m2, m4
- movd [dst1q], m2
- pextrd [dst1q+strideq], m2, 1
- pextrd [dst2q], m2, 2
- pextrd [dst2q+strideq], m2, 3
- RET
-
-;-----------------------------------------------------------------------------
-; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
-;-----------------------------------------------------------------------------
-
-%if ARCH_X86_32
-INIT_MMX mmx
-cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
- ; load data
- movd m0, [blockq+32*0] ; A
- movd m1, [blockq+32*2] ; C
- punpcklwd m0, [blockq+32*1] ; A B
- punpcklwd m1, [blockq+32*3] ; C D
- punpckldq m0, m1 ; A B C D
- pxor m6, m6
-
- ; calculate DC
- paddw m0, [pw_4]
- movd [blockq+32*0], m6
- movd [blockq+32*1], m6
- movd [blockq+32*2], m6
- movd [blockq+32*3], m6
- psraw m0, 3
- psubw m6, m0
- packuswb m0, m0
- packuswb m6, m6
- punpcklbw m0, m0 ; AABBCCDD
- punpcklbw m6, m6 ; AABBCCDD
- movq m1, m0
- movq m7, m6
- punpcklbw m0, m0 ; AAAABBBB
- punpckhbw m1, m1 ; CCCCDDDD
- punpcklbw m6, m6 ; AAAABBBB
- punpckhbw m7, m7 ; CCCCDDDD
-
- ; add DC
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+strideq*2]
- ADD_DC m0, m6, 0, mova
- ADD_DC m1, m7, 8, mova
- RET
-%endif
-
-INIT_XMM sse2
-cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
- ; load data
- movd m0, [blockq+32*0] ; A
- movd m1, [blockq+32*2] ; C
- punpcklwd m0, [blockq+32*1] ; A B
- punpcklwd m1, [blockq+32*3] ; C D
- punpckldq m0, m1 ; A B C D
- pxor m1, m1
-
- ; calculate DC
- paddw m0, [pw_4]
- movd [blockq+32*0], m1
- movd [blockq+32*1], m1
- movd [blockq+32*2], m1
- movd [blockq+32*3], m1
- psraw m0, 3
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
-
- ; add DC
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+strideq*2]
- ADD_DC m0, m1, 0, mova
- RET
-
-;-----------------------------------------------------------------------------
-; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
- ; load data
- movd m0, [blockq+32*0] ; A
- movd m1, [blockq+32*2] ; C
- punpcklwd m0, [blockq+32*1] ; A B
- punpcklwd m1, [blockq+32*3] ; C D
- punpckldq m0, m1 ; A B C D
- pxor m6, m6
-
- ; calculate DC
- paddw m0, [pw_4]
- movd [blockq+32*0], m6
- movd [blockq+32*1], m6
- movd [blockq+32*2], m6
- movd [blockq+32*3], m6
- psraw m0, 3
- psubw m6, m0
- packuswb m0, m0
- packuswb m6, m6
- punpcklbw m0, m0 ; AABBCCDD
- punpcklbw m6, m6 ; AABBCCDD
- movq m1, m0
- movq m7, m6
- punpcklbw m0, m0 ; AAAABBBB
- punpckhbw m1, m1 ; CCCCDDDD
- punpcklbw m6, m6 ; AAAABBBB
- punpckhbw m7, m7 ; CCCCDDDD
-
- ; add DC
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+strideq*2]
- ADD_DC m0, m6, 0, mova
- lea dst1q, [dst1q+strideq*4]
- lea dst2q, [dst2q+strideq*4]
- ADD_DC m1, m7, 0, mova
- RET
-
-;-----------------------------------------------------------------------------
-; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
-;-----------------------------------------------------------------------------
-
-; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
-; this macro assumes that m6/m7 have words for 20091/17734 loaded
-%macro VP8_MULTIPLY_SUMSUB 4
- mova %3, %1
- mova %4, %2
- pmulhw %3, m6 ;20091(1)
- pmulhw %4, m6 ;20091(2)
- paddw %3, %1
- paddw %4, %2
- paddw %1, %1
- paddw %2, %2
- pmulhw %1, m7 ;35468(1)
- pmulhw %2, m7 ;35468(2)
- psubw %1, %4
- paddw %2, %3
-%endmacro
-
-; calculate x0=%1+%3; x1=%1-%3
-; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
-; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
-; %5/%6 are temporary registers
-; we assume m6/m7 have constant words 20091/17734 loaded in them
-%macro VP8_IDCT_TRANSFORM4x4_1D 6
- SUMSUB_BA w, %3, %1, %5 ;t0, t1
- VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
- SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
- SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
- SWAP %4, %1
- SWAP %4, %3
-%endmacro
-
-%macro VP8_IDCT_ADD 0
-cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
- ; load block data
- movq m0, [blockq+ 0]
- movq m1, [blockq+ 8]
- movq m2, [blockq+16]
- movq m3, [blockq+24]
- movq m6, [pw_20091]
- movq m7, [pw_17734]
-%if cpuflag(sse)
- xorps xmm0, xmm0
- movaps [blockq+ 0], xmm0
- movaps [blockq+16], xmm0
-%else
- pxor m4, m4
- movq [blockq+ 0], m4
- movq [blockq+ 8], m4
- movq [blockq+16], m4
- movq [blockq+24], m4
-%endif
-
- ; actual IDCT
- VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- paddw m0, [pw_4]
- VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
- TRANSPOSE4x4W 0, 1, 2, 3, 4
-
- ; store
- pxor m4, m4
- DEFINE_ARGS dst1, dst2, stride
- lea dst2q, [dst1q+2*strideq]
- STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
- STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
-
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-VP8_IDCT_ADD
-%endif
-INIT_MMX sse
-VP8_IDCT_ADD
-
-;-----------------------------------------------------------------------------
-; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
-;-----------------------------------------------------------------------------
-
-%macro SCATTER_WHT 3
- movd dc1d, m%1
- movd dc2d, m%2
- mov [blockq+2*16*(0+%3)], dc1w
- mov [blockq+2*16*(1+%3)], dc2w
- shr dc1d, 16
- shr dc2d, 16
- psrlq m%1, 32
- psrlq m%2, 32
- mov [blockq+2*16*(4+%3)], dc1w
- mov [blockq+2*16*(5+%3)], dc2w
- movd dc1d, m%1
- movd dc2d, m%2
- mov [blockq+2*16*(8+%3)], dc1w
- mov [blockq+2*16*(9+%3)], dc2w
- shr dc1d, 16
- shr dc2d, 16
- mov [blockq+2*16*(12+%3)], dc1w
- mov [blockq+2*16*(13+%3)], dc2w
-%endmacro
-
-%macro HADAMARD4_1D 4
- SUMSUB_BADC w, %2, %1, %4, %3
- SUMSUB_BADC w, %4, %2, %3, %1
- SWAP %1, %4, %3
-%endmacro
-
-%macro VP8_DC_WHT 0
-cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
- movq m0, [dc1q]
- movq m1, [dc1q+8]
- movq m2, [dc1q+16]
- movq m3, [dc1q+24]
-%if cpuflag(sse)
- xorps xmm0, xmm0
- movaps [dc1q+ 0], xmm0
- movaps [dc1q+16], xmm0
-%else
- pxor m4, m4
- movq [dc1q+ 0], m4
- movq [dc1q+ 8], m4
- movq [dc1q+16], m4
- movq [dc1q+24], m4
-%endif
- HADAMARD4_1D 0, 1, 2, 3
- TRANSPOSE4x4W 0, 1, 2, 3, 4
- paddw m0, [pw_3]
- HADAMARD4_1D 0, 1, 2, 3
- psraw m0, 3
- psraw m1, 3
- psraw m2, 3
- psraw m3, 3
- SCATTER_WHT 0, 1, 0
- SCATTER_WHT 2, 3, 2
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-VP8_DC_WHT
-%endif
-INIT_MMX sse
-VP8_DC_WHT
diff --git a/ffmpeg/libavcodec/x86/vp8dsp_init.c b/ffmpeg/libavcodec/x86/vp8dsp_init.c
deleted file mode 100644
index dca00f5..0000000
--- a/ffmpeg/libavcodec/x86/vp8dsp_init.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * VP8 DSP functions x86-optimized
- * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
- * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/vp8dsp.h"
-
-#if HAVE_YASM
-
-/*
- * MC functions
- */
-void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-
-void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
- uint8_t *src, ptrdiff_t srcstride,
- int height, int mx, int my);
-
-#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
-static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
- uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
- ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
- ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
- dst, dststride, src, srcstride, height, mx, my); \
- ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
- dst + 8, dststride, src + 8, srcstride, height, mx, my); \
-}
-#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
-static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
- uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
- ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
- ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
- dst, dststride, src, srcstride, height, mx, my); \
- ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
- dst + 4, dststride, src + 4, srcstride, height, mx, my); \
-}
-
-#if ARCH_X86_32
-TAP_W8 (mmxext, epel, h4)
-TAP_W8 (mmxext, epel, h6)
-TAP_W16(mmxext, epel, h6)
-TAP_W8 (mmxext, epel, v4)
-TAP_W8 (mmxext, epel, v6)
-TAP_W16(mmxext, epel, v6)
-TAP_W8 (mmxext, bilinear, h)
-TAP_W16(mmxext, bilinear, h)
-TAP_W8 (mmxext, bilinear, v)
-TAP_W16(mmxext, bilinear, v)
-#endif
-
-TAP_W16(sse2, epel, h6)
-TAP_W16(sse2, epel, v6)
-TAP_W16(sse2, bilinear, h)
-TAP_W16(sse2, bilinear, v)
-
-TAP_W16(ssse3, epel, h6)
-TAP_W16(ssse3, epel, v6)
-TAP_W16(ssse3, bilinear, h)
-TAP_W16(ssse3, bilinear, v)
-
-#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
-static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
- uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
- ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
- DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
- uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
- src -= srcstride * (TAPNUMY / 2 - 1); \
- ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
- tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
- ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
- dst, dststride, tmpptr, SIZE, height, mx, my); \
-}
-
-#if ARCH_X86_32
-#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y, 4, 8) \
-HVTAP(mmxext, 8, x, y, 8, 16)
-
-HVTAP(mmxext, 8, 6, 6, 16, 16)
-#else
-#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y, 4, 8)
-#endif
-
-HVTAPMMX(4, 4)
-HVTAPMMX(4, 6)
-HVTAPMMX(6, 4)
-HVTAPMMX(6, 6)
-
-#define HVTAPSSE2(x, y, w) \
-HVTAP(sse2, 16, x, y, w, 16) \
-HVTAP(ssse3, 16, x, y, w, 16)
-
-HVTAPSSE2(4, 4, 8)
-HVTAPSSE2(4, 6, 8)
-HVTAPSSE2(6, 4, 8)
-HVTAPSSE2(6, 6, 8)
-HVTAPSSE2(6, 6, 16)
-
-HVTAP(ssse3, 16, 4, 4, 4, 8)
-HVTAP(ssse3, 16, 4, 6, 4, 8)
-HVTAP(ssse3, 16, 6, 4, 4, 8)
-HVTAP(ssse3, 16, 6, 6, 4, 8)
-
-#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
-static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
- uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
- ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
- DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
- ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
- tmp, SIZE, src, srcstride, height + 1, mx, my); \
- ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
- dst, dststride, tmp, SIZE, height, mx, my); \
-}
-
-HVBILIN(mmxext, 8, 4, 8)
-#if ARCH_X86_32
-HVBILIN(mmxext, 8, 8, 16)
-HVBILIN(mmxext, 8, 16, 16)
-#endif
-HVBILIN(sse2, 8, 8, 16)
-HVBILIN(sse2, 8, 16, 16)
-HVBILIN(ssse3, 8, 4, 8)
-HVBILIN(ssse3, 8, 8, 16)
-HVBILIN(ssse3, 8, 16, 16)
-
-void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
- ptrdiff_t stride);
-void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
- ptrdiff_t stride);
-void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
- ptrdiff_t stride);
-void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
- ptrdiff_t stride);
-void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
- ptrdiff_t stride);
-void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
-void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
-void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-
-#define DECLARE_LOOP_FILTER(NAME) \
-void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
- ptrdiff_t stride, \
- int flim); \
-void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
- ptrdiff_t stride, \
- int flim); \
-void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
- ptrdiff_t stride, \
- int e, int i, int hvt); \
-void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
- ptrdiff_t stride, \
- int e, int i, int hvt); \
-void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
- uint8_t *dstV, \
- ptrdiff_t s, \
- int e, int i, int hvt); \
-void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
- uint8_t *dstV, \
- ptrdiff_t s, \
- int e, int i, int hvt); \
-void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
- ptrdiff_t stride, \
- int e, int i, int hvt); \
-void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
- ptrdiff_t stride, \
- int e, int i, int hvt); \
-void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
- uint8_t *dstV, \
- ptrdiff_t s, \
- int e, int i, int hvt); \
-void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
- uint8_t *dstV, \
- ptrdiff_t s, \
- int e, int i, int hvt);
-
-DECLARE_LOOP_FILTER(mmx)
-DECLARE_LOOP_FILTER(mmxext)
-DECLARE_LOOP_FILTER(sse2)
-DECLARE_LOOP_FILTER(ssse3)
-DECLARE_LOOP_FILTER(sse4)
-
-#endif /* HAVE_YASM */
-
-#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
- c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
-
-#define VP8_MC_FUNC(IDX, SIZE, OPT) \
- c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
- c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
- VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
-
-#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
- c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
- c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
-
-
-av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_MMX(cpu_flags)) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
- c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
-#if ARCH_X86_32
- c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
- c->vp8_idct_add = ff_vp8_idct_add_mmx;
- c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
- c->put_vp8_epel_pixels_tab[0][0][0] =
- c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
-#endif
- c->put_vp8_epel_pixels_tab[1][0][0] =
- c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
-
-#if ARCH_X86_32
- c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
-
- c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
- c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
-
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
-#endif
- }
-
- /* note that 4-tap width=16 functions are missing because w=16
- * is only used for luma, and luma is always a copy or sixtap. */
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- VP8_MC_FUNC(2, 4, mmxext);
- VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
-#if ARCH_X86_32
- VP8_LUMA_MC_FUNC(0, 16, mmxext);
- VP8_MC_FUNC(1, 8, mmxext);
- VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
- VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
-
- c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
-
- c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
- c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
-
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
-#endif
- }
-
- if (EXTERNAL_SSE(cpu_flags)) {
- c->vp8_idct_add = ff_vp8_idct_add_sse;
- c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
- c->put_vp8_epel_pixels_tab[0][0][0] =
- c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
- }
-
- if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
- VP8_LUMA_MC_FUNC(0, 16, sse2);
- VP8_MC_FUNC(1, 8, sse2);
- VP8_BILINEAR_MC_FUNC(0, 16, sse2);
- VP8_BILINEAR_MC_FUNC(1, 8, sse2);
-
- c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
-
- c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
- c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
-
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
- }
-
- if (EXTERNAL_SSE2(cpu_flags)) {
- c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
-
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
-
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
-
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
- }
-
- if (EXTERNAL_SSSE3(cpu_flags)) {
- VP8_LUMA_MC_FUNC(0, 16, ssse3);
- VP8_MC_FUNC(1, 8, ssse3);
- VP8_MC_FUNC(2, 4, ssse3);
- VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
- VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
- VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
-
- c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
-
- c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
- c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
- c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
-
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
- }
-
- if (EXTERNAL_SSE4(cpu_flags)) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
-
- c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/ffmpeg/libavcodec/x86/w64xmmtest.c b/ffmpeg/libavcodec/x86/w64xmmtest.c
deleted file mode 100644
index 25e833f..0000000
--- a/ffmpeg/libavcodec/x86/w64xmmtest.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * check XMM registers for clobbers on Win64
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavutil/x86/w64xmmtest.h"
-
-wrap(avcodec_open2(AVCodecContext *avctx,
- AVCodec *codec,
- AVDictionary **options))
-{
- testxmmclobbers(avcodec_open2, avctx, codec, options);
-}
-
-wrap(avcodec_decode_audio4(AVCodecContext *avctx,
- AVFrame *frame,
- int *got_frame_ptr,
- AVPacket *avpkt))
-{
- testxmmclobbers(avcodec_decode_audio4, avctx, frame,
- got_frame_ptr, avpkt);
-}
-
-wrap(avcodec_decode_video2(AVCodecContext *avctx,
- AVFrame *picture,
- int *got_picture_ptr,
- AVPacket *avpkt))
-{
- testxmmclobbers(avcodec_decode_video2, avctx, picture,
- got_picture_ptr, avpkt);
-}
-
-wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
- AVSubtitle *sub,
- int *got_sub_ptr,
- AVPacket *avpkt))
-{
- testxmmclobbers(avcodec_decode_subtitle2, avctx, sub,
- got_sub_ptr, avpkt);
-}
-
-wrap(avcodec_encode_audio2(AVCodecContext *avctx,
- AVPacket *avpkt,
- const AVFrame *frame,
- int *got_packet_ptr))
-{
- testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
- got_packet_ptr);
-}
-
-wrap(avcodec_encode_video(AVCodecContext *avctx,
- uint8_t *buf, int buf_size,
- const AVFrame *pict))
-{
- testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
-}
-
-wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
- uint8_t *buf, int buf_size,
- const AVSubtitle *sub))
-{
- testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
-}
-
-wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
- const AVFrame *frame, int *got_packet_ptr))
-{
- testxmmclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
-}