summaryrefslogtreecommitdiff
path: root/ffmpeg/libavfilter/x86
diff options
context:
space:
mode:
authorTim Redfern <tim@eclectronics.org>2013-09-05 17:57:22 +0100
committerTim Redfern <tim@eclectronics.org>2013-09-05 17:57:22 +0100
commit8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch)
tree3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavfilter/x86
parent741fb4b9e135cfb161a749db88713229038577bb (diff)
making act segmenter
Diffstat (limited to 'ffmpeg/libavfilter/x86')
-rw-r--r--ffmpeg/libavfilter/x86/Makefile8
-rw-r--r--ffmpeg/libavfilter/x86/af_volume.asm140
-rw-r--r--ffmpeg/libavfilter/x86/af_volume_init.c59
-rw-r--r--ffmpeg/libavfilter/x86/vf_gradfun.c217
-rw-r--r--ffmpeg/libavfilter/x86/vf_hqdn3d.asm106
-rw-r--r--ffmpeg/libavfilter/x86/vf_hqdn3d_init.c41
-rw-r--r--ffmpeg/libavfilter/x86/vf_yadif.asm252
-rw-r--r--ffmpeg/libavfilter/x86/vf_yadif_init.c100
-rw-r--r--ffmpeg/libavfilter/x86/yadif-10.asm282
-rw-r--r--ffmpeg/libavfilter/x86/yadif-16.asm347
10 files changed, 1552 insertions, 0 deletions
diff --git a/ffmpeg/libavfilter/x86/Makefile b/ffmpeg/libavfilter/x86/Makefile
new file mode 100644
index 0000000..cd97347
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/Makefile
@@ -0,0 +1,8 @@
+OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
+OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
+OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
+OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
+
+YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
+YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/ffmpeg/libavfilter/x86/af_volume.asm b/ffmpeg/libavfilter/x86/af_volume.asm
new file mode 100644
index 0000000..f4cbcbc
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/af_volume.asm
@@ -0,0 +1,140 @@
+;*****************************************************************************
+;* x86-optimized functions for volume filter
+;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_1_256: times 4 dq 0x3F70000000000000
+pd_int32_max: times 4 dq 0x41DFFFFFFFC00000
+pw_1: times 8 dw 1
+pw_128: times 8 dw 128
+pq_128: times 2 dq 128
+
+SECTION_TEXT
+
+;------------------------------------------------------------------------------
+; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len,
+; int volume)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse2
+cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
+ movd m0, volumem
+ pshuflw m0, m0, 0
+ punpcklwd m0, [pw_1]
+ mova m1, [pw_128]
+ lea lenq, [lend*2-mmsize]
+.loop:
+ ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8);
+ mova m2, [srcq+lenq]
+ punpcklwd m3, m2, m1
+ punpckhwd m2, m1
+ pmaddwd m3, m0
+ pmaddwd m2, m0
+ psrad m3, 8
+ psrad m2, 8
+ packssdw m3, m2
+ mova [dstq+lenq], m3
+ sub lenq, mmsize
+ jge .loop
+ REP_RET
+
+;------------------------------------------------------------------------------
+; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
+; int volume)
+;------------------------------------------------------------------------------
+
+%macro SCALE_SAMPLES_S32 0
+cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
+%if ARCH_X86_32 && cpuflag(avx)
+ vbroadcastss xmm2, volumem
+%else
+ movd xmm2, volumed
+ pshufd xmm2, xmm2, 0
+%endif
+ CVTDQ2PD m2, xmm2
+ mulpd m2, m2, [pd_1_256]
+ mova m3, [pd_int32_max]
+ lea lenq, [lend*4-mmsize]
+.loop:
+ CVTDQ2PD m0, [srcq+lenq ]
+ CVTDQ2PD m1, [srcq+lenq+mmsize/2]
+ mulpd m0, m0, m2
+ mulpd m1, m1, m2
+ minpd m0, m0, m3
+ minpd m1, m1, m3
+ cvtpd2dq xmm0, m0
+ cvtpd2dq xmm1, m1
+%if cpuflag(avx)
+ vmovdqa [dstq+lenq ], xmm0
+ vmovdqa [dstq+lenq+mmsize/2], xmm1
+%else
+ movq [dstq+lenq ], xmm0
+ movq [dstq+lenq+mmsize/2], xmm1
+%endif
+ sub lenq, mmsize
+ jge .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+%define CVTDQ2PD cvtdq2pd
+SCALE_SAMPLES_S32
+%if HAVE_AVX_EXTERNAL
+%define CVTDQ2PD vcvtdq2pd
+INIT_YMM avx
+SCALE_SAMPLES_S32
+%endif
+%undef CVTDQ2PD
+
+; NOTE: This is not bit-identical with the C version because it clips to
+; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
+
+INIT_XMM ssse3, atom
+cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
+ movd m4, volumem
+ pshufd m4, m4, 0
+ mova m5, [pq_128]
+ pxor m6, m6
+ lea lenq, [lend*4-mmsize]
+.loop:
+ ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
+ mova m7, [srcq+lenq]
+ pabsd m3, m7
+ pshufd m0, m3, q0100
+ pshufd m1, m3, q0302
+ pmuludq m0, m4
+ pmuludq m1, m4
+ paddq m0, m5
+ paddq m1, m5
+ psrlq m0, 7
+ psrlq m1, 7
+ shufps m2, m0, m1, q3131
+ shufps m0, m0, m1, q2020
+ pcmpgtd m2, m6
+ por m0, m2
+ psrld m0, 1
+ psignd m0, m7
+ mova [dstq+lenq], m0
+ sub lenq, mmsize
+ jge .loop
+ REP_RET
diff --git a/ffmpeg/libavfilter/x86/af_volume_init.c b/ffmpeg/libavfilter/x86/af_volume_init.c
new file mode 100644
index 0000000..beee8ca
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/af_volume_init.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/samplefmt.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/af_volume.h"
+
+void ff_scale_samples_s16_sse2(uint8_t *dst, const uint8_t *src, int len,
+ int volume);
+
+void ff_scale_samples_s32_sse2(uint8_t *dst, const uint8_t *src, int len,
+ int volume);
+void ff_scale_samples_s32_ssse3_atom(uint8_t *dst, const uint8_t *src, int len,
+ int volume);
+void ff_scale_samples_s32_avx(uint8_t *dst, const uint8_t *src, int len,
+ int volume);
+
+void ff_volume_init_x86(VolumeContext *vol)
+{
+ int mm_flags = av_get_cpu_flags();
+ enum AVSampleFormat sample_fmt = av_get_packed_sample_fmt(vol->sample_fmt);
+
+ if (sample_fmt == AV_SAMPLE_FMT_S16) {
+ if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768) {
+ vol->scale_samples = ff_scale_samples_s16_sse2;
+ vol->samples_align = 8;
+ }
+ } else if (sample_fmt == AV_SAMPLE_FMT_S32) {
+ if (EXTERNAL_SSE2(mm_flags)) {
+ vol->scale_samples = ff_scale_samples_s32_sse2;
+ vol->samples_align = 4;
+ }
+ if (EXTERNAL_SSSE3(mm_flags) && mm_flags & AV_CPU_FLAG_ATOM) {
+ vol->scale_samples = ff_scale_samples_s32_ssse3_atom;
+ vol->samples_align = 4;
+ }
+ if (EXTERNAL_AVX(mm_flags)) {
+ vol->scale_samples = ff_scale_samples_s32_avx;
+ vol->samples_align = 8;
+ }
+ }
+}
diff --git a/ffmpeg/libavfilter/x86/vf_gradfun.c b/ffmpeg/libavfilter/x86/vf_gradfun.c
new file mode 100644
index 0000000..214e764
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_gradfun.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/gradfun.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
+DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+
+#if HAVE_MMXEXT_INLINE
+static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
+ int width, int thresh,
+ const uint16_t *dithers)
+{
+ intptr_t x;
+ if (width & 3) {
+ x = width & ~3;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ __asm__ volatile(
+ "movd %4, %%mm5 \n"
+ "pxor %%mm7, %%mm7 \n"
+ "pshufw $0, %%mm5, %%mm5 \n"
+ "movq %6, %%mm6 \n"
+ "movq (%5), %%mm3 \n"
+ "movq 8(%5), %%mm4 \n"
+
+ "1: \n"
+ "movd (%2,%0), %%mm0 \n"
+ "movd (%3,%0), %%mm1 \n"
+ "punpcklbw %%mm7, %%mm0 \n"
+ "punpcklwd %%mm1, %%mm1 \n"
+ "psllw $7, %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "psubw %%mm0, %%mm1 \n" // delta = dc - pix
+ "psubw %%mm1, %%mm2 \n"
+ "pmaxsw %%mm1, %%mm2 \n"
+ "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+ "psubw %%mm6, %%mm2 \n"
+ "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+ "pmullw %%mm2, %%mm2 \n"
+ "paddw %%mm3, %%mm0 \n" // pix += dither
+ "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
+ "pmulhw %%mm2, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n" // pix += m
+ "psraw $7, %%mm0 \n"
+ "packuswb %%mm0, %%mm0 \n"
+ "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+ "add $4, %0 \n"
+ "jnl 2f \n"
+
+ "movd (%2,%0), %%mm0 \n"
+ "movd (%3,%0), %%mm1 \n"
+ "punpcklbw %%mm7, %%mm0 \n"
+ "punpcklwd %%mm1, %%mm1 \n"
+ "psllw $7, %%mm0 \n"
+ "pxor %%mm2, %%mm2 \n"
+ "psubw %%mm0, %%mm1 \n" // delta = dc - pix
+ "psubw %%mm1, %%mm2 \n"
+ "pmaxsw %%mm1, %%mm2 \n"
+ "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+ "psubw %%mm6, %%mm2 \n"
+ "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+ "pmullw %%mm2, %%mm2 \n"
+ "paddw %%mm4, %%mm0 \n" // pix += dither
+ "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
+ "pmulhw %%mm2, %%mm1 \n"
+ "paddw %%mm1, %%mm0 \n" // pix += m
+ "psraw $7, %%mm0 \n"
+ "packuswb %%mm0, %%mm0 \n"
+ "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+ "add $4, %0 \n"
+ "jl 1b \n"
+
+ "2: \n"
+ "emms \n"
+ :"+r"(x)
+ :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+ "rm"(thresh), "r"(dithers), "m"(*pw_7f)
+ :"memory"
+ );
+}
+#endif
+
+#if HAVE_SSSE3_INLINE
+static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+ intptr_t x;
+ if (width & 7) {
+ // could be 10% faster if I somehow eliminated this
+ x = width & ~7;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ __asm__ volatile(
+ "movd %4, %%xmm5 \n"
+ "pxor %%xmm7, %%xmm7 \n"
+ "pshuflw $0,%%xmm5, %%xmm5 \n"
+ "movdqa %6, %%xmm6 \n"
+ "punpcklqdq %%xmm5, %%xmm5 \n"
+ "movdqa %5, %%xmm4 \n"
+ "1: \n"
+ "movq (%2,%0), %%xmm0 \n"
+ "movq (%3,%0), %%xmm1 \n"
+ "punpcklbw %%xmm7, %%xmm0 \n"
+ "punpcklwd %%xmm1, %%xmm1 \n"
+ "psllw $7, %%xmm0 \n"
+ "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
+ "pabsw %%xmm1, %%xmm2 \n"
+ "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
+ "psubw %%xmm6, %%xmm2 \n"
+ "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
+ "pmullw %%xmm2, %%xmm2 \n"
+ "psllw $2, %%xmm1 \n"
+ "paddw %%xmm4, %%xmm0 \n" // pix += dither
+ "pmulhw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
+ "paddw %%xmm1, %%xmm0 \n" // pix += m
+ "psraw $7, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
+ "add $8, %0 \n"
+ "jl 1b \n"
+ :"+&r"(x)
+ :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+ "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+ :"memory"
+ );
+}
+#endif /* HAVE_SSSE3_INLINE */
+
+#if HAVE_SSE2_INLINE
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
+{
+#define BLURV(load)\
+ intptr_t x = -2*width;\
+ __asm__ volatile(\
+ "movdqa %6, %%xmm7 \n"\
+ "1: \n"\
+ load" (%4,%0), %%xmm0 \n"\
+ load" (%5,%0), %%xmm1 \n"\
+ "movdqa %%xmm0, %%xmm2 \n"\
+ "movdqa %%xmm1, %%xmm3 \n"\
+ "psrlw $8, %%xmm0 \n"\
+ "psrlw $8, %%xmm1 \n"\
+ "pand %%xmm7, %%xmm2 \n"\
+ "pand %%xmm7, %%xmm3 \n"\
+ "paddw %%xmm1, %%xmm0 \n"\
+ "paddw %%xmm3, %%xmm2 \n"\
+ "paddw %%xmm2, %%xmm0 \n"\
+ "paddw (%2,%0), %%xmm0 \n"\
+ "movdqa (%1,%0), %%xmm1 \n"\
+ "movdqa %%xmm0, (%1,%0) \n"\
+ "psubw %%xmm1, %%xmm0 \n"\
+ "movdqa %%xmm0, (%3,%0) \n"\
+ "add $16, %0 \n"\
+ "jl 1b \n"\
+ :"+&r"(x)\
+ :"r"(buf+width),\
+ "r"(buf1+width),\
+ "r"(dc+width),\
+ "r"(src+width*2),\
+ "r"(src+width*2+src_linesize),\
+ "m"(*pw_ff)\
+ :"memory"\
+ );
+ if (((intptr_t) src | src_linesize) & 15) {
+ BLURV("movdqu");
+ } else {
+ BLURV("movdqa");
+ }
+}
+#endif /* HAVE_SSE2_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_gradfun_init_x86(GradFunContext *gf)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+ if (cpu_flags & AV_CPU_FLAG_MMXEXT)
+ gf->filter_line = gradfun_filter_line_mmxext;
+#endif
+#if HAVE_SSSE3_INLINE
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ gf->filter_line = gradfun_filter_line_ssse3;
+#endif
+#if HAVE_SSE2_INLINE
+ if (cpu_flags & AV_CPU_FLAG_SSE2)
+ gf->blur_line = gradfun_blur_line_sse2;
+#endif
+}
diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d.asm b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm
new file mode 100644
index 0000000..961127e
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm
@@ -0,0 +1,106 @@
+;******************************************************************************
+;* Copyright (c) 2012 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro LOWPASS 3 ; prevsample, cursample, lut
+ sub %1q, %2q
+%if lut_bits != 8
+ sar %1q, 8-lut_bits
+%endif
+ movsx %1d, word [%3q+%1q*2]
+ add %1d, %2d
+%endmacro
+
+%macro LOAD 3 ; dstreg, x, bitdepth
+%if %3 == 8
+ movzx %1, byte [srcq+%2]
+%else
+ movzx %1, word [srcq+(%2)*2]
+%endif
+%if %3 != 16
+ shl %1, 16-%3
+ add %1, (1<<(15-%3))-1
+%endif
+%endmacro
+
+%macro HQDN3D_ROW 1 ; bitdepth
+%if ARCH_X86_64
+cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1
+%else
+cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal
+%endif
+ %assign bytedepth (%1+7)>>3
+ %assign lut_bits 4+4*(%1/16)
+ dec widthq
+ lea srcq, [srcq+widthq*bytedepth]
+ lea dstq, [dstq+widthq*bytedepth]
+ lea frameantq, [frameantq+widthq*2]
+ lea lineantq, [lineantq+widthq*2]
+ neg widthq
+ %define xq widthq
+%if ARCH_X86_32
+ mov dstmp, dstq
+ mov srcmp, srcq
+ mov frameantmp, frameantq
+ mov lineantmp, lineantq
+ %define dstq r0
+ %define frameantq r0
+ %define lineantq r0
+ %define pixelantq r1
+ %define pixelantd r1d
+ DECLARE_REG_TMP 2,3
+%endif
+ LOAD pixelantd, xq, %1
+ALIGN 16
+.loop:
+ movifnidn srcq, srcmp
+ LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread
+.loop2:
+ movifnidn lineantq, lineantmp
+ movzx t1d, word [lineantq+xq*2]
+ LOWPASS t1, pixelant, spatial
+ mov [lineantq+xq*2], t1w
+ LOWPASS pixelant, t0, spatial
+ movifnidn frameantq, frameantmp
+ movzx t0d, word [frameantq+xq*2]
+ LOWPASS t0, t1, temporal
+ mov [frameantq+xq*2], t0w
+ movifnidn dstq, dstmp
+%if %1 != 16
+ shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation
+%endif
+%if %1 == 8
+ mov [dstq+xq], t0b
+%else
+ mov [dstq+xq*2], t0w
+%endif
+ inc xq
+ jl .loop
+ je .loop2
+ REP_RET
+%endmacro ; HQDN3D_ROW
+
+HQDN3D_ROW 8
+HQDN3D_ROW 9
+HQDN3D_ROW 10
+HQDN3D_ROW 16
diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c
new file mode 100644
index 0000000..4abb878
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2012 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavfilter/vf_hqdn3d.h"
+#include "config.h"
+
+void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_9_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+
+av_cold void ff_hqdn3d_init_x86(HQDN3DContext *hqdn3d)
+{
+#if HAVE_YASM
+ hqdn3d->denoise_row[ 8] = ff_hqdn3d_row_8_x86;
+ hqdn3d->denoise_row[ 9] = ff_hqdn3d_row_9_x86;
+ hqdn3d->denoise_row[10] = ff_hqdn3d_row_10_x86;
+ hqdn3d->denoise_row[16] = ff_hqdn3d_row_16_x86;
+#endif
+}
diff --git a/ffmpeg/libavfilter/x86/vf_yadif.asm b/ffmpeg/libavfilter/x86/vf_yadif.asm
new file mode 100644
index 0000000..ebc505c
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_yadif.asm
@@ -0,0 +1,252 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1]
+ movu m3, [curq+t0+%2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgb m5, m3
+ pand m4, [pb_1]
+ psubusb m5, m4
+%if mmsize == 16
+ psrldq m5, 1
+%else
+ psrlq m5, 8
+%endif
+ punpcklbw m5, m7
+ mova m4, m2
+ psubusb m2, m3
+ psubusb m3, m4
+ pmaxub m2, m3
+ mova m3, m2
+ mova m4, m2
+%if mmsize == 16
+ psrldq m3, 1
+ psrldq m4, 2
+%else
+ psrlq m3, 8
+ psrlq m4, 16
+%endif
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ paddw m2, m3
+ paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtw m3, m2
+ pminsw m0, m2
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+%macro CHECK2 0
+ paddw m6, [pw_1]
+ psllw m6, 14
+ paddsw m2, m6
+ mova m3, m0
+ pcmpgtw m3, m2
+ pminsw m0, m2
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+%macro LOAD 2
+ movh %1, %2
+ punpcklbw %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddw m3, m2
+ psraw m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubw m2, m4
+ ABS1 m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ ABS1 m3, m5
+ ABS1 m4, m5
+ paddw m3, m4
+ psrlw m2, 1
+ psrlw m3, 1
+ pmaxsw m2, m3
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ ABS1 m3, m5
+ ABS1 m4, m5
+ paddw m3, m4
+ psrlw m3, 1
+ pmaxsw m2, m3
+ mova [rsp+48], m2
+
+ paddw m1, m0
+ paddw m0, m0
+ psubw m0, m1
+ psrlw m1, 1
+ ABS1 m0, m2
+
+ movu m2, [curq+t1-1]
+ movu m3, [curq+t0-1]
+ mova m4, m2
+ psubusb m2, m3
+ psubusb m3, m4
+ pmaxub m2, m3
+%if mmsize == 16
+ mova m3, m2
+ psrldq m3, 2
+%else
+ pshufw m3, m2, q0021
+%endif
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ paddw m0, m2
+ paddw m0, m3
+ psubw m0, [pw_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 1
+ psrlw m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubw m2, m4
+ psubw m3, m7
+ mova m0, m5
+ psubw m5, m4
+ psubw m0, m7
+ mova m4, m2
+ pminsw m2, m3
+ pmaxsw m3, m4
+ pmaxsw m2, m5
+ pminsw m3, m5
+ pmaxsw m2, m0
+ pminsw m3, m0
+ pxor m4, m4
+ pmaxsw m6, m3
+ psubw m4, m2
+ pmaxsw m6, m4
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubw m2, m6
+ paddw m3, m6
+ pmaxsw m1, m2
+ pminsw m1, m3
+ packuswb m1, m1
+
+ movh [dstq], m1
+ add dstq, mmsize/2
+ add prevq, mmsize/2
+ add curq, mmsize/2
+ add nextq, mmsize/2
+ sub DWORD r4m, mmsize/2
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+ mrefs, parity, mode
+%else
+cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+ mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/ffmpeg/libavfilter/x86/vf_yadif_init.c b/ffmpeg/libavfilter/x86/vf_yadif_init.c
new file mode 100644
index 0000000..58f2fc6
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_yadif_init.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/x86/dsputil_mmx.h"
+#include "libavfilter/yadif.h"
+
+void ff_yadif_filter_line_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
+av_cold void ff_yadif_init_x86(YADIFContext *yadif)
+{
+ int cpu_flags = av_get_cpu_flags();
+ int bit_depth = (!yadif->csp) ? 8
+ : yadif->csp->comp[0].depth_minus1 + 1;
+
+#if HAVE_YASM
+ if (bit_depth >= 15) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
+ if (EXTERNAL_SSE4(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+ } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
+ } else {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_ssse3;
+ }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavfilter/x86/yadif-10.asm b/ffmpeg/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000..d586deb
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,282 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+ pabsw %1, %1
+%else
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+%if mmsize == 16
+ psrldq m5, 2
+%else
+ psrlq m5, 16
+%endif
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+%if mmsize == 16
+ psrldq m3, 2
+ psrldq m4, 4
+%else
+ psrlq m3, 16
+ psrlq m4, 32
+%endif
+ paddw m2, m3
+ paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtw m3, m2
+ pminsw m0, m2
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; %macro CHECK2 0
+; paddw m6, [pw_1]
+; psllw m6, 14
+; paddsw m2, m6
+; mova m3, m0
+; pcmpgtw m3, m2
+; pminsw m0, m2
+; pand m5, m3
+; pandn m3, m1
+; por m3, m5
+; mova m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples. The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+ mova m3, m0
+ pcmpgtw m0, m2
+ pand m0, m6
+ mova m6, m0
+ pand m5, m6
+ pand m2, m0
+ pandn m6, m1
+ pandn m0, m3
+ por m6, m5
+ por m0, m2
+ mova m1, m6
+%endmacro
+
+%macro LOAD 2
+ movu %1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddw m3, m2
+ psraw m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubw m2, m4
+ PABS m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddw m3, m4
+ psrlw m2, 1
+ psrlw m3, 1
+ pmaxsw m2, m3
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddw m3, m4
+ psrlw m3, 1
+ pmaxsw m2, m3
+ mova [rsp+48], m2
+
+ paddw m1, m0
+ paddw m0, m0
+ psubw m0, m1
+ psrlw m1, 1
+ PABS m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+%if mmsize == 16
+ mova m3, m2
+ psrldq m3, 4
+%else
+ mova m3, m2
+ psrlq m3, 32
+%endif
+ paddw m0, m2
+ paddw m0, m3
+ psubw m0, [pw_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 1
+ psrlw m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubw m2, m4
+ psubw m3, m7
+ mova m0, m5
+ psubw m5, m4
+ psubw m0, m7
+ mova m4, m2
+ pminsw m2, m3
+ pmaxsw m3, m4
+ pmaxsw m2, m5
+ pminsw m3, m5
+ pmaxsw m2, m0
+ pminsw m3, m0
+ pxor m4, m4
+ pmaxsw m6, m3
+ psubw m4, m2
+ pmaxsw m6, m4
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubw m2, m6
+ paddw m3, m6
+ pmaxsw m1, m2
+ pminsw m1, m3
+
+ movu [dstq], m1
+ add dstq, mmsize-4
+ add prevq, mmsize-4
+ add curq, mmsize-4
+ add nextq, mmsize-4
+ sub DWORD r4m, mmsize/2-2
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/ffmpeg/libavfilter/x86/yadif-16.asm b/ffmpeg/libavfilter/x86/yadif-16.asm
new file mode 100644
index 0000000..a2e6006
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/yadif-16.asm
@@ -0,0 +1,347 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1: times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PIXSHIFT1 1
+%if cpuflag(sse2)
+ psrldq %1, 2
+%else
+ psrlq %1, 16
+%endif
+%endmacro
+
+%macro PIXSHIFT2 1
+%if cpuflag(sse2)
+ psrldq %1, 4
+%else
+ psrlq %1, 32
+%endif
+%endmacro
+
+%macro PABS 2
+%if cpuflag(ssse3)
+ pabsd %1, %1
+%else
+ pxor %2, %2
+ pcmpgtd %2, %1
+ pxor %1, %2
+ psubd %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+ packusdw %1, %1
+%else
+ psubd %1, [pd_8000]
+ packssdw %1, %1
+ paddw %1, [pw_8000]
+%endif
+%endmacro
+
+%macro PMINSD 3
+%if cpuflag(sse4)
+ pminsd %1, %2
+%else
+ mova %3, %2
+ pcmpgtd %3, %1
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endif
+%endmacro
+
+%macro PMAXSD 3
+%if cpuflag(sse4)
+ pmaxsd %1, %2
+%else
+ mova %3, %1
+ pcmpgtd %3, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+%if mmsize == 16
+ psrldq m5, 2
+%else
+ psrlq m5, 16
+%endif
+ punpcklwd m5, m7
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+%if mmsize == 16
+ psrldq m3, 2
+ psrldq m4, 4
+%else
+ psrlq m3, 16
+ psrlq m4, 32
+%endif
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ punpcklwd m4, m7
+ paddd m2, m3
+ paddd m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m6
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+%macro CHECK2 0
+ paddd m6, [pd_1]
+ pslld m6, 30
+ paddd m2, m6
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m4
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster. A rewrite or refactor of the filter
+; code should make it possible to eliminate the move intruction at the end. It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+; mova m3, m0
+; pcmpgtd m0, m2
+; pand m0, m6
+; mova m6, m0
+; pand m5, m6
+; pand m2, m0
+; pandn m6, m1
+; pandn m0, m3
+; por m6, m5
+; por m0, m2
+; mova m1, m6
+; %endmacro
+
+%macro LOAD 2
+ movh %1, %2
+ punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddd m3, m2
+ psrad m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubd m2, m4
+ PABS m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m2, 1
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ mova [rsp+48], m2
+
+ paddd m1, m0
+ paddd m0, m0
+ psubd m0, m1
+ psrld m1, 1
+ PABS m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+%if mmsize == 16
+ mova m3, m2
+ psrldq m3, 4
+%else
+ mova m3, m2
+ psrlq m3, 32
+%endif
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ psubd m0, [pd_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddd m2, m4
+ paddd m3, m5
+ psrld m2, 1
+ psrld m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubd m2, m4
+ psubd m3, m7
+ mova m0, m5
+ psubd m5, m4
+ psubd m0, m7
+ mova m4, m2
+ PMINSD m2, m3, m7
+ PMAXSD m3, m4, m7
+ PMAXSD m2, m5, m7
+ PMINSD m3, m5, m7
+ PMAXSD m2, m0, m7
+ PMINSD m3, m0, m7
+ pxor m4, m4
+ PMAXSD m6, m3, m7
+ psubd m4, m2
+ PMAXSD m6, m4, m7
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubd m2, m6
+ paddd m3, m6
+ PMAXSD m1, m2, m7
+ PMINSD m1, m3, m7
+ PACK m1
+
+ movh [dstq], m1
+ add dstq, mmsize/2
+ add prevq, mmsize/2
+ add curq, mmsize/2
+ add nextq, mmsize/2
+ sub DWORD r4m, mmsize/4
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif