making act segmenter

author: Tim Redfern <tim@eclectronics.org> 2013-09-05 17:57:22 +0100
committer: Tim Redfern <tim@eclectronics.org> 2013-09-05 17:57:22 +0100
commit: 8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch)
tree: 3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavfilter/x86
parent: 741fb4b9e135cfb161a749db88713229038577bb (diff)
10 files changed, 1552 insertions, 0 deletions
diff --git a/ffmpeg/libavfilter/x86/Makefile b/ffmpeg/libavfilter/x86/Makefile
new file mode 100644
index 0000000..cd97347
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/Makefile
@@ -0,0 +1,8 @@
+OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun.o
+OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
+OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
+OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
+
+YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
+YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/ffmpeg/libavfilter/x86/af_volume.asm b/ffmpeg/libavfilter/x86/af_volume.asm
new file mode 100644
index 0000000..f4cbcbc
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/af_volume.asm
@@ -0,0 +1,140 @@
+;*****************************************************************************
+;* x86-optimized functions for volume filter
+;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_1_256:     times 4 dq 0x3F70000000000000
+pd_int32_max: times 4 dq 0x41DFFFFFFFC00000
+pw_1:         times 8 dw 1
+pw_128:       times 8 dw 128
+pq_128:       times 2 dq 128
+
+SECTION_TEXT
+
+;------------------------------------------------------------------------------
+; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len,
+;                           int volume)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse2
+cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
+    movd        m0, volumem
+    pshuflw     m0, m0, 0
+    punpcklwd   m0, [pw_1]
+    mova        m1, [pw_128]
+    lea       lenq, [lend*2-mmsize]
+.loop:
+    ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8);
+    mova        m2, [srcq+lenq]
+    punpcklwd   m3, m2, m1
+    punpckhwd   m2, m1
+    pmaddwd     m3, m0
+    pmaddwd     m2, m0
+    psrad       m3, 8
+    psrad       m2, 8
+    packssdw    m3, m2
+    mova  [dstq+lenq], m3
+    sub       lenq, mmsize
+    jge .loop
+    REP_RET
+
+;------------------------------------------------------------------------------
+; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
+;                           int volume)
+;------------------------------------------------------------------------------
+
+%macro SCALE_SAMPLES_S32 0
+cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
+%if ARCH_X86_32 && cpuflag(avx)
+    vbroadcastss   xmm2, volumem
+%else
+    movd           xmm2, volumed
+    pshufd         xmm2, xmm2, 0
+%endif
+    CVTDQ2PD         m2, xmm2
+    mulpd            m2, m2, [pd_1_256]
+    mova             m3, [pd_int32_max]
+    lea            lenq, [lend*4-mmsize]
+.loop:
+    CVTDQ2PD         m0, [srcq+lenq         ]
+    CVTDQ2PD         m1, [srcq+lenq+mmsize/2]
+    mulpd            m0, m0, m2
+    mulpd            m1, m1, m2
+    minpd            m0, m0, m3
+    minpd            m1, m1, m3
+    cvtpd2dq       xmm0, m0
+    cvtpd2dq       xmm1, m1
+%if cpuflag(avx)
+    vmovdqa [dstq+lenq         ], xmm0
+    vmovdqa [dstq+lenq+mmsize/2], xmm1
+%else
+    movq    [dstq+lenq         ], xmm0
+    movq    [dstq+lenq+mmsize/2], xmm1
+%endif
+    sub            lenq, mmsize
+    jge .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+%define CVTDQ2PD cvtdq2pd
+SCALE_SAMPLES_S32
+%if HAVE_AVX_EXTERNAL
+%define CVTDQ2PD vcvtdq2pd
+INIT_YMM avx
+SCALE_SAMPLES_S32
+%endif
+%undef CVTDQ2PD
+
+; NOTE: This is not bit-identical with the C version because it clips to
+;       [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
+
+INIT_XMM ssse3, atom
+cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
+    movd        m4, volumem
+    pshufd      m4, m4, 0
+    mova        m5, [pq_128]
+    pxor        m6, m6
+    lea       lenq, [lend*4-mmsize]
+.loop:
+    ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
+    mova        m7, [srcq+lenq]
+    pabsd       m3, m7
+    pshufd      m0, m3, q0100
+    pshufd      m1, m3, q0302
+    pmuludq     m0, m4
+    pmuludq     m1, m4
+    paddq       m0, m5
+    paddq       m1, m5
+    psrlq       m0, 7
+    psrlq       m1, 7
+    shufps      m2, m0, m1, q3131
+    shufps      m0, m0, m1, q2020
+    pcmpgtd     m2, m6
+    por         m0, m2
+    psrld       m0, 1
+    psignd      m0, m7
+    mova  [dstq+lenq], m0
+    sub       lenq, mmsize
+    jge .loop
+    REP_RET
diff --git a/ffmpeg/libavfilter/x86/af_volume_init.c b/ffmpeg/libavfilter/x86/af_volume_init.c
new file mode 100644
index 0000000..beee8ca
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/af_volume_init.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/samplefmt.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/af_volume.h"
+
+void ff_scale_samples_s16_sse2(uint8_t *dst, const uint8_t *src, int len,
+                               int volume);
+
+void ff_scale_samples_s32_sse2(uint8_t *dst, const uint8_t *src, int len,
+                               int volume);
+void ff_scale_samples_s32_ssse3_atom(uint8_t *dst, const uint8_t *src, int len,
+                                     int volume);
+void ff_scale_samples_s32_avx(uint8_t *dst, const uint8_t *src, int len,
+                              int volume);
+
+void ff_volume_init_x86(VolumeContext *vol)
+{
+    int mm_flags = av_get_cpu_flags();
+    enum AVSampleFormat sample_fmt = av_get_packed_sample_fmt(vol->sample_fmt);
+
+    if (sample_fmt == AV_SAMPLE_FMT_S16) {
+        if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768) {
+            vol->scale_samples = ff_scale_samples_s16_sse2;
+            vol->samples_align = 8;
+        }
+    } else if (sample_fmt == AV_SAMPLE_FMT_S32) {
+        if (EXTERNAL_SSE2(mm_flags)) {
+            vol->scale_samples = ff_scale_samples_s32_sse2;
+            vol->samples_align = 4;
+        }
+        if (EXTERNAL_SSSE3(mm_flags) && mm_flags & AV_CPU_FLAG_ATOM) {
+            vol->scale_samples = ff_scale_samples_s32_ssse3_atom;
+            vol->samples_align = 4;
+        }
+        if (EXTERNAL_AVX(mm_flags)) {
+            vol->scale_samples = ff_scale_samples_s32_avx;
+            vol->samples_align = 8;
+        }
+    }
+}
diff --git a/ffmpeg/libavfilter/x86/vf_gradfun.c b/ffmpeg/libavfilter/x86/vf_gradfun.c
new file mode 100644
index 0000000..214e764
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_gradfun.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/gradfun.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
+DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+
+#if HAVE_MMXEXT_INLINE
+static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
+                                       int width, int thresh,
+                                       const uint16_t *dithers)
+{
+    intptr_t x;
+    if (width & 3) {
+        x = width & ~3;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    __asm__ volatile(
+        "movd          %4, %%mm5 \n"
+        "pxor       %%mm7, %%mm7 \n"
+        "pshufw $0, %%mm5, %%mm5 \n"
+        "movq          %6, %%mm6 \n"
+        "movq          (%5), %%mm3 \n"
+        "movq         8(%5), %%mm4 \n"
+
+        "1: \n"
+        "movd     (%2,%0), %%mm0 \n"
+        "movd     (%3,%0), %%mm1 \n"
+        "punpcklbw  %%mm7, %%mm0 \n"
+        "punpcklwd  %%mm1, %%mm1 \n"
+        "psllw         $7, %%mm0 \n"
+        "pxor       %%mm2, %%mm2 \n"
+        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
+        "psubw      %%mm1, %%mm2 \n"
+        "pmaxsw     %%mm1, %%mm2 \n"
+        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+        "psubw      %%mm6, %%mm2 \n"
+        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+        "pmullw     %%mm2, %%mm2 \n"
+        "paddw      %%mm3, %%mm0 \n" // pix += dither
+        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
+        "pmulhw     %%mm2, %%mm1 \n"
+        "paddw      %%mm1, %%mm0 \n" // pix += m
+        "psraw         $7, %%mm0 \n"
+        "packuswb   %%mm0, %%mm0 \n"
+        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+        "add           $4, %0 \n"
+        "jnl 2f \n"
+
+        "movd     (%2,%0), %%mm0 \n"
+        "movd     (%3,%0), %%mm1 \n"
+        "punpcklbw  %%mm7, %%mm0 \n"
+        "punpcklwd  %%mm1, %%mm1 \n"
+        "psllw         $7, %%mm0 \n"
+        "pxor       %%mm2, %%mm2 \n"
+        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
+        "psubw      %%mm1, %%mm2 \n"
+        "pmaxsw     %%mm1, %%mm2 \n"
+        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
+        "psubw      %%mm6, %%mm2 \n"
+        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
+        "pmullw     %%mm2, %%mm2 \n"
+        "paddw      %%mm4, %%mm0 \n" // pix += dither
+        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
+        "pmulhw     %%mm2, %%mm1 \n"
+        "paddw      %%mm1, %%mm0 \n" // pix += m
+        "psraw         $7, %%mm0 \n"
+        "packuswb   %%mm0, %%mm0 \n"
+        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
+        "add           $4, %0 \n"
+        "jl 1b \n"
+
+        "2: \n"
+        "emms \n"
+        :"+r"(x)
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+         "rm"(thresh), "r"(dithers), "m"(*pw_7f)
+        :"memory"
+    );
+}
+#endif
+
+#if HAVE_SSSE3_INLINE
+static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+    intptr_t x;
+    if (width & 7) {
+        // could be 10% faster if I somehow eliminated this
+        x = width & ~7;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    __asm__ volatile(
+        "movd           %4, %%xmm5 \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        "pshuflw $0,%%xmm5, %%xmm5 \n"
+        "movdqa         %6, %%xmm6 \n"
+        "punpcklqdq %%xmm5, %%xmm5 \n"
+        "movdqa         %5, %%xmm4 \n"
+        "1: \n"
+        "movq      (%2,%0), %%xmm0 \n"
+        "movq      (%3,%0), %%xmm1 \n"
+        "punpcklbw  %%xmm7, %%xmm0 \n"
+        "punpcklwd  %%xmm1, %%xmm1 \n"
+        "psllw          $7, %%xmm0 \n"
+        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
+        "pabsw      %%xmm1, %%xmm2 \n"
+        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
+        "psubw      %%xmm6, %%xmm2 \n"
+        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
+        "pmullw     %%xmm2, %%xmm2 \n"
+        "psllw          $2, %%xmm1 \n"
+        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
+        "pmulhw     %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
+        "paddw      %%xmm1, %%xmm0 \n" // pix += m
+        "psraw          $7, %%xmm0 \n"
+        "packuswb   %%xmm0, %%xmm0 \n"
+        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
+        "add            $8, %0 \n"
+        "jl 1b \n"
+        :"+&r"(x)
+        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
+         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
+        :"memory"
+    );
+}
+#endif /* HAVE_SSSE3_INLINE */
+
+#if HAVE_SSE2_INLINE
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
+{
+#define BLURV(load)\
+    intptr_t x = -2*width;\
+    __asm__ volatile(\
+        "movdqa %6, %%xmm7 \n"\
+        "1: \n"\
+        load"   (%4,%0), %%xmm0 \n"\
+        load"   (%5,%0), %%xmm1 \n"\
+        "movdqa  %%xmm0, %%xmm2 \n"\
+        "movdqa  %%xmm1, %%xmm3 \n"\
+        "psrlw       $8, %%xmm0 \n"\
+        "psrlw       $8, %%xmm1 \n"\
+        "pand    %%xmm7, %%xmm2 \n"\
+        "pand    %%xmm7, %%xmm3 \n"\
+        "paddw   %%xmm1, %%xmm0 \n"\
+        "paddw   %%xmm3, %%xmm2 \n"\
+        "paddw   %%xmm2, %%xmm0 \n"\
+        "paddw  (%2,%0), %%xmm0 \n"\
+        "movdqa (%1,%0), %%xmm1 \n"\
+        "movdqa  %%xmm0, (%1,%0) \n"\
+        "psubw   %%xmm1, %%xmm0 \n"\
+        "movdqa  %%xmm0, (%3,%0) \n"\
+        "add        $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(x)\
+        :"r"(buf+width),\
+         "r"(buf1+width),\
+         "r"(dc+width),\
+         "r"(src+width*2),\
+         "r"(src+width*2+src_linesize),\
+         "m"(*pw_ff)\
+        :"memory"\
+    );
+    if (((intptr_t) src | src_linesize) & 15) {
+        BLURV("movdqu");
+    } else {
+        BLURV("movdqa");
+    }
+}
+#endif /* HAVE_SSE2_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_gradfun_init_x86(GradFunContext *gf)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
+        gf->filter_line = gradfun_filter_line_mmxext;
+#endif
+#if HAVE_SSSE3_INLINE
+    if (cpu_flags & AV_CPU_FLAG_SSSE3)
+        gf->filter_line = gradfun_filter_line_ssse3;
+#endif
+#if HAVE_SSE2_INLINE
+    if (cpu_flags & AV_CPU_FLAG_SSE2)
+        gf->blur_line = gradfun_blur_line_sse2;
+#endif
+}
diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d.asm b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm
new file mode 100644
index 0000000..961127e
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm
@@ -0,0 +1,106 @@
+;******************************************************************************
+;* Copyright (c) 2012 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro LOWPASS 3 ; prevsample, cursample, lut
+    sub    %1q, %2q
+%if lut_bits != 8
+    sar    %1q, 8-lut_bits
+%endif
+    movsx  %1d, word [%3q+%1q*2]
+    add    %1d, %2d
+%endmacro
+
+%macro LOAD 3 ; dstreg, x, bitdepth
+%if %3 == 8
+    movzx  %1, byte [srcq+%2]
+%else
+    movzx  %1, word [srcq+(%2)*2]
+%endif
+%if %3 != 16
+    shl    %1, 16-%3
+    add    %1, (1<<(15-%3))-1
+%endif
+%endmacro
+
+%macro HQDN3D_ROW 1 ; bitdepth
+%if ARCH_X86_64
+cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1
+%else
+cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal
+%endif
+    %assign bytedepth (%1+7)>>3
+    %assign lut_bits 4+4*(%1/16)
+    dec    widthq
+    lea    srcq, [srcq+widthq*bytedepth]
+    lea    dstq, [dstq+widthq*bytedepth]
+    lea    frameantq, [frameantq+widthq*2]
+    lea    lineantq,  [lineantq+widthq*2]
+    neg    widthq
+    %define xq widthq
+%if ARCH_X86_32
+    mov    dstmp, dstq
+    mov    srcmp, srcq
+    mov    frameantmp, frameantq
+    mov    lineantmp,  lineantq
+    %define dstq r0
+    %define frameantq r0
+    %define lineantq  r0
+    %define pixelantq r1
+    %define pixelantd r1d
+    DECLARE_REG_TMP 2,3
+%endif
+    LOAD   pixelantd, xq, %1
+ALIGN 16
+.loop:
+    movifnidn srcq, srcmp
+    LOAD      t0d, xq+1, %1 ; skip on the last iteration to avoid overread
+.loop2:
+    movifnidn lineantq, lineantmp
+    movzx     t1d, word [lineantq+xq*2]
+    LOWPASS   t1, pixelant, spatial
+    mov       [lineantq+xq*2], t1w
+    LOWPASS   pixelant, t0, spatial
+    movifnidn frameantq, frameantmp
+    movzx     t0d, word [frameantq+xq*2]
+    LOWPASS   t0, t1, temporal
+    mov       [frameantq+xq*2], t0w
+    movifnidn dstq, dstmp
+%if %1 != 16
+    shr    t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation
+%endif
+%if %1 == 8
+    mov    [dstq+xq], t0b
+%else
+    mov    [dstq+xq*2], t0w
+%endif
+    inc    xq
+    jl .loop
+    je .loop2
+    REP_RET
+%endmacro ; HQDN3D_ROW
+
+HQDN3D_ROW 8
+HQDN3D_ROW 9
+HQDN3D_ROW 10
+HQDN3D_ROW 16
diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c
new file mode 100644
index 0000000..4abb878
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2012 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavfilter/vf_hqdn3d.h"
+#include "config.h"
+
+void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_9_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+
+av_cold void ff_hqdn3d_init_x86(HQDN3DContext *hqdn3d)
+{
+#if HAVE_YASM
+    hqdn3d->denoise_row[ 8] = ff_hqdn3d_row_8_x86;
+    hqdn3d->denoise_row[ 9] = ff_hqdn3d_row_9_x86;
+    hqdn3d->denoise_row[10] = ff_hqdn3d_row_10_x86;
+    hqdn3d->denoise_row[16] = ff_hqdn3d_row_16_x86;
+#endif
+}
diff --git a/ffmpeg/libavfilter/x86/vf_yadif.asm b/ffmpeg/libavfilter/x86/vf_yadif.asm
new file mode 100644
index 0000000..ebc505c
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_yadif.asm
@@ -0,0 +1,252 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1]
+    movu      m3, [curq+t0+%2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgb     m5, m3
+    pand      m4, [pb_1]
+    psubusb   m5, m4
+%if mmsize == 16
+    psrldq    m5, 1
+%else
+    psrlq     m5, 8
+%endif
+    punpcklbw m5, m7
+    mova      m4, m2
+    psubusb   m2, m3
+    psubusb   m3, m4
+    pmaxub    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 1
+    psrldq    m4, 2
+%else
+    psrlq     m3, 8
+    psrlq     m4, 16
+%endif
+    punpcklbw m2, m7
+    punpcklbw m3, m7
+    punpcklbw m4, m7
+    paddw     m2, m3
+    paddw     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro CHECK2 0
+    paddw   m6, [pw_1]
+    psllw   m6, 14
+    paddsw  m2, m6
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro LOAD 2
+    movh      %1, %2
+    punpcklbw %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psraw        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubw        m2, m4
+    ABS1         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS1         m3, m5
+    ABS1         m4, m5
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS1         m3, m5
+    ABS1         m4, m5
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    mova   [rsp+48], m2
+
+    paddw        m1, m0
+    paddw        m0, m0
+    psubw        m0, m1
+    psrlw        m1, 1
+    ABS1         m0, m2
+
+    movu         m2, [curq+t1-1]
+    movu         m3, [curq+t0-1]
+    mova         m4, m2
+    psubusb      m2, m3
+    psubusb      m3, m4
+    pmaxub       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 2
+%else
+    pshufw       m3, m2, q0021
+%endif
+    punpcklbw    m2, m7
+    punpcklbw    m3, m7
+    paddw        m0, m2
+    paddw        m0, m3
+    psubw        m0, [pw_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddw        m2, m4
+    paddw        m3, m5
+    psrlw        m2, 1
+    psrlw        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubw        m2, m4
+    psubw        m3, m7
+    mova         m0, m5
+    psubw        m5, m4
+    psubw        m0, m7
+    mova         m4, m2
+    pminsw       m2, m3
+    pmaxsw       m3, m4
+    pmaxsw       m2, m5
+    pminsw       m3, m5
+    pmaxsw       m2, m0
+    pminsw       m3, m0
+    pxor         m4, m4
+    pmaxsw       m6, m3
+    psubw        m4, m2
+    pmaxsw       m6, m4
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubw        m2, m6
+    paddw        m3, m6
+    pmaxsw       m1, m2
+    pminsw       m1, m3
+    packuswb     m1, m1
+
+    movh     [dstq], m1
+    add        dstq, mmsize/2
+    add       prevq, mmsize/2
+    add        curq, mmsize/2
+    add       nextq, mmsize/2
+    sub   DWORD r4m, mmsize/2
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+                                        mrefs, parity, mode
+%else
+cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+                                        mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/ffmpeg/libavfilter/x86/vf_yadif_init.c b/ffmpeg/libavfilter/x86/vf_yadif_init.c
new file mode 100644
index 0000000..58f2fc6
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/vf_yadif_init.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/x86/dsputil_mmx.h"
+#include "libavfilter/yadif.h"
+
+void ff_yadif_filter_line_mmxext(void *dst, void *prev, void *cur,
+                                 void *next, int w, int prefs,
+                                 int mrefs, int parity, int mode);
+void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
+                               void *next, int w, int prefs,
+                               int mrefs, int parity, int mode);
+void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
+                                void *next, int w, int prefs,
+                                int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+
+av_cold void ff_yadif_init_x86(YADIFContext *yadif)
+{
+    int cpu_flags = av_get_cpu_flags();
+    int bit_depth = (!yadif->csp) ? 8
+                                  : yadif->csp->comp[0].depth_minus1 + 1;
+
+#if HAVE_YASM
+    if (bit_depth >= 15) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
+        if (EXTERNAL_SSE4(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+    } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
+    } else {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_ssse3;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/ffmpeg/libavfilter/x86/yadif-10.asm b/ffmpeg/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000..d586deb
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,282 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsw %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+%if mmsize == 16
+    psrldq    m5, 2
+%else
+    psrlq     m5, 16
+%endif
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 2
+    psrldq    m4, 4
+%else
+    psrlq     m3, 16
+    psrlq     m4, 32
+%endif
+    paddw     m2, m3
+    paddw     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; %macro CHECK2 0
+;     paddw   m6, [pw_1]
+;     psllw   m6, 14
+;     paddsw  m2, m6
+;     mova    m3, m0
+;     pcmpgtw m3, m2
+;     pminsw  m0, m2
+;     pand    m5, m3
+;     pandn   m3, m1
+;     por     m3, m5
+;     mova    m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples.  The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+    mova    m3, m0
+    pcmpgtw m0, m2
+    pand    m0, m6
+    mova    m6, m0
+    pand    m5, m6
+    pand    m2, m0
+    pandn   m6, m1
+    pandn   m0, m3
+    por     m6, m5
+    por     m0, m2
+    mova    m1, m6
+%endmacro
+
+%macro LOAD 2
+    movu      %1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psraw        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubw        m2, m4
+    PABS         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    mova   [rsp+48], m2
+
+    paddw        m1, m0
+    paddw        m0, m0
+    psubw        m0, m1
+    psrlw        m1, 1
+    PABS         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 4
+%else
+    mova         m3, m2
+    psrlq        m3, 32
+%endif
+    paddw        m0, m2
+    paddw        m0, m3
+    psubw        m0, [pw_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddw        m2, m4
+    paddw        m3, m5
+    psrlw        m2, 1
+    psrlw        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubw        m2, m4
+    psubw        m3, m7
+    mova         m0, m5
+    psubw        m5, m4
+    psubw        m0, m7
+    mova         m4, m2
+    pminsw       m2, m3
+    pmaxsw       m3, m4
+    pmaxsw       m2, m5
+    pminsw       m3, m5
+    pmaxsw       m2, m0
+    pminsw       m3, m0
+    pxor         m4, m4
+    pmaxsw       m6, m3
+    psubw        m4, m2
+    pmaxsw       m6, m4
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubw        m2, m6
+    paddw        m3, m6
+    pmaxsw       m1, m2
+    pminsw       m1, m3
+
+    movu     [dstq], m1
+    add        dstq, mmsize-4
+    add       prevq, mmsize-4
+    add        curq, mmsize-4
+    add       nextq, mmsize-4
+    sub   DWORD r4m, mmsize/2-2
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/ffmpeg/libavfilter/x86/yadif-16.asm b/ffmpeg/libavfilter/x86/yadif-16.asm
new file mode 100644
index 0000000..a2e6006
--- /dev/null
+++ b/ffmpeg/libavfilter/x86/yadif-16.asm
@@ -0,0 +1,347 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1:    times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1:    times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PIXSHIFT1 1
+%if cpuflag(sse2)
+    psrldq %1, 2
+%else
+    psrlq %1, 16
+%endif
+%endmacro
+
+%macro PIXSHIFT2 1
+%if cpuflag(sse2)
+    psrldq %1, 4
+%else
+    psrlq %1, 32
+%endif
+%endmacro
+
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsd %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtd %2, %1
+    pxor    %1, %2
+    psubd   %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+    packusdw %1, %1
+%else
+    psubd    %1, [pd_8000]
+    packssdw %1, %1
+    paddw    %1, [pw_8000]
+%endif
+%endmacro
+
+%macro PMINSD 3
+%if cpuflag(sse4)
+    pminsd %1, %2
+%else
+    mova    %3, %2
+    pcmpgtd %3, %1
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXSD 3
+%if cpuflag(sse4)
+    pmaxsd %1, %2
+%else
+    mova    %3, %1
+    pcmpgtd %3, %2
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+%if mmsize == 16
+    psrldq    m5, 2
+%else
+    psrlq     m5, 16
+%endif
+    punpcklwd m5, m7
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 2
+    psrldq    m4, 4
+%else
+    psrlq     m3, 16
+    psrlq     m4, 32
+%endif
+    punpcklwd m2, m7
+    punpcklwd m3, m7
+    punpcklwd m4, m7
+    paddd     m2, m3
+    paddd     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m6
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro CHECK2 0
+    paddd   m6, [pd_1]
+    pslld   m6, 30
+    paddd   m2, m6
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m4
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster.  A rewrite or refactor of the filter
+; code should make it possible to eliminate the move intruction at the end.  It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+;     mova    m3, m0
+;     pcmpgtd m0, m2
+;     pand    m0, m6
+;     mova    m6, m0
+;     pand    m5, m6
+;     pand    m2, m0
+;     pandn   m6, m1
+;     pandn   m0, m3
+;     por     m6, m5
+;     por     m0, m2
+;     mova    m1, m6
+; %endmacro
+
+%macro LOAD 2
+    movh      %1, %2
+    punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddd        m3, m2
+    psrad        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubd        m2, m4
+    PABS         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m2, 1
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    mova   [rsp+48], m2
+
+    paddd        m1, m0
+    paddd        m0, m0
+    psubd        m0, m1
+    psrld        m1, 1
+    PABS         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 4
+%else
+    mova         m3, m2
+    psrlq        m3, 32
+%endif
+    punpcklwd    m2, m7
+    punpcklwd    m3, m7
+    paddd        m0, m2
+    paddd        m0, m3
+    psubd        m0, [pd_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddd        m2, m4
+    paddd        m3, m5
+    psrld        m2, 1
+    psrld        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubd        m2, m4
+    psubd        m3, m7
+    mova         m0, m5
+    psubd        m5, m4
+    psubd        m0, m7
+    mova         m4, m2
+    PMINSD       m2, m3, m7
+    PMAXSD       m3, m4, m7
+    PMAXSD       m2, m5, m7
+    PMINSD       m3, m5, m7
+    PMAXSD       m2, m0, m7
+    PMINSD       m3, m0, m7
+    pxor         m4, m4
+    PMAXSD       m6, m3, m7
+    psubd        m4, m2
+    PMAXSD       m6, m4, m7
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubd        m2, m6
+    paddd        m3, m6
+    PMAXSD       m1, m2, m7
+    PMINSD       m1, m3, m7
+    PACK         m1
+
+    movh     [dstq], m1
+    add        dstq, mmsize/2
+    add       prevq, mmsize/2
+    add        curq, mmsize/2
+    add       nextq, mmsize/2
+    sub   DWORD r4m, mmsize/4
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
author	Tim Redfern <tim@eclectronics.org>	2013-09-05 17:57:22 +0100
committer	Tim Redfern <tim@eclectronics.org>	2013-09-05 17:57:22 +0100
commit	8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch)
tree	3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavfilter/x86
parent	741fb4b9e135cfb161a749db88713229038577bb (diff)