diff options
| author | Tim Redfern <tim@eclectronics.org> | 2013-09-05 17:57:22 +0100 |
|---|---|---|
| committer | Tim Redfern <tim@eclectronics.org> | 2013-09-05 17:57:22 +0100 |
| commit | 8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch) | |
| tree | 3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavfilter/x86 | |
| parent | 741fb4b9e135cfb161a749db88713229038577bb (diff) | |
making act segmenter
Diffstat (limited to 'ffmpeg/libavfilter/x86')
| -rw-r--r-- | ffmpeg/libavfilter/x86/Makefile | 8 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/af_volume.asm | 140 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/af_volume_init.c | 59 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/vf_gradfun.c | 217 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/vf_hqdn3d.asm | 106 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/vf_hqdn3d_init.c | 41 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/vf_yadif.asm | 252 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/vf_yadif_init.c | 100 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/yadif-10.asm | 282 | ||||
| -rw-r--r-- | ffmpeg/libavfilter/x86/yadif-16.asm | 347 |
10 files changed, 1552 insertions, 0 deletions
diff --git a/ffmpeg/libavfilter/x86/Makefile b/ffmpeg/libavfilter/x86/Makefile new file mode 100644 index 0000000..cd97347 --- /dev/null +++ b/ffmpeg/libavfilter/x86/Makefile @@ -0,0 +1,8 @@ +OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o +OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o +OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o +OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o + +YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o +YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o +YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o diff --git a/ffmpeg/libavfilter/x86/af_volume.asm b/ffmpeg/libavfilter/x86/af_volume.asm new file mode 100644 index 0000000..f4cbcbc --- /dev/null +++ b/ffmpeg/libavfilter/x86/af_volume.asm @@ -0,0 +1,140 @@ +;***************************************************************************** +;* x86-optimized functions for volume filter +;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_1_256: times 4 dq 0x3F70000000000000 +pd_int32_max: times 4 dq 0x41DFFFFFFFC00000 +pw_1: times 8 dw 1 +pw_128: times 8 dw 128 +pq_128: times 2 dq 128 + +SECTION_TEXT + +;------------------------------------------------------------------------------ +; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len, +; int volume) +;------------------------------------------------------------------------------ + +INIT_XMM sse2 +cglobal scale_samples_s16, 4,4,4, dst, src, len, volume + movd m0, volumem + pshuflw m0, m0, 0 + punpcklwd m0, [pw_1] + mova m1, [pw_128] + lea lenq, [lend*2-mmsize] +.loop: + ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8); + mova m2, [srcq+lenq] + punpcklwd m3, m2, m1 + punpckhwd m2, m1 + pmaddwd m3, m0 + pmaddwd m2, m0 + psrad m3, 8 + psrad m2, 8 + packssdw m3, m2 + mova [dstq+lenq], m3 + sub lenq, mmsize + jge .loop + REP_RET + +;------------------------------------------------------------------------------ +; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len, +; int volume) +;------------------------------------------------------------------------------ + +%macro SCALE_SAMPLES_S32 0 +cglobal scale_samples_s32, 4,4,4, dst, src, len, volume +%if ARCH_X86_32 && cpuflag(avx) + vbroadcastss xmm2, volumem +%else + movd xmm2, volumed + pshufd xmm2, xmm2, 0 +%endif + CVTDQ2PD m2, xmm2 + mulpd m2, m2, [pd_1_256] + mova m3, [pd_int32_max] + lea lenq, [lend*4-mmsize] +.loop: + CVTDQ2PD m0, [srcq+lenq ] + CVTDQ2PD m1, [srcq+lenq+mmsize/2] + mulpd m0, m0, m2 + mulpd m1, m1, m2 + minpd m0, m0, m3 + minpd m1, m1, m3 + cvtpd2dq xmm0, m0 + cvtpd2dq xmm1, m1 +%if cpuflag(avx) + vmovdqa [dstq+lenq ], xmm0 + vmovdqa [dstq+lenq+mmsize/2], xmm1 +%else + movq [dstq+lenq ], xmm0 + movq [dstq+lenq+mmsize/2], xmm1 +%endif + sub lenq, mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +%define CVTDQ2PD cvtdq2pd +SCALE_SAMPLES_S32 +%if HAVE_AVX_EXTERNAL +%define CVTDQ2PD vcvtdq2pd +INIT_YMM avx +SCALE_SAMPLES_S32 +%endif +%undef CVTDQ2PD + +; NOTE: This is not bit-identical with the C version because it clips to +; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX] + +INIT_XMM ssse3, atom +cglobal scale_samples_s32, 4,4,8, dst, src, len, volume + movd m4, volumem + pshufd m4, m4, 0 + mova m5, [pq_128] + pxor m6, m6 + lea lenq, [lend*4-mmsize] +.loop: + ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8); + mova m7, [srcq+lenq] + pabsd m3, m7 + pshufd m0, m3, q0100 + pshufd m1, m3, q0302 + pmuludq m0, m4 + pmuludq m1, m4 + paddq m0, m5 + paddq m1, m5 + psrlq m0, 7 + psrlq m1, 7 + shufps m2, m0, m1, q3131 + shufps m0, m0, m1, q2020 + pcmpgtd m2, m6 + por m0, m2 + psrld m0, 1 + psignd m0, m7 + mova [dstq+lenq], m0 + sub lenq, mmsize + jge .loop + REP_RET diff --git a/ffmpeg/libavfilter/x86/af_volume_init.c b/ffmpeg/libavfilter/x86/af_volume_init.c new file mode 100644 index 0000000..beee8ca --- /dev/null +++ b/ffmpeg/libavfilter/x86/af_volume_init.c @@ -0,0 +1,59 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/samplefmt.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/af_volume.h" + +void ff_scale_samples_s16_sse2(uint8_t *dst, const uint8_t *src, int len, + int volume); + +void ff_scale_samples_s32_sse2(uint8_t *dst, const uint8_t *src, int len, + int volume); +void ff_scale_samples_s32_ssse3_atom(uint8_t *dst, const uint8_t *src, int len, + int volume); +void ff_scale_samples_s32_avx(uint8_t *dst, const uint8_t *src, int len, + int volume); + +void ff_volume_init_x86(VolumeContext *vol) +{ + int mm_flags = av_get_cpu_flags(); + enum AVSampleFormat sample_fmt = av_get_packed_sample_fmt(vol->sample_fmt); + + if (sample_fmt == AV_SAMPLE_FMT_S16) { + if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768) { + vol->scale_samples = ff_scale_samples_s16_sse2; + vol->samples_align = 8; + } + } else if (sample_fmt == AV_SAMPLE_FMT_S32) { + if (EXTERNAL_SSE2(mm_flags)) { + vol->scale_samples = ff_scale_samples_s32_sse2; + vol->samples_align = 4; + } + if (EXTERNAL_SSSE3(mm_flags) && mm_flags & AV_CPU_FLAG_ATOM) { + vol->scale_samples = ff_scale_samples_s32_ssse3_atom; + vol->samples_align = 4; + } + if (EXTERNAL_AVX(mm_flags)) { + vol->scale_samples = ff_scale_samples_s32_avx; + vol->samples_align = 8; + } + } +} diff --git a/ffmpeg/libavfilter/x86/vf_gradfun.c b/ffmpeg/libavfilter/x86/vf_gradfun.c new file mode 100644 index 0000000..214e764 --- /dev/null +++ b/ffmpeg/libavfilter/x86/vf_gradfun.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavfilter/gradfun.h" + +#if HAVE_INLINE_ASM + +DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; +DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; + +#if HAVE_MMXEXT_INLINE +static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc, + int width, int thresh, + const uint16_t *dithers) +{ + intptr_t x; + if (width & 3) { + x = width & ~3; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%mm5 \n" + "pxor %%mm7, %%mm7 \n" + "pshufw $0, %%mm5, %%mm5 \n" + "movq %6, %%mm6 \n" + "movq (%5), %%mm3 \n" + "movq 8(%5), %%mm4 \n" + + "1: \n" + "movd (%2,%0), %%mm0 \n" + "movd (%3,%0), %%mm1 \n" + "punpcklbw %%mm7, %%mm0 \n" + "punpcklwd %%mm1, %%mm1 \n" + "psllw $7, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "psubw %%mm0, %%mm1 \n" // delta = dc - pix + "psubw %%mm1, %%mm2 \n" + "pmaxsw %%mm1, %%mm2 \n" + "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%mm6, %%mm2 \n" + "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) + "pmullw %%mm2, %%mm2 \n" + "paddw %%mm3, %%mm0 \n" // pix += dither + "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 + "pmulhw %%mm2, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" // pix += m + "psraw $7, %%mm0 \n" + "packuswb %%mm0, %%mm0 \n" + "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $4, %0 \n" + "jnl 2f \n" + + "movd (%2,%0), %%mm0 \n" + "movd (%3,%0), %%mm1 \n" + "punpcklbw %%mm7, %%mm0 \n" + "punpcklwd %%mm1, %%mm1 \n" + "psllw $7, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "psubw %%mm0, %%mm1 \n" // delta = dc - pix + "psubw %%mm1, %%mm2 \n" + "pmaxsw %%mm1, %%mm2 \n" + "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%mm6, %%mm2 \n" + "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) + "pmullw %%mm2, %%mm2 \n" + "paddw %%mm4, %%mm0 \n" // pix += dither + "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 + "pmulhw %%mm2, %%mm1 \n" + "paddw %%mm1, %%mm0 \n" // pix += m + "psraw $7, %%mm0 \n" + "packuswb %%mm0, %%mm0 \n" + "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $4, %0 \n" + "jl 1b \n" + + "2: \n" + "emms \n" + :"+r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "r"(dithers), "m"(*pw_7f) + :"memory" + ); +} +#endif + +#if HAVE_SSSE3_INLINE +static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers) +{ + intptr_t x; + if (width & 7) { + // could be 10% faster if I somehow eliminated this + x = width & ~7; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%xmm5 \n" + "pxor %%xmm7, %%xmm7 \n" + "pshuflw $0,%%xmm5, %%xmm5 \n" + "movdqa %6, %%xmm6 \n" + "punpcklqdq %%xmm5, %%xmm5 \n" + "movdqa %5, %%xmm4 \n" + "1: \n" + "movq (%2,%0), %%xmm0 \n" + "movq (%3,%0), %%xmm1 \n" + "punpcklbw %%xmm7, %%xmm0 \n" + "punpcklwd %%xmm1, %%xmm1 \n" + "psllw $7, %%xmm0 \n" + "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix + "pabsw %%xmm1, %%xmm2 \n" + "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%xmm6, %%xmm2 \n" + "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m) + "pmullw %%xmm2, %%xmm2 \n" + "psllw $2, %%xmm1 \n" + "paddw %%xmm4, %%xmm0 \n" // pix += dither + "pmulhw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 + "paddw %%xmm1, %%xmm0 \n" // pix += m + "psraw $7, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $8, %0 \n" + "jl 1b \n" + :"+&r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "m"(*dithers), "m"(*pw_7f) + :"memory" + ); +} +#endif /* HAVE_SSSE3_INLINE */ + +#if HAVE_SSE2_INLINE +static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width) +{ +#define BLURV(load)\ + intptr_t x = -2*width;\ + __asm__ volatile(\ + "movdqa %6, %%xmm7 \n"\ + "1: \n"\ + load" (%4,%0), %%xmm0 \n"\ + load" (%5,%0), %%xmm1 \n"\ + "movdqa %%xmm0, %%xmm2 \n"\ + "movdqa %%xmm1, %%xmm3 \n"\ + "psrlw $8, %%xmm0 \n"\ + "psrlw $8, %%xmm1 \n"\ + "pand %%xmm7, %%xmm2 \n"\ + "pand %%xmm7, %%xmm3 \n"\ + "paddw %%xmm1, %%xmm0 \n"\ + "paddw %%xmm3, %%xmm2 \n"\ + "paddw %%xmm2, %%xmm0 \n"\ + "paddw (%2,%0), %%xmm0 \n"\ + "movdqa (%1,%0), %%xmm1 \n"\ + "movdqa %%xmm0, (%1,%0) \n"\ + "psubw %%xmm1, %%xmm0 \n"\ + "movdqa %%xmm0, (%3,%0) \n"\ + "add $16, %0 \n"\ + "jl 1b \n"\ + :"+&r"(x)\ + :"r"(buf+width),\ + "r"(buf1+width),\ + "r"(dc+width),\ + "r"(src+width*2),\ + "r"(src+width*2+src_linesize),\ + "m"(*pw_ff)\ + :"memory"\ + ); + if (((intptr_t) src | src_linesize) & 15) { + BLURV("movdqu"); + } else { + BLURV("movdqa"); + } +} +#endif /* HAVE_SSE2_INLINE */ + +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_gradfun_init_x86(GradFunContext *gf) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_MMXEXT_INLINE + if (cpu_flags & AV_CPU_FLAG_MMXEXT) + gf->filter_line = gradfun_filter_line_mmxext; +#endif +#if HAVE_SSSE3_INLINE + if (cpu_flags & AV_CPU_FLAG_SSSE3) + gf->filter_line = gradfun_filter_line_ssse3; +#endif +#if HAVE_SSE2_INLINE + if (cpu_flags & AV_CPU_FLAG_SSE2) + gf->blur_line = gradfun_blur_line_sse2; +#endif +} diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d.asm b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm new file mode 100644 index 0000000..961127e --- /dev/null +++ b/ffmpeg/libavfilter/x86/vf_hqdn3d.asm @@ -0,0 +1,106 @@ +;****************************************************************************** +;* Copyright (c) 2012 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro LOWPASS 3 ; prevsample, cursample, lut + sub %1q, %2q +%if lut_bits != 8 + sar %1q, 8-lut_bits +%endif + movsx %1d, word [%3q+%1q*2] + add %1d, %2d +%endmacro + +%macro LOAD 3 ; dstreg, x, bitdepth +%if %3 == 8 + movzx %1, byte [srcq+%2] +%else + movzx %1, word [srcq+(%2)*2] +%endif +%if %3 != 16 + shl %1, 16-%3 + add %1, (1<<(15-%3))-1 +%endif +%endmacro + +%macro HQDN3D_ROW 1 ; bitdepth +%if ARCH_X86_64 +cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1 +%else +cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal +%endif + %assign bytedepth (%1+7)>>3 + %assign lut_bits 4+4*(%1/16) + dec widthq + lea srcq, [srcq+widthq*bytedepth] + lea dstq, [dstq+widthq*bytedepth] + lea frameantq, [frameantq+widthq*2] + lea lineantq, [lineantq+widthq*2] + neg widthq + %define xq widthq +%if ARCH_X86_32 + mov dstmp, dstq + mov srcmp, srcq + mov frameantmp, frameantq + mov lineantmp, lineantq + %define dstq r0 + %define frameantq r0 + %define lineantq r0 + %define pixelantq r1 + %define pixelantd r1d + DECLARE_REG_TMP 2,3 +%endif + LOAD pixelantd, xq, %1 +ALIGN 16 +.loop: + movifnidn srcq, srcmp + LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread +.loop2: + movifnidn lineantq, lineantmp + movzx t1d, word [lineantq+xq*2] + LOWPASS t1, pixelant, spatial + mov [lineantq+xq*2], t1w + LOWPASS pixelant, t0, spatial + movifnidn frameantq, frameantmp + movzx t0d, word [frameantq+xq*2] + LOWPASS t0, t1, temporal + mov [frameantq+xq*2], t0w + movifnidn dstq, dstmp +%if %1 != 16 + shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation +%endif +%if %1 == 8 + mov [dstq+xq], t0b +%else + mov [dstq+xq*2], t0w +%endif + inc xq + jl .loop + je .loop2 + REP_RET +%endmacro ; HQDN3D_ROW + +HQDN3D_ROW 8 +HQDN3D_ROW 9 +HQDN3D_ROW 10 +HQDN3D_ROW 16 diff --git a/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c new file mode 100644 index 0000000..4abb878 --- /dev/null +++ b/ffmpeg/libavfilter/x86/vf_hqdn3d_init.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2012 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavfilter/vf_hqdn3d.h" +#include "config.h" + +void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); +void ff_hqdn3d_row_9_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); +void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); +void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); + +av_cold void ff_hqdn3d_init_x86(HQDN3DContext *hqdn3d) +{ +#if HAVE_YASM + hqdn3d->denoise_row[ 8] = ff_hqdn3d_row_8_x86; + hqdn3d->denoise_row[ 9] = ff_hqdn3d_row_9_x86; + hqdn3d->denoise_row[10] = ff_hqdn3d_row_10_x86; + hqdn3d->denoise_row[16] = ff_hqdn3d_row_16_x86; +#endif +} diff --git a/ffmpeg/libavfilter/x86/vf_yadif.asm b/ffmpeg/libavfilter/x86/vf_yadif.asm new file mode 100644 index 0000000..ebc505c --- /dev/null +++ b/ffmpeg/libavfilter/x86/vf_yadif.asm @@ -0,0 +1,252 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 + movu m2, [curq+t1+%1] + movu m3, [curq+t0+%2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgb m5, m3 + pand m4, [pb_1] + psubusb m5, m4 +%if mmsize == 16 + psrldq m5, 1 +%else + psrlq m5, 8 +%endif + punpcklbw m5, m7 + mova m4, m2 + psubusb m2, m3 + psubusb m3, m4 + pmaxub m2, m3 + mova m3, m2 + mova m4, m2 +%if mmsize == 16 + psrldq m3, 1 + psrldq m4, 2 +%else + psrlq m3, 8 + psrlq m4, 16 +%endif + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + paddw m2, m3 + paddw m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro CHECK2 0 + paddw m6, [pw_1] + psllw m6, 14 + paddsw m2, m6 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro LOAD 2 + movh %1, %2 + punpcklbw %1, m7 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] + mova m4, m3 + paddw m3, m2 + psraw m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubw m2, m4 + ABS1 m2, m4 + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] + psubw m3, m0 + psubw m4, m1 + ABS1 m3, m5 + ABS1 m4, m5 + paddw m3, m4 + psrlw m2, 1 + psrlw m3, 1 + pmaxsw m2, m3 + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] + psubw m3, m0 + psubw m4, m1 + ABS1 m3, m5 + ABS1 m4, m5 + paddw m3, m4 + psrlw m3, 1 + pmaxsw m2, m3 + mova [rsp+48], m2 + + paddw m1, m0 + paddw m0, m0 + psubw m0, m1 + psrlw m1, 1 + ABS1 m0, m2 + + movu m2, [curq+t1-1] + movu m3, [curq+t0-1] + mova m4, m2 + psubusb m2, m3 + psubusb m3, m4 + pmaxub m2, m3 +%if mmsize == 16 + mova m3, m2 + psrldq m3, 2 +%else + pshufw m3, m2, q0021 +%endif + punpcklbw m2, m7 + punpcklbw m3, m7 + paddw m0, m2 + paddw m0, m3 + psubw m0, [pw_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] + paddw m2, m4 + paddw m3, m5 + psrlw m2, 1 + psrlw m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubw m2, m4 + psubw m3, m7 + mova m0, m5 + psubw m5, m4 + psubw m0, m7 + mova m4, m2 + pminsw m2, m3 + pmaxsw m3, m4 + pmaxsw m2, m5 + pminsw m3, m5 + pmaxsw m2, m0 + pminsw m3, m0 + pxor m4, m4 + pmaxsw m6, m3 + psubw m4, m2 + pmaxsw m6, m4 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubw m2, m6 + paddw m3, m6 + pmaxsw m1, m2 + pminsw m1, m3 + packuswb m1, m1 + + movh [dstq], m1 + add dstq, mmsize/2 + add prevq, mmsize/2 + add curq, mmsize/2 + add nextq, mmsize/2 + sub DWORD r4m, mmsize/2 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ + mrefs, parity, mode +%else +cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ + mrefs, parity, mode +%endif +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif diff --git a/ffmpeg/libavfilter/x86/vf_yadif_init.c b/ffmpeg/libavfilter/x86/vf_yadif_init.c new file mode 100644 index 0000000..58f2fc6 --- /dev/null +++ b/ffmpeg/libavfilter/x86/vf_yadif_init.c @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/x86/dsputil_mmx.h" +#include "libavfilter/yadif.h" + +void ff_yadif_filter_line_mmxext(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); + +void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); + +void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); + +av_cold void ff_yadif_init_x86(YADIFContext *yadif) +{ + int cpu_flags = av_get_cpu_flags(); + int bit_depth = (!yadif->csp) ? 8 + : yadif->csp->comp[0].depth_minus1 + 1; + +#if HAVE_YASM + if (bit_depth >= 15) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_ssse3; + if (EXTERNAL_SSE4(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_sse4; + } else if ( bit_depth >= 9 && bit_depth <= 14) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_ssse3; + } else { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_ssse3; + } +#endif /* HAVE_YASM */ +} diff --git a/ffmpeg/libavfilter/x86/yadif-10.asm b/ffmpeg/libavfilter/x86/yadif-10.asm new file mode 100644 index 0000000..d586deb --- /dev/null +++ b/ffmpeg/libavfilter/x86/yadif-10.asm @@ -0,0 +1,282 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> +;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 + +SECTION .text + +%macro PABS 2 +%if cpuflag(ssse3) + pabsw %1, %1 +%else + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro PMAXUW 2 +%if cpuflag(sse4) + pmaxuw %1, %2 +%else + psubusw %1, %2 + paddusw %1, %2 +%endif +%endmacro + +%macro CHECK 2 + movu m2, [curq+t1+%1*2] + movu m3, [curq+t0+%2*2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgw m5, m3 + pand m4, [pw_1] + psubusw m5, m4 +%if mmsize == 16 + psrldq m5, 2 +%else + psrlq m5, 16 +%endif + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + mova m4, m2 +%if mmsize == 16 + psrldq m3, 2 + psrldq m4, 4 +%else + psrlq m3, 16 + psrlq m4, 32 +%endif + paddw m2, m3 + paddw m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +; %macro CHECK2 0 +; paddw m6, [pw_1] +; psllw m6, 14 +; paddsw m2, m6 +; mova m3, m0 +; pcmpgtw m3, m2 +; pminsw m0, m2 +; pand m5, m3 +; pandn m3, m1 +; por m3, m5 +; mova m1, m3 +; %endmacro + +; This version of CHECK2 is required for 14-bit samples. The left-shift trick +; in the old code is not large enough to correctly select pixels or scores. + +%macro CHECK2 0 + mova m3, m0 + pcmpgtw m0, m2 + pand m0, m6 + mova m6, m0 + pand m5, m6 + pand m2, m0 + pandn m6, m1 + pandn m0, m3 + por m6, m5 + por m0, m2 + mova m1, m6 +%endmacro + +%macro LOAD 2 + movu %1, %2 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] + mova m4, m3 + paddw m3, m2 + psraw m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubw m2, m4 + PABS m2, m4 + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] + psubw m3, m0 + psubw m4, m1 + PABS m3, m5 + PABS m4, m5 + paddw m3, m4 + psrlw m2, 1 + psrlw m3, 1 + pmaxsw m2, m3 + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] + psubw m3, m0 + psubw m4, m1 + PABS m3, m5 + PABS m4, m5 + paddw m3, m4 + psrlw m3, 1 + pmaxsw m2, m3 + mova [rsp+48], m2 + + paddw m1, m0 + paddw m0, m0 + psubw m0, m1 + psrlw m1, 1 + PABS m0, m2 + + movu m2, [curq+t1-1*2] + movu m3, [curq+t0-1*2] + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 +%if mmsize == 16 + mova m3, m2 + psrldq m3, 4 +%else + mova m3, m2 + psrlq m3, 32 +%endif + paddw m0, m2 + paddw m0, m3 + psubw m0, [pw_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] + paddw m2, m4 + paddw m3, m5 + psrlw m2, 1 + psrlw m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubw m2, m4 + psubw m3, m7 + mova m0, m5 + psubw m5, m4 + psubw m0, m7 + mova m4, m2 + pminsw m2, m3 + pmaxsw m3, m4 + pmaxsw m2, m5 + pminsw m3, m5 + pmaxsw m2, m0 + pminsw m3, m0 + pxor m4, m4 + pmaxsw m6, m3 + psubw m4, m2 + pmaxsw m6, m4 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubw m2, m6 + paddw m3, m6 + pmaxsw m1, m2 + pminsw m1, m3 + + movu [dstq], m1 + add dstq, mmsize-4 + add prevq, mmsize-4 + add curq, mmsize-4 + add nextq, mmsize-4 + sub DWORD r4m, mmsize/2-2 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%else +cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%endif +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif diff --git a/ffmpeg/libavfilter/x86/yadif-16.asm b/ffmpeg/libavfilter/x86/yadif-16.asm new file mode 100644 index 0000000..a2e6006 --- /dev/null +++ b/ffmpeg/libavfilter/x86/yadif-16.asm @@ -0,0 +1,347 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> +;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 +pw_8000: times 8 dw 0x8000 +pd_1: times 4 dd 1 +pd_8000: times 4 dd 0x8000 + +SECTION .text + +%macro PIXSHIFT1 1 +%if cpuflag(sse2) + psrldq %1, 2 +%else + psrlq %1, 16 +%endif +%endmacro + +%macro PIXSHIFT2 1 +%if cpuflag(sse2) + psrldq %1, 4 +%else + psrlq %1, 32 +%endif +%endmacro + +%macro PABS 2 +%if cpuflag(ssse3) + pabsd %1, %1 +%else + pxor %2, %2 + pcmpgtd %2, %1 + pxor %1, %2 + psubd %1, %2 +%endif +%endmacro + +%macro PACK 1 +%if cpuflag(sse4) + packusdw %1, %1 +%else + psubd %1, [pd_8000] + packssdw %1, %1 + paddw %1, [pw_8000] +%endif +%endmacro + +%macro PMINSD 3 +%if cpuflag(sse4) + pminsd %1, %2 +%else + mova %3, %2 + pcmpgtd %3, %1 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro PMAXSD 3 +%if cpuflag(sse4) + pmaxsd %1, %2 +%else + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro PMAXUW 2 +%if cpuflag(sse4) + pmaxuw %1, %2 +%else + psubusw %1, %2 + paddusw %1, %2 +%endif +%endmacro + +%macro CHECK 2 + movu m2, [curq+t1+%1*2] + movu m3, [curq+t0+%2*2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgw m5, m3 + pand m4, [pw_1] + psubusw m5, m4 +%if mmsize == 16 + psrldq m5, 2 +%else + psrlq m5, 16 +%endif + punpcklwd m5, m7 + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + mova m4, m2 +%if mmsize == 16 + psrldq m3, 2 + psrldq m4, 4 +%else + psrlq m3, 16 + psrlq m4, 32 +%endif + punpcklwd m2, m7 + punpcklwd m3, m7 + punpcklwd m4, m7 + paddd m2, m3 + paddd m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtd m3, m2 + PMINSD m0, m2, m6 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro CHECK2 0 + paddd m6, [pd_1] + pslld m6, 30 + paddd m2, m6 + mova m3, m0 + pcmpgtd m3, m2 + PMINSD m0, m2, m4 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I +; am not sure whether it is any faster. A rewrite or refactor of the filter +; code should make it possible to eliminate the move intruction at the end. It +; exists to satisfy the expectation that the "score" values are in m1. + +; %macro CHECK2 0 +; mova m3, m0 +; pcmpgtd m0, m2 +; pand m0, m6 +; mova m6, m0 +; pand m5, m6 +; pand m2, m0 +; pandn m6, m1 +; pandn m0, m3 +; por m6, m5 +; por m0, m2 +; mova m1, m6 +; %endmacro + +%macro LOAD 2 + movh %1, %2 + punpcklwd %1, m7 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] + mova m4, m3 + paddd m3, m2 + psrad m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubd m2, m4 + PABS m2, m4 + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] + psubd m3, m0 + psubd m4, m1 + PABS m3, m5 + PABS m4, m5 + paddd m3, m4 + psrld m2, 1 + psrld m3, 1 + PMAXSD m2, m3, m6 + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] + psubd m3, m0 + psubd m4, m1 + PABS m3, m5 + PABS m4, m5 + paddd m3, m4 + psrld m3, 1 + PMAXSD m2, m3, m6 + mova [rsp+48], m2 + + paddd m1, m0 + paddd m0, m0 + psubd m0, m1 + psrld m1, 1 + PABS m0, m2 + + movu m2, [curq+t1-1*2] + movu m3, [curq+t0-1*2] + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 +%if mmsize == 16 + mova m3, m2 + psrldq m3, 4 +%else + mova m3, m2 + psrlq m3, 32 +%endif + punpcklwd m2, m7 + punpcklwd m3, m7 + paddd m0, m2 + paddd m0, m3 + psubd m0, [pd_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] + paddd m2, m4 + paddd m3, m5 + psrld m2, 1 + psrld m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubd m2, m4 + psubd m3, m7 + mova m0, m5 + psubd m5, m4 + psubd m0, m7 + mova m4, m2 + PMINSD m2, m3, m7 + PMAXSD m3, m4, m7 + PMAXSD m2, m5, m7 + PMINSD m3, m5, m7 + PMAXSD m2, m0, m7 + PMINSD m3, m0, m7 + pxor m4, m4 + PMAXSD m6, m3, m7 + psubd m4, m2 + PMAXSD m6, m4, m7 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubd m2, m6 + paddd m3, m6 + PMAXSD m1, m2, m7 + PMINSD m1, m3, m7 + PACK m1 + + movh [dstq], m1 + add dstq, mmsize/2 + add prevq, mmsize/2 + add curq, mmsize/2 + add nextq, mmsize/2 + sub DWORD r4m, mmsize/4 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%else +cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%endif +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM sse4 +YADIF +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif |
