diff options
Diffstat (limited to 'ffmpeg/libavutil/x86')
| -rw-r--r-- | ffmpeg/libavutil/x86/Makefile | 8 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/asm.h | 112 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/bswap.h | 61 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/cpu.c | 210 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/cpu.h | 75 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/cpuid.asm | 91 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/emms.asm | 30 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/emms.h | 47 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/float_dsp.asm | 290 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/float_dsp_init.c | 156 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/intreadwrite.h | 97 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/timer.h | 44 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/w64xmmtest.h | 73 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/x86inc.asm | 1480 | ||||
| -rw-r--r-- | ffmpeg/libavutil/x86/x86util.asm | 680 |
15 files changed, 0 insertions, 3454 deletions
diff --git a/ffmpeg/libavutil/x86/Makefile b/ffmpeg/libavutil/x86/Makefile deleted file mode 100644 index 1e19082..0000000 --- a/ffmpeg/libavutil/x86/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -OBJS += x86/cpu.o \ - x86/float_dsp_init.o \ - x86/lls_init.o \ - -YASM-OBJS += x86/cpuid.o \ - x86/emms.o \ - x86/float_dsp.o \ - x86/lls.o \ diff --git a/ffmpeg/libavutil/x86/asm.h b/ffmpeg/libavutil/x86/asm.h deleted file mode 100644 index 70ccac7..0000000 --- a/ffmpeg/libavutil/x86/asm.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_ASM_H -#define AVUTIL_X86_ASM_H - -#include <stdint.h> -#include "config.h" - -typedef struct xmm_reg { uint64_t a, b; } xmm_reg; - -#if ARCH_X86_64 -# define OPSIZE "q" -# define REG_a "rax" -# define REG_b "rbx" -# define REG_c "rcx" -# define REG_d "rdx" -# define REG_D "rdi" -# define REG_S "rsi" -# define PTR_SIZE "8" -typedef int64_t x86_reg; - -# define REG_SP "rsp" -# define REG_BP "rbp" -# define REGBP rbp -# define REGa rax -# define REGb rbx -# define REGc rcx -# define REGd rdx -# define REGSP rsp - -#elif ARCH_X86_32 - -# define OPSIZE "l" -# define REG_a "eax" -# define REG_b "ebx" -# define REG_c "ecx" -# define REG_d "edx" -# define REG_D "edi" -# define REG_S "esi" -# define PTR_SIZE "4" -typedef int32_t x86_reg; - -# define REG_SP "esp" -# define REG_BP "ebp" -# define REGBP ebp -# define REGa eax -# define REGb ebx -# define REGc ecx -# define REGd edx -# define REGSP esp -#else -typedef int x86_reg; -#endif - -#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE)) -#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE)) - -#if ARCH_X86_64 && defined(PIC) -# define BROKEN_RELOCATIONS 1 -#endif - -/* - * If gcc is not set to support sse (-msse) it will not accept xmm registers - * in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm - * registers to be marked as clobbered and evaluates to nothing if they are - * not supported, or to the list itself if they are supported. Since a clobber - * list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm - * registers are the only in the clobber list. - * For example a list with "eax" and "xmm0" as clobbers should become: - * : XMM_CLOBBERS("xmm0",) "eax" - * and a list with only "xmm0" should become: - * XMM_CLOBBERS_ONLY("xmm0") - */ -#if HAVE_XMM_CLOBBERS -# define XMM_CLOBBERS(...) __VA_ARGS__ -# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__ -#else -# define XMM_CLOBBERS(...) -# define XMM_CLOBBERS_ONLY(...) -#endif - -/* Use to export labels from asm. */ -#define LABEL_MANGLE(a) EXTERN_PREFIX #a - -// Use rip-relative addressing if compiling PIC code on x86-64. -#if ARCH_X86_64 && defined(PIC) -# define LOCAL_MANGLE(a) #a "(%%rip)" -#else -# define LOCAL_MANGLE(a) #a -#endif - -#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) - -#endif /* AVUTIL_X86_ASM_H */ diff --git a/ffmpeg/libavutil/x86/bswap.h b/ffmpeg/libavutil/x86/bswap.h deleted file mode 100644 index 08e2a62..0000000 --- a/ffmpeg/libavutil/x86/bswap.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * byte swapping routines - */ - -#ifndef AVUTIL_X86_BSWAP_H -#define AVUTIL_X86_BSWAP_H - -#include <stdint.h> -#include "config.h" -#include "libavutil/attributes.h" - -#if HAVE_INLINE_ASM - -#if !AV_GCC_VERSION_AT_LEAST(4,1) -#define av_bswap16 av_bswap16 -static av_always_inline av_const unsigned av_bswap16(unsigned x) -{ - __asm__("rorw $8, %w0" : "+r"(x)); - return x; -} -#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */ - -#if !AV_GCC_VERSION_AT_LEAST(4,5) -#define av_bswap32 av_bswap32 -static av_always_inline av_const uint32_t av_bswap32(uint32_t x) -{ - __asm__("bswap %0" : "+r" (x)); - return x; -} - -#if ARCH_X86_64 -#define av_bswap64 av_bswap64 -static inline uint64_t av_const av_bswap64(uint64_t x) -{ - __asm__("bswap %0": "=r" (x) : "0" (x)); - return x; -} -#endif -#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */ - -#endif /* HAVE_INLINE_ASM */ -#endif /* AVUTIL_X86_BSWAP_H */ diff --git a/ffmpeg/libavutil/x86/cpu.c b/ffmpeg/libavutil/x86/cpu.c deleted file mode 100644 index 18049ea..0000000 --- a/ffmpeg/libavutil/x86/cpu.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * CPU detection code, extracted from mmx.h - * (c)1997-99 by H. Dietz and R. Fisher - * Converted to C and improved by Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include <string.h> - -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavutil/cpu.h" -#include "libavutil/cpu_internal.h" - -#if HAVE_YASM - -#define cpuid(index, eax, ebx, ecx, edx) \ - ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx) - -#define xgetbv(index, eax, edx) \ - ff_cpu_xgetbv(index, &eax, &edx) - -#elif HAVE_INLINE_ASM - -/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ -#define cpuid(index, eax, ebx, ecx, edx) \ - __asm__ volatile ( \ - "mov %%"REG_b", %%"REG_S" \n\t" \ - "cpuid \n\t" \ - "xchg %%"REG_b", %%"REG_S \ - : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \ - : "0" (index)) - -#define xgetbv(index, eax, edx) \ - __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index)) - -#define get_eflags(x) \ - __asm__ volatile ("pushfl \n" \ - "pop %0 \n" \ - : "=r"(x)) - -#define set_eflags(x) \ - __asm__ volatile ("push %0 \n" \ - "popfl \n" \ - :: "r"(x)) - -#endif /* HAVE_INLINE_ASM */ - -#if ARCH_X86_64 - -#define cpuid_test() 1 - -#elif HAVE_YASM - -#define cpuid_test ff_cpu_cpuid_test - -#elif HAVE_INLINE_ASM - -static int cpuid_test(void) -{ - x86_reg a, c; - - /* Check if CPUID is supported by attempting to toggle the ID bit in - * the EFLAGS register. */ - get_eflags(a); - set_eflags(a ^ 0x200000); - get_eflags(c); - - return a != c; -} -#endif - -/* Function to test if multimedia instructions are supported... */ -int ff_get_cpu_flags_x86(void) -{ - int rval = 0; - -#ifdef cpuid - - int eax, ebx, ecx, edx; - int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0; - int family = 0, model = 0; - union { int i[3]; char c[12]; } vendor; - - if (!cpuid_test()) - return 0; /* CPUID not supported */ - - cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]); - - if (max_std_level >= 1) { - cpuid(1, eax, ebx, ecx, std_caps); - family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); - if (std_caps & (1 << 15)) - rval |= AV_CPU_FLAG_CMOV; - if (std_caps & (1 << 23)) - rval |= AV_CPU_FLAG_MMX; - if (std_caps & (1 << 25)) - rval |= AV_CPU_FLAG_MMXEXT; -#if HAVE_SSE - if (std_caps & (1 << 25)) - rval |= AV_CPU_FLAG_SSE; - if (std_caps & (1 << 26)) - rval |= AV_CPU_FLAG_SSE2; - if (ecx & 1) - rval |= AV_CPU_FLAG_SSE3; - if (ecx & 0x00000200 ) - rval |= AV_CPU_FLAG_SSSE3; - if (ecx & 0x00080000 ) - rval |= AV_CPU_FLAG_SSE4; - if (ecx & 0x00100000 ) - rval |= AV_CPU_FLAG_SSE42; -#if HAVE_AVX - /* Check OXSAVE and AVX bits */ - if ((ecx & 0x18000000) == 0x18000000) { - /* Check for OS support */ - xgetbv(0, eax, edx); - if ((eax & 0x6) == 0x6) - rval |= AV_CPU_FLAG_AVX; - } -#if HAVE_AVX2 - if (max_std_level >= 7) { - cpuid(7, eax, ebx, ecx, edx); - if (ebx&0x00000020) - rval |= AV_CPU_FLAG_AVX2; - /* TODO: BMI1/2 */ - } -#endif /* HAVE_AVX2 */ -#endif /* HAVE_AVX */ -#endif /* HAVE_SSE */ - } - - cpuid(0x80000000, max_ext_level, ebx, ecx, edx); - - if (max_ext_level >= 0x80000001) { - cpuid(0x80000001, eax, ebx, ecx, ext_caps); - if (ext_caps & (1U << 31)) - rval |= AV_CPU_FLAG_3DNOW; - if (ext_caps & (1 << 30)) - rval |= AV_CPU_FLAG_3DNOWEXT; - if (ext_caps & (1 << 23)) - rval |= AV_CPU_FLAG_MMX; - if (ext_caps & (1 << 22)) - rval |= AV_CPU_FLAG_MMXEXT; - - /* Allow for selectively disabling SSE2 functions on AMD processors - with SSE2 support but not SSE4a. This includes Athlon64, some - Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster - than SSE2 often enough to utilize this special-case flag. - AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case - so that SSE2 is used unless explicitly disabled by checking - AV_CPU_FLAG_SSE2SLOW. */ - if (!strncmp(vendor.c, "AuthenticAMD", 12) && - rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) { - rval |= AV_CPU_FLAG_SSE2SLOW; - } - - /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be - * used unless the OS has AVX support. */ - if (rval & AV_CPU_FLAG_AVX) { - if (ecx & 0x00000800) - rval |= AV_CPU_FLAG_XOP; - if (ecx & 0x00010000) - rval |= AV_CPU_FLAG_FMA4; - } - } - - if (!strncmp(vendor.c, "GenuineIntel", 12)) { - if (family == 6 && (model == 9 || model == 13 || model == 14)) { - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and - * 6/14 (core1 "yonah") theoretically support sse2, but it's - * usually slower than mmx, so let's just pretend they don't. - * AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is - * enabled so that SSE2 is not used unless explicitly enabled - * by checking AV_CPU_FLAG_SSE2SLOW. The same situation - * applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */ - if (rval & AV_CPU_FLAG_SSE2) - rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2; - if (rval & AV_CPU_FLAG_SSE3) - rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3; - } - /* The Atom processor has SSSE3 support, which is useful in many cases, - * but sometimes the SSSE3 version is slower than the SSE2 equivalent - * on the Atom, but is generally faster on other processors supporting - * SSSE3. This flag allows for selectively disabling certain SSSE3 - * functions on the Atom. */ - if (family == 6 && model == 28) - rval |= AV_CPU_FLAG_ATOM; - } - -#endif /* cpuid */ - - return rval; -} diff --git a/ffmpeg/libavutil/x86/cpu.h b/ffmpeg/libavutil/x86/cpu.h deleted file mode 100644 index 3724357..0000000 --- a/ffmpeg/libavutil/x86/cpu.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_CPU_H -#define AVUTIL_X86_CPU_H - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/cpu_internal.h" - -#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW -#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT - -#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW) -#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT) -#define X86_MMX(flags) CPUEXT(flags, MMX) -#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT) -#define X86_SSE(flags) CPUEXT(flags, SSE) -#define X86_SSE2(flags) CPUEXT(flags, SSE2) -#define X86_SSE3(flags) CPUEXT(flags, SSE3) -#define X86_SSSE3(flags) CPUEXT(flags, SSSE3) -#define X86_SSE4(flags) CPUEXT(flags, SSE4) -#define X86_SSE42(flags) CPUEXT(flags, SSE42) -#define X86_AVX(flags) CPUEXT(flags, AVX) -#define X86_FMA4(flags) CPUEXT(flags, FMA4) -#define X86_AVX2(flags) CPUEXT(flags, AVX2) - -#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW) -#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT) -#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX) -#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT) -#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE) -#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2) -#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3) -#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3) -#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4) -#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) -#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) -#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) -#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) - -#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW) -#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT) -#define INLINE_MMX(flags) CPUEXT_SUFFIX(flags, _INLINE, MMX) -#define INLINE_MMXEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, MMXEXT) -#define INLINE_SSE(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE) -#define INLINE_SSE2(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE2) -#define INLINE_SSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE3) -#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3) -#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4) -#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) -#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) -#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) -#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2) - -void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx); -void ff_cpu_xgetbv(int op, int *eax, int *edx); -int ff_cpu_cpuid_test(void); - -#endif /* AVUTIL_X86_CPU_H */ diff --git a/ffmpeg/libavutil/x86/cpuid.asm b/ffmpeg/libavutil/x86/cpuid.asm deleted file mode 100644 index 56876a8..0000000 --- a/ffmpeg/libavutil/x86/cpuid.asm +++ /dev/null @@ -1,91 +0,0 @@ -;***************************************************************************** -;* Copyright (C) 2005-2010 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86util.asm" - -SECTION .text - -;----------------------------------------------------------------------------- -; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx) -;----------------------------------------------------------------------------- -cglobal cpu_cpuid, 5,7 - push rbx - push r4 - push r3 - push r2 - push r1 - mov eax, r0d - xor ecx, ecx - cpuid - pop r4 - mov [r4], eax - pop r4 - mov [r4], ebx - pop r4 - mov [r4], ecx - pop r4 - mov [r4], edx - pop rbx - RET - -;----------------------------------------------------------------------------- -; void ff_cpu_xgetbv(int op, int *eax, int *edx) -;----------------------------------------------------------------------------- -cglobal cpu_xgetbv, 3,7 - push r2 - push r1 - mov ecx, r0d - xgetbv - pop r4 - mov [r4], eax - pop r4 - mov [r4], edx - RET - -%if ARCH_X86_64 == 0 -;----------------------------------------------------------------------------- -; int ff_cpu_cpuid_test(void) -; return 0 if unsupported -;----------------------------------------------------------------------------- -cglobal cpu_cpuid_test - pushfd - push ebx - push ebp - push esi - push edi - pushfd - pop eax - mov ebx, eax - xor eax, 0x200000 - push eax - popfd - pushfd - pop eax - xor eax, ebx - pop edi - pop esi - pop ebp - pop ebx - popfd - ret -%endif diff --git a/ffmpeg/libavutil/x86/emms.asm b/ffmpeg/libavutil/x86/emms.asm deleted file mode 100644 index 0aad34a..0000000 --- a/ffmpeg/libavutil/x86/emms.asm +++ /dev/null @@ -1,30 +0,0 @@ -;***************************************************************************** -;* Copyright (C) 2013 Martin Storsjo -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86util.asm" - -SECTION .text - -;----------------------------------------------------------------------------- -; void avpriv_emms_yasm(void) -;----------------------------------------------------------------------------- -cvisible emms_yasm, 0, 0 - emms - RET diff --git a/ffmpeg/libavutil/x86/emms.h b/ffmpeg/libavutil/x86/emms.h deleted file mode 100644 index a529b6b..0000000 --- a/ffmpeg/libavutil/x86/emms.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_EMMS_H -#define AVUTIL_X86_EMMS_H - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" - -void avpriv_emms_yasm(void); - -#if HAVE_MMX_INLINE -# define emms_c emms_c -/** - * Empty mmx state. - * this must be called between any dsp function and float/double code. - * for example sin(); dsp->idct_put(); emms_c(); cos() - */ -static av_always_inline void emms_c(void) -{ - if(av_get_cpu_flags() & AV_CPU_FLAG_MMX) - __asm__ volatile ("emms" ::: "memory"); -} -#elif HAVE_MMX && HAVE_MM_EMPTY -# include <mmintrin.h> -# define emms_c _mm_empty -#elif HAVE_MMX_EXTERNAL -# define emms_c avpriv_emms_yasm -#endif /* HAVE_MMX_INLINE */ - -#endif /* AVUTIL_X86_EMMS_H */ diff --git a/ffmpeg/libavutil/x86/float_dsp.asm b/ffmpeg/libavutil/x86/float_dsp.asm deleted file mode 100644 index 49d4876..0000000 --- a/ffmpeg/libavutil/x86/float_dsp.asm +++ /dev/null @@ -1,290 +0,0 @@ -;***************************************************************************** -;* x86-optimized Float DSP functions -;* -;* Copyright 2006 Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86util.asm" - -SECTION .text - -;----------------------------------------------------------------------------- -; void vector_fmul(float *dst, const float *src0, const float *src1, int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL 0 -cglobal vector_fmul, 4,4,2, dst, src0, src1, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: - mova m0, [src0q + lenq] - mova m1, [src0q + lenq + mmsize] - mulps m0, m0, [src1q + lenq] - mulps m1, m1, [src1q + lenq + mmsize] - mova [dstq + lenq], m0 - mova [dstq + lenq + mmsize], m1 - - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -VECTOR_FMUL -%endif - -;------------------------------------------------------------------------------ -; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) -;------------------------------------------------------------------------------ - -%macro VECTOR_FMAC_SCALAR 0 -%if UNIX64 -cglobal vector_fmac_scalar, 3,3,3, dst, src, len -%else -cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len -%endif -%if ARCH_X86_32 - VBROADCASTSS m0, mulm -%else -%if WIN64 - mova xmm0, xmm2 -%endif - shufps xmm0, xmm0, 0 -%if cpuflag(avx) - vinsertf128 m0, m0, xmm0, 1 -%endif -%endif - lea lenq, [lend*4-2*mmsize] -.loop: - mulps m1, m0, [srcq+lenq ] - mulps m2, m0, [srcq+lenq+mmsize] - addps m1, m1, [dstq+lenq ] - addps m2, m2, [dstq+lenq+mmsize] - mova [dstq+lenq ], m1 - mova [dstq+lenq+mmsize], m2 - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMAC_SCALAR -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -VECTOR_FMAC_SCALAR -%endif - -;------------------------------------------------------------------------------ -; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) -;------------------------------------------------------------------------------ - -%macro VECTOR_FMUL_SCALAR 0 -%if UNIX64 -cglobal vector_fmul_scalar, 3,3,2, dst, src, len -%else -cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len -%endif -%if ARCH_X86_32 - movss m0, mulm -%elif WIN64 - SWAP 0, 2 -%endif - shufps m0, m0, 0 - lea lenq, [lend*4-mmsize] -.loop: - mova m1, [srcq+lenq] - mulps m1, m0 - mova [dstq+lenq], m1 - sub lenq, mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_SCALAR - -;------------------------------------------------------------------------------ -; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, -; int len) -;------------------------------------------------------------------------------ - -%macro VECTOR_DMUL_SCALAR 0 -%if ARCH_X86_32 -cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr - mov lenq, lenaddrm -%elif UNIX64 -cglobal vector_dmul_scalar, 3,3,3, dst, src, len -%else -cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len -%endif -%if ARCH_X86_32 - VBROADCASTSD m0, mulm -%else -%if WIN64 - movlhps xmm2, xmm2 -%if cpuflag(avx) - vinsertf128 ymm2, ymm2, xmm2, 1 -%endif - SWAP 0, 2 -%else - movlhps xmm0, xmm0 -%if cpuflag(avx) - vinsertf128 ymm0, ymm0, xmm0, 1 -%endif -%endif -%endif - lea lenq, [lend*8-2*mmsize] -.loop: - mulpd m1, m0, [srcq+lenq ] - mulpd m2, m0, [srcq+lenq+mmsize] - mova [dstq+lenq ], m1 - mova [dstq+lenq+mmsize], m2 - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse2 -VECTOR_DMUL_SCALAR -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -VECTOR_DMUL_SCALAR -%endif - -;----------------------------------------------------------------------------- -; vector_fmul_add(float *dst, const float *src0, const float *src1, -; const float *src2, int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: - mova m0, [src0q + lenq] - mova m1, [src0q + lenq + mmsize] - mulps m0, m0, [src1q + lenq] - mulps m1, m1, [src1q + lenq + mmsize] - addps m0, m0, [src2q + lenq] - addps m1, m1, [src2q + lenq + mmsize] - mova [dstq + lenq], m0 - mova [dstq + lenq + mmsize], m1 - - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_ADD -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -VECTOR_FMUL_ADD -%endif - -;----------------------------------------------------------------------------- -; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, -; int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL_REVERSE 0 -cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: -%if cpuflag(avx) - vmovaps xmm0, [src1q + 16] - vinsertf128 m0, m0, [src1q], 1 - vshufps m0, m0, m0, q0123 - vmovaps xmm1, [src1q + mmsize + 16] - vinsertf128 m1, m1, [src1q + mmsize], 1 - vshufps m1, m1, m1, q0123 -%else - mova m0, [src1q] - mova m1, [src1q + mmsize] - shufps m0, m0, q0123 - shufps m1, m1, q0123 -%endif - mulps m0, m0, [src0q + lenq + mmsize] - mulps m1, m1, [src0q + lenq] - mova [dstq + lenq + mmsize], m0 - mova [dstq + lenq], m1 - add src1q, 2*mmsize - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_REVERSE -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -VECTOR_FMUL_REVERSE -%endif - -; float scalarproduct_float_sse(const float *v1, const float *v2, int len) -INIT_XMM sse -cglobal scalarproduct_float, 3,3,2, v1, v2, offset - neg offsetq - shl offsetq, 2 - sub v1q, offsetq - sub v2q, offsetq - xorps xmm0, xmm0 -.loop: - movaps xmm1, [v1q+offsetq] - mulps xmm1, [v2q+offsetq] - addps xmm0, xmm1 - add offsetq, 16 - js .loop - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movss xmm1, xmm0 - shufps xmm0, xmm0, 1 - addss xmm0, xmm1 -%if ARCH_X86_64 == 0 - movss r0m, xmm0 - fld dword r0m -%endif - RET - -;----------------------------------------------------------------------------- -; void ff_butterflies_float(float *src0, float *src1, int len); -;----------------------------------------------------------------------------- -INIT_XMM sse -cglobal butterflies_float, 3,3,3, src0, src1, len -%if ARCH_X86_64 - movsxd lenq, lend -%endif - test lenq, lenq - jz .end - shl lenq, 2 - add src0q, lenq - add src1q, lenq - neg lenq -.loop: - mova m0, [src0q + lenq] - mova m1, [src1q + lenq] - subps m2, m0, m1 - addps m0, m0, m1 - mova [src1q + lenq], m2 - mova [src0q + lenq], m0 - add lenq, mmsize - jl .loop -.end: - REP_RET diff --git a/ffmpeg/libavutil/x86/float_dsp_init.c b/ffmpeg/libavutil/x86/float_dsp_init.c deleted file mode 100644 index 97f7b7c..0000000 --- a/ffmpeg/libavutil/x86/float_dsp_init.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/float_dsp.h" -#include "cpu.h" -#include "asm.h" - -void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, - int len); -void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, - int len); - -void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, - int len); -void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, - int len); - -void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, - int len); - -void ff_vector_dmul_scalar_sse2(double *dst, const double *src, - double mul, int len); -void ff_vector_dmul_scalar_avx(double *dst, const double *src, - double mul, int len); - -void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, - const float *src2, int len); -void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, - const float *src2, int len); - -void ff_vector_fmul_reverse_sse(float *dst, const float *src0, - const float *src1, int len); -void ff_vector_fmul_reverse_avx(float *dst, const float *src0, - const float *src1, int len); - -float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); - -void ff_butterflies_float_sse(float *src0, float *src1, int len); - -#if HAVE_6REGS && HAVE_INLINE_ASM -static void vector_fmul_window_3dnowext(float *dst, const float *src0, - const float *src1, const float *win, - int len) -{ - x86_reg i = -len * 4; - x86_reg j = len * 4 - 8; - __asm__ volatile ( - "1: \n" - "pswapd (%5, %1), %%mm1 \n" - "movq (%5, %0), %%mm0 \n" - "pswapd (%4, %1), %%mm5 \n" - "movq (%3, %0), %%mm4 \n" - "movq %%mm0, %%mm2 \n" - "movq %%mm1, %%mm3 \n" - "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i] - "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j] - "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j] - "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i] - "pfadd %%mm3, %%mm2 \n" - "pfsub %%mm0, %%mm1 \n" - "pswapd %%mm2, %%mm2 \n" - "movq %%mm1, (%2, %0) \n" - "movq %%mm2, (%2, %1) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - "femms \n" - : "+r"(i), "+r"(j) - : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) - ); -} - -static void vector_fmul_window_sse(float *dst, const float *src0, - const float *src1, const float *win, int len) -{ - x86_reg i = -len * 4; - x86_reg j = len * 4 - 16; - __asm__ volatile ( - "1: \n" - "movaps (%5, %1), %%xmm1 \n" - "movaps (%5, %0), %%xmm0 \n" - "movaps (%4, %1), %%xmm5 \n" - "movaps (%3, %0), %%xmm4 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "shufps $0x1b, %%xmm5, %%xmm5 \n" - "movaps %%xmm0, %%xmm2 \n" - "movaps %%xmm1, %%xmm3 \n" - "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i] - "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j] - "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j] - "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i] - "addps %%xmm3, %%xmm2 \n" - "subps %%xmm0, %%xmm1 \n" - "shufps $0x1b, %%xmm2, %%xmm2 \n" - "movaps %%xmm1, (%2, %0) \n" - "movaps %%xmm2, (%2, %1) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - : "+r"(i), "+r"(j) - : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) - ); -} -#endif /* HAVE_6REGS && HAVE_INLINE_ASM */ - -av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_6REGS && HAVE_INLINE_ASM - if (INLINE_AMD3DNOWEXT(cpu_flags)) { - fdsp->vector_fmul_window = vector_fmul_window_3dnowext; - } - if (INLINE_SSE(cpu_flags)) { - fdsp->vector_fmul_window = vector_fmul_window_sse; - } -#endif - if (EXTERNAL_SSE(cpu_flags)) { - fdsp->vector_fmul = ff_vector_fmul_sse; - fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; - fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; - fdsp->vector_fmul_add = ff_vector_fmul_add_sse; - fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; - fdsp->scalarproduct_float = ff_scalarproduct_float_sse; - fdsp->butterflies_float = ff_butterflies_float_sse; - } - if (EXTERNAL_SSE2(cpu_flags)) { - fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; - } - if (EXTERNAL_AVX(cpu_flags)) { - fdsp->vector_fmul = ff_vector_fmul_avx; - fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; - fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; - fdsp->vector_fmul_add = ff_vector_fmul_add_avx; - fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; - } -} diff --git a/ffmpeg/libavutil/x86/intreadwrite.h b/ffmpeg/libavutil/x86/intreadwrite.h deleted file mode 100644 index 4061d19..0000000 --- a/ffmpeg/libavutil/x86/intreadwrite.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_INTREADWRITE_H -#define AVUTIL_X86_INTREADWRITE_H - -#include <stdint.h> -#include "config.h" -#include "libavutil/attributes.h" - -#if HAVE_MMX - -#if !HAVE_FAST_64BIT && defined(__MMX__) - -#define AV_COPY64 AV_COPY64 -static av_always_inline void AV_COPY64(void *d, const void *s) -{ - __asm__("movq %1, %%mm0 \n\t" - "movq %%mm0, %0 \n\t" - : "=m"(*(uint64_t*)d) - : "m" (*(const uint64_t*)s) - : "mm0"); -} - -#define AV_SWAP64 AV_SWAP64 -static av_always_inline void AV_SWAP64(void *a, void *b) -{ - __asm__("movq %1, %%mm0 \n\t" - "movq %0, %%mm1 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm1, %1 \n\t" - : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b) - ::"mm0", "mm1"); -} - -#define AV_ZERO64 AV_ZERO64 -static av_always_inline void AV_ZERO64(void *d) -{ - __asm__("pxor %%mm0, %%mm0 \n\t" - "movq %%mm0, %0 \n\t" - : "=m"(*(uint64_t*)d) - :: "mm0"); -} - -#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */ - -#ifdef __SSE__ - -#define AV_COPY128 AV_COPY128 -static av_always_inline void AV_COPY128(void *d, const void *s) -{ - struct v {uint64_t v[2];}; - - __asm__("movaps %1, %%xmm0 \n\t" - "movaps %%xmm0, %0 \n\t" - : "=m"(*(struct v*)d) - : "m" (*(const struct v*)s) - : "xmm0"); -} - -#endif /* __SSE__ */ - -#ifdef __SSE2__ - -#define AV_ZERO128 AV_ZERO128 -static av_always_inline void AV_ZERO128(void *d) -{ - struct v {uint64_t v[2];}; - - __asm__("pxor %%xmm0, %%xmm0 \n\t" - "movdqa %%xmm0, %0 \n\t" - : "=m"(*(struct v*)d) - :: "xmm0"); -} - -#endif /* __SSE2__ */ - -#endif /* HAVE_MMX */ - -#endif /* AVUTIL_X86_INTREADWRITE_H */ diff --git a/ffmpeg/libavutil/x86/timer.h b/ffmpeg/libavutil/x86/timer.h deleted file mode 100644 index d812d46..0000000 --- a/ffmpeg/libavutil/x86/timer.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_TIMER_H -#define AVUTIL_X86_TIMER_H - -#include <stdint.h> - -#if HAVE_INLINE_ASM - -#define AV_READ_TIME read_time - -static inline uint64_t read_time(void) -{ - uint32_t a, d; - __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); - return ((uint64_t)d << 32) + a; -} - -#elif HAVE_RDTSC - -#include <intrin.h> -#define AV_READ_TIME __rdtsc - -#endif /* HAVE_INLINE_ASM */ - -#endif /* AVUTIL_X86_TIMER_H */ diff --git a/ffmpeg/libavutil/x86/w64xmmtest.h b/ffmpeg/libavutil/x86/w64xmmtest.h deleted file mode 100644 index 9df499f..0000000 --- a/ffmpeg/libavutil/x86/w64xmmtest.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * check XMM registers for clobbers on Win64 - * Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <inttypes.h> -#include <stdint.h> -#include <stdlib.h> -#include <stdarg.h> -#include <string.h> - -#include "libavutil/bswap.h" - -#define storexmmregs(mem) \ - __asm__ volatile( \ - "movups %%xmm6 , 0x00(%0)\n\t" \ - "movups %%xmm7 , 0x10(%0)\n\t" \ - "movups %%xmm8 , 0x20(%0)\n\t" \ - "movups %%xmm9 , 0x30(%0)\n\t" \ - "movups %%xmm10, 0x40(%0)\n\t" \ - "movups %%xmm11, 0x50(%0)\n\t" \ - "movups %%xmm12, 0x60(%0)\n\t" \ - "movups %%xmm13, 0x70(%0)\n\t" \ - "movups %%xmm14, 0x80(%0)\n\t" \ - "movups %%xmm15, 0x90(%0)\n\t" \ - :: "r"(mem) : "memory") - -#define testxmmclobbers(func, ctx, ...) \ - uint64_t xmm[2][10][2]; \ - int ret; \ - storexmmregs(xmm[0]); \ - ret = __real_ ## func(ctx, __VA_ARGS__); \ - storexmmregs(xmm[1]); \ - if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \ - int i; \ - av_log(ctx, AV_LOG_ERROR, \ - "XMM REGS CLOBBERED IN %s!\n", #func); \ - for (i = 0; i < 10; i ++) \ - if (xmm[0][i][0] != xmm[1][i][0] || \ - xmm[0][i][1] != xmm[1][i][1]) { \ - av_log(ctx, AV_LOG_ERROR, \ - "xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \ - 6 + i, av_bswap64(xmm[0][i][0]), \ - av_bswap64(xmm[0][i][1])); \ - av_log(ctx, AV_LOG_ERROR, \ - " -> %016"PRIx64"%016"PRIx64"\n", \ - av_bswap64(xmm[1][i][0]), \ - av_bswap64(xmm[1][i][1])); \ - } \ - abort(); \ - } \ - return ret - -#define wrap(func) \ -int __real_ ## func; \ -int __wrap_ ## func; \ -int __wrap_ ## func diff --git a/ffmpeg/libavutil/x86/x86inc.asm b/ffmpeg/libavutil/x86/x86inc.asm deleted file mode 100644 index 420e293..0000000 --- a/ffmpeg/libavutil/x86/x86inc.asm +++ /dev/null @@ -1,1480 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2013 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <henrik@gramner.com> -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%ifndef private_prefix - %define private_prefix x264 -%endif - -%ifndef public_prefix - %define public_prefix private_prefix -%endif - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,x64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -; aout does not support align= -; NOTE: This section is out of sync with x264, in order to -; keep supporting OS/2. -%macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,aout - section .text - %else - SECTION .rodata align=%1 - %endif -%endmacro - -%macro SECTION_TEXT 0-1 16 - %ifidn __OUTPUT_FORMAT__,aout - SECTION .text - %else - SECTION .text align=%1 - %endif -%endmacro - -%if WIN64 - %define PIC -%elif ARCH_X86_64 == 0 -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %undef PIC -%endif -%ifdef PIC - default rel -%endif - -%macro CPUNOP 1 - %if HAVE_CPUNOP - CPU %1 - %endif -%endmacro - -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPUNOP amdnop - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, -; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), -; and an extra register will be allocated to hold the original stack -; pointer (to not invalidate r0m etc.). To prevent the use of an extra -; register as stack pointer, request a negative stack size. -; %4+/%5+ = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Use this instead of RET if it's a branch target. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %define %2q %2 - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m [rstk + stack_offset + %3] - %define r%1mp qword r %+ %1 %+ m - %else - %define r%1m [rstk + stack_offset + %3] - %define r%1mp dword r %+ %1 %+ m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset+gprsize - %endif -%endmacro - -%macro POP 1 - pop %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset-gprsize - %endif -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) - %ifnum %1 - %if %1 != 0 - %assign %%stack_alignment ((mmsize + 15) & ~15) - %assign stack_size %1 - %if stack_size < 0 - %assign stack_size -stack_size - %endif - %assign stack_size_padded stack_size - %if WIN64 - %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 - %endif - %endif - %endif - %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - SUB rsp, stack_size_padded - %else - %assign %%reg_num (regs_used - 1) - %xdefine rstk r %+ %%reg_num - ; align stack, and save original stack location directly above - ; it, i.e. in [rsp+stack_size_padded], so we can restore the - ; stack in a single instruction (i.e. mov rsp, rstk or mov - ; rsp, [rsp+stack_size_padded]) - mov rstk, rsp - %if %1 < 0 ; need to store rsp on stack - sub rsp, gprsize+stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm [rsp+stack_size_padded] - mov rstkm, rstk - %else ; can keep rsp in rstk during whole function - sub rsp, stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rstk - %endif - %endif - WIN64_PUSH_XMM - %endif - %endif -%endmacro - -%macro SETUP_STACK_POINTER 1 - %ifnum %1 - %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) - %if %1 > 0 - %assign regs_used (regs_used + 1) - %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 - %warning "Stack pointer will overwrite register argument" - %endif - %endif - %endif -%endmacro - -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4, %3 - %if mmsize != 8 && stack_size == 0 - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 - movaps [rstk + stack_offset + 8], xmm6 - %endif - %if xmm_regs_used > 7 - movaps [rstk + stack_offset + 24], xmm7 - %endif - %if xmm_regs_used > 8 - %assign %%i 8 - %rep xmm_regs_used-8 - movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 - %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 - SUB rsp, stack_size_padded - %endif - WIN64_PUSH_XMM -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 - %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] - %endrep - %endif - %if stack_size_padded > 0 - %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) - mov rsp, rstkm - %else - add %1, stack_size_padded - %assign %%pad_size stack_size_padded - %endif - %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] - %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset (stack_offset-stack_size_padded) - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m [rstk + stack_offset + 4*%1 + 4] - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - %if num_args > 7 - %assign num_args 7 - %endif - %if regs_used > 7 - %assign regs_used 7 - %endif - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 7 - PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%macro WIN64_PUSH_XMM 0 -%endmacro -%endif - -; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either -; a branch or a branch target. So switch to a 2-byte form of ret in that case. -; We can automatically detect "follows a branch", but not a branch target. -; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif -%endmacro - -%define last_branch_adr $$ -%macro AUTO_REP_RET 0 - %ifndef cpuflags - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. - %elif notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep - %endif - ret -%endmacro - -%macro BRANCH_INSTR 0-* - %rep %0 - %macro %1 1-2 %1 - %2 %1 - %%branch_instr: - %xdefine last_branch_adr %%branch_instr - %endmacro - %rotate 1 - %endrep -%endmacro - -BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX -; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). -%macro cglobal 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 1, %1 %+ SUFFIX, %2 -%endmacro -%macro cvisible 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 0, %1 %+ SUFFIX, %2 -%endmacro -%macro cglobal_internal 2-3+ - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - %xdefine %%VISIBILITY hidden - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif - %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) - %xdefine %2.skip_prologue %2 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %2, 1 - %endif - %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf - global %2:function %%VISIBILITY - %else - global %2 - %endif - align function_align - %2: - RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer - %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required - %assign stack_offset 0 ; stack pointer offset relative to the return address - %assign stack_size 0 ; amount of stack space that can be freely used inside a function - %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 - %ifnidn %3, "" - PROLOGUE %3 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 1-2+ - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf - global %1:data hidden - %else - global %1 - %endif - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif - -; cpuflags - -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 - -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) - -; Takes up to 2 cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPUNOP amdnop - %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif - %xdefine SUFFIX _ %+ cpuname - %if cpuflag(avx) - %assign avx_enabled 1 - %endif - %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elifidn %1, sse3 - %define movu lddqu - %endif - %if notcpuflag(sse2) - CPUNOP basicnop - %endif - %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags - %endif -%endmacro - -; Merge mmx and sse* -; m# is a simd regsiter of the currently selected size -; xm# is the corresponding xmmreg (if selcted xmm or ymm size), or mmreg (if selected mmx) -; ym# is the corresponding ymmreg (if selcted xmm or ymm size), or mmreg (if selected mmx) -; (All 3 remain in sync through SWAP.) - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -; FIXME: INIT_AVX can be replaced by INIT_XMM avx -%macro INIT_AVX 0 - INIT_XMM - %assign avx_enabled 1 - %define PALIGNR PALIGNR_SSSE3 - %define RESET_MM_PERMUTATION INIT_AVX -%endmacro - -%macro INIT_YMM 0-1+ - %assign avx_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %undef movh - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -%macro DECLARE_MMCAST 1 - %define mmmm%1 mm%1 - %define mmxmm%1 mm%1 - %define mmymm%1 mm%1 - %define xmmmm%1 mm%1 - %define xmmxmm%1 xmm%1 - %define xmmymm%1 xmm%1 - %define ymmmm%1 mm%1 - %define ymmxmm%1 ymm%1 - %define ymmymm%1 ymm%1 - %define xm%1 xmm %+ m%1 - %define ym%1 ymm %+ m%1 -%endmacro - -%assign i 0 -%rep 16 - DECLARE_MMCAST i -%assign i i+1 -%endrep - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) -%ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 -%else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 -%endif -%endmacro - -%macro SWAP_INTERNAL_NUM 2-* - %rep %0-1 - %xdefine %%tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 - %rotate 1 - %endrep -%endmacro - -%macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 - %rep %0-1 - %xdefine %%args %%args, n %+ %2 - %rotate 1 - %endrep - SWAP_INTERNAL_NUM %%args -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1 %+ SUFFIX, %1 -%endmacro -%macro call_internal 2 - %xdefine %%i %2 - %ifndef cglobaled_%2 - %ifdef cglobaled_%1 - %xdefine %%i %1 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-avx emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -;%5+: operands -%macro RUN_AVX_INSTR 5-8+ - %ifnum sizeof%6 - %assign %%sizeofreg sizeof%6 - %elifnum sizeof%5 - %assign %%sizeofreg sizeof%5 - %else - %assign %%sizeofreg mmsize - %endif - %assign %%emulate_avx 0 - %if avx_enabled && %%sizeofreg >= 16 - %xdefine %%instr v%1 - %else - %xdefine %%instr %1 - %if %0 >= 7+%3 - %assign %%emulate_avx 1 - %endif - %endif - - %if %%emulate_avx - %xdefine %%src1 %6 - %xdefine %%src2 %7 - %ifnidn %5, %6 - %if %0 >= 8 - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %endif - %if %4 && %3 == 0 - %ifnid %7 - ; 3-operand AVX instructions with a memory arg can only have it in src2, - ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). - ; So, if the instruction is commutative with a memory arg, swap them. - %xdefine %%src1 %7 - %xdefine %%src2 %6 - %endif - %endif - %if %%sizeofreg == 8 - MOVQ %5, %%src1 - %elif %2 - MOVAPS %5, %%src1 - %else - MOVDQA %5, %%src1 - %endif - %endif - %if %0 >= 8 - %1 %5, %%src2, %8 - %else - %1 %5, %%src2 - %endif - %elif %0 >= 8 - %%instr %5, %6, %7, %8 - %elif %0 == 7 - %%instr %5, %6, %7 - %elif %0 == 6 - %%instr %5, %6 - %else - %%instr %5 - %endif -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-4 0, 1, 0 - %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %2, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1 - %elifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -; Instructions with both VEX and non-VEX encodings -; Non-destructive instructions are written without parameters -AVX_INSTR addpd, 1, 0, 1 -AVX_INSTR addps, 1, 0, 1 -AVX_INSTR addsd, 1, 0, 1 -AVX_INSTR addss, 1, 0, 1 -AVX_INSTR addsubpd, 1, 0, 0 -AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR aesdec, 0, 0, 0 -AVX_INSTR aesdeclast, 0, 0, 0 -AVX_INSTR aesenc, 0, 0, 0 -AVX_INSTR aesenclast, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist -AVX_INSTR andnpd, 1, 0, 0 -AVX_INSTR andnps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 -AVX_INSTR blendpd, 1, 0, 0 -AVX_INSTR blendps, 1, 0, 0 -AVX_INSTR blendvpd, 1, 0, 0 -AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 1, 0 -AVX_INSTR cmpps, 1, 1, 0 -AVX_INSTR cmpsd, 1, 1, 0 -AVX_INSTR cmpss, 1, 1, 0 -AVX_INSTR comisd -AVX_INSTR comiss -AVX_INSTR cvtdq2pd -AVX_INSTR cvtdq2ps -AVX_INSTR cvtpd2dq -AVX_INSTR cvtpd2ps -AVX_INSTR cvtps2dq -AVX_INSTR cvtps2pd -AVX_INSTR cvtsd2si -AVX_INSTR cvtsd2ss -AVX_INSTR cvtsi2sd -AVX_INSTR cvtsi2ss -AVX_INSTR cvtss2sd -AVX_INSTR cvtss2si -AVX_INSTR cvttpd2dq -AVX_INSTR cvttps2dq -AVX_INSTR cvttsd2si -AVX_INSTR cvttss2si -AVX_INSTR divpd, 1, 0, 0 -AVX_INSTR divps, 1, 0, 0 -AVX_INSTR divsd, 1, 0, 0 -AVX_INSTR divss, 1, 0, 0 -AVX_INSTR dppd, 1, 1, 0 -AVX_INSTR dpps, 1, 1, 0 -AVX_INSTR extractps -AVX_INSTR haddpd, 1, 0, 0 -AVX_INSTR haddps, 1, 0, 0 -AVX_INSTR hsubpd, 1, 0, 0 -AVX_INSTR hsubps, 1, 0, 0 -AVX_INSTR insertps, 1, 1, 0 -AVX_INSTR lddqu -AVX_INSTR ldmxcsr -AVX_INSTR maskmovdqu -AVX_INSTR maxpd, 1, 0, 1 -AVX_INSTR maxps, 1, 0, 1 -AVX_INSTR maxsd, 1, 0, 1 -AVX_INSTR maxss, 1, 0, 1 -AVX_INSTR minpd, 1, 0, 1 -AVX_INSTR minps, 1, 0, 1 -AVX_INSTR minsd, 1, 0, 1 -AVX_INSTR minss, 1, 0, 1 -AVX_INSTR movapd -AVX_INSTR movaps -AVX_INSTR movd -AVX_INSTR movddup -AVX_INSTR movdqa -AVX_INSTR movdqu -AVX_INSTR movhlps, 1, 0, 0 -AVX_INSTR movhpd, 1, 0, 0 -AVX_INSTR movhps, 1, 0, 0 -AVX_INSTR movlhps, 1, 0, 0 -AVX_INSTR movlpd, 1, 0, 0 -AVX_INSTR movlps, 1, 0, 0 -AVX_INSTR movmskpd -AVX_INSTR movmskps -AVX_INSTR movntdq -AVX_INSTR movntdqa -AVX_INSTR movntpd -AVX_INSTR movntps -AVX_INSTR movq -AVX_INSTR movsd, 1, 0, 0 -AVX_INSTR movshdup -AVX_INSTR movsldup -AVX_INSTR movss, 1, 0, 0 -AVX_INSTR movupd -AVX_INSTR movups -AVX_INSTR mpsadbw, 0, 1, 0 -AVX_INSTR mulpd, 1, 0, 1 -AVX_INSTR mulps, 1, 0, 1 -AVX_INSTR mulsd, 1, 0, 1 -AVX_INSTR mulss, 1, 0, 1 -AVX_INSTR orpd, 1, 0, 1 -AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb -AVX_INSTR pabsd -AVX_INSTR pabsw -AVX_INSTR packsswb, 0, 0, 0 -AVX_INSTR packssdw, 0, 0, 0 -AVX_INSTR packuswb, 0, 0, 0 -AVX_INSTR packusdw, 0, 0, 0 -AVX_INSTR paddb, 0, 0, 1 -AVX_INSTR paddw, 0, 0, 1 -AVX_INSTR paddd, 0, 0, 1 -AVX_INSTR paddq, 0, 0, 1 -AVX_INSTR paddsb, 0, 0, 1 -AVX_INSTR paddsw, 0, 0, 1 -AVX_INSTR paddusb, 0, 0, 1 -AVX_INSTR paddusw, 0, 0, 1 -AVX_INSTR palignr, 0, 1, 0 -AVX_INSTR pand, 0, 0, 1 -AVX_INSTR pandn, 0, 0, 0 -AVX_INSTR pavgb, 0, 0, 1 -AVX_INSTR pavgw, 0, 0, 1 -AVX_INSTR pblendvb, 0, 0, 0 -AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pclmulqdq, 0, 1, 0 -AVX_INSTR pcmpestri -AVX_INSTR pcmpestrm -AVX_INSTR pcmpistri -AVX_INSTR pcmpistrm -AVX_INSTR pcmpeqb, 0, 0, 1 -AVX_INSTR pcmpeqw, 0, 0, 1 -AVX_INSTR pcmpeqd, 0, 0, 1 -AVX_INSTR pcmpeqq, 0, 0, 1 -AVX_INSTR pcmpgtb, 0, 0, 0 -AVX_INSTR pcmpgtw, 0, 0, 0 -AVX_INSTR pcmpgtd, 0, 0, 0 -AVX_INSTR pcmpgtq, 0, 0, 0 -AVX_INSTR pextrb -AVX_INSTR pextrd -AVX_INSTR pextrq -AVX_INSTR pextrw -AVX_INSTR phaddw, 0, 0, 0 -AVX_INSTR phaddd, 0, 0, 0 -AVX_INSTR phaddsw, 0, 0, 0 -AVX_INSTR phminposuw -AVX_INSTR phsubw, 0, 0, 0 -AVX_INSTR phsubd, 0, 0, 0 -AVX_INSTR phsubsw, 0, 0, 0 -AVX_INSTR pinsrb, 0, 1, 0 -AVX_INSTR pinsrd, 0, 1, 0 -AVX_INSTR pinsrq, 0, 1, 0 -AVX_INSTR pinsrw, 0, 1, 0 -AVX_INSTR pmaddwd, 0, 0, 1 -AVX_INSTR pmaddubsw, 0, 0, 0 -AVX_INSTR pmaxsb, 0, 0, 1 -AVX_INSTR pmaxsw, 0, 0, 1 -AVX_INSTR pmaxsd, 0, 0, 1 -AVX_INSTR pmaxub, 0, 0, 1 -AVX_INSTR pmaxuw, 0, 0, 1 -AVX_INSTR pmaxud, 0, 0, 1 -AVX_INSTR pminsb, 0, 0, 1 -AVX_INSTR pminsw, 0, 0, 1 -AVX_INSTR pminsd, 0, 0, 1 -AVX_INSTR pminub, 0, 0, 1 -AVX_INSTR pminuw, 0, 0, 1 -AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb -AVX_INSTR pmovsxbw -AVX_INSTR pmovsxbd -AVX_INSTR pmovsxbq -AVX_INSTR pmovsxwd -AVX_INSTR pmovsxwq -AVX_INSTR pmovsxdq -AVX_INSTR pmovzxbw -AVX_INSTR pmovzxbd -AVX_INSTR pmovzxbq -AVX_INSTR pmovzxwd -AVX_INSTR pmovzxwq -AVX_INSTR pmovzxdq -AVX_INSTR pmuldq, 0, 0, 1 -AVX_INSTR pmulhrsw, 0, 0, 1 -AVX_INSTR pmulhuw, 0, 0, 1 -AVX_INSTR pmulhw, 0, 0, 1 -AVX_INSTR pmullw, 0, 0, 1 -AVX_INSTR pmulld, 0, 0, 1 -AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR por, 0, 0, 1 -AVX_INSTR psadbw, 0, 0, 1 -AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd -AVX_INSTR pshufhw -AVX_INSTR pshuflw -AVX_INSTR psignb, 0, 0, 0 -AVX_INSTR psignw, 0, 0, 0 -AVX_INSTR psignd, 0, 0, 0 -AVX_INSTR psllw, 0, 0, 0 -AVX_INSTR pslld, 0, 0, 0 -AVX_INSTR psllq, 0, 0, 0 -AVX_INSTR pslldq, 0, 0, 0 -AVX_INSTR psraw, 0, 0, 0 -AVX_INSTR psrad, 0, 0, 0 -AVX_INSTR psrlw, 0, 0, 0 -AVX_INSTR psrld, 0, 0, 0 -AVX_INSTR psrlq, 0, 0, 0 -AVX_INSTR psrldq, 0, 0, 0 -AVX_INSTR psubb, 0, 0, 0 -AVX_INSTR psubw, 0, 0, 0 -AVX_INSTR psubd, 0, 0, 0 -AVX_INSTR psubq, 0, 0, 0 -AVX_INSTR psubsb, 0, 0, 0 -AVX_INSTR psubsw, 0, 0, 0 -AVX_INSTR psubusb, 0, 0, 0 -AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest -AVX_INSTR punpckhbw, 0, 0, 0 -AVX_INSTR punpckhwd, 0, 0, 0 -AVX_INSTR punpckhdq, 0, 0, 0 -AVX_INSTR punpckhqdq, 0, 0, 0 -AVX_INSTR punpcklbw, 0, 0, 0 -AVX_INSTR punpcklwd, 0, 0, 0 -AVX_INSTR punpckldq, 0, 0, 0 -AVX_INSTR punpcklqdq, 0, 0, 0 -AVX_INSTR pxor, 0, 0, 1 -AVX_INSTR rcpps, 1, 0, 0 -AVX_INSTR rcpss, 1, 0, 0 -AVX_INSTR roundpd -AVX_INSTR roundps -AVX_INSTR roundsd -AVX_INSTR roundss -AVX_INSTR rsqrtps, 1, 0, 0 -AVX_INSTR rsqrtss, 1, 0, 0 -AVX_INSTR shufpd, 1, 1, 0 -AVX_INSTR shufps, 1, 1, 0 -AVX_INSTR sqrtpd, 1, 0, 0 -AVX_INSTR sqrtps, 1, 0, 0 -AVX_INSTR sqrtsd, 1, 0, 0 -AVX_INSTR sqrtss, 1, 0, 0 -AVX_INSTR stmxcsr -AVX_INSTR subpd, 1, 0, 0 -AVX_INSTR subps, 1, 0, 0 -AVX_INSTR subsd, 1, 0, 0 -AVX_INSTR subss, 1, 0, 0 -AVX_INSTR ucomisd -AVX_INSTR ucomiss -AVX_INSTR unpckhpd, 1, 0, 0 -AVX_INSTR unpckhps, 1, 0, 0 -AVX_INSTR unpcklpd, 1, 0, 0 -AVX_INSTR unpcklps, 1, 0, 0 -AVX_INSTR xorpd, 1, 0, 1 -AVX_INSTR xorps, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0, 1 -AVX_INSTR pfsub, 1, 0, 0 -AVX_INSTR pfmul, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif -%assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %else - %6 %1, %2, %3 - %7 %1, %4 - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsdd, pmulld, paddd -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmadcswd, pmaddwd, paddd - -; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. -; This lets us use tzcnt without bumping the yasm version requirement yet. -%define tzcnt rep bsf - -; convert FMA4 to FMA3 if possible -%macro FMA4_INSTR 4 - %macro %1 4-8 %1, %2, %3, %4 - %if cpuflag(fma4) - v%5 %1, %2, %3, %4 - %elifidn %1, %2 - v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 - %elifidn %1, %3 - v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 - %elifidn %1, %4 - v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 - %else - %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported - %endif - %endmacro -%endmacro - -FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd -FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps -FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd -FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss - -FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd -FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps -FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd -FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps - -FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd -FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps -FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd -FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss - -FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd -FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps -FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd -FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss - -FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd -FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps -FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd -FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%if ARCH_X86_64 == 0 -%macro vpbroadcastq 2 -%if sizeof%1 == 16 - movddup %1, %2 -%else - vbroadcastsd %1, %2 -%endif -%endmacro -%endif diff --git a/ffmpeg/libavutil/x86/x86util.asm b/ffmpeg/libavutil/x86/x86util.asm deleted file mode 100644 index 59e5df2..0000000 --- a/ffmpeg/libavutil/x86/x86util.asm +++ /dev/null @@ -1,680 +0,0 @@ -;***************************************************************************** -;* x86util.asm -;***************************************************************************** -;* Copyright (C) 2008-2010 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Holger Lubitz <holger@lubitz.org> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%define private_prefix ff -%define public_prefix avpriv -%define cpuflags_mmxext cpuflags_mmx2 - -%include "libavutil/x86/x86inc.asm" - -%macro SBUTTERFLY 4 -%if avx_enabled == 0 - mova m%4, m%2 - punpckl%1 m%2, m%3 - punpckh%1 m%4, m%3 -%else - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 -%endif - SWAP %3, %4 -%endmacro - -%macro SBUTTERFLY2 4 - punpckl%1 m%4, m%2, m%3 - punpckh%1 m%2, m%2, m%3 - SWAP %2, %4, %3 -%endmacro - -%macro SBUTTERFLYPS 3 - unpcklps m%3, m%1, m%2 - unpckhps m%1, m%1, m%2 - SWAP %1, %3, %2 -%endmacro - -%macro TRANSPOSE4x4B 5 - SBUTTERFLY bw, %1, %2, %5 - SBUTTERFLY bw, %3, %4, %5 - SBUTTERFLY wd, %1, %3, %5 - SBUTTERFLY wd, %2, %4, %5 - SWAP %2, %3 -%endmacro - -%macro TRANSPOSE4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SWAP %2, %3 -%endmacro - -%macro TRANSPOSE2x4x4W 5 - SBUTTERFLY wd, %1, %2, %5 - SBUTTERFLY wd, %3, %4, %5 - SBUTTERFLY dq, %1, %3, %5 - SBUTTERFLY dq, %2, %4, %5 - SBUTTERFLY qdq, %1, %2, %5 - SBUTTERFLY qdq, %3, %4, %5 -%endmacro - -%macro TRANSPOSE4x4D 5 - SBUTTERFLY dq, %1, %2, %5 - SBUTTERFLY dq, %3, %4, %5 - SBUTTERFLY qdq, %1, %3, %5 - SBUTTERFLY qdq, %2, %4, %5 - SWAP %2, %3 -%endmacro - -; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops -%macro TRANSPOSE4x4PS 5 - SBUTTERFLYPS %1, %2, %5 - SBUTTERFLYPS %3, %4, %5 - movlhps m%5, m%1, m%3 - movhlps m%3, m%1 - SWAP %5, %1 - movlhps m%5, m%2, m%4 - movhlps m%4, m%2 - SWAP %5, %2, %3 -%endmacro - -%macro TRANSPOSE8x8W 9-11 -%if ARCH_X86_64 - SBUTTERFLY wd, %1, %2, %9 - SBUTTERFLY wd, %3, %4, %9 - SBUTTERFLY wd, %5, %6, %9 - SBUTTERFLY wd, %7, %8, %9 - SBUTTERFLY dq, %1, %3, %9 - SBUTTERFLY dq, %2, %4, %9 - SBUTTERFLY dq, %5, %7, %9 - SBUTTERFLY dq, %6, %8, %9 - SBUTTERFLY qdq, %1, %5, %9 - SBUTTERFLY qdq, %2, %6, %9 - SBUTTERFLY qdq, %3, %7, %9 - SBUTTERFLY qdq, %4, %8, %9 - SWAP %2, %5 - SWAP %4, %7 -%else -; in: m0..m7, unless %11 in which case m6 is in %9 -; out: m0..m7, unless %11 in which case m4 is in %10 -; spills into %9 and %10 -%if %0<11 - movdqa %9, m%7 -%endif - SBUTTERFLY wd, %1, %2, %7 - movdqa %10, m%2 - movdqa m%7, %9 - SBUTTERFLY wd, %3, %4, %2 - SBUTTERFLY wd, %5, %6, %2 - SBUTTERFLY wd, %7, %8, %2 - SBUTTERFLY dq, %1, %3, %2 - movdqa %9, m%3 - movdqa m%2, %10 - SBUTTERFLY dq, %2, %4, %3 - SBUTTERFLY dq, %5, %7, %3 - SBUTTERFLY dq, %6, %8, %3 - SBUTTERFLY qdq, %1, %5, %3 - SBUTTERFLY qdq, %2, %6, %3 - movdqa %10, m%2 - movdqa m%3, %9 - SBUTTERFLY qdq, %3, %7, %2 - SBUTTERFLY qdq, %4, %8, %2 - SWAP %2, %5 - SWAP %4, %7 -%if %0<11 - movdqa m%5, %10 -%endif -%endif -%endmacro - -; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place -%macro PABSW 2 -%if cpuflag(ssse3) - pabsw %1, %2 -%elif cpuflag(mmxext) - pxor %1, %1 - psubw %1, %2 - pmaxsw %1, %2 -%else - pxor %1, %1 - pcmpgtw %1, %2 - pxor %2, %1 - psubw %2, %1 - SWAP %1, %2 -%endif -%endmacro - -%macro PSIGNW_MMX 2 - pxor %1, %2 - psubw %1, %2 -%endmacro - -%macro PSIGNW_SSSE3 2 - psignw %1, %2 -%endmacro - -%macro ABS1 2 -%if cpuflag(ssse3) - pabsw %1, %1 -%elif cpuflag(mmxext) ; a, tmp - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 -%else ; a, tmp - pxor %2, %2 - pcmpgtw %2, %1 - pxor %1, %2 - psubw %1, %2 -%endif -%endmacro - -%macro ABS2 4 -%if cpuflag(ssse3) - pabsw %1, %1 - pabsw %2, %2 -%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 - pxor %3, %3 - pxor %4, %4 - psubw %3, %1 - psubw %4, %2 - pmaxsw %1, %3 - pmaxsw %2, %4 -%else ; a, b, tmp0, tmp1 - pxor %3, %3 - pxor %4, %4 - pcmpgtw %3, %1 - pcmpgtw %4, %2 - pxor %1, %3 - pxor %2, %4 - psubw %1, %3 - psubw %2, %4 -%endif -%endmacro - -%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3) -%if cpuflag(ssse3) - pabsb %1, %1 -%else - pxor %2, %2 - psubb %2, %1 - pminub %1, %2 -%endif -%endmacro - -%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) -%if cpuflag(ssse3) - pabsb %1, %1 - pabsb %2, %2 -%else - pxor %3, %3 - pxor %4, %4 - psubb %3, %1 - psubb %4, %2 - pminub %1, %3 - pminub %2, %4 -%endif -%endmacro - -%macro ABSD2_MMX 4 - pxor %3, %3 - pxor %4, %4 - pcmpgtd %3, %1 - pcmpgtd %4, %2 - pxor %1, %3 - pxor %2, %4 - psubd %1, %3 - psubd %2, %4 -%endmacro - -%macro ABS4 6 - ABS2 %1, %2, %5, %6 - ABS2 %3, %4, %5, %6 -%endmacro - -%macro SPLATB_LOAD 3 -%if cpuflag(ssse3) - movd %1, [%2-3] - pshufb %1, %3 -%else - movd %1, [%2-3] ;to avoid crossing a cacheline - punpcklbw %1, %1 - SPLATW %1, %1, 3 -%endif -%endmacro - -%macro SPLATB_REG 3 -%if cpuflag(ssse3) - movd %1, %2d - pshufb %1, %3 -%else - movd %1, %2d - punpcklbw %1, %1 - SPLATW %1, %1, 0 -%endif -%endmacro - -%macro PALIGNR 4-5 -%if cpuflag(ssse3) -%if %0==5 - palignr %1, %2, %3, %4 -%else - palignr %1, %2, %3 -%endif -%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp - %define %%dst %1 -%if %0==5 -%ifnidn %1, %2 - mova %%dst, %2 -%endif - %rotate 1 -%endif -%ifnidn %4, %2 - mova %4, %2 -%endif -%if mmsize==8 - psllq %%dst, (8-%3)*8 - psrlq %4, %3*8 -%else - pslldq %%dst, 16-%3 - psrldq %4, %3 -%endif - por %%dst, %4 -%endif -%endmacro - -%macro PAVGB 2 -%if cpuflag(mmxext) - pavgb %1, %2 -%elif cpuflag(3dnow) - pavgusb %1, %2 -%endif -%endmacro - -%macro PSHUFLW 1+ - %if mmsize == 8 - pshufw %1 - %else - pshuflw %1 - %endif -%endmacro - -%macro PSWAPD 2 -%if cpuflag(mmxext) - pshufw %1, %2, q1032 -%elif cpuflag(3dnowext) - pswapd %1, %2 -%elif cpuflag(3dnow) - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endif -%endmacro - -%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from -%ifnum %5 - pand m%3, m%5, m%4 ; src .. y6 .. y4 - pand m%1, m%5, m%2 ; dst .. y6 .. y4 -%else - mova m%1, %5 - pand m%3, m%1, m%4 ; src .. y6 .. y4 - pand m%1, m%1, m%2 ; dst .. y6 .. y4 -%endif - psrlw m%2, 8 ; dst .. y7 .. y5 - psrlw m%4, 8 ; src .. y7 .. y5 -%endmacro - -%macro SUMSUB_BA 3-4 -%if %0==3 - padd%1 m%2, m%3 - padd%1 m%3, m%3 - psub%1 m%3, m%2 -%else -%if avx_enabled == 0 - mova m%4, m%2 - padd%1 m%2, m%3 - psub%1 m%3, m%4 -%else - padd%1 m%4, m%2, m%3 - psub%1 m%3, m%2 - SWAP %2, %4 -%endif -%endif -%endmacro - -%macro SUMSUB_BADC 5-6 -%if %0==6 - SUMSUB_BA %1, %2, %3, %6 - SUMSUB_BA %1, %4, %5, %6 -%else - padd%1 m%2, m%3 - padd%1 m%4, m%5 - padd%1 m%3, m%3 - padd%1 m%5, m%5 - psub%1 m%3, m%2 - psub%1 m%5, m%4 -%endif -%endmacro - -%macro SUMSUB2_AB 4 -%ifnum %3 - psub%1 m%4, m%2, m%3 - psub%1 m%4, m%3 - padd%1 m%2, m%2 - padd%1 m%2, m%3 -%else - mova m%4, m%2 - padd%1 m%2, m%2 - padd%1 m%2, %3 - psub%1 m%4, %3 - psub%1 m%4, %3 -%endif -%endmacro - -%macro SUMSUB2_BA 4 -%if avx_enabled == 0 - mova m%4, m%2 - padd%1 m%2, m%3 - padd%1 m%2, m%3 - psub%1 m%3, m%4 - psub%1 m%3, m%4 -%else - padd%1 m%4, m%2, m%3 - padd%1 m%4, m%3 - psub%1 m%3, m%2 - psub%1 m%3, m%2 - SWAP %2, %4 -%endif -%endmacro - -%macro SUMSUBD2_AB 5 -%ifnum %4 - psra%1 m%5, m%2, 1 ; %3: %3>>1 - psra%1 m%4, m%3, 1 ; %2: %2>>1 - padd%1 m%4, m%2 ; %3: %3>>1+%2 - psub%1 m%5, m%3 ; %2: %2>>1-%3 - SWAP %2, %5 - SWAP %3, %4 -%else - mova %5, m%2 - mova %4, m%3 - psra%1 m%3, 1 ; %3: %3>>1 - psra%1 m%2, 1 ; %2: %2>>1 - padd%1 m%3, %5 ; %3: %3>>1+%2 - psub%1 m%2, %4 ; %2: %2>>1-%3 -%endif -%endmacro - -%macro DCT4_1D 5 -%ifnum %5 - SUMSUB_BADC w, %4, %1, %3, %2, %5 - SUMSUB_BA w, %3, %4, %5 - SUMSUB2_AB w, %1, %2, %5 - SWAP %1, %3, %4, %5, %2 -%else - SUMSUB_BADC w, %4, %1, %3, %2 - SUMSUB_BA w, %3, %4 - mova [%5], m%2 - SUMSUB2_AB w, %1, [%5], %2 - SWAP %1, %3, %4, %2 -%endif -%endmacro - -%macro IDCT4_1D 6-7 -%ifnum %6 - SUMSUBD2_AB %1, %3, %5, %7, %6 - ; %3: %3>>1-%5 %5: %3+%5>>1 - SUMSUB_BA %1, %4, %2, %7 - ; %4: %2+%4 %2: %2-%4 - SUMSUB_BADC %1, %5, %4, %3, %2, %7 - ; %5: %2+%4 + (%3+%5>>1) - ; %4: %2+%4 - (%3+%5>>1) - ; %3: %2-%4 + (%3>>1-%5) - ; %2: %2-%4 - (%3>>1-%5) -%else -%ifidn %1, w - SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] -%else - SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] -%endif - SUMSUB_BA %1, %4, %2 - SUMSUB_BADC %1, %5, %4, %3, %2 -%endif - SWAP %2, %5, %4 - ; %2: %2+%4 + (%3+%5>>1) row0 - ; %3: %2-%4 + (%3>>1-%5) row1 - ; %4: %2-%4 - (%3>>1-%5) row2 - ; %5: %2+%4 - (%3+%5>>1) row3 -%endmacro - - -%macro LOAD_DIFF 5 -%ifidn %3, none - movh %1, %4 - movh %2, %5 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%else - movh %1, %4 - punpcklbw %1, %3 - movh %2, %5 - punpcklbw %2, %3 - psubw %1, %2 -%endif -%endmacro - -%macro STORE_DCT 6 - movq [%5+%6+ 0], m%1 - movq [%5+%6+ 8], m%2 - movq [%5+%6+16], m%3 - movq [%5+%6+24], m%4 - movhps [%5+%6+32], m%1 - movhps [%5+%6+40], m%2 - movhps [%5+%6+48], m%3 - movhps [%5+%6+56], m%4 -%endmacro - -%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? - LOAD_DIFF m%1, m%5, m%7, [%8], [%9] - LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] - LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] - LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] -%if %10 - lea %8, [%8+4*r1] - lea %9, [%9+4*r3] -%endif -%endmacro - -%macro DIFFx2 6-7 - movh %3, %5 - punpcklbw %3, %4 - psraw %1, 6 - paddsw %1, %3 - movh %3, %6 - punpcklbw %3, %4 - psraw %2, 6 - paddsw %2, %3 - packuswb %2, %1 -%endmacro - -%macro STORE_DIFF 4 - movh %2, %4 - punpcklbw %2, %3 - psraw %1, 6 - paddsw %1, %2 - packuswb %1, %1 - movh %4, %1 -%endmacro - -%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride - movh %3, [%7] - movh %4, [%7+%8] - psraw %1, %6 - psraw %2, %6 - punpcklbw %3, %5 - punpcklbw %4, %5 - paddw %3, %1 - paddw %4, %2 - packuswb %3, %5 - packuswb %4, %5 - movh [%7], %3 - movh [%7+%8], %4 -%endmacro - -%macro PMINUB 3 ; dst, src, ignored -%if cpuflag(mmxext) - pminub %1, %2 -%else ; dst, src, tmp - mova %3, %1 - psubusb %3, %2 - psubb %1, %3 -%endif -%endmacro - -%macro SPLATW 2-3 0 -%if mmsize == 16 - pshuflw %1, %2, (%3)*0x55 - punpcklqdq %1, %1 -%elif cpuflag(mmxext) - pshufw %1, %2, (%3)*0x55 -%else - %ifnidn %1, %2 - mova %1, %2 - %endif - %if %3 & 2 - punpckhwd %1, %1 - %else - punpcklwd %1, %1 - %endif - %if %3 & 1 - punpckhwd %1, %1 - %else - punpcklwd %1, %1 - %endif -%endif -%endmacro - -%macro SPLATD 1 -%if mmsize == 8 - punpckldq %1, %1 -%elif cpuflag(sse2) - pshufd %1, %1, 0 -%elif cpuflag(sse) - shufps %1, %1, 0 -%endif -%endmacro - -%macro CLIPW 3 ;(dst, min, max) - pmaxsw %1, %2 - pminsw %1, %3 -%endmacro - -%macro PMINSD_MMX 3 ; dst, src, tmp - mova %3, %2 - pcmpgtd %3, %1 - pxor %1, %2 - pand %1, %3 - pxor %1, %2 -%endmacro - -%macro PMAXSD_MMX 3 ; dst, src, tmp - mova %3, %1 - pcmpgtd %3, %2 - pand %1, %3 - pandn %3, %2 - por %1, %3 -%endmacro - -%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp - PMINSD_MMX %1, %3, %4 - PMAXSD_MMX %1, %2, %4 -%endmacro - -%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused - cvtdq2ps %1, %1 - minps %1, %3 - maxps %1, %2 - cvtps2dq %1, %1 -%endmacro - -%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused - pminsd %1, %3 - pmaxsd %1, %2 -%endmacro - -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse - movss %1, %2 - shufps %1, %1, 0 -%endif -%endmacro - -%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 -%if cpuflag(avx) && mmsize == 32 - vbroadcastsd %1, %2 -%elif cpuflag(sse3) - movddup %1, %2 -%else ; sse2 - movsd %1, %2 - movlhps %1, %1 -%endif -%endmacro - -%macro SHUFFLE_MASK_W 8 - %rep 8 - %if %1>=0x80 - db %1, %1 - %else - db %1*2 - db %1*2+1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro PMOVSXWD 2; dst, src -%if cpuflag(sse4) - pmovsxwd %1, %2 -%else - %ifnidn %1, %2 - mova %1, %2 - %endif - punpcklwd %1, %1 - psrad %1, 16 -%endif -%endmacro - -; Wrapper for non-FMA version of fmaddps -%macro FMULADD_PS 5 - %if cpuflag(fma3) || cpuflag(fma4) - fmaddps %1, %2, %3, %4 - %elifidn %1, %4 - mulps %5, %2, %3 - addps %1, %4, %5 - %else - mulps %1, %2, %3 - addps %1, %4 - %endif -%endmacro |
