diff options
Diffstat (limited to 'ffmpeg/libavcodec/x86/imdct36.asm')
| -rw-r--r-- | ffmpeg/libavcodec/x86/imdct36.asm | 724 |
1 files changed, 0 insertions, 724 deletions
diff --git a/ffmpeg/libavcodec/x86/imdct36.asm b/ffmpeg/libavcodec/x86/imdct36.asm deleted file mode 100644 index d311fbe..0000000 --- a/ffmpeg/libavcodec/x86/imdct36.asm +++ /dev/null @@ -1,724 +0,0 @@ -;****************************************************************************** -;* 36 point SSE-optimized IMDCT transform -;* Copyright (c) 2011 Vitor Sessak -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -align 16 -ps_mask: dd 0, ~0, ~0, ~0 -ps_mask2: dd 0, ~0, 0, ~0 -ps_mask3: dd 0, 0, 0, ~0 -ps_mask4: dd 0, ~0, 0, 0 - -ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 -ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 -ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 -ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 -ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 -ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 -ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 - -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 -ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 - -ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 - dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 - dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 - dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 - dd 1.0, 0.70710678118654752439, 0.0, 0.0 - -ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 - dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 - dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 - dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 - dd 1.0, 0.70710678118654752439, 0.0, 0.0 - -costabs: times 4 dd 0.98480773 - times 4 dd 0.93969262 - times 4 dd 0.86602539 - times 4 dd -0.76604444 - times 4 dd -0.64278764 - times 4 dd 0.50000000 - times 4 dd -0.50000000 - times 4 dd -0.34202015 - times 4 dd -0.17364818 - times 4 dd 0.50190992 - times 4 dd 0.51763808 - times 4 dd 0.55168896 - times 4 dd 0.61038726 - times 4 dd 0.70710677 - times 4 dd 0.87172341 - times 4 dd 1.18310082 - times 4 dd 1.93185163 - times 4 dd 5.73685646 - -%define SBLIMIT 32 -SECTION_TEXT - -%macro PSHUFD 3 -%if cpuflag(sse2) && notcpuflag(avx) - pshufd %1, %2, %3 -%else - shufps %1, %2, %2, %3 -%endif -%endmacro - -; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} -; output %1={x3,x4,y1,y2} -%macro BUILDINVHIGHLOW 3 -%if cpuflag(avx) - shufps %1, %2, %3, 0x4e -%else - movlhps %1, %3 - movhlps %1, %2 -%endif -%endmacro - -; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} -; output %1={x4,y1,y2,y3} -%macro ROTLEFT 3 -%if cpuflag(ssse3) - palignr %1, %3, %2, 12 -%else - BUILDINVHIGHLOW %1, %2, %3 - shufps %1, %1, %3, 0x99 -%endif -%endmacro - -%macro INVERTHL 2 -%if cpuflag(sse2) - PSHUFD %1, %2, 0x4e -%else - movhlps %1, %2 - movlhps %1, %2 -%endif -%endmacro - -%macro BUTTERF 3 - INVERTHL %2, %1 - xorps %1, [ps_p1p1m1m1] - addps %1, %2 -%if cpuflag(sse3) - mulps %1, %1, [ps_cosh_sse3 + %3] - PSHUFD %2, %1, 0xb1 - addsubps %1, %1, %2 -%else - mulps %1, [ps_cosh + %3] - PSHUFD %2, %1, 0xb1 - xorps %1, [ps_p1m1p1m1] - addps %1, %2 -%endif -%endmacro - -%macro STORE 4 - movhlps %2, %1 - movss [%3 ], %1 - movss [%3 + 2*%4], %2 - shufps %1, %1, 0xb1 - movss [%3 + %4], %1 - movhlps %2, %1 - movss [%3 + 3*%4], %2 -%endmacro - -%macro LOAD 4 - movlps %1, [%3 ] - movhps %1, [%3 + %4] - movlps %2, [%3 + 2*%4] - movhps %2, [%3 + 3*%4] - shufps %1, %2, 0x88 -%endmacro - -%macro LOADA64 2 -%if cpuflag(avx) - movu %1, [%2] -%else - movlps %1, [%2] - movhps %1, [%2 + 8] -%endif -%endmacro - -%macro DEFINE_IMDCT 0 -cglobal imdct36_float, 4,4,9, out, buf, in, win - - ; for(i=17;i>=1;i--) in[i] += in[i-1]; - LOADA64 m0, inq - LOADA64 m1, inq + 16 - - ROTLEFT m5, m0, m1 - - PSHUFD m6, m0, 0x93 - andps m6, m6, [ps_mask] - addps m0, m0, m6 - - LOADA64 m2, inq + 32 - - ROTLEFT m7, m1, m2 - - addps m1, m1, m5 - LOADA64 m3, inq + 48 - - ROTLEFT m5, m2, m3 - - xorps m4, m4, m4 - movlps m4, [inq+64] - BUILDINVHIGHLOW m6, m3, m4 - shufps m6, m6, m4, 0xa9 - - addps m4, m4, m6 - addps m2, m2, m7 - addps m3, m3, m5 - - ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; - movlhps m5, m5, m0 - andps m5, m5, [ps_mask3] - - BUILDINVHIGHLOW m7, m0, m1 - andps m7, m7, [ps_mask2] - - addps m0, m0, m5 - - BUILDINVHIGHLOW m6, m1, m2 - andps m6, m6, [ps_mask2] - - addps m1, m1, m7 - - BUILDINVHIGHLOW m7, m2, m3 - andps m7, m7, [ps_mask2] - - addps m2, m2, m6 - - movhlps m6, m6, m3 - andps m6, m6, [ps_mask4] - - addps m3, m3, m7 - addps m4, m4, m6 - - ; Populate tmp[] - movlhps m6, m1, m5 ; zero out high values - subps m6, m6, m4 - - subps m5, m0, m3 - -%if ARCH_X86_64 - SWAP m5, m8 -%endif - - mulps m7, m2, [ps_val1] - -%if ARCH_X86_64 - mulps m5, m8, [ps_val2] -%else - mulps m5, m5, [ps_val2] -%endif - addps m7, m7, m5 - - mulps m5, m6, [ps_val1] - subps m7, m7, m5 - -%if ARCH_X86_64 - SWAP m5, m8 -%else - subps m5, m0, m3 -%endif - - subps m5, m5, m6 - addps m5, m5, m2 - - shufps m6, m4, m3, 0xe4 - subps m6, m6, m2 - mulps m6, m6, [ps_val3] - - addps m4, m4, m1 - mulps m4, m4, [ps_val4] - - shufps m1, m1, m0, 0xe4 - addps m1, m1, m2 - mulps m1, m1, [ps_val5] - - mulps m3, m3, [ps_val6] - mulps m0, m0, [ps_val7] - addps m0, m0, m3 - - xorps m2, m1, [ps_p1p1m1m1] - subps m2, m2, m4 - addps m2, m2, m0 - - addps m3, m4, m0 - subps m3, m3, m6 - xorps m3, m3, [ps_p1p1m1m1] - - shufps m0, m0, m4, 0xe4 - subps m0, m0, m1 - addps m0, m0, m6 - - BUILDINVHIGHLOW m4, m2, m3 - shufps m3, m3, m2, 0x4e - - ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} - - BUTTERF m0, m1, 0 - BUTTERF m7, m2, 16 - BUTTERF m3, m6, 32 - BUTTERF m4, m1, 48 - - mulps m5, m5, [ps_cosh + 64] - PSHUFD m1, m5, 0xe1 - xorps m5, m5, [ps_p1m1p1m1] - addps m5, m5, m1 - - ; permutates: - ; m0 0 1 2 3 => 2 6 10 14 m1 - ; m7 4 5 6 7 => 3 7 11 15 m2 - ; m3 8 9 10 11 => 17 13 9 5 m3 - ; m4 12 13 14 15 => 16 12 8 4 m5 - ; m5 16 17 xx xx => 0 1 xx xx m0 - - unpckhps m1, m0, m7 - unpckhps m6, m3, m4 - movhlps m2, m6, m1 - movlhps m1, m1, m6 - - unpcklps m5, m5, m4 - unpcklps m3, m3, m7 - movhlps m4, m3, m5 - movlhps m5, m5, m3 - SWAP m4, m3 - ; permutation done - - PSHUFD m6, m2, 0xb1 - movss m4, [bufq + 4*68] - movss m7, [bufq + 4*64] - unpcklps m7, m7, m4 - mulps m6, m6, [winq + 16*4] - addps m6, m6, m7 - movss [outq + 64*SBLIMIT], m6 - shufps m6, m6, m6, 0xb1 - movss [outq + 68*SBLIMIT], m6 - - mulps m6, m3, [winq + 4*4] - LOAD m4, m7, bufq + 4*16, 16 - addps m6, m6, m4 - STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT - - shufps m4, m0, m3, 0xb5 - mulps m4, m4, [winq + 8*4] - LOAD m7, m6, bufq + 4*32, 16 - addps m4, m4, m7 - STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT - - shufps m3, m3, m2, 0xb1 - mulps m3, m3, [winq + 12*4] - LOAD m7, m6, bufq + 4*48, 16 - addps m3, m3, m7 - STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT - - mulps m2, m2, [winq] - LOAD m6, m7, bufq, 16 - addps m2, m2, m6 - STORE m2, m7, outq, 4*SBLIMIT - - mulps m4, m1, [winq + 20*4] - STORE m4, m7, bufq, 16 - - mulps m3, m5, [winq + 24*4] - STORE m3, m7, bufq + 4*16, 16 - - shufps m0, m0, m5, 0xb0 - mulps m0, m0, [winq + 28*4] - STORE m0, m7, bufq + 4*32, 16 - - shufps m5, m5, m1, 0xb1 - mulps m5, m5, [winq + 32*4] - STORE m5, m7, bufq + 4*48, 16 - - shufps m1, m1, m1, 0xb1 - mulps m1, m1, [winq + 36*4] - movss [bufq + 4*64], m1 - shufps m1, m1, 0xb1 - movss [bufq + 4*68], m1 - RET -%endmacro - -INIT_XMM sse -DEFINE_IMDCT - -INIT_XMM sse2 -DEFINE_IMDCT - -INIT_XMM sse3 -DEFINE_IMDCT - -INIT_XMM ssse3 -DEFINE_IMDCT - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEFINE_IMDCT -%endif - -INIT_XMM sse - -%if ARCH_X86_64 -%define SPILL SWAP -%define UNSPILL SWAP -%define SPILLED(x) m %+ x -%else -%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] -%macro SPILL 2 ; xmm#, mempos - movaps SPILLED(%2), m%1 -%endmacro -%macro UNSPILL 2 - movaps m%1, SPILLED(%2) -%endmacro -%endif - -%macro DEFINE_FOUR_IMDCT 0 -cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp - movlps m0, [inq+64] - movhps m0, [inq+64 + 72] - movlps m3, [inq+64 + 2*72] - movhps m3, [inq+64 + 3*72] - - shufps m5, m0, m3, 0xdd - shufps m0, m0, m3, 0x88 - - mova m1, [inq+48] - movu m6, [inq+48 + 72] - mova m7, [inq+48 + 2*72] - movu m3, [inq+48 + 3*72] - - TRANSPOSE4x4PS 1, 6, 7, 3, 4 - - addps m4, m6, m7 - mova [tmpq+4*28], m4 - - addps m7, m3 - addps m6, m1 - addps m3, m0 - addps m0, m5 - addps m0, m7 - addps m7, m6 - mova [tmpq+4*12], m7 - SPILL 3, 12 - - mova m4, [inq+32] - movu m5, [inq+32 + 72] - mova m2, [inq+32 + 2*72] - movu m7, [inq+32 + 3*72] - - TRANSPOSE4x4PS 4, 5, 2, 7, 3 - - addps m1, m7 - SPILL 1, 11 - - addps m3, m5, m2 - SPILL 3, 13 - - addps m7, m2 - addps m5, m4 - addps m6, m7 - mova [tmpq], m6 - addps m7, m5 - mova [tmpq+4*16], m7 - - mova m2, [inq+16] - movu m7, [inq+16 + 72] - mova m1, [inq+16 + 2*72] - movu m6, [inq+16 + 3*72] - - TRANSPOSE4x4PS 2, 7, 1, 6, 3 - - addps m4, m6 - addps m6, m1 - addps m1, m7 - addps m7, m2 - addps m5, m6 - SPILL 5, 15 - addps m6, m7 - mulps m6, [costabs + 16*2] - mova [tmpq+4*8], m6 - SPILL 1, 10 - SPILL 0, 14 - - mova m1, [inq] - movu m6, [inq + 72] - mova m3, [inq + 2*72] - movu m5, [inq + 3*72] - - TRANSPOSE4x4PS 1, 6, 3, 5, 0 - - addps m2, m5 - addps m5, m3 - addps m7, m5 - addps m3, m6 - addps m6, m1 - SPILL 7, 8 - addps m5, m6 - SPILL 6, 9 - addps m6, m4, SPILLED(12) - subps m6, m2 - UNSPILL 7, 11 - SPILL 5, 11 - subps m5, m1, m7 - mulps m7, [costabs + 16*5] - addps m7, m1 - mulps m0, m6, [costabs + 16*6] - addps m0, m5 - mova [tmpq+4*24], m0 - addps m6, m5 - mova [tmpq+4*4], m6 - addps m6, m4, m2 - mulps m6, [costabs + 16*1] - subps m4, SPILLED(12) - mulps m4, [costabs + 16*8] - addps m2, SPILLED(12) - mulps m2, [costabs + 16*3] - subps m5, m7, m6 - subps m5, m2 - addps m6, m7 - addps m6, m4 - addps m7, m2 - subps m7, m4 - mova [tmpq+4*20], m7 - mova m2, [tmpq+4*28] - mova [tmpq+4*28], m5 - UNSPILL 7, 13 - subps m5, m7, m2 - mulps m5, [costabs + 16*7] - UNSPILL 1, 10 - mulps m1, [costabs + 16*2] - addps m4, m3, m2 - mulps m4, [costabs + 16*4] - addps m2, m7 - addps m7, m3 - mulps m7, [costabs] - subps m3, m2 - mulps m3, [costabs + 16*2] - addps m2, m7, m5 - addps m2, m1 - SPILL 2, 10 - addps m7, m4 - subps m7, m1 - SPILL 7, 12 - subps m5, m4 - subps m5, m1 - UNSPILL 0, 14 - SPILL 5, 13 - addps m1, m0, SPILLED(15) - subps m1, SPILLED(8) - mova m4, [costabs + 16*5] - mulps m4, [tmpq] - UNSPILL 2, 9 - addps m4, m2 - subps m2, [tmpq] - mulps m5, m1, [costabs + 16*6] - addps m5, m2 - SPILL 5, 9 - addps m2, m1 - SPILL 2, 14 - UNSPILL 5, 15 - subps m7, m5, m0 - addps m5, SPILLED(8) - mulps m5, [costabs + 16*1] - mulps m7, [costabs + 16*8] - addps m0, SPILLED(8) - mulps m0, [costabs + 16*3] - subps m2, m4, m5 - subps m2, m0 - SPILL 2, 15 - addps m5, m4 - addps m5, m7 - addps m4, m0 - subps m4, m7 - SPILL 4, 8 - mova m7, [tmpq+4*16] - mova m2, [tmpq+4*12] - addps m0, m7, m2 - subps m0, SPILLED(11) - mulps m0, [costabs + 16*2] - addps m4, m7, SPILLED(11) - mulps m4, [costabs] - subps m7, m2 - mulps m7, [costabs + 16*7] - addps m2, SPILLED(11) - mulps m2, [costabs + 16*4] - addps m1, m7, [tmpq+4*8] - addps m1, m4 - addps m4, m2 - subps m4, [tmpq+4*8] - SPILL 4, 11 - subps m7, m2 - subps m7, [tmpq+4*8] - addps m4, m6, SPILLED(10) - subps m6, SPILLED(10) - addps m2, m5, m1 - mulps m2, [costabs + 16*9] - subps m5, m1 - mulps m5, [costabs + 16*17] - subps m1, m4, m2 - addps m4, m2 - mulps m2, m1, [winq+4*36] - addps m2, [bufq+4*36] - mova [outq+1152], m2 - mulps m1, [winq+4*32] - addps m1, [bufq+4*32] - mova [outq+1024], m1 - mulps m1, m4, [winq+4*116] - mova [bufq+4*36], m1 - mulps m4, [winq+4*112] - mova [bufq+4*32], m4 - addps m2, m6, m5 - subps m6, m5 - mulps m1, m6, [winq+4*68] - addps m1, [bufq+4*68] - mova [outq+2176], m1 - mulps m6, [winq] - addps m6, [bufq] - mova [outq], m6 - mulps m1, m2, [winq+4*148] - mova [bufq+4*68], m1 - mulps m2, [winq+4*80] - mova [bufq], m2 - addps m5, m3, [tmpq+4*24] - mova m2, [tmpq+4*24] - subps m2, m3 - mova m1, SPILLED(9) - subps m1, m0 - mulps m1, [costabs + 16*10] - addps m0, SPILLED(9) - mulps m0, [costabs + 16*16] - addps m6, m5, m1 - subps m5, m1 - mulps m3, m5, [winq+4*40] - addps m3, [bufq+4*40] - mova [outq+1280], m3 - mulps m5, [winq+4*28] - addps m5, [bufq+4*28] - mova [outq+896], m5 - mulps m1, m6, [winq+4*120] - mova [bufq+4*40], m1 - mulps m6, [winq+4*108] - mova [bufq+4*28], m6 - addps m1, m2, m0 - subps m2, m0 - mulps m5, m2, [winq+4*64] - addps m5, [bufq+4*64] - mova [outq+2048], m5 - mulps m2, [winq+4*4] - addps m2, [bufq+4*4] - mova [outq+128], m2 - mulps m0, m1, [winq+4*144] - mova [bufq+4*64], m0 - mulps m1, [winq+4*84] - mova [bufq+4*4], m1 - mova m1, [tmpq+4*28] - mova m5, m1 - addps m1, SPILLED(13) - subps m5, SPILLED(13) - UNSPILL 3, 15 - addps m2, m7, m3 - mulps m2, [costabs + 16*11] - subps m3, m7 - mulps m3, [costabs + 16*15] - addps m0, m2, m1 - subps m1, m2 - SWAP m0, m2 - mulps m6, m1, [winq+4*44] - addps m6, [bufq+4*44] - mova [outq+1408], m6 - mulps m1, [winq+4*24] - addps m1, [bufq+4*24] - mova [outq+768], m1 - mulps m0, m2, [winq+4*124] - mova [bufq+4*44], m0 - mulps m2, [winq+4*104] - mova [bufq+4*24], m2 - addps m0, m5, m3 - subps m5, m3 - mulps m1, m5, [winq+4*60] - addps m1, [bufq+4*60] - mova [outq+1920], m1 - mulps m5, [winq+4*8] - addps m5, [bufq+4*8] - mova [outq+256], m5 - mulps m1, m0, [winq+4*140] - mova [bufq+4*60], m1 - mulps m0, [winq+4*88] - mova [bufq+4*8], m0 - mova m1, [tmpq+4*20] - addps m1, SPILLED(12) - mova m2, [tmpq+4*20] - subps m2, SPILLED(12) - UNSPILL 7, 8 - subps m0, m7, SPILLED(11) - addps m7, SPILLED(11) - mulps m4, m7, [costabs + 16*12] - mulps m0, [costabs + 16*14] - addps m5, m1, m4 - subps m1, m4 - mulps m7, m1, [winq+4*48] - addps m7, [bufq+4*48] - mova [outq+1536], m7 - mulps m1, [winq+4*20] - addps m1, [bufq+4*20] - mova [outq+640], m1 - mulps m1, m5, [winq+4*128] - mova [bufq+4*48], m1 - mulps m5, [winq+4*100] - mova [bufq+4*20], m5 - addps m6, m2, m0 - subps m2, m0 - mulps m1, m2, [winq+4*56] - addps m1, [bufq+4*56] - mova [outq+1792], m1 - mulps m2, [winq+4*12] - addps m2, [bufq+4*12] - mova [outq+384], m2 - mulps m0, m6, [winq+4*136] - mova [bufq+4*56], m0 - mulps m6, [winq+4*92] - mova [bufq+4*12], m6 - UNSPILL 0, 14 - mulps m0, [costabs + 16*13] - mova m3, [tmpq+4*4] - addps m2, m0, m3 - subps m3, m0 - mulps m0, m3, [winq+4*52] - addps m0, [bufq+4*52] - mova [outq+1664], m0 - mulps m3, [winq+4*16] - addps m3, [bufq+4*16] - mova [outq+512], m3 - mulps m0, m2, [winq+4*132] - mova [bufq+4*52], m0 - mulps m2, [winq+4*96] - mova [bufq+4*16], m2 - RET -%endmacro - -INIT_XMM sse -DEFINE_FOUR_IMDCT - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -DEFINE_FOUR_IMDCT -%endif |
