diff options
Diffstat (limited to 'ffmpeg/libswscale/x86')
| -rw-r--r-- | ffmpeg/libswscale/x86/Makefile | 11 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/input.asm | 696 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/output.asm | 413 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/rgb2rgb.c | 160 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/rgb2rgb_template.c | 2533 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/scale.asm | 431 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/swscale.c | 580 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/swscale_template.c | 1717 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/w64xmmtest.c | 31 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/yuv2rgb.c | 118 | ||||
| -rw-r--r-- | ffmpeg/libswscale/x86/yuv2rgb_template.c | 451 |
11 files changed, 0 insertions, 7141 deletions
diff --git a/ffmpeg/libswscale/x86/Makefile b/ffmpeg/libswscale/x86/Makefile deleted file mode 100644 index e767a5c..0000000 --- a/ffmpeg/libswscale/x86/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) - -OBJS += x86/rgb2rgb.o \ - x86/swscale.o \ - x86/yuv2rgb.o \ - -OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o - -YASM-OBJS += x86/input.o \ - x86/output.o \ - x86/scale.o \ diff --git a/ffmpeg/libswscale/x86/input.asm b/ffmpeg/libswscale/x86/input.asm deleted file mode 100644 index 0c4f30e..0000000 --- a/ffmpeg/libswscale/x86/input.asm +++ /dev/null @@ -1,696 +0,0 @@ -;****************************************************************************** -;* x86-optimized input routines; does shuffling of packed -;* YUV formats into individual planes, and converts RGB -;* into YUV planes also. -;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -%define RY 0x20DE -%define GY 0x4087 -%define BY 0x0C88 -%define RU 0xECFF -%define GU 0xDAC8 -%define BU 0x3838 -%define RV 0x3838 -%define GV 0xD0E3 -%define BV 0xF6E4 - -rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 -rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 -%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq -%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq -%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq -%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq -%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq -%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq -%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq -%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq -%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq -%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq -%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq -%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq - -%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq -%define rgba_Ycoeff_br 16*4 + 16*13 + tableq -%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq -%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq -%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq -%define rgba_Ucoeff_br 16*4 + 16*17 + tableq -%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq -%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq -%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq -%define rgba_Vcoeff_br 16*4 + 16*21 + tableq -%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq -%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq - -; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY -; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY -; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY -; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY -; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU -; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU -; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU -; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU -; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV -; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV -; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV -; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV - -; rgba_Ycoeff_rb: times 4 dw RY, BY -; rgba_Ycoeff_br: times 4 dw BY, RY -; rgba_Ycoeff_ga: times 4 dw GY, 0 -; rgba_Ycoeff_ag: times 4 dw 0, GY -; rgba_Ucoeff_rb: times 4 dw RU, BU -; rgba_Ucoeff_br: times 4 dw BU, RU -; rgba_Ucoeff_ga: times 4 dw GU, 0 -; rgba_Ucoeff_ag: times 4 dw 0, GU -; rgba_Vcoeff_rb: times 4 dw RV, BV -; rgba_Vcoeff_br: times 4 dw BV, RV -; rgba_Vcoeff_ga: times 4 dw GV, 0 -; rgba_Vcoeff_ag: times 4 dw 0, GV - -shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ - 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 -shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \ - 8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80 - -SECTION .text - -;----------------------------------------------------------------------------- -; RGB to Y/UV. -; -; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); -; and -; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, -; const uint8_t *unused, int w); -;----------------------------------------------------------------------------- - -; %1 = nr. of XMM registers -; %2 = rgb or bgr -%macro RGB24_TO_Y_FN 2-3 -cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table -%if mmsize == 8 - mova m5, [%2_Ycoeff_12x4] - mova m6, [%2_Ycoeff_3x56] -%define coeff1 m5 -%define coeff2 m6 -%elif ARCH_X86_64 - mova m8, [%2_Ycoeff_12x4] - mova m9, [%2_Ycoeff_3x56] -%define coeff1 m8 -%define coeff2 m9 -%else ; x86-32 && mmsize == 16 -%define coeff1 [%2_Ycoeff_12x4] -%define coeff2 [%2_Ycoeff_3x56] -%endif ; x86-32/64 && mmsize == 8/16 -%if (ARCH_X86_64 || mmsize == 8) && %0 == 3 - jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body -%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 -.body: -%if cpuflag(ssse3) - mova m7, [shuf_rgb_12x4] -%define shuf_rgb1 m7 -%if ARCH_X86_64 - mova m10, [shuf_rgb_3x56] -%define shuf_rgb2 m10 -%else ; x86-32 -%define shuf_rgb2 [shuf_rgb_3x56] -%endif ; x86-32/64 -%endif ; cpuflag(ssse3) -%if ARCH_X86_64 - movsxd wq, wd -%endif - add wq, wq - add dstq, wq - neg wq -%if notcpuflag(ssse3) - pxor m7, m7 -%endif ; !cpuflag(ssse3) - mova m4, [rgb_Yrnd] -.loop: -%if cpuflag(ssse3) - movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] - movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] - pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } - pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } - pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } - pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } -%else ; !cpuflag(ssse3) - movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } - movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } - movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 } - movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 } -%if mmsize == 16 ; i.e. sse2 - punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } - punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } - movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 } - movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 } - movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 } - movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 } - punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } - punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; mmsize == 16 - punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } - punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } - punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } - punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; cpuflag(ssse3) - add srcq, 3 * mmsize / 2 - pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY } - pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY } - pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY } - pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY } - paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3] - paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] - paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } - paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } - psrad m0, 9 - psrad m2, 9 - packssdw m0, m2 ; (word) { Y[0-7] } - mova [dstq+wq], m0 - add wq, mmsize - jl .loop - REP_RET -%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 -%endmacro - -; %1 = nr. of XMM registers -; %2 = rgb or bgr -%macro RGB24_TO_UV_FN 2-3 -cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table -%if ARCH_X86_64 - mova m8, [%2_Ucoeff_12x4] - mova m9, [%2_Ucoeff_3x56] - mova m10, [%2_Vcoeff_12x4] - mova m11, [%2_Vcoeff_3x56] -%define coeffU1 m8 -%define coeffU2 m9 -%define coeffV1 m10 -%define coeffV2 m11 -%else ; x86-32 -%define coeffU1 [%2_Ucoeff_12x4] -%define coeffU2 [%2_Ucoeff_3x56] -%define coeffV1 [%2_Vcoeff_12x4] -%define coeffV2 [%2_Vcoeff_3x56] -%endif ; x86-32/64 -%if ARCH_X86_64 && %0 == 3 - jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body -%else ; ARCH_X86_64 && %0 == 3 -.body: -%if cpuflag(ssse3) - mova m7, [shuf_rgb_12x4] -%define shuf_rgb1 m7 -%if ARCH_X86_64 - mova m12, [shuf_rgb_3x56] -%define shuf_rgb2 m12 -%else ; x86-32 -%define shuf_rgb2 [shuf_rgb_3x56] -%endif ; x86-32/64 -%endif ; cpuflag(ssse3) -%if ARCH_X86_64 - movsxd wq, dword r5m -%else ; x86-32 - mov wq, r5m -%endif - add wq, wq - add dstUq, wq - add dstVq, wq - neg wq - mova m6, [rgb_UVrnd] -%if notcpuflag(ssse3) - pxor m7, m7 -%endif -.loop: -%if cpuflag(ssse3) - movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] - movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] - pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } - pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } -%else ; !cpuflag(ssse3) - movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } - movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } - movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 } - movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 } -%if mmsize == 16 - punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } - punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } - movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 } - movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 } -%endif ; mmsize == 16 - punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } - punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } -%endif ; cpuflag(ssse3) - pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV } - pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV } - pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU } - pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU } - paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3] - paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3] -%if cpuflag(ssse3) - pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } - pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } -%else ; !cpuflag(ssse3) -%if mmsize == 16 - movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 } - movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 } - punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } - punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; mmsize == 16 && !cpuflag(ssse3) - punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } - punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; cpuflag(ssse3) - add srcq, 3 * mmsize / 2 - pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU } - pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU } - pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV } - pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV } - paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7] - paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7] - paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] } - paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } - paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } - paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } - psrad m0, 9 - psrad m2, 9 - psrad m1, 9 - psrad m4, 9 - packssdw m0, m1 ; (word) { U[0-7] } - packssdw m2, m4 ; (word) { V[0-7] } -%if mmsize == 8 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%else ; mmsize == 16 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%endif ; mmsize == 8/16 - add wq, mmsize - jl .loop - REP_RET -%endif ; ARCH_X86_64 && %0 == 3 -%endmacro - -; %1 = nr. of XMM registers for rgb-to-Y func -; %2 = nr. of XMM registers for rgb-to-UV func -%macro RGB24_FUNCS 2 -RGB24_TO_Y_FN %1, rgb -RGB24_TO_Y_FN %1, bgr, rgb -RGB24_TO_UV_FN %2, rgb -RGB24_TO_UV_FN %2, bgr, rgb -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -RGB24_FUNCS 0, 0 -%endif - -INIT_XMM sse2 -RGB24_FUNCS 10, 12 - -INIT_XMM ssse3 -RGB24_FUNCS 11, 13 - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -RGB24_FUNCS 11, 13 -%endif - -; %1 = nr. of XMM registers -; %2-5 = rgba, bgra, argb or abgr (in individual characters) -%macro RGB32_TO_Y_FN 5-6 -cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table - mova m5, [rgba_Ycoeff_%2%4] - mova m6, [rgba_Ycoeff_%3%5] -%if %0 == 6 - jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body -%else ; %0 == 6 -.body: -%if ARCH_X86_64 - movsxd wq, wd -%endif - lea srcq, [srcq+wq*4] - add wq, wq - add dstq, wq - neg wq - mova m4, [rgb_Yrnd] - pcmpeqb m7, m7 - psrlw m7, 8 ; (word) { 0x00ff } x4 -.loop: - ; FIXME check alignment and use mova - movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] - DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] - pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] - pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] - pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7] - pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7] - paddd m0, m4 ; += rgb_Yrnd - paddd m2, m4 ; += rgb_Yrnd - paddd m0, m1 ; (dword) { Y[0-3] } - paddd m2, m3 ; (dword) { Y[4-7] } - psrad m0, 9 - psrad m2, 9 - packssdw m0, m2 ; (word) { Y[0-7] } - mova [dstq+wq], m0 - add wq, mmsize - jl .loop - REP_RET -%endif ; %0 == 3 -%endmacro - -; %1 = nr. of XMM registers -; %2-5 = rgba, bgra, argb or abgr (in individual characters) -%macro RGB32_TO_UV_FN 5-6 -cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table -%if ARCH_X86_64 - mova m8, [rgba_Ucoeff_%2%4] - mova m9, [rgba_Ucoeff_%3%5] - mova m10, [rgba_Vcoeff_%2%4] - mova m11, [rgba_Vcoeff_%3%5] -%define coeffU1 m8 -%define coeffU2 m9 -%define coeffV1 m10 -%define coeffV2 m11 -%else ; x86-32 -%define coeffU1 [rgba_Ucoeff_%2%4] -%define coeffU2 [rgba_Ucoeff_%3%5] -%define coeffV1 [rgba_Vcoeff_%2%4] -%define coeffV2 [rgba_Vcoeff_%3%5] -%endif ; x86-64/32 -%if ARCH_X86_64 && %0 == 6 - jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body -%else ; ARCH_X86_64 && %0 == 6 -.body: -%if ARCH_X86_64 - movsxd wq, dword r5m -%else ; x86-32 - mov wq, r5m -%endif - add wq, wq - add dstUq, wq - add dstVq, wq - lea srcq, [srcq+wq*2] - neg wq - pcmpeqb m7, m7 - psrlw m7, 8 ; (word) { 0x00ff } x4 - mova m6, [rgb_UVrnd] -.loop: - ; FIXME check alignment and use mova - movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] - DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] - pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] - pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] - pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] - pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] - paddd m3, m6 ; += rgb_UVrnd - paddd m1, m6 ; += rgb_UVrnd - paddd m2, m3 ; (dword) { V[0-3] } - paddd m0, m1 ; (dword) { U[0-3] } - pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7] - pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7] - pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7] - pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] - paddd m3, m6 ; += rgb_UVrnd - paddd m5, m6 ; += rgb_UVrnd - psrad m0, 9 - paddd m1, m3 ; (dword) { V[4-7] } - paddd m4, m5 ; (dword) { U[4-7] } - psrad m2, 9 - psrad m4, 9 - psrad m1, 9 - packssdw m0, m4 ; (word) { U[0-7] } - packssdw m2, m1 ; (word) { V[0-7] } -%if mmsize == 8 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%else ; mmsize == 16 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%endif ; mmsize == 8/16 - add wq, mmsize - jl .loop - REP_RET -%endif ; ARCH_X86_64 && %0 == 3 -%endmacro - -; %1 = nr. of XMM registers for rgb-to-Y func -; %2 = nr. of XMM registers for rgb-to-UV func -%macro RGB32_FUNCS 2 -RGB32_TO_Y_FN %1, r, g, b, a -RGB32_TO_Y_FN %1, b, g, r, a, rgba -RGB32_TO_Y_FN %1, a, r, g, b, rgba -RGB32_TO_Y_FN %1, a, b, g, r, rgba - -RGB32_TO_UV_FN %2, r, g, b, a -RGB32_TO_UV_FN %2, b, g, r, a, rgba -RGB32_TO_UV_FN %2, a, r, g, b, rgba -RGB32_TO_UV_FN %2, a, b, g, r, rgba -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -RGB32_FUNCS 0, 0 -%endif - -INIT_XMM sse2 -RGB32_FUNCS 8, 12 - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -RGB32_FUNCS 8, 12 -%endif - -;----------------------------------------------------------------------------- -; YUYV/UYVY/NV12/NV21 packed pixel shuffling. -; -; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); -; and -; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, -; const uint8_t *unused, int w); -;----------------------------------------------------------------------------- - -; %1 = a (aligned) or u (unaligned) -; %2 = yuyv or uyvy -%macro LOOP_YUYV_TO_Y 2 -.loop_%1: - mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... } - mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } -%ifidn %2, yuyv - pand m0, m2 ; (word) { Y0, Y1, ..., Y7 } - pand m1, m2 ; (word) { Y8, Y9, ..., Y15 } -%else ; uyvy - psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 } - psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 } -%endif ; yuyv/uyvy - packuswb m0, m1 ; (byte) { Y0, ..., Y15 } - mova [dstq+wq], m0 - add wq, mmsize - jl .loop_%1 - REP_RET -%endmacro - -; %1 = nr. of XMM registers -; %2 = yuyv or uyvy -; %3 = if specified, it means that unaligned and aligned code in loop -; will be the same (i.e. YUYV+AVX), and thus we don't need to -; split the loop in an aligned and unaligned case -%macro YUYV_TO_Y_FN 2-3 -cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w -%if ARCH_X86_64 - movsxd wq, wd -%endif - add dstq, wq -%if mmsize == 16 - test srcq, 15 -%endif - lea srcq, [srcq+wq*2] -%ifidn %2, yuyv - pcmpeqb m2, m2 ; (byte) { 0xff } x 16 - psrlw m2, 8 ; (word) { 0x00ff } x 8 -%endif ; yuyv -%if mmsize == 16 - jnz .loop_u_start - neg wq - LOOP_YUYV_TO_Y a, %2 -.loop_u_start: - neg wq - LOOP_YUYV_TO_Y u, %2 -%else ; mmsize == 8 - neg wq - LOOP_YUYV_TO_Y a, %2 -%endif ; mmsize == 8/16 -%endmacro - -; %1 = a (aligned) or u (unaligned) -; %2 = yuyv or uyvy -%macro LOOP_YUYV_TO_UV 2 -.loop_%1: -%ifidn %2, yuyv - mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } - mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } - psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 } - psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 } -%else ; uyvy -%if cpuflag(avx) - vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 } - vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 } -%else - mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } - mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } - pand m0, m2 ; (word) { U0, V0, ..., U3, V3 } - pand m1, m2 ; (word) { U4, V4, ..., U7, V7 } -%endif -%endif ; yuyv/uyvy - packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } - pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } - psrlw m0, 8 ; (word) { V0, V1, ..., V7 } -%if mmsize == 16 - packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } - movh [dstUq+wq], m1 - movhps [dstVq+wq], m1 -%else ; mmsize == 8 - packuswb m1, m1 ; (byte) { U0, ... U3 } - packuswb m0, m0 ; (byte) { V0, ... V3 } - movh [dstUq+wq], m1 - movh [dstVq+wq], m0 -%endif ; mmsize == 8/16 - add wq, mmsize / 2 - jl .loop_%1 - REP_RET -%endmacro - -; %1 = nr. of XMM registers -; %2 = yuyv or uyvy -; %3 = if specified, it means that unaligned and aligned code in loop -; will be the same (i.e. UYVY+AVX), and thus we don't need to -; split the loop in an aligned and unaligned case -%macro YUYV_TO_UV_FN 2-3 -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w -%if ARCH_X86_64 - movsxd wq, dword r5m -%else ; x86-32 - mov wq, r5m -%endif - add dstUq, wq - add dstVq, wq -%if mmsize == 16 && %0 == 2 - test srcq, 15 -%endif - lea srcq, [srcq+wq*4] - pcmpeqb m2, m2 ; (byte) { 0xff } x 16 - psrlw m2, 8 ; (word) { 0x00ff } x 8 - ; NOTE: if uyvy+avx, u/a are identical -%if mmsize == 16 && %0 == 2 - jnz .loop_u_start - neg wq - LOOP_YUYV_TO_UV a, %2 -.loop_u_start: - neg wq - LOOP_YUYV_TO_UV u, %2 -%else ; mmsize == 8 - neg wq - LOOP_YUYV_TO_UV a, %2 -%endif ; mmsize == 8/16 -%endmacro - -; %1 = a (aligned) or u (unaligned) -; %2 = nv12 or nv21 -%macro LOOP_NVXX_TO_UV 2 -.loop_%1: - mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } - mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } - pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } - pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } - psrlw m0, 8 ; (word) { V0, V1, ..., V7 } - psrlw m1, 8 ; (word) { V8, V9, ..., V15 } - packuswb m2, m3 ; (byte) { U0, ..., U15 } - packuswb m0, m1 ; (byte) { V0, ..., V15 } -%ifidn %2, nv12 - mova [dstUq+wq], m2 - mova [dstVq+wq], m0 -%else ; nv21 - mova [dstVq+wq], m2 - mova [dstUq+wq], m0 -%endif ; nv12/21 - add wq, mmsize - jl .loop_%1 - REP_RET -%endmacro - -; %1 = nr. of XMM registers -; %2 = nv12 or nv21 -%macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w -%if ARCH_X86_64 - movsxd wq, dword r5m -%else ; x86-32 - mov wq, r5m -%endif - add dstUq, wq - add dstVq, wq -%if mmsize == 16 - test srcq, 15 -%endif - lea srcq, [srcq+wq*2] - pcmpeqb m5, m5 ; (byte) { 0xff } x 16 - psrlw m5, 8 ; (word) { 0x00ff } x 8 -%if mmsize == 16 - jnz .loop_u_start - neg wq - LOOP_NVXX_TO_UV a, %2 -.loop_u_start: - neg wq - LOOP_NVXX_TO_UV u, %2 -%else ; mmsize == 8 - neg wq - LOOP_NVXX_TO_UV a, %2 -%endif ; mmsize == 8/16 -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -YUYV_TO_Y_FN 0, yuyv -YUYV_TO_Y_FN 0, uyvy -YUYV_TO_UV_FN 0, yuyv -YUYV_TO_UV_FN 0, uyvy -NVXX_TO_UV_FN 0, nv12 -NVXX_TO_UV_FN 0, nv21 -%endif - -INIT_XMM sse2 -YUYV_TO_Y_FN 3, yuyv -YUYV_TO_Y_FN 2, uyvy -YUYV_TO_UV_FN 3, yuyv -YUYV_TO_UV_FN 3, uyvy -NVXX_TO_UV_FN 5, nv12 -NVXX_TO_UV_FN 5, nv21 - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but -; that's not faster in practice -YUYV_TO_UV_FN 3, yuyv -YUYV_TO_UV_FN 3, uyvy, 1 -NVXX_TO_UV_FN 5, nv12 -NVXX_TO_UV_FN 5, nv21 -%endif diff --git a/ffmpeg/libswscale/x86/output.asm b/ffmpeg/libswscale/x86/output.asm deleted file mode 100644 index 9ea4af9..0000000 --- a/ffmpeg/libswscale/x86/output.asm +++ /dev/null @@ -1,413 +0,0 @@ -;****************************************************************************** -;* x86-optimized vertical line scaling functions -;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> -;* Kieran Kunhya <kieran@kunhya.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -minshort: times 8 dw 0x8000 -yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 -yuv2yuvX_10_start: times 4 dd 0x10000 -yuv2yuvX_9_start: times 4 dd 0x20000 -yuv2yuvX_10_upper: times 8 dw 0x3ff -yuv2yuvX_9_upper: times 8 dw 0x1ff -pd_4: times 4 dd 4 -pd_4min0x40000:times 4 dd 4 - (0x40000) -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -pw_512: times 8 dw 512 -pw_1024: times 8 dw 1024 - -SECTION .text - -;----------------------------------------------------------------------------- -; vertical line scaling -; -; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, -; const uint8_t *dither, int offset) -; and -; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, -; const int16_t **src, uint8_t *dst, int dstW, -; const uint8_t *dither, int offset) -; -; Scale one or $filterSize lines of source data to generate one line of output -; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in -; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple -; of 2. $offset is either 0 or 3. $dither holds 8 values. -;----------------------------------------------------------------------------- - -%macro yuv2planeX_fn 3 - -%if ARCH_X86_32 -%define cntr_reg fltsizeq -%define movsx mov -%else -%define cntr_reg r7 -%define movsx movsxd -%endif - -cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset -%if %1 == 8 || %1 == 9 || %1 == 10 - pxor m6, m6 -%endif ; %1 == 8/9/10 - -%if %1 == 8 -%if ARCH_X86_32 -%assign pad 0x2c - (stack_offset & 15) - SUB rsp, pad -%define m_dith m7 -%else ; x86-64 -%define m_dith m9 -%endif ; x86-32 - - ; create registers holding dither - movq m_dith, [ditherq] ; dither - test offsetd, offsetd - jz .no_rot -%if mmsize == 16 - punpcklqdq m_dith, m_dith -%endif ; mmsize == 16 - PALIGNR m_dith, m_dith, 3, m0 -.no_rot: -%if mmsize == 16 - punpcklbw m_dith, m6 -%if ARCH_X86_64 - punpcklwd m8, m_dith, m6 - pslld m8, 12 -%else ; x86-32 - punpcklwd m5, m_dith, m6 - pslld m5, 12 -%endif ; x86-32/64 - punpckhwd m_dith, m6 - pslld m_dith, 12 -%if ARCH_X86_32 - mova [rsp+ 0], m5 - mova [rsp+16], m_dith -%endif -%else ; mmsize == 8 - punpcklbw m5, m_dith, m6 - punpckhbw m_dith, m6 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpcklwd m3, m_dith, m6 - punpckhwd m_dith, m6 - pslld m4, 12 - pslld m5, 12 - pslld m3, 12 - pslld m_dith, 12 - mova [rsp+ 0], m4 - mova [rsp+ 8], m5 - mova [rsp+16], m3 - mova [rsp+24], m_dith -%endif ; mmsize == 8/16 -%endif ; %1 == 8 - - xor r5, r5 - -.pixelloop: -%assign %%i 0 - ; the rep here is for the 8bit output mmx case, where dither covers - ; 8 pixels but we can only handle 2 pixels per register, and thus 4 - ; pixels per iteration. In order to not have to keep track of where - ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. -%if %1 == 8 -%assign %%repcnt 16/mmsize -%else -%assign %%repcnt 1 -%endif - -%rep %%repcnt - -%if %1 == 8 -%if ARCH_X86_32 - mova m2, [rsp+mmsize*(0+%%i)] - mova m1, [rsp+mmsize*(1+%%i)] -%else ; x86-64 - mova m2, m8 - mova m1, m_dith -%endif ; x86-32/64 -%else ; %1 == 9/10/16 - mova m1, [yuv2yuvX_%1_start] - mova m2, m1 -%endif ; %1 == 8/9/10/16 - movsx cntr_reg, fltsizem -.filterloop_ %+ %%i: - ; input pixels - mov r6, [srcq+gprsize*cntr_reg-2*gprsize] -%if %1 == 16 - mova m3, [r6+r5*4] - mova m5, [r6+r5*4+mmsize] -%else ; %1 == 8/9/10 - mova m3, [r6+r5*2] -%endif ; %1 == 8/9/10/16 - mov r6, [srcq+gprsize*cntr_reg-gprsize] -%if %1 == 16 - mova m4, [r6+r5*4] - mova m6, [r6+r5*4+mmsize] -%else ; %1 == 8/9/10 - mova m4, [r6+r5*2] -%endif ; %1 == 8/9/10/16 - - ; coefficients - movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] -%if %1 == 16 - pshuflw m7, m0, 0 ; coeff[0] - pshuflw m0, m0, 0x55 ; coeff[1] - pmovsxwd m7, m7 ; word -> dword - pmovsxwd m0, m0 ; word -> dword - - pmulld m3, m7 - pmulld m5, m7 - pmulld m4, m0 - pmulld m6, m0 - - paddd m2, m3 - paddd m1, m5 - paddd m2, m4 - paddd m1, m6 -%else ; %1 == 10/9/8 - punpcklwd m5, m3, m4 - punpckhwd m3, m4 - SPLATD m0 - - pmaddwd m5, m0 - pmaddwd m3, m0 - - paddd m2, m5 - paddd m1, m3 -%endif ; %1 == 8/9/10/16 - - sub cntr_reg, 2 - jg .filterloop_ %+ %%i - -%if %1 == 16 - psrad m2, 31 - %1 - psrad m1, 31 - %1 -%else ; %1 == 10/9/8 - psrad m2, 27 - %1 - psrad m1, 27 - %1 -%endif ; %1 == 8/9/10/16 - -%if %1 == 8 - packssdw m2, m1 - packuswb m2, m2 - movh [dstq+r5*1], m2 -%else ; %1 == 9/10/16 -%if %1 == 16 - packssdw m2, m1 - paddw m2, [minshort] -%else ; %1 == 9/10 -%if cpuflag(sse4) - packusdw m2, m1 -%else ; mmxext/sse2 - packssdw m2, m1 - pmaxsw m2, m6 -%endif ; mmxext/sse2/sse4/avx - pminsw m2, [yuv2yuvX_%1_upper] -%endif ; %1 == 9/10/16 - mova [dstq+r5*2], m2 -%endif ; %1 == 8/9/10/16 - - add r5, mmsize/2 - sub wd, mmsize/2 - -%assign %%i %%i+2 -%endrep - jg .pixelloop - -%if %1 == 8 -%if ARCH_X86_32 - ADD rsp, pad - RET -%else ; x86-64 - REP_RET -%endif ; x86-32/64 -%else ; %1 == 9/10/16 - REP_RET -%endif ; %1 == 8/9/10/16 -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmxext -yuv2planeX_fn 8, 0, 7 -yuv2planeX_fn 9, 0, 5 -yuv2planeX_fn 10, 0, 5 -%endif - -INIT_XMM sse2 -yuv2planeX_fn 8, 10, 7 -yuv2planeX_fn 9, 7, 5 -yuv2planeX_fn 10, 7, 5 - -INIT_XMM sse4 -yuv2planeX_fn 8, 10, 7 -yuv2planeX_fn 9, 7, 5 -yuv2planeX_fn 10, 7, 5 -yuv2planeX_fn 16, 8, 5 - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -yuv2planeX_fn 8, 10, 7 -yuv2planeX_fn 9, 7, 5 -yuv2planeX_fn 10, 7, 5 -%endif - -; %1=outout-bpc, %2=alignment (u/a) -%macro yuv2plane1_mainloop 2 -.loop_%2: -%if %1 == 8 - paddsw m0, m2, [srcq+wq*2+mmsize*0] - paddsw m1, m3, [srcq+wq*2+mmsize*1] - psraw m0, 7 - psraw m1, 7 - packuswb m0, m1 - mov%2 [dstq+wq], m0 -%elif %1 == 16 - paddd m0, m4, [srcq+wq*4+mmsize*0] - paddd m1, m4, [srcq+wq*4+mmsize*1] - paddd m2, m4, [srcq+wq*4+mmsize*2] - paddd m3, m4, [srcq+wq*4+mmsize*3] - psrad m0, 3 - psrad m1, 3 - psrad m2, 3 - psrad m3, 3 -%if cpuflag(sse4) ; avx/sse4 - packusdw m0, m1 - packusdw m2, m3 -%else ; mmx/sse2 - packssdw m0, m1 - packssdw m2, m3 - paddw m0, m5 - paddw m2, m5 -%endif ; mmx/sse2/sse4/avx - mov%2 [dstq+wq*2+mmsize*0], m0 - mov%2 [dstq+wq*2+mmsize*1], m2 -%else ; %1 == 9/10 - paddsw m0, m2, [srcq+wq*2+mmsize*0] - paddsw m1, m2, [srcq+wq*2+mmsize*1] - psraw m0, 15 - %1 - psraw m1, 15 - %1 - pmaxsw m0, m4 - pmaxsw m1, m4 - pminsw m0, m3 - pminsw m1, m3 - mov%2 [dstq+wq*2+mmsize*0], m0 - mov%2 [dstq+wq*2+mmsize*1], m1 -%endif - add wq, mmsize - jl .loop_%2 -%endmacro - -%macro yuv2plane1_fn 3 -cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset - movsxdifnidn wq, wd - add wq, mmsize - 1 - and wq, ~(mmsize - 1) -%if %1 == 8 - add dstq, wq -%else ; %1 != 8 - lea dstq, [dstq+wq*2] -%endif ; %1 == 8 -%if %1 == 16 - lea srcq, [srcq+wq*4] -%else ; %1 != 16 - lea srcq, [srcq+wq*2] -%endif ; %1 == 16 - neg wq - -%if %1 == 8 - pxor m4, m4 ; zero - - ; create registers holding dither - movq m3, [ditherq] ; dither - test offsetd, offsetd - jz .no_rot -%if mmsize == 16 - punpcklqdq m3, m3 -%endif ; mmsize == 16 - PALIGNR m3, m3, 3, m2 -.no_rot: -%if mmsize == 8 - mova m2, m3 - punpckhbw m3, m4 ; byte->word - punpcklbw m2, m4 ; byte->word -%else - punpcklbw m3, m4 - mova m2, m3 -%endif -%elif %1 == 9 - pxor m4, m4 - mova m3, [pw_512] - mova m2, [pw_32] -%elif %1 == 10 - pxor m4, m4 - mova m3, [pw_1024] - mova m2, [pw_16] -%else ; %1 == 16 -%if cpuflag(sse4) ; sse4/avx - mova m4, [pd_4] -%else ; mmx/sse2 - mova m4, [pd_4min0x40000] - mova m5, [minshort] -%endif ; mmx/sse2/sse4/avx -%endif ; %1 == .. - - ; actual pixel scaling -%if mmsize == 8 - yuv2plane1_mainloop %1, a -%else ; mmsize == 16 - test dstq, 15 - jnz .unaligned - yuv2plane1_mainloop %1, a - REP_RET -.unaligned: - yuv2plane1_mainloop %1, u -%endif ; mmsize == 8/16 - REP_RET -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -yuv2plane1_fn 8, 0, 5 -yuv2plane1_fn 16, 0, 3 - -INIT_MMX mmxext -yuv2plane1_fn 9, 0, 3 -yuv2plane1_fn 10, 0, 3 -%endif - -INIT_XMM sse2 -yuv2plane1_fn 8, 5, 5 -yuv2plane1_fn 9, 5, 3 -yuv2plane1_fn 10, 5, 3 -yuv2plane1_fn 16, 6, 3 - -INIT_XMM sse4 -yuv2plane1_fn 16, 5, 3 - -%if HAVE_AVX_EXTERNAL -INIT_XMM avx -yuv2plane1_fn 8, 5, 5 -yuv2plane1_fn 9, 5, 3 -yuv2plane1_fn 10, 5, 3 -yuv2plane1_fn 16, 5, 3 -%endif diff --git a/ffmpeg/libswscale/x86/rgb2rgb.c b/ffmpeg/libswscale/x86/rgb2rgb.c deleted file mode 100644 index 8cc99c6..0000000 --- a/ffmpeg/libswscale/x86/rgb2rgb.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * software RGB to RGB converter - * pluralize by software PAL8 to RGB converter - * software YUV to YUV converter - * software YUV to RGB converter - * Written by Nick Kurshev. - * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavutil/cpu.h" -#include "libavutil/bswap.h" -#include "libswscale/rgb2rgb.h" -#include "libswscale/swscale.h" -#include "libswscale/swscale_internal.h" - -#if HAVE_INLINE_ASM - -DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; -DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; -DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL; -DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL; -DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL; -DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL; -DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; -DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; -DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; -DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; -DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; -DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; -DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL; -DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ -DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ -DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; -DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; -DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; -#define mask16b mask15b -DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; -DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; -DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL; -DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; -DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; -DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; -DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; -DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; -DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; -DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; -DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; - -#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) -#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) -#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) -#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) -#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) -#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) -#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) -#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) -#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) - -// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one. - -#define COMPILE_TEMPLATE_MMXEXT 0 -#define COMPILE_TEMPLATE_AMD3DNOW 0 -#define COMPILE_TEMPLATE_SSE2 0 -#define COMPILE_TEMPLATE_AVX 0 - -//MMX versions -#undef RENAME -#define RENAME(a) a ## _mmx -#include "rgb2rgb_template.c" - -// MMXEXT versions -#undef RENAME -#undef COMPILE_TEMPLATE_MMXEXT -#define COMPILE_TEMPLATE_MMXEXT 1 -#define RENAME(a) a ## _mmxext -#include "rgb2rgb_template.c" - -//SSE2 versions -#undef RENAME -#undef COMPILE_TEMPLATE_SSE2 -#define COMPILE_TEMPLATE_SSE2 1 -#define RENAME(a) a ## _sse2 -#include "rgb2rgb_template.c" - -//AVX versions -#undef RENAME -#undef COMPILE_TEMPLATE_AVX -#define COMPILE_TEMPLATE_AVX 1 -#define RENAME(a) a ## _avx -#include "rgb2rgb_template.c" - -//3DNOW versions -#undef RENAME -#undef COMPILE_TEMPLATE_MMXEXT -#undef COMPILE_TEMPLATE_SSE2 -#undef COMPILE_TEMPLATE_AVX -#undef COMPILE_TEMPLATE_AMD3DNOW -#define COMPILE_TEMPLATE_MMXEXT 0 -#define COMPILE_TEMPLATE_SSE2 0 -#define COMPILE_TEMPLATE_AVX 0 -#define COMPILE_TEMPLATE_AMD3DNOW 1 -#define RENAME(a) a ## _3dnow -#include "rgb2rgb_template.c" - -/* - RGB15->RGB16 original by Strepto/Astral - ported to gcc & bugfixed : A'rpi - MMXEXT, 3DNOW optimization by Nick Kurshev - 32-bit C version, and and&add trick by Michael Niedermayer -*/ - -#endif /* HAVE_INLINE_ASM */ - -av_cold void rgb2rgb_init_x86(void) -{ -#if HAVE_INLINE_ASM - int cpu_flags = av_get_cpu_flags(); - - if (INLINE_MMX(cpu_flags)) - rgb2rgb_init_mmx(); - if (INLINE_AMD3DNOW(cpu_flags)) - rgb2rgb_init_3dnow(); - if (INLINE_MMXEXT(cpu_flags)) - rgb2rgb_init_mmxext(); - if (INLINE_SSE2(cpu_flags)) - rgb2rgb_init_sse2(); - if (INLINE_AVX(cpu_flags)) - rgb2rgb_init_avx(); -#endif /* HAVE_INLINE_ASM */ -} diff --git a/ffmpeg/libswscale/x86/rgb2rgb_template.c b/ffmpeg/libswscale/x86/rgb2rgb_template.c deleted file mode 100644 index d58219b..0000000 --- a/ffmpeg/libswscale/x86/rgb2rgb_template.c +++ /dev/null @@ -1,2533 +0,0 @@ -/* - * software RGB to RGB converter - * pluralize by software PAL8 to RGB converter - * software YUV to YUV converter - * software YUV to RGB converter - * Written by Nick Kurshev. - * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) - * lot of big-endian byte order fixes by Alex Beregszaszi - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> - -#include "libavutil/attributes.h" - -#undef PREFETCH -#undef MOVNTQ -#undef EMMS -#undef SFENCE -#undef PAVGB - -#if COMPILE_TEMPLATE_AMD3DNOW -#define PREFETCH "prefetch" -#define PAVGB "pavgusb" -#elif COMPILE_TEMPLATE_MMXEXT -#define PREFETCH "prefetchnta" -#define PAVGB "pavgb" -#else -#define PREFETCH " # nop" -#endif - -#if COMPILE_TEMPLATE_AMD3DNOW -/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ -#define EMMS "femms" -#else -#define EMMS "emms" -#endif - -#if COMPILE_TEMPLATE_MMXEXT -#define MOVNTQ "movntq" -#define SFENCE "sfence" -#else -#define MOVNTQ "movq" -#define SFENCE " # nop" -#endif - -#if !COMPILE_TEMPLATE_SSE2 - -#if !COMPILE_TEMPLATE_AMD3DNOW - -static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size) -{ - uint8_t *dest = dst; - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 23; - __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "punpckldq 3(%1), %%mm0 \n\t" - "movd 6(%1), %%mm1 \n\t" - "punpckldq 9(%1), %%mm1 \n\t" - "movd 12(%1), %%mm2 \n\t" - "punpckldq 15(%1), %%mm2 \n\t" - "movd 18(%1), %%mm3 \n\t" - "punpckldq 21(%1), %%mm3 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm2 \n\t" - "por %%mm7, %%mm3 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - MOVNTQ" %%mm1, 8(%0) \n\t" - MOVNTQ" %%mm2, 16(%0) \n\t" - MOVNTQ" %%mm3, 24(%0)" - :: "r"(dest), "r"(s) - :"memory"); - dest += 32; - s += 24; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - *dest++ = *s++; - *dest++ = *s++; - *dest++ = *s++; - *dest++ = 255; - } -} - -#define STORE_BGR24_MMX \ - "psrlq $8, %%mm2 \n\t" \ - "psrlq $8, %%mm3 \n\t" \ - "psrlq $8, %%mm6 \n\t" \ - "psrlq $8, %%mm7 \n\t" \ - "pand "MANGLE(mask24l)", %%mm0\n\t" \ - "pand "MANGLE(mask24l)", %%mm1\n\t" \ - "pand "MANGLE(mask24l)", %%mm4\n\t" \ - "pand "MANGLE(mask24l)", %%mm5\n\t" \ - "pand "MANGLE(mask24h)", %%mm2\n\t" \ - "pand "MANGLE(mask24h)", %%mm3\n\t" \ - "pand "MANGLE(mask24h)", %%mm6\n\t" \ - "pand "MANGLE(mask24h)", %%mm7\n\t" \ - "por %%mm2, %%mm0 \n\t" \ - "por %%mm3, %%mm1 \n\t" \ - "por %%mm6, %%mm4 \n\t" \ - "por %%mm7, %%mm5 \n\t" \ - \ - "movq %%mm1, %%mm2 \n\t" \ - "movq %%mm4, %%mm3 \n\t" \ - "psllq $48, %%mm2 \n\t" \ - "psllq $32, %%mm3 \n\t" \ - "por %%mm2, %%mm0 \n\t" \ - "psrlq $16, %%mm1 \n\t" \ - "psrlq $32, %%mm4 \n\t" \ - "psllq $16, %%mm5 \n\t" \ - "por %%mm3, %%mm1 \n\t" \ - "por %%mm5, %%mm4 \n\t" \ - \ - MOVNTQ" %%mm0, (%0) \n\t" \ - MOVNTQ" %%mm1, 8(%0) \n\t" \ - MOVNTQ" %%mm4, 16(%0)" - - -static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) -{ - uint8_t *dest = dst; - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 31; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - "movq %%mm4, %%mm6 \n\t" - "movq %%mm5, %%mm7 \n\t" - STORE_BGR24_MMX - :: "r"(dest), "r"(s) - :"memory"); - dest += 24; - s += 32; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - *dest++ = *s++; - *dest++ = *s++; - *dest++ = *s++; - s++; - } -} - -/* - original by Strepto/Astral - ported to gcc & bugfixed: A'rpi - MMXEXT, 3DNOW optimization by Nick Kurshev - 32-bit C version, and and&add trick by Michael Niedermayer -*/ -static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size) -{ - register const uint8_t* s=src; - register uint8_t* d=dst; - register const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*s)); - __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); - mm_end = end - 15; - while (s<mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "pand %%mm4, %%mm0 \n\t" - "pand %%mm4, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - MOVNTQ" %%mm2, 8(%0)" - :: "r"(d), "r"(s) - ); - d+=16; - s+=16; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - mm_end = end - 3; - while (s < mm_end) { - register unsigned x= *((const uint32_t *)s); - *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); - d+=4; - s+=4; - } - if (s < end) { - register unsigned short x= *((const uint16_t *)s); - *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); - } -} - -static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size) -{ - register const uint8_t* s=src; - register uint8_t* d=dst; - register const uint8_t *end; - const uint8_t *mm_end; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*s)); - __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); - __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); - mm_end = end - 15; - while (s<mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $1, %%mm0 \n\t" - "psrlq $1, %%mm2 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm3 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm3, %%mm2 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - MOVNTQ" %%mm2, 8(%0)" - :: "r"(d), "r"(s) - ); - d+=16; - s+=16; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - mm_end = end - 3; - while (s < mm_end) { - register uint32_t x= *((const uint32_t*)s); - *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); - s+=4; - d+=4; - } - if (s < end) { - register uint16_t x= *((const uint16_t*)s); - *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); - } -} - -static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - mm_end = end - 15; - __asm__ volatile( - "movq %3, %%mm5 \n\t" - "movq %4, %%mm6 \n\t" - "movq %5, %%mm7 \n\t" - "jmp 2f \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm6, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pmaddwd %%mm7, %%mm0 \n\t" - "pmaddwd %%mm7, %%mm3 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm5, %%mm4 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "psrld $5, %%mm0 \n\t" - "pslld $11, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - "add $16, %1 \n\t" - "add $8, %0 \n\t" - "2: \n\t" - "cmp %2, %1 \n\t" - " jb 1b \n\t" - : "+r" (d), "+r"(s) - : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) - ); - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register int rgb = *(const uint32_t*)s; s += 4; - *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); - } -} - -static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psllq $8, %%mm0 \n\t" - "psllq $8, %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm3 \n\t" - "psrlq $5, %%mm1 \n\t" - "psrlq $5, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $19, %%mm2 \n\t" - "psrlq $19, %%mm5 \n\t" - "pand %2, %%mm2 \n\t" - "pand %2, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); - d += 4; - s += 16; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register int rgb = *(const uint32_t*)s; s += 4; - *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); - } -} - -static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - mm_end = end - 15; - __asm__ volatile( - "movq %3, %%mm5 \n\t" - "movq %4, %%mm6 \n\t" - "movq %5, %%mm7 \n\t" - "jmp 2f \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm6, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pmaddwd %%mm7, %%mm0 \n\t" - "pmaddwd %%mm7, %%mm3 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm5, %%mm4 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "psrld $6, %%mm0 \n\t" - "pslld $10, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - "add $16, %1 \n\t" - "add $8, %0 \n\t" - "2: \n\t" - "cmp %2, %1 \n\t" - " jb 1b \n\t" - : "+r" (d), "+r"(s) - : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) - ); - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register int rgb = *(const uint32_t*)s; s += 4; - *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); - } -} - -static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 4(%1), %%mm3 \n\t" - "punpckldq 8(%1), %%mm0 \n\t" - "punpckldq 12(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psllq $7, %%mm0 \n\t" - "psllq $7, %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm3 \n\t" - "psrlq $6, %%mm1 \n\t" - "psrlq $6, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $19, %%mm2 \n\t" - "psrlq $19, %%mm5 \n\t" - "pand %2, %%mm2 \n\t" - "pand %2, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); - d += 4; - s += 16; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register int rgb = *(const uint32_t*)s; s += 4; - *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); - } -} - -static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 11; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 3(%1), %%mm3 \n\t" - "punpckldq 6(%1), %%mm0 \n\t" - "punpckldq 9(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psrlq $3, %%mm0 \n\t" - "psrlq $3, %%mm3 \n\t" - "pand %2, %%mm0 \n\t" - "pand %2, %%mm3 \n\t" - "psrlq $5, %%mm1 \n\t" - "psrlq $5, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $8, %%mm2 \n\t" - "psrlq $8, %%mm5 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm7, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); - d += 4; - s += 12; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - const int b = *s++; - const int g = *s++; - const int r = *s++; - *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } -} - -static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_16mask),"m"(green_16mask)); - mm_end = end - 15; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 3(%1), %%mm3 \n\t" - "punpckldq 6(%1), %%mm0 \n\t" - "punpckldq 9(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psllq $8, %%mm0 \n\t" - "psllq $8, %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm3 \n\t" - "psrlq $5, %%mm1 \n\t" - "psrlq $5, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $19, %%mm2 \n\t" - "psrlq $19, %%mm5 \n\t" - "pand %2, %%mm2 \n\t" - "pand %2, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); - d += 4; - s += 12; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - const int r = *s++; - const int g = *s++; - const int b = *s++; - *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } -} - -static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 11; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 3(%1), %%mm3 \n\t" - "punpckldq 6(%1), %%mm0 \n\t" - "punpckldq 9(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psrlq $3, %%mm0 \n\t" - "psrlq $3, %%mm3 \n\t" - "pand %2, %%mm0 \n\t" - "pand %2, %%mm3 \n\t" - "psrlq $6, %%mm1 \n\t" - "psrlq $6, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $9, %%mm2 \n\t" - "psrlq $9, %%mm5 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm7, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); - d += 4; - s += 12; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - const int b = *s++; - const int g = *s++; - const int r = *s++; - *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); - } -} - -static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint8_t *s = src; - const uint8_t *end; - const uint8_t *mm_end; - uint16_t *d = (uint16_t *)dst; - end = s + src_size; - __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); - __asm__ volatile( - "movq %0, %%mm7 \n\t" - "movq %1, %%mm6 \n\t" - ::"m"(red_15mask),"m"(green_15mask)); - mm_end = end - 15; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movd (%1), %%mm0 \n\t" - "movd 3(%1), %%mm3 \n\t" - "punpckldq 6(%1), %%mm0 \n\t" - "punpckldq 9(%1), %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psllq $7, %%mm0 \n\t" - "psllq $7, %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm3 \n\t" - "psrlq $6, %%mm1 \n\t" - "psrlq $6, %%mm4 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "psrlq $19, %%mm2 \n\t" - "psrlq $19, %%mm5 \n\t" - "pand %2, %%mm2 \n\t" - "pand %2, %%mm5 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm5, %%mm3 \n\t" - "psllq $16, %%mm3 \n\t" - "por %%mm3, %%mm0 \n\t" - MOVNTQ" %%mm0, (%0) \n\t" - ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); - d += 4; - s += 12; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - const int r = *s++; - const int g = *s++; - const int b = *s++; - *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); - } -} - -static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint16_t *end; - const uint16_t *mm_end; - uint8_t *d = dst; - const uint16_t *s = (const uint16_t*)src; - end = s + src_size/2; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 7; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - "movq %%mm0, %%mm3 \n\t" - "movq %%mm1, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "punpcklwd %5, %%mm0 \n\t" - "punpcklwd %5, %%mm1 \n\t" - "punpcklwd %5, %%mm2 \n\t" - "punpckhwd %5, %%mm3 \n\t" - "punpckhwd %5, %%mm4 \n\t" - "punpckhwd %5, %%mm5 \n\t" - "psllq $8, %%mm1 \n\t" - "psllq $16, %%mm2 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm2, %%mm0 \n\t" - "psllq $8, %%mm4 \n\t" - "psllq $16, %%mm5 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm5, %%mm3 \n\t" - - "movq %%mm0, %%mm6 \n\t" - "movq %%mm3, %%mm7 \n\t" - - "movq 8(%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - "movq %%mm0, %%mm3 \n\t" - "movq %%mm1, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "punpcklwd %5, %%mm0 \n\t" - "punpcklwd %5, %%mm1 \n\t" - "punpcklwd %5, %%mm2 \n\t" - "punpckhwd %5, %%mm3 \n\t" - "punpckhwd %5, %%mm4 \n\t" - "punpckhwd %5, %%mm5 \n\t" - "psllq $8, %%mm1 \n\t" - "psllq $16, %%mm2 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm2, %%mm0 \n\t" - "psllq $8, %%mm4 \n\t" - "psllq $16, %%mm5 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm5, %%mm3 \n\t" - - :"=m"(*d) - :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) - :"memory"); - /* borrowed 32 to 24 */ - __asm__ volatile( - "movq %%mm0, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "movq %%mm6, %%mm0 \n\t" - "movq %%mm7, %%mm1 \n\t" - - "movq %%mm4, %%mm6 \n\t" - "movq %%mm5, %%mm7 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - - STORE_BGR24_MMX - - :: "r"(d), "m"(*s) - :"memory"); - d += 24; - s += 8; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register uint16_t bgr; - bgr = *s++; - *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); - *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); - *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); - } -} - -static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint16_t *end; - const uint16_t *mm_end; - uint8_t *d = (uint8_t *)dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - mm_end = end - 7; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "psrlq $1, %%mm2 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" - "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - "movq %%mm0, %%mm3 \n\t" - "movq %%mm1, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "punpcklwd %5, %%mm0 \n\t" - "punpcklwd %5, %%mm1 \n\t" - "punpcklwd %5, %%mm2 \n\t" - "punpckhwd %5, %%mm3 \n\t" - "punpckhwd %5, %%mm4 \n\t" - "punpckhwd %5, %%mm5 \n\t" - "psllq $8, %%mm1 \n\t" - "psllq $16, %%mm2 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm2, %%mm0 \n\t" - "psllq $8, %%mm4 \n\t" - "psllq $16, %%mm5 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm5, %%mm3 \n\t" - - "movq %%mm0, %%mm6 \n\t" - "movq %%mm3, %%mm7 \n\t" - - "movq 8(%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "psrlq $1, %%mm2 \n\t" - "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" - "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - "movq %%mm0, %%mm3 \n\t" - "movq %%mm1, %%mm4 \n\t" - "movq %%mm2, %%mm5 \n\t" - "punpcklwd %5, %%mm0 \n\t" - "punpcklwd %5, %%mm1 \n\t" - "punpcklwd %5, %%mm2 \n\t" - "punpckhwd %5, %%mm3 \n\t" - "punpckhwd %5, %%mm4 \n\t" - "punpckhwd %5, %%mm5 \n\t" - "psllq $8, %%mm1 \n\t" - "psllq $16, %%mm2 \n\t" - "por %%mm1, %%mm0 \n\t" - "por %%mm2, %%mm0 \n\t" - "psllq $8, %%mm4 \n\t" - "psllq $16, %%mm5 \n\t" - "por %%mm4, %%mm3 \n\t" - "por %%mm5, %%mm3 \n\t" - :"=m"(*d) - :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) - :"memory"); - /* borrowed 32 to 24 */ - __asm__ volatile( - "movq %%mm0, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "movq %%mm6, %%mm0 \n\t" - "movq %%mm7, %%mm1 \n\t" - - "movq %%mm4, %%mm6 \n\t" - "movq %%mm5, %%mm7 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - - STORE_BGR24_MMX - - :: "r"(d), "m"(*s) - :"memory"); - d += 24; - s += 8; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register uint16_t bgr; - bgr = *s++; - *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); - *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); - *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); - } -} - -/* - * mm0 = 00 B3 00 B2 00 B1 00 B0 - * mm1 = 00 G3 00 G2 00 G1 00 G0 - * mm2 = 00 R3 00 R2 00 R1 00 R0 - * mm6 = FF FF FF FF FF FF FF FF - * mm7 = 00 00 00 00 00 00 00 00 - */ -#define PACK_RGB32 \ - "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ - "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ - "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ - "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ - "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ - "movq %%mm0, %%mm3 \n\t" \ - "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ - "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ - MOVNTQ" %%mm0, (%0) \n\t" \ - MOVNTQ" %%mm3, 8(%0) \n\t" \ - -static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint16_t *end; - const uint16_t *mm_end; - uint8_t *d = dst; - const uint16_t *s = (const uint16_t *)src; - end = s + src_size/2; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); - __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); - mm_end = end - 3; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "pmulhw %5, %%mm0 \n\t" - "pmulhw %5, %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) - :"memory"); - d += 16; - s += 4; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register uint16_t bgr; - bgr = *s++; - *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); - *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); - *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); - *d++ = 255; - } -} - -static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size) -{ - const uint16_t *end; - const uint16_t *mm_end; - uint8_t *d = dst; - const uint16_t *s = (const uint16_t*)src; - end = s + src_size/2; - __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); - __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); - __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); - mm_end = end - 3; - while (s < mm_end) { - __asm__ volatile( - PREFETCH" 32(%1) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "pand %2, %%mm0 \n\t" - "pand %3, %%mm1 \n\t" - "pand %4, %%mm2 \n\t" - "psllq $5, %%mm0 \n\t" - "psrlq $1, %%mm2 \n\t" - "pmulhw %5, %%mm0 \n\t" - "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" - "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" - PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) - :"memory"); - d += 16; - s += 4; - } - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - while (s < end) { - register uint16_t bgr; - bgr = *s++; - *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); - *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); - *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); - *d++ = 255; - } -} - -static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size) -{ - x86_reg idx = 15 - src_size; - const uint8_t *s = src-idx; - uint8_t *d = dst-idx; - __asm__ volatile( - "test %0, %0 \n\t" - "jns 2f \n\t" - PREFETCH" (%1, %0) \n\t" - "movq %3, %%mm7 \n\t" - "pxor %4, %%mm7 \n\t" - "movq %%mm7, %%mm6 \n\t" - "pxor %5, %%mm7 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1, %0) \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" -# if COMPILE_TEMPLATE_MMXEXT - "pshufw $177, %%mm0, %%mm3 \n\t" - "pshufw $177, %%mm1, %%mm5 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm6, %%mm3 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm6, %%mm5 \n\t" - "por %%mm3, %%mm0 \n\t" - "por %%mm5, %%mm1 \n\t" -# else - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm4 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm6, %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "pslld $16, %%mm2 \n\t" - "psrld $16, %%mm3 \n\t" - "pslld $16, %%mm4 \n\t" - "psrld $16, %%mm5 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm4, %%mm1 \n\t" - "por %%mm3, %%mm0 \n\t" - "por %%mm5, %%mm1 \n\t" -# endif - MOVNTQ" %%mm0, (%2, %0) \n\t" - MOVNTQ" %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "js 1b \n\t" - SFENCE" \n\t" - EMMS" \n\t" - "2: \n\t" - : "+&r"(idx) - : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) - : "memory"); - for (; idx<15; idx+=4) { - register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; - v &= 0xff00ff; - *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); - } -} - -static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) -{ - unsigned i; - x86_reg mmx_size= 23 - src_size; - __asm__ volatile ( - "test %%"REG_a", %%"REG_a" \n\t" - "jns 2f \n\t" - "movq "MANGLE(mask24r)", %%mm5 \n\t" - "movq "MANGLE(mask24g)", %%mm6 \n\t" - "movq "MANGLE(mask24b)", %%mm7 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a") \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG - "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG - "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B - "psllq $16, %%mm0 \n\t" // 00 BGR BGR - "pand %%mm5, %%mm0 \n\t" - "pand %%mm6, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG - MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG - "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B - "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR - "pand %%mm7, %%mm0 \n\t" - "pand %%mm5, %%mm1 \n\t" - "pand %%mm6, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B - MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R - "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR - "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG - "pand %%mm6, %%mm0 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm5, %%mm2 \n\t" - "por %%mm0, %%mm1 \n\t" - "por %%mm2, %%mm1 \n\t" - MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" - "add $24, %%"REG_a" \n\t" - " js 1b \n\t" - "2: \n\t" - : "+a" (mmx_size) - : "r" (src-mmx_size), "r"(dst-mmx_size) - ); - - __asm__ volatile(SFENCE:::"memory"); - __asm__ volatile(EMMS:::"memory"); - - if (mmx_size==23) return; //finished, was multiple of 8 - - src+= src_size; - dst+= src_size; - src_size= 23-mmx_size; - src-= src_size; - dst-= src_size; - for (i=0; i<src_size; i+=3) { - register uint8_t x; - x = src[i + 2]; - dst[i + 1] = src[i + 1]; - dst[i + 2] = src[i + 0]; - dst[i + 0] = x; - } -} - -static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride, int vertLumPerChroma) -{ - int y; - const x86_reg chromWidth= width>>1; - for (y=0; y<height; y++) { - //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a", 2) \n\t" - PREFETCH" 32(%2, %%"REG_a") \n\t" - PREFETCH" 32(%3, %%"REG_a") \n\t" - "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) - "movq %%mm0, %%mm2 \n\t" // U(0) - "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) - "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) - - "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) - "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) - "movq %%mm3, %%mm4 \n\t" // Y(0) - "movq %%mm5, %%mm6 \n\t" // Y(8) - "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) - "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) - "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) - "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) - - MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) - : "%"REG_a - ); - if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; - } - __asm__(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); -} - -/** - * Height should be a multiple of 2 and width should be a multiple of 16. - * (If this is a problem for anyone then tell me, and I will fix it.) - */ -static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride) -{ - //FIXME interpolate chroma - RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); -} - -static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride, int vertLumPerChroma) -{ - int y; - const x86_reg chromWidth= width>>1; - for (y=0; y<height; y++) { - //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 32(%1, %%"REG_a", 2) \n\t" - PREFETCH" 32(%2, %%"REG_a") \n\t" - PREFETCH" 32(%3, %%"REG_a") \n\t" - "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) - "movq %%mm0, %%mm2 \n\t" // U(0) - "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) - "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) - - "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) - "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) - "movq %%mm0, %%mm4 \n\t" // Y(0) - "movq %%mm2, %%mm6 \n\t" // Y(8) - "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) - "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) - "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) - "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) - - MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" - MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) - : "%"REG_a - ); - if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { - usrc += chromStride; - vsrc += chromStride; - } - ysrc += lumStride; - dst += dstStride; - } - __asm__(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); -} - -/** - * Height should be a multiple of 2 and width should be a multiple of 16 - * (If this is a problem for anyone then tell me, and I will fix it.) - */ -static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride) -{ - //FIXME interpolate chroma - RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); -} - -/** - * Width should be a multiple of 16. - */ -static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride) -{ - RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); -} - -/** - * Width should be a multiple of 16. - */ -static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, - int width, int height, - int lumStride, int chromStride, int dstStride) -{ - RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); -} - -/** - * Height should be a multiple of 2 and width should be a multiple of 16. - * (If this is a problem for anyone then tell me, and I will fix it.) - */ -static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const x86_reg chromWidth= width>>1; - for (y=0; y<height; y+=2) { - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) - "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) - - MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" - - "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) - "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) - "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) - "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" - - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - - MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" - MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); - - ydst += lumStride; - src += srcStride; - - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) - "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) - "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) - "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) - "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); - udst += chromStride; - vdst += chromStride; - ydst += lumStride; - src += srcStride; - } - __asm__ volatile(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW -static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) -{ - int x,y; - - dst[0]= src[0]; - - // first line - for (x=0; x<srcWidth-1; x++) { - dst[2*x+1]= (3*src[x] + src[x+1])>>2; - dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; - } - dst[2*srcWidth-1]= src[srcWidth-1]; - - dst+= dstStride; - - for (y=1; y<srcHeight; y++) { - const x86_reg mmxSize= srcWidth&~15; - __asm__ volatile( - "mov %4, %%"REG_a" \n\t" - "movq "MANGLE(mmx_ff)", %%mm0 \n\t" - "movq (%0, %%"REG_a"), %%mm4 \n\t" - "movq %%mm4, %%mm2 \n\t" - "psllq $8, %%mm4 \n\t" - "pand %%mm0, %%mm2 \n\t" - "por %%mm2, %%mm4 \n\t" - "movq (%1, %%"REG_a"), %%mm5 \n\t" - "movq %%mm5, %%mm3 \n\t" - "psllq $8, %%mm5 \n\t" - "pand %%mm0, %%mm3 \n\t" - "por %%mm3, %%mm5 \n\t" - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq 1(%0, %%"REG_a"), %%mm2 \n\t" - "movq 1(%1, %%"REG_a"), %%mm3 \n\t" - PAVGB" %%mm0, %%mm5 \n\t" - PAVGB" %%mm0, %%mm3 \n\t" - PAVGB" %%mm0, %%mm5 \n\t" - PAVGB" %%mm0, %%mm3 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - "movq %%mm5, %%mm7 \n\t" - "movq %%mm4, %%mm6 \n\t" - "punpcklbw %%mm3, %%mm5 \n\t" - "punpckhbw %%mm3, %%mm7 \n\t" - "punpcklbw %%mm2, %%mm4 \n\t" - "punpckhbw %%mm2, %%mm6 \n\t" - MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" - "add $8, %%"REG_a" \n\t" - "movq -1(%0, %%"REG_a"), %%mm4 \n\t" - "movq -1(%1, %%"REG_a"), %%mm5 \n\t" - " js 1b \n\t" - :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), - "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), - "g" (-mmxSize) - : "%"REG_a - ); - - for (x=mmxSize-1; x<srcWidth-1; x++) { - dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; - dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; - dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; - dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; - } - dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; - dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; - - dst+=dstStride*2; - src+=srcStride; - } - - // last line - dst[0]= src[0]; - - for (x=0; x<srcWidth-1; x++) { - dst[2*x+1]= (3*src[x] + src[x+1])>>2; - dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; - } - dst[2*srcWidth-1]= src[srcWidth-1]; - - __asm__ volatile(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); -} -#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ - -#if !COMPILE_TEMPLATE_AMD3DNOW -/** - * Height should be a multiple of 2 and width should be a multiple of 16. - * (If this is a problem for anyone then tell me, and I will fix it.) - * Chrominance data is only taken from every second line, others are ignored. - * FIXME: Write HQ version. - */ -static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const x86_reg chromWidth= width>>1; - for (y=0; y<height; y+=2) { - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) - "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) - "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) - "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) - "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) - "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) - "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) - "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) - - MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" - - "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) - "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) - "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) - "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) - "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) - "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) - "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" - - "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) - "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) - "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) - "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) - "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) - "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) - "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) - "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) - - MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" - MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); - - ydst += lumStride; - src += srcStride; - - __asm__ volatile( - "xor %%"REG_a", %%"REG_a" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_a", 4) \n\t" - "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) - "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) - "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) - "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) - "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) - "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) - "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) - "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) - "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) - "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) - - MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" - - "add $8, %%"REG_a" \n\t" - "cmp %4, %%"REG_a" \n\t" - " jb 1b \n\t" - - ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) - : "memory", "%"REG_a - ); - udst += chromStride; - vdst += chromStride; - ydst += lumStride; - src += srcStride; - } - __asm__ volatile(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -/** - * Height should be a multiple of 2 and width should be a multiple of 2. - * (If this is a problem for anyone then tell me, and I will fix it.) - * Chrominance data is only taken from every second line, - * others are ignored in the C version. - * FIXME: Write HQ version. - */ -#if HAVE_7REGS -static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv) -{ -#define BGR2Y_IDX "16*4+16*32" -#define BGR2U_IDX "16*4+16*33" -#define BGR2V_IDX "16*4+16*34" - int y; - const x86_reg chromWidth= width>>1; - for (y=0; y<height-2; y+=2) { - int i; - for (i=0; i<2; i++) { - __asm__ volatile( - "mov %2, %%"REG_a" \n\t" - "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 6(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "psraw $7, %%mm0 \n\t" - - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 18(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%0, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm1, %%mm4 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "psraw $7, %%mm4 \n\t" - - "packuswb %%mm4, %%mm0 \n\t" - "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" - - MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) - : "%"REG_a, "%"REG_d - ); - ydst += lumStride; - src += srcStride; - } - src -= srcStride*2; - __asm__ volatile( - "mov %4, %%"REG_a" \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "BGR2U_IDX"(%5), %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" - "add %%"REG_d", %%"REG_d" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"REG_d") \n\t" - PREFETCH" 64(%1, %%"REG_d") \n\t" -#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW - "movq (%0, %%"REG_d"), %%mm0 \n\t" - "movq (%1, %%"REG_d"), %%mm1 \n\t" - "movq 6(%0, %%"REG_d"), %%mm2 \n\t" - "movq 6(%1, %%"REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm0 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" -#else - "movd (%0, %%"REG_d"), %%mm0 \n\t" - "movd (%1, %%"REG_d"), %%mm1 \n\t" - "movd 3(%0, %%"REG_d"), %%mm2 \n\t" - "movd 3(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "movd 6(%0, %%"REG_d"), %%mm4 \n\t" - "movd 6(%1, %%"REG_d"), %%mm1 \n\t" - "movd 9(%0, %%"REG_d"), %%mm2 \n\t" - "movd 9(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm4 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm2 \n\t" -#endif - "movq "BGR2V_IDX"(%5), %%mm1 \n\t" - "movq "BGR2V_IDX"(%5), %%mm3 \n\t" - - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 - "psraw $7, %%mm0 \n\t" - -#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW - "movq 12(%0, %%"REG_d"), %%mm4 \n\t" - "movq 12(%1, %%"REG_d"), %%mm1 \n\t" - "movq 18(%0, %%"REG_d"), %%mm2 \n\t" - "movq 18(%1, %%"REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm4, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm4 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" -#else - "movd 12(%0, %%"REG_d"), %%mm4 \n\t" - "movd 12(%1, %%"REG_d"), %%mm1 \n\t" - "movd 15(%0, %%"REG_d"), %%mm2 \n\t" - "movd 15(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm4 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm4 \n\t" - "movd 18(%0, %%"REG_d"), %%mm5 \n\t" - "movd 18(%1, %%"REG_d"), %%mm1 \n\t" - "movd 21(%0, %%"REG_d"), %%mm2 \n\t" - "movd 21(%1, %%"REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "paddw %%mm1, %%mm5 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm2 \n\t" -#endif - "movq "BGR2V_IDX"(%5), %%mm1 \n\t" - "movq "BGR2V_IDX"(%5), %%mm3 \n\t" - - "pmaddwd %%mm4, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "add $24, %%"REG_d" \n\t" - "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 - "psraw $7, %%mm4 \n\t" - - "movq %%mm0, %%mm1 \n\t" - "punpckldq %%mm4, %%mm0 \n\t" - "punpckhdq %%mm4, %%mm1 \n\t" - "packsswb %%mm1, %%mm0 \n\t" - "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" - "movd %%mm0, (%2, %%"REG_a") \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, (%3, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) - : "%"REG_a, "%"REG_d - ); - - udst += chromStride; - vdst += chromStride; - src += srcStride*2; - } - - __asm__ volatile(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); - - ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); -} -#endif /* HAVE_7REGS */ -#endif /* !COMPILE_TEMPLATE_SSE2 */ - -#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX -static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, - int width, int height, int src1Stride, - int src2Stride, int dstStride) -{ - int h; - - for (h=0; h < height; h++) { - int w; - -#if COMPILE_TEMPLATE_SSE2 - __asm__( - "xor %%"REG_a", %%"REG_a" \n\t" - "1: \n\t" - PREFETCH" 64(%1, %%"REG_a") \n\t" - PREFETCH" 64(%2, %%"REG_a") \n\t" - "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" - "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" - "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" - "punpcklbw %%xmm2, %%xmm0 \n\t" - "punpckhbw %%xmm2, %%xmm1 \n\t" - "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" - "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" - "add $16, %%"REG_a" \n\t" - "cmp %3, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) - : "memory", "%"REG_a"" - ); -#else - __asm__( - "xor %%"REG_a", %%"REG_a" \n\t" - "1: \n\t" - PREFETCH" 64(%1, %%"REG_a") \n\t" - PREFETCH" 64(%2, %%"REG_a") \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 8(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "movq 8(%2, %%"REG_a"), %%mm5 \n\t" - "punpcklbw %%mm4, %%mm0 \n\t" - "punpckhbw %%mm4, %%mm1 \n\t" - "punpcklbw %%mm5, %%mm2 \n\t" - "punpckhbw %%mm5, %%mm3 \n\t" - MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" - MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" - "add $16, %%"REG_a" \n\t" - "cmp %3, %%"REG_a" \n\t" - " jb 1b \n\t" - ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) - : "memory", "%"REG_a - ); -#endif - for (w= (width&(~15)); w < width; w++) { - dest[2*w+0] = src1[w]; - dest[2*w+1] = src2[w]; - } - dest += dstStride; - src1 += src1Stride; - src2 += src2Stride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX*/ - -#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL -#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM -void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *unused0, - const uint8_t *src1, - const uint8_t *src2, - int w, uint32_t *unused); -static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, - int width, int height, int srcStride, - int dst1Stride, int dst2Stride) -{ - int h; - - for (h=0; h < height; h++) { - RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL); - src += srcStride; - dst1 += dst1Stride; - dst2 += dst2Stride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ -#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ - -#if !COMPILE_TEMPLATE_SSE2 -#if !COMPILE_TEMPLATE_AMD3DNOW -static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, - uint8_t *dst1, uint8_t *dst2, - int width, int height, - int srcStride1, int srcStride2, - int dstStride1, int dstStride2) -{ - x86_reg x, y; - int w,h; - w=width/2; h=height/2; - __asm__ volatile( - PREFETCH" %0 \n\t" - PREFETCH" %1 \n\t" - ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); - for (y=0;y<h;y++) { - const uint8_t* s1=src1+srcStride1*(y>>1); - uint8_t* d=dst1+dstStride1*y; - x=0; - for (;x<w-31;x+=32) { - __asm__ volatile( - PREFETCH" 32(%1,%2) \n\t" - "movq (%1,%2), %%mm0 \n\t" - "movq 8(%1,%2), %%mm2 \n\t" - "movq 16(%1,%2), %%mm4 \n\t" - "movq 24(%1,%2), %%mm6 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "movq %%mm6, %%mm7 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpcklbw %%mm2, %%mm2 \n\t" - "punpckhbw %%mm3, %%mm3 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpckhbw %%mm5, %%mm5 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm7 \n\t" - MOVNTQ" %%mm0, (%0,%2,2) \n\t" - MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" - MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" - MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" - MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" - MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" - MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" - MOVNTQ" %%mm7, 56(%0,%2,2)" - :: "r"(d), "r"(s1), "r"(x) - :"memory"); - } - for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; - } - for (y=0;y<h;y++) { - const uint8_t* s2=src2+srcStride2*(y>>1); - uint8_t* d=dst2+dstStride2*y; - x=0; - for (;x<w-31;x+=32) { - __asm__ volatile( - PREFETCH" 32(%1,%2) \n\t" - "movq (%1,%2), %%mm0 \n\t" - "movq 8(%1,%2), %%mm2 \n\t" - "movq 16(%1,%2), %%mm4 \n\t" - "movq 24(%1,%2), %%mm6 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "movq %%mm6, %%mm7 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpcklbw %%mm2, %%mm2 \n\t" - "punpckhbw %%mm3, %%mm3 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpckhbw %%mm5, %%mm5 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm7 \n\t" - MOVNTQ" %%mm0, (%0,%2,2) \n\t" - MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" - MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" - MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" - MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" - MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" - MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" - MOVNTQ" %%mm7, 56(%0,%2,2)" - :: "r"(d), "r"(s2), "r"(x) - :"memory"); - } - for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} - -static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, - uint8_t *dst, - int width, int height, - int srcStride1, int srcStride2, - int srcStride3, int dstStride) -{ - x86_reg x; - int y,w,h; - w=width/2; h=height; - for (y=0;y<h;y++) { - const uint8_t* yp=src1+srcStride1*y; - const uint8_t* up=src2+srcStride2*(y>>2); - const uint8_t* vp=src3+srcStride3*(y>>2); - uint8_t* d=dst+dstStride*y; - x=0; - for (;x<w-7;x+=8) { - __asm__ volatile( - PREFETCH" 32(%1, %0) \n\t" - PREFETCH" 32(%2, %0) \n\t" - PREFETCH" 32(%3, %0) \n\t" - "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ - "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ - "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ - "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ - "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ - "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ - "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ - "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ - "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ - "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ - - "movq %%mm1, %%mm6 \n\t" - "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ - "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ - "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ - MOVNTQ" %%mm0, (%4, %0, 8) \n\t" - MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" - - "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ - "movq 8(%1, %0, 4), %%mm0 \n\t" - "movq %%mm0, %%mm3 \n\t" - "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ - "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ - MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" - MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" - - "movq %%mm4, %%mm6 \n\t" - "movq 16(%1, %0, 4), %%mm0 \n\t" - "movq %%mm0, %%mm3 \n\t" - "punpcklbw %%mm5, %%mm4 \n\t" - "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ - "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ - MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" - MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" - - "punpckhbw %%mm5, %%mm6 \n\t" - "movq 24(%1, %0, 4), %%mm0 \n\t" - "movq %%mm0, %%mm3 \n\t" - "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ - "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ - MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" - MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" - - : "+r" (x) - : "r"(yp), "r" (up), "r"(vp), "r"(d) - :"memory"); - } - for (; x<w; x++) { - const int x2 = x<<2; - d[8*x+0] = yp[x2]; - d[8*x+1] = up[x]; - d[8*x+2] = yp[x2+1]; - d[8*x+3] = vp[x]; - d[8*x+4] = yp[x2+2]; - d[8*x+5] = up[x]; - d[8*x+6] = yp[x2+3]; - d[8*x+7] = vp[x]; - } - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) -{ - dst += count; - src += 2*count; - count= - count; - - if(count <= -16) { - count += 15; - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" - "1: \n\t" - "movq -30(%1, %0, 2), %%mm0 \n\t" - "movq -22(%1, %0, 2), %%mm1 \n\t" - "movq -14(%1, %0, 2), %%mm2 \n\t" - "movq -6(%1, %0, 2), %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - MOVNTQ" %%mm0,-15(%2, %0) \n\t" - MOVNTQ" %%mm2,- 7(%2, %0) \n\t" - "add $16, %0 \n\t" - " js 1b \n\t" - : "+r"(count) - : "r"(src), "r"(dst) - ); - count -= 15; - } - while(count<0) { - dst[count]= src[2*count]; - count++; - } -} - -#if !COMPILE_TEMPLATE_AMD3DNOW -static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) -{ - dst0+= count; - dst1+= count; - src += 4*count; - count= - count; - if(count <= -8) { - count += 7; - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" - "1: \n\t" - "movq -28(%1, %0, 4), %%mm0 \n\t" - "movq -20(%1, %0, 4), %%mm1 \n\t" - "movq -12(%1, %0, 4), %%mm2 \n\t" - "movq -4(%1, %0, 4), %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm2, %%mm0 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - MOVNTQ" %%mm0,- 7(%3, %0) \n\t" - MOVNTQ" %%mm1,- 7(%2, %0) \n\t" - "add $8, %0 \n\t" - " js 1b \n\t" - : "+r"(count) - : "r"(src), "r"(dst0), "r"(dst1) - ); - count -= 7; - } - while(count<0) { - dst0[count]= src[4*count+0]; - dst1[count]= src[4*count+2]; - count++; - } -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) -{ - dst0 += count; - dst1 += count; - src0 += 4*count; - src1 += 4*count; - count= - count; -#ifdef PAVGB - if(count <= -8) { - count += 7; - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" - "1: \n\t" - "movq -28(%1, %0, 4), %%mm0 \n\t" - "movq -20(%1, %0, 4), %%mm1 \n\t" - "movq -12(%1, %0, 4), %%mm2 \n\t" - "movq -4(%1, %0, 4), %%mm3 \n\t" - PAVGB" -28(%2, %0, 4), %%mm0 \n\t" - PAVGB" -20(%2, %0, 4), %%mm1 \n\t" - PAVGB" -12(%2, %0, 4), %%mm2 \n\t" - PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" - "pand %%mm7, %%mm0 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm2, %%mm0 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - MOVNTQ" %%mm0,- 7(%4, %0) \n\t" - MOVNTQ" %%mm1,- 7(%3, %0) \n\t" - "add $8, %0 \n\t" - " js 1b \n\t" - : "+r"(count) - : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) - ); - count -= 7; - } -#endif - while(count<0) { - dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; - dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; - count++; - } -} - -#if !COMPILE_TEMPLATE_AMD3DNOW -static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) -{ - dst0+= count; - dst1+= count; - src += 4*count; - count= - count; - if(count <= -8) { - count += 7; - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" - "1: \n\t" - "movq -28(%1, %0, 4), %%mm0 \n\t" - "movq -20(%1, %0, 4), %%mm1 \n\t" - "movq -12(%1, %0, 4), %%mm2 \n\t" - "movq -4(%1, %0, 4), %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm1 \n\t" - "psrlw $8, %%mm2 \n\t" - "psrlw $8, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm2, %%mm0 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - MOVNTQ" %%mm0,- 7(%3, %0) \n\t" - MOVNTQ" %%mm1,- 7(%2, %0) \n\t" - "add $8, %0 \n\t" - " js 1b \n\t" - : "+r"(count) - : "r"(src), "r"(dst0), "r"(dst1) - ); - count -= 7; - } - src++; - while(count<0) { - dst0[count]= src[4*count+0]; - dst1[count]= src[4*count+2]; - count++; - } -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) -{ - dst0 += count; - dst1 += count; - src0 += 4*count; - src1 += 4*count; - count= - count; -#ifdef PAVGB - if(count <= -8) { - count += 7; - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $8, %%mm7 \n\t" - "1: \n\t" - "movq -28(%1, %0, 4), %%mm0 \n\t" - "movq -20(%1, %0, 4), %%mm1 \n\t" - "movq -12(%1, %0, 4), %%mm2 \n\t" - "movq -4(%1, %0, 4), %%mm3 \n\t" - PAVGB" -28(%2, %0, 4), %%mm0 \n\t" - PAVGB" -20(%2, %0, 4), %%mm1 \n\t" - PAVGB" -12(%2, %0, 4), %%mm2 \n\t" - PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm1 \n\t" - "psrlw $8, %%mm2 \n\t" - "psrlw $8, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm3 \n\t" - "packuswb %%mm2, %%mm0 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - MOVNTQ" %%mm0,- 7(%4, %0) \n\t" - MOVNTQ" %%mm1,- 7(%3, %0) \n\t" - "add $8, %0 \n\t" - " js 1b \n\t" - : "+r"(count) - : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) - ); - count -= 7; - } -#endif - src0++; - src1++; - while(count<0) { - dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; - dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; - count++; - } -} - -static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const int chromWidth = FF_CEIL_RSHIFT(width, 1); - - for (y=0; y<height; y++) { - RENAME(extract_even)(src, ydst, width); - if(y&1) { - RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); - udst+= chromStride; - vdst+= chromStride; - } - - src += srcStride; - ydst+= lumStride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} - -#if !COMPILE_TEMPLATE_AMD3DNOW -static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const int chromWidth = FF_CEIL_RSHIFT(width, 1); - - for (y=0; y<height; y++) { - RENAME(extract_even)(src, ydst, width); - RENAME(extract_odd2)(src, udst, vdst, chromWidth); - - src += srcStride; - ydst+= lumStride; - udst+= chromStride; - vdst+= chromStride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const int chromWidth = FF_CEIL_RSHIFT(width, 1); - - for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); - if(y&1) { - RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); - udst+= chromStride; - vdst+= chromStride; - } - - src += srcStride; - ydst+= lumStride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} - -#if !COMPILE_TEMPLATE_AMD3DNOW -static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, - int width, int height, - int lumStride, int chromStride, int srcStride) -{ - int y; - const int chromWidth = FF_CEIL_RSHIFT(width, 1); - - for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); - RENAME(extract_even2)(src, udst, vdst, chromWidth); - - src += srcStride; - ydst+= lumStride; - udst+= chromStride; - vdst+= chromStride; - } - __asm__( - EMMS" \n\t" - SFENCE" \n\t" - ::: "memory" - ); -} -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ -#endif /* !COMPILE_TEMPLATE_SSE2 */ - -static av_cold void RENAME(rgb2rgb_init)(void) -{ -#if !COMPILE_TEMPLATE_SSE2 -#if !COMPILE_TEMPLATE_AMD3DNOW - rgb15to16 = RENAME(rgb15to16); - rgb15tobgr24 = RENAME(rgb15tobgr24); - rgb15to32 = RENAME(rgb15to32); - rgb16tobgr24 = RENAME(rgb16tobgr24); - rgb16to32 = RENAME(rgb16to32); - rgb16to15 = RENAME(rgb16to15); - rgb24tobgr16 = RENAME(rgb24tobgr16); - rgb24tobgr15 = RENAME(rgb24tobgr15); - rgb24tobgr32 = RENAME(rgb24tobgr32); - rgb32to16 = RENAME(rgb32to16); - rgb32to15 = RENAME(rgb32to15); - rgb32tobgr24 = RENAME(rgb32tobgr24); - rgb24to15 = RENAME(rgb24to15); - rgb24to16 = RENAME(rgb24to16); - rgb24tobgr24 = RENAME(rgb24tobgr24); - shuffle_bytes_2103 = RENAME(shuffle_bytes_2103); - rgb32tobgr16 = RENAME(rgb32tobgr16); - rgb32tobgr15 = RENAME(rgb32tobgr15); - yv12toyuy2 = RENAME(yv12toyuy2); - yv12touyvy = RENAME(yv12touyvy); - yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); - yuv422ptouyvy = RENAME(yuv422ptouyvy); - yuy2toyv12 = RENAME(yuy2toyv12); - vu9_to_vu12 = RENAME(vu9_to_vu12); - yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); - uyvytoyuv422 = RENAME(uyvytoyuv422); - yuyvtoyuv422 = RENAME(yuyvtoyuv422); -#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ - -#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW - planar2x = RENAME(planar2x); -#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ -#if HAVE_7REGS - ff_rgb24toyv12 = RENAME(rgb24toyv12); -#endif /* HAVE_7REGS */ - - yuyvtoyuv420 = RENAME(yuyvtoyuv420); - uyvytoyuv420 = RENAME(uyvytoyuv420); -#endif /* !COMPILE_TEMPLATE_SSE2 */ - -#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX - interleaveBytes = RENAME(interleaveBytes); -#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX*/ -#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL -#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM - deinterleaveBytes = RENAME(deinterleaveBytes); -#endif -#endif -} diff --git a/ffmpeg/libswscale/x86/scale.asm b/ffmpeg/libswscale/x86/scale.asm deleted file mode 100644 index 940f357..0000000 --- a/ffmpeg/libswscale/x86/scale.asm +++ /dev/null @@ -1,431 +0,0 @@ -;****************************************************************************** -;* x86-optimized horizontal line scaling functions -;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -max_19bit_int: times 4 dd 0x7ffff -max_19bit_flt: times 4 dd 524287.0 -minshort: times 8 dw 0x8000 -unicoeff: times 4 dd 0x20000000 - -SECTION .text - -;----------------------------------------------------------------------------- -; horizontal line scaling -; -; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> -; (SwsContext *c, int{16,32}_t *dst, -; int dstW, const uint{8,16}_t *src, -; const int16_t *filter, -; const int32_t *filterPos, int filterSize); -; -; Scale one horizontal line. Input is either 8-bits width or 16-bits width -; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to -; downscale before multiplying). Filter is 14-bits. Output is either 15bits -; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each -; output pixel is generated from $filterSize input pixels, the position of -; the first pixel is given in filterPos[nOutputPixel]. -;----------------------------------------------------------------------------- - -; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm -%macro SCALE_FUNC 6 -%ifnidn %3, X -cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 -%else -cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize -%endif -%if ARCH_X86_64 - movsxd wq, wd -%define mov32 movsxd -%else ; x86-32 -%define mov32 mov -%endif ; x86-64 -%if %2 == 19 -%if mmsize == 8 ; mmx - mova m2, [max_19bit_int] -%elif cpuflag(sse4) - mova m2, [max_19bit_int] -%else ; ssse3/sse2 - mova m2, [max_19bit_flt] -%endif ; mmx/sse2/ssse3/sse4 -%endif ; %2 == 19 -%if %1 == 16 - mova m6, [minshort] - mova m7, [unicoeff] -%elif %1 == 8 - pxor m3, m3 -%endif ; %1 == 8/16 - -%if %1 == 8 -%define movlh movd -%define movbh movh -%define srcmul 1 -%else ; %1 == 9-16 -%define movlh movq -%define movbh movu -%define srcmul 2 -%endif ; %1 == 8/9-16 - -%ifnidn %3, X - - ; setup loop -%if %3 == 8 - shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter -%define wshr 1 -%else ; %3 == 4 -%define wshr 0 -%endif ; %3 == 8 - lea filterq, [filterq+wq*8] -%if %2 == 15 - lea dstq, [dstq+wq*(2>>wshr)] -%else ; %2 == 19 - lea dstq, [dstq+wq*(4>>wshr)] -%endif ; %2 == 15/19 - lea fltposq, [fltposq+wq*(4>>wshr)] - neg wq - -.loop: -%if %3 == 4 ; filterSize == 4 scaling - ; load 2x4 or 4x4 source pixels into m0/m1 - mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0] - mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1] - movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}] -%if mmsize == 8 - movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] -%else ; mmsize == 16 -%if %1 > 8 - movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] -%else ; %1 == 8 - movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] -%endif - mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2] - mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3] - movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}] -%if %1 > 8 - movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] -%else ; %1 == 8 - movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] - punpckldq m0, m4 - punpckldq m1, m5 -%endif ; %1 == 8 -%endif ; mmsize == 8/16 -%if %1 == 8 - punpcklbw m0, m3 ; byte -> word - punpcklbw m1, m3 ; byte -> word -%endif ; %1 == 8 - - ; multiply with filter coefficients -%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll - ; add back 0x8000 * sum(coeffs) after the horizontal add - psubw m0, m6 - psubw m1, m6 -%endif ; %1 == 16 - pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] - pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] - - ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) -%if mmsize == 8 ; mmx - movq m4, m0 - punpckldq m0, m1 - punpckhdq m4, m1 - paddd m0, m4 -%elif notcpuflag(ssse3) ; sse2 - mova m4, m0 - shufps m0, m1, 10001000b - shufps m4, m1, 11011101b - paddd m0, m4 -%else ; ssse3/sse4 - phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], - ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], - ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], - ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] -%endif ; mmx/sse2/ssse3/sse4 -%else ; %3 == 8, i.e. filterSize == 8 scaling - ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 - mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] - mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1] - movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] -%if mmsize == 8 - movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] - movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}] - movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] -%else ; mmsize == 16 - movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] - mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2] - mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3] - movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] - movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] -%endif ; mmsize == 8/16 -%if %1 == 8 - punpcklbw m0, m3 ; byte -> word - punpcklbw m1, m3 ; byte -> word - punpcklbw m4, m3 ; byte -> word - punpcklbw m5, m3 ; byte -> word -%endif ; %1 == 8 - - ; multiply -%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll - ; add back 0x8000 * sum(coeffs) after the horizontal add - psubw m0, m6 - psubw m1, m6 - psubw m4, m6 - psubw m5, m6 -%endif ; %1 == 16 - pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] - pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] - pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}] - pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] - - ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) -%if mmsize == 8 - paddd m0, m1 - paddd m4, m5 - movq m1, m0 - punpckldq m0, m4 - punpckhdq m1, m4 - paddd m0, m1 -%elif notcpuflag(ssse3) ; sse2 -%if %1 == 8 -%define mex m6 -%else -%define mex m3 -%endif - ; emulate horizontal add as transpose + vertical add - mova mex, m0 - punpckldq m0, m1 - punpckhdq mex, m1 - paddd m0, mex - mova m1, m4 - punpckldq m4, m5 - punpckhdq m1, m5 - paddd m4, m1 - mova m1, m0 - punpcklqdq m0, m4 - punpckhqdq m1, m4 - paddd m0, m1 -%else ; ssse3/sse4 - ; FIXME if we rearrange the filter in pairs of 4, we can - ; load pixels likewise and use 2 x paddd + phaddd instead - ; of 3 x phaddd here, faster on older cpus - phaddd m0, m1 - phaddd m4, m5 - phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], - ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], - ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], - ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] -%endif ; mmx/sse2/ssse3/sse4 -%endif ; %3 == 4/8 - -%else ; %3 == X, i.e. any filterSize scaling - -%ifidn %4, X4 -%define dlt 4 -%else ; %4 == X || %4 == X8 -%define dlt 0 -%endif ; %4 ==/!= X4 -%if ARCH_X86_64 -%define srcq r8 -%define pos1q r7 -%define srcendq r9 - movsxd fltsizeq, fltsized ; filterSize - lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] -%else ; x86-32 -%define srcq srcmemq -%define pos1q dstq -%define srcendq r6m - lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] - mov srcendq, pos0q -%endif ; x86-32/64 - lea fltposq, [fltposq+wq*4] -%if %2 == 15 - lea dstq, [dstq+wq*2] -%else ; %2 == 19 - lea dstq, [dstq+wq*4] -%endif ; %2 == 15/19 - movifnidn dstmp, dstq - neg wq - -.loop: - mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0] - mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] - ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? - pxor m4, m4 - pxor m5, m5 - mov srcq, srcmemmp - -.innerloop: - ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 - movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] - movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] -%if %1 == 8 - punpcklbw m0, m3 - punpcklbw m1, m3 -%endif ; %1 == 8 - - ; multiply -%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll - ; add back 0x8000 * sum(coeffs) after the horizontal add - psubw m0, m6 - psubw m1, m6 -%endif ; %1 == 16 - pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}] - pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}] - paddd m4, m0 - paddd m5, m1 - add filterq, mmsize - add srcq, srcmul*mmsize/2 - cmp srcq, srcendq ; while (src += 4) < &src[filterSize] - jl .innerloop - -%ifidn %4, X4 - mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] - movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0] - sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1] -%if %1 > 8 - movhps m0, [srcq+(pos1q+dlt)*srcmul] -%else ; %1 == 8 - movd m1, [srcq+(pos1q+dlt)*srcmul] - punpckldq m0, m1 -%endif ; %1 == 8 -%if %1 == 8 - punpcklbw m0, m3 -%endif ; %1 == 8 -%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll - ; add back 0x8000 * sum(coeffs) after the horizontal add - psubw m0, m6 -%endif ; %1 == 16 - pmaddwd m0, [filterq] -%endif ; %4 == X4 - - lea filterq, [filterq+(fltsizeq+dlt)*2] - -%if mmsize == 8 ; mmx - movq m0, m4 - punpckldq m4, m5 - punpckhdq m0, m5 - paddd m0, m4 -%else ; mmsize == 16 -%if notcpuflag(ssse3) ; sse2 - mova m1, m4 - punpcklqdq m4, m5 - punpckhqdq m1, m5 - paddd m4, m1 -%else ; ssse3/sse4 - phaddd m4, m5 -%endif ; sse2/ssse3/sse4 -%ifidn %4, X4 - paddd m4, m0 -%endif ; %3 == X4 -%if notcpuflag(ssse3) ; sse2 - pshufd m4, m4, 11011000b - movhlps m0, m4 - paddd m0, m4 -%else ; ssse3/sse4 - phaddd m4, m4 - SWAP 0, 4 -%endif ; sse2/ssse3/sse4 -%endif ; mmsize == 8/16 -%endif ; %3 ==/!= X - -%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned - paddd m0, m7 -%endif ; %1 == 16 - - ; clip, store - psrad m0, 14 + %1 - %2 -%ifidn %3, X - movifnidn dstq, dstmp -%endif ; %3 == X -%if %2 == 15 - packssdw m0, m0 -%ifnidn %3, X - movh [dstq+wq*(2>>wshr)], m0 -%else ; %3 == X - movd [dstq+wq*2], m0 -%endif ; %3 ==/!= X -%else ; %2 == 19 -%if mmsize == 8 - PMINSD_MMX m0, m2, m4 -%elif cpuflag(sse4) - pminsd m0, m2 -%else ; sse2/ssse3 - cvtdq2ps m0, m0 - minps m0, m2 - cvtps2dq m0, m0 -%endif ; mmx/sse2/ssse3/sse4 -%ifnidn %3, X - mova [dstq+wq*(4>>wshr)], m0 -%else ; %3 == X - movq [dstq+wq*4], m0 -%endif ; %3 ==/!= X -%endif ; %2 == 15/19 -%ifnidn %3, X - add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) - ; per iteration. see "shl wq,1" above as for why we do this -%else ; %3 == X - add wq, 2 -%endif ; %3 ==/!= X - jl .loop - REP_RET -%endmacro - -; SCALE_FUNCS source_width, intermediate_nbits, n_xmm -%macro SCALE_FUNCS 3 -SCALE_FUNC %1, %2, 4, 4, 6, %3 -SCALE_FUNC %1, %2, 8, 8, 6, %3 -%if mmsize == 8 -SCALE_FUNC %1, %2, X, X, 7, %3 -%else -SCALE_FUNC %1, %2, X, X4, 7, %3 -SCALE_FUNC %1, %2, X, X8, 7, %3 -%endif -%endmacro - -; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args -%macro SCALE_FUNCS2 3 -%if notcpuflag(sse4) -SCALE_FUNCS 8, 15, %1 -SCALE_FUNCS 9, 15, %2 -SCALE_FUNCS 10, 15, %2 -SCALE_FUNCS 12, 15, %2 -SCALE_FUNCS 14, 15, %2 -SCALE_FUNCS 16, 15, %3 -%endif ; !sse4 -SCALE_FUNCS 8, 19, %1 -SCALE_FUNCS 9, 19, %2 -SCALE_FUNCS 10, 19, %2 -SCALE_FUNCS 12, 19, %2 -SCALE_FUNCS 14, 19, %2 -SCALE_FUNCS 16, 19, %3 -%endmacro - -%if ARCH_X86_32 -INIT_MMX mmx -SCALE_FUNCS2 0, 0, 0 -%endif -INIT_XMM sse2 -SCALE_FUNCS2 6, 7, 8 -INIT_XMM ssse3 -SCALE_FUNCS2 6, 6, 8 -INIT_XMM sse4 -SCALE_FUNCS2 6, 6, 8 diff --git a/ffmpeg/libswscale/x86/swscale.c b/ffmpeg/libswscale/x86/swscale.c deleted file mode 100644 index 2f7e4f7..0000000 --- a/ffmpeg/libswscale/x86/swscale.c +++ /dev/null @@ -1,580 +0,0 @@ -/* - * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <inttypes.h> -#include "config.h" -#include "libswscale/swscale.h" -#include "libswscale/swscale_internal.h" -#include "libavutil/attributes.h" -#include "libavutil/avassert.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavutil/cpu.h" -#include "libavutil/pixdesc.h" - -#if HAVE_INLINE_ASM - -#define DITHER1XBPP - -DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL; -DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL; -DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL; -DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL; - -const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = { - 0x0103010301030103LL, - 0x0200020002000200LL,}; - -const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = { - 0x0602060206020602LL, - 0x0004000400040004LL,}; - -DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL; -DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL; -DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL; -DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL; -DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL; -DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL; - -DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; -DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; -DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; - -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; - -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; - - -//MMX versions -#if HAVE_MMX_INLINE -#undef RENAME -#define COMPILE_TEMPLATE_MMXEXT 0 -#define RENAME(a) a ## _mmx -#include "swscale_template.c" -#endif - -// MMXEXT versions -#if HAVE_MMXEXT_INLINE -#undef RENAME -#undef COMPILE_TEMPLATE_MMXEXT -#define COMPILE_TEMPLATE_MMXEXT 1 -#define RENAME(a) a ## _mmxext -#include "swscale_template.c" -#endif - -void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex, - int lastInLumBuf, int lastInChrBuf) -{ - const int dstH= c->dstH; - const int flags= c->flags; - int16_t **lumPixBuf= c->lumPixBuf; - int16_t **chrUPixBuf= c->chrUPixBuf; - int16_t **alpPixBuf= c->alpPixBuf; - const int vLumBufSize= c->vLumBufSize; - const int vChrBufSize= c->vChrBufSize; - int32_t *vLumFilterPos= c->vLumFilterPos; - int32_t *vChrFilterPos= c->vChrFilterPos; - int16_t *vLumFilter= c->vLumFilter; - int16_t *vChrFilter= c->vChrFilter; - int32_t *lumMmxFilter= c->lumMmxFilter; - int32_t *chrMmxFilter= c->chrMmxFilter; - int32_t av_unused *alpMmxFilter= c->alpMmxFilter; - const int vLumFilterSize= c->vLumFilterSize; - const int vChrFilterSize= c->vChrFilterSize; - const int chrDstY= dstY>>c->chrDstVSubSample; - const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input - const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input - - c->blueDither= ff_dither8[dstY&1]; - if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555) - c->greenDither= ff_dither8[dstY&1]; - else - c->greenDither= ff_dither4[dstY&1]; - c->redDither= ff_dither8[(dstY+1)&1]; - if (dstY < dstH - 2) { - const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; - const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; - int i; - - if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { - const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize; - int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize); - for (i = 0; i < neg; i++) - tmpY[i] = lumSrcPtr[neg]; - for ( ; i < end; i++) - tmpY[i] = lumSrcPtr[i]; - for ( ; i < vLumFilterSize; i++) - tmpY[i] = tmpY[i-1]; - lumSrcPtr = tmpY; - - if (alpSrcPtr) { - const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize; - for (i = 0; i < neg; i++) - tmpA[i] = alpSrcPtr[neg]; - for ( ; i < end; i++) - tmpA[i] = alpSrcPtr[i]; - for ( ; i < vLumFilterSize; i++) - tmpA[i] = tmpA[i - 1]; - alpSrcPtr = tmpA; - } - } - if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) { - const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize; - int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize); - for (i = 0; i < neg; i++) { - tmpU[i] = chrUSrcPtr[neg]; - } - for ( ; i < end; i++) { - tmpU[i] = chrUSrcPtr[i]; - } - for ( ; i < vChrFilterSize; i++) { - tmpU[i] = tmpU[i - 1]; - } - chrUSrcPtr = tmpU; - } - - if (flags & SWS_ACCURATE_RND) { - int s= APCK_SIZE / 8; - for (i=0; i<vLumFilterSize; i+=2) { - *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; - *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; - lumMmxFilter[s*i+APCK_COEF/4 ]= - lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] - + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { - *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; - *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; - alpMmxFilter[s*i+APCK_COEF/4 ]= - alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; - } - } - for (i=0; i<vChrFilterSize; i+=2) { - *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ]; - *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)]; - chrMmxFilter[s*i+APCK_COEF/4 ]= - chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] - + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); - } - } else { - for (i=0; i<vLumFilterSize; i++) { - *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i]; - lumMmxFilter[4*i+2]= - lumMmxFilter[4*i+3]= - ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U; - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { - *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i]; - alpMmxFilter[4*i+2]= - alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; - } - } - for (i=0; i<vChrFilterSize; i++) { - *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i]; - chrMmxFilter[4*i+2]= - chrMmxFilter[4*i+3]= - ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U; - } - } - } -} - -#if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ - if(((int)dest) & 15){ - return yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); - } - if (offset) { - __asm__ volatile("movq (%0), %%xmm3\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrlq $24, %%xmm3\n\t" - "psllq $40, %%xmm4\n\t" - "por %%xmm4, %%xmm3\n\t" - :: "r"(dither) - ); - } else { - __asm__ volatile("movq (%0), %%xmm3\n\t" - :: "r"(dither) - ); - } - filterSize--; - __asm__ volatile( - "pxor %%xmm0, %%xmm0\n\t" - "punpcklbw %%xmm0, %%xmm3\n\t" - "movd %0, %%xmm1\n\t" - "punpcklwd %%xmm1, %%xmm1\n\t" - "punpckldq %%xmm1, %%xmm1\n\t" - "punpcklqdq %%xmm1, %%xmm1\n\t" - "psllw $3, %%xmm1\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "psraw $4, %%xmm3\n\t" - ::"m"(filterSize) - ); - __asm__ volatile( - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "movl %3, %%ecx\n\t" - "mov %0, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - ".p2align 4 \n\t" /* FIXME Unroll? */\ - "1: \n\t"\ - "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ - "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ - "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "pmulhw %%xmm0, %%xmm2 \n\t"\ - "pmulhw %%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm2, %%xmm3 \n\t"\ - "paddw %%xmm5, %%xmm4 \n\t"\ - " jnz 1b \n\t"\ - "psraw $3, %%xmm3 \n\t"\ - "psraw $3, %%xmm4 \n\t"\ - "packuswb %%xmm4, %%xmm3 \n\t" - "movntdq %%xmm3, (%1, %%"REG_c")\n\t" - "add $16, %%"REG_c" \n\t"\ - "cmp %2, %%"REG_c" \n\t"\ - "movdqa %%xmm7, %%xmm3\n\t" - "movdqa %%xmm7, %%xmm4\n\t" - "mov %0, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "jb 1b \n\t"\ - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) - : "%"REG_d, "%"REG_S, "%"REG_c - ); -} -#endif - -#endif /* HAVE_INLINE_ASM */ - -#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ -void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ - SwsContext *c, int16_t *data, \ - int dstW, const uint8_t *src, \ - const int16_t *filter, \ - const int32_t *filterPos, int filterSize) - -#define SCALE_FUNCS(filter_n, opt) \ - SCALE_FUNC(filter_n, 8, 15, opt); \ - SCALE_FUNC(filter_n, 9, 15, opt); \ - SCALE_FUNC(filter_n, 10, 15, opt); \ - SCALE_FUNC(filter_n, 12, 15, opt); \ - SCALE_FUNC(filter_n, 14, 15, opt); \ - SCALE_FUNC(filter_n, 16, 15, opt); \ - SCALE_FUNC(filter_n, 8, 19, opt); \ - SCALE_FUNC(filter_n, 9, 19, opt); \ - SCALE_FUNC(filter_n, 10, 19, opt); \ - SCALE_FUNC(filter_n, 12, 19, opt); \ - SCALE_FUNC(filter_n, 14, 19, opt); \ - SCALE_FUNC(filter_n, 16, 19, opt) - -#define SCALE_FUNCS_MMX(opt) \ - SCALE_FUNCS(4, opt); \ - SCALE_FUNCS(8, opt); \ - SCALE_FUNCS(X, opt) - -#define SCALE_FUNCS_SSE(opt) \ - SCALE_FUNCS(4, opt); \ - SCALE_FUNCS(8, opt); \ - SCALE_FUNCS(X4, opt); \ - SCALE_FUNCS(X8, opt) - -#if ARCH_X86_32 -SCALE_FUNCS_MMX(mmx); -#endif -SCALE_FUNCS_SSE(sse2); -SCALE_FUNCS_SSE(ssse3); -SCALE_FUNCS_SSE(sse4); - -#define VSCALEX_FUNC(size, opt) \ -void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \ - const int16_t **src, uint8_t *dest, int dstW, \ - const uint8_t *dither, int offset) -#define VSCALEX_FUNCS(opt) \ - VSCALEX_FUNC(8, opt); \ - VSCALEX_FUNC(9, opt); \ - VSCALEX_FUNC(10, opt) - -#if ARCH_X86_32 -VSCALEX_FUNCS(mmxext); -#endif -VSCALEX_FUNCS(sse2); -VSCALEX_FUNCS(sse4); -VSCALEX_FUNC(16, sse4); -VSCALEX_FUNCS(avx); - -#define VSCALE_FUNC(size, opt) \ -void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \ - const uint8_t *dither, int offset) -#define VSCALE_FUNCS(opt1, opt2) \ - VSCALE_FUNC(8, opt1); \ - VSCALE_FUNC(9, opt2); \ - VSCALE_FUNC(10, opt2); \ - VSCALE_FUNC(16, opt1) - -#if ARCH_X86_32 -VSCALE_FUNCS(mmx, mmxext); -#endif -VSCALE_FUNCS(sse2, sse2); -VSCALE_FUNC(16, sse4); -VSCALE_FUNCS(avx, avx); - -#define INPUT_Y_FUNC(fmt, opt) \ -void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ - const uint8_t *unused1, const uint8_t *unused2, \ - int w, uint32_t *unused) -#define INPUT_UV_FUNC(fmt, opt) \ -void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *unused0, \ - const uint8_t *src1, \ - const uint8_t *src2, \ - int w, uint32_t *unused) -#define INPUT_FUNC(fmt, opt) \ - INPUT_Y_FUNC(fmt, opt); \ - INPUT_UV_FUNC(fmt, opt) -#define INPUT_FUNCS(opt) \ - INPUT_FUNC(uyvy, opt); \ - INPUT_FUNC(yuyv, opt); \ - INPUT_UV_FUNC(nv12, opt); \ - INPUT_UV_FUNC(nv21, opt); \ - INPUT_FUNC(rgba, opt); \ - INPUT_FUNC(bgra, opt); \ - INPUT_FUNC(argb, opt); \ - INPUT_FUNC(abgr, opt); \ - INPUT_FUNC(rgb24, opt); \ - INPUT_FUNC(bgr24, opt) - -#if ARCH_X86_32 -INPUT_FUNCS(mmx); -#endif -INPUT_FUNCS(sse2); -INPUT_FUNCS(ssse3); -INPUT_FUNCS(avx); - -av_cold void ff_sws_init_swscale_x86(SwsContext *c) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_MMX_INLINE - if (cpu_flags & AV_CPU_FLAG_MMX) - sws_init_swscale_mmx(c); -#endif -#if HAVE_MMXEXT_INLINE - if (cpu_flags & AV_CPU_FLAG_MMXEXT) - sws_init_swscale_mmxext(c); - if (cpu_flags & AV_CPU_FLAG_SSE3){ - if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) - c->yuv2planeX = yuv2yuvX_sse3; - } -#endif - -#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ - if (c->srcBpc == 8) { \ - hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale8to19_ ## filtersize ## _ ## opt1; \ - } else if (c->srcBpc == 9) { \ - hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale9to19_ ## filtersize ## _ ## opt1; \ - } else if (c->srcBpc == 10) { \ - hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale10to19_ ## filtersize ## _ ## opt1; \ - } else if (c->srcBpc == 12) { \ - hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale12to19_ ## filtersize ## _ ## opt1; \ - } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \ - hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale14to19_ ## filtersize ## _ ## opt1; \ - } else { /* c->srcBpc == 16 */ \ - av_assert0(c->srcBpc == 16);\ - hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ - ff_hscale16to19_ ## filtersize ## _ ## opt1; \ - } \ -} while (0) -#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ - switch (filtersize) { \ - case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ - case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ - default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ - } -#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \ -switch(c->dstBpc){ \ - case 16: do_16_case; break; \ - case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \ - case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \ - default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \ - } -#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ - switch(c->dstBpc){ \ - case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ - case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ - case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ - case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ - default: av_assert0(c->dstBpc>8); \ - } -#define case_rgb(x, X, opt) \ - case AV_PIX_FMT_ ## X: \ - c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \ - if (!c->chrSrcHSubSample) \ - c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \ - break -#if ARCH_X86_32 - if (EXTERNAL_MMX(cpu_flags)) { - ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); - ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); - ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT); - - switch (c->srcFormat) { - case AV_PIX_FMT_Y400A: - c->lumToYV12 = ff_yuyvToY_mmx; - if (c->alpPixBuf) - c->alpToYV12 = ff_uyvyToY_mmx; - break; - case AV_PIX_FMT_YUYV422: - c->lumToYV12 = ff_yuyvToY_mmx; - c->chrToYV12 = ff_yuyvToUV_mmx; - break; - case AV_PIX_FMT_UYVY422: - c->lumToYV12 = ff_uyvyToY_mmx; - c->chrToYV12 = ff_uyvyToUV_mmx; - break; - case AV_PIX_FMT_NV12: - c->chrToYV12 = ff_nv12ToUV_mmx; - break; - case AV_PIX_FMT_NV21: - c->chrToYV12 = ff_nv21ToUV_mmx; - break; - case_rgb(rgb24, RGB24, mmx); - case_rgb(bgr24, BGR24, mmx); - case_rgb(bgra, BGRA, mmx); - case_rgb(rgba, RGBA, mmx); - case_rgb(abgr, ABGR, mmx); - case_rgb(argb, ARGB, mmx); - default: - break; - } - } - if (EXTERNAL_MMXEXT(cpu_flags)) { - ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1); - } -#endif /* ARCH_X86_32 */ -#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ - switch (filtersize) { \ - case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ - case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ - default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \ - else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \ - break; \ - } - if (EXTERNAL_SSE2(cpu_flags)) { - ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); - ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); - ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, , - HAVE_ALIGNED_STACK || ARCH_X86_64); - ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); - - switch (c->srcFormat) { - case AV_PIX_FMT_Y400A: - c->lumToYV12 = ff_yuyvToY_sse2; - if (c->alpPixBuf) - c->alpToYV12 = ff_uyvyToY_sse2; - break; - case AV_PIX_FMT_YUYV422: - c->lumToYV12 = ff_yuyvToY_sse2; - c->chrToYV12 = ff_yuyvToUV_sse2; - break; - case AV_PIX_FMT_UYVY422: - c->lumToYV12 = ff_uyvyToY_sse2; - c->chrToYV12 = ff_uyvyToUV_sse2; - break; - case AV_PIX_FMT_NV12: - c->chrToYV12 = ff_nv12ToUV_sse2; - break; - case AV_PIX_FMT_NV21: - c->chrToYV12 = ff_nv21ToUV_sse2; - break; - case_rgb(rgb24, RGB24, sse2); - case_rgb(bgr24, BGR24, sse2); - case_rgb(bgra, BGRA, sse2); - case_rgb(rgba, RGBA, sse2); - case_rgb(abgr, ABGR, sse2); - case_rgb(argb, ARGB, sse2); - default: - break; - } - } - if (EXTERNAL_SSSE3(cpu_flags)) { - ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); - ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3); - switch (c->srcFormat) { - case_rgb(rgb24, RGB24, ssse3); - case_rgb(bgr24, BGR24, ssse3); - default: - break; - } - } - if (EXTERNAL_SSE4(cpu_flags)) { - /* Xto15 don't need special sse4 functions */ - ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); - ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); - ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4, - if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4, - HAVE_ALIGNED_STACK || ARCH_X86_64); - if (c->dstBpc == 16 && !isBE(c->dstFormat)) - c->yuv2plane1 = ff_yuv2plane1_16_sse4; - } - - if (EXTERNAL_AVX(cpu_flags)) { - ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, , - HAVE_ALIGNED_STACK || ARCH_X86_64); - ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); - - switch (c->srcFormat) { - case AV_PIX_FMT_YUYV422: - c->chrToYV12 = ff_yuyvToUV_avx; - break; - case AV_PIX_FMT_UYVY422: - c->chrToYV12 = ff_uyvyToUV_avx; - break; - case AV_PIX_FMT_NV12: - c->chrToYV12 = ff_nv12ToUV_avx; - break; - case AV_PIX_FMT_NV21: - c->chrToYV12 = ff_nv21ToUV_avx; - break; - case_rgb(rgb24, RGB24, avx); - case_rgb(bgr24, BGR24, avx); - case_rgb(bgra, BGRA, avx); - case_rgb(rgba, RGBA, avx); - case_rgb(abgr, ABGR, avx); - case_rgb(argb, ARGB, avx); - default: - break; - } - } -} diff --git a/ffmpeg/libswscale/x86/swscale_template.c b/ffmpeg/libswscale/x86/swscale_template.c deleted file mode 100644 index c7a1bb4..0000000 --- a/ffmpeg/libswscale/x86/swscale_template.c +++ /dev/null @@ -1,1717 +0,0 @@ -/* - * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#undef REAL_MOVNTQ -#undef MOVNTQ -#undef MOVNTQ2 -#undef PREFETCH - -#if COMPILE_TEMPLATE_MMXEXT -#define PREFETCH "prefetchnta" -#else -#define PREFETCH " # nop" -#endif - -#if COMPILE_TEMPLATE_MMXEXT -#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" -#define MOVNTQ2 "movntq " -#else -#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" -#define MOVNTQ2 "movq " -#endif -#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) - -#if !COMPILE_TEMPLATE_MMXEXT -static av_always_inline void -dither_8to16(const uint8_t *srcDither, int rot) -{ - if (rot) { - __asm__ volatile("pxor %%mm0, %%mm0\n\t" - "movq (%0), %%mm3\n\t" - "movq %%mm3, %%mm4\n\t" - "psrlq $24, %%mm3\n\t" - "psllq $40, %%mm4\n\t" - "por %%mm4, %%mm3\n\t" - "movq %%mm3, %%mm4\n\t" - "punpcklbw %%mm0, %%mm3\n\t" - "punpckhbw %%mm0, %%mm4\n\t" - :: "r"(srcDither) - ); - } else { - __asm__ volatile("pxor %%mm0, %%mm0\n\t" - "movq (%0), %%mm3\n\t" - "movq %%mm3, %%mm4\n\t" - "punpcklbw %%mm0, %%mm3\n\t" - "punpckhbw %%mm0, %%mm4\n\t" - :: "r"(srcDither) - ); - } -} -#endif - -static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ - dither_8to16(dither, offset); - filterSize--; - __asm__ volatile( - "movd %0, %%mm1\n\t" - "punpcklwd %%mm1, %%mm1\n\t" - "punpckldq %%mm1, %%mm1\n\t" - "psllw $3, %%mm1\n\t" - "paddw %%mm1, %%mm3\n\t" - "paddw %%mm1, %%mm4\n\t" - "psraw $4, %%mm3\n\t" - "psraw $4, %%mm4\n\t" - ::"m"(filterSize) - ); - - __asm__ volatile(\ - "movq %%mm3, %%mm6\n\t" - "movq %%mm4, %%mm7\n\t" - "movl %3, %%ecx\n\t" - "mov %0, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - ".p2align 4 \n\t" /* FIXME Unroll? */\ - "1: \n\t"\ - "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ - "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ - "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t"\ - "pmulhw %%mm0, %%mm5 \n\t"\ - "paddw %%mm2, %%mm3 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - " jnz 1b \n\t"\ - "psraw $3, %%mm3 \n\t"\ - "psraw $3, %%mm4 \n\t"\ - "packuswb %%mm4, %%mm3 \n\t" - MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" - "add $8, %%"REG_c" \n\t"\ - "cmp %2, %%"REG_c" \n\t"\ - "movq %%mm6, %%mm3\n\t" - "movq %%mm7, %%mm4\n\t" - "mov %0, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "jb 1b \n\t"\ - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) - : "%"REG_d, "%"REG_S, "%"REG_c - ); -} - -#define YSCALEYUV2PACKEDX_UV \ - __asm__ volatile(\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - ".p2align 4 \n\t"\ - "nop \n\t"\ - "1: \n\t"\ - "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - ".p2align 4 \n\t"\ - "2: \n\t"\ - "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ - "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ - "add %6, %%"REG_S" \n\t" \ - "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t"\ - "pmulhw %%mm0, %%mm5 \n\t"\ - "paddw %%mm2, %%mm3 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - " jnz 2b \n\t"\ - -#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ - "lea "offset"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ - "movq "#dst1", "#dst2" \n\t"\ - ".p2align 4 \n\t"\ - "2: \n\t"\ - "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ - "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ - "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pmulhw "#coeff", "#src1" \n\t"\ - "pmulhw "#coeff", "#src2" \n\t"\ - "paddw "#src1", "#dst1" \n\t"\ - "paddw "#src2", "#dst2" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - " jnz 2b \n\t"\ - -#define YSCALEYUV2PACKEDX \ - YSCALEYUV2PACKEDX_UV \ - YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ - -#define YSCALEYUV2PACKEDX_END \ - :: "r" (&c->redDither), \ - "m" (dummy), "m" (dummy), "m" (dummy),\ - "r" (dest), "m" (dstW_reg), "m"(uv_off) \ - : "%"REG_a, "%"REG_d, "%"REG_S \ - ); - -#define YSCALEYUV2PACKEDX_ACCURATE_UV \ - __asm__ volatile(\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - ".p2align 4 \n\t"\ - "nop \n\t"\ - "1: \n\t"\ - "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - ".p2align 4 \n\t"\ - "2: \n\t"\ - "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ - "add %6, %%"REG_S" \n\t" \ - "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ - "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ - "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm1, %%mm0 \n\t"\ - "punpckhwd %%mm1, %%mm3 \n\t"\ - "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm3 \n\t"\ - "paddd %%mm0, %%mm4 \n\t"\ - "paddd %%mm3, %%mm5 \n\t"\ - "add %6, %%"REG_S" \n\t" \ - "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ - "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ - "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "punpcklwd %%mm3, %%mm2 \n\t"\ - "punpckhwd %%mm3, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm2 \n\t"\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "paddd %%mm2, %%mm6 \n\t"\ - "paddd %%mm0, %%mm7 \n\t"\ - " jnz 2b \n\t"\ - "psrad $16, %%mm4 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ - "packssdw %%mm5, %%mm4 \n\t"\ - "packssdw %%mm7, %%mm6 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm0, %%mm6 \n\t"\ - "movq %%mm4, "U_TEMP"(%0) \n\t"\ - "movq %%mm6, "V_TEMP"(%0) \n\t"\ - -#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ - "lea "offset"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pxor %%mm1, %%mm1 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ".p2align 4 \n\t"\ - "2: \n\t"\ - "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ - "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ - "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ - "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm4, %%mm0 \n\t"\ - "punpckhwd %%mm4, %%mm3 \n\t"\ - "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ - "pmaddwd %%mm4, %%mm0 \n\t"\ - "pmaddwd %%mm4, %%mm3 \n\t"\ - "paddd %%mm0, %%mm1 \n\t"\ - "paddd %%mm3, %%mm5 \n\t"\ - "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ - "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ - "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "punpcklwd %%mm3, %%mm2 \n\t"\ - "punpckhwd %%mm3, %%mm0 \n\t"\ - "pmaddwd %%mm4, %%mm2 \n\t"\ - "pmaddwd %%mm4, %%mm0 \n\t"\ - "paddd %%mm2, %%mm7 \n\t"\ - "paddd %%mm0, %%mm6 \n\t"\ - " jnz 2b \n\t"\ - "psrad $16, %%mm1 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ - "packssdw %%mm5, %%mm1 \n\t"\ - "packssdw %%mm6, %%mm7 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm0, %%mm7 \n\t"\ - "movq "U_TEMP"(%0), %%mm3 \n\t"\ - "movq "V_TEMP"(%0), %%mm4 \n\t"\ - -#define YSCALEYUV2PACKEDX_ACCURATE \ - YSCALEYUV2PACKEDX_ACCURATE_UV \ - YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) - -#define YSCALEYUV2RGBX \ - "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ - "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ - "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ - "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ - /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ - "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ - "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ - "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ - "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ - "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ - "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ - /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ - "paddw %%mm3, %%mm4 \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "movq %%mm5, %%mm6 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ - "punpcklwd %%mm2, %%mm2 \n\t"\ - "punpcklwd %%mm5, %%mm5 \n\t"\ - "punpcklwd %%mm4, %%mm4 \n\t"\ - "paddw %%mm1, %%mm2 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "paddw %%mm1, %%mm4 \n\t"\ - "punpckhwd %%mm0, %%mm0 \n\t"\ - "punpckhwd %%mm6, %%mm6 \n\t"\ - "punpckhwd %%mm3, %%mm3 \n\t"\ - "paddw %%mm7, %%mm0 \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw %%mm7, %%mm3 \n\t"\ - /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ - "packuswb %%mm0, %%mm2 \n\t"\ - "packuswb %%mm6, %%mm5 \n\t"\ - "packuswb %%mm3, %%mm4 \n\t"\ - -#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ - "movq "#b", "#q2" \n\t" /* B */\ - "movq "#r", "#t" \n\t" /* R */\ - "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ - "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ - "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ - "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ - "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ - "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ - "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ - "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ - "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ - "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ -\ - MOVNTQ( q0, (dst, index, 4))\ - MOVNTQ( b, 8(dst, index, 4))\ - MOVNTQ( q2, 16(dst, index, 4))\ - MOVNTQ( q3, 24(dst, index, 4))\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" -#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) - -static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "movq %%mm2, "U_TEMP"(%0) \n\t" - "movq %%mm4, "V_TEMP"(%0) \n\t" - "movq %%mm5, "Y_TEMP"(%0) \n\t" - YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) - "movq "Y_TEMP"(%0), %%mm5 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm7 \n\t" - "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) - YSCALEYUV2PACKEDX_END - } else { - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - YSCALEYUV2PACKEDX_END - } -} - -static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm7 \n\t" - "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) - YSCALEYUV2PACKEDX_END - } else { - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - YSCALEYUV2PACKEDX_END - } -} - -#define REAL_WRITERGB16(dst, dstw, index) \ - "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ - "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ - "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ - "psrlq $3, %%mm2 \n\t"\ -\ - "movq %%mm2, %%mm1 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ -\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm5, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm5, %%mm1 \n\t"\ -\ - "psllq $3, %%mm3 \n\t"\ - "psllq $3, %%mm4 \n\t"\ -\ - "por %%mm3, %%mm2 \n\t"\ - "por %%mm4, %%mm1 \n\t"\ -\ - MOVNTQ(%%mm2, (dst, index, 2))\ - MOVNTQ(%%mm1, 8(dst, index, 2))\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" -#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) - -static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" - "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" - "paddusb "RED_DITHER"(%0), %%mm5\n\t" -#endif - WRITERGB16(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" - "paddusb "RED_DITHER"(%0), %%mm5 \n\t" -#endif - WRITERGB16(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -#define REAL_WRITERGB15(dst, dstw, index) \ - "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ - "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ - "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ - "psrlq $3, %%mm2 \n\t"\ - "psrlq $1, %%mm5 \n\t"\ -\ - "movq %%mm2, %%mm1 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ -\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm5, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm5, %%mm1 \n\t"\ -\ - "psllq $2, %%mm3 \n\t"\ - "psllq $2, %%mm4 \n\t"\ -\ - "por %%mm3, %%mm2 \n\t"\ - "por %%mm4, %%mm1 \n\t"\ -\ - MOVNTQ(%%mm2, (dst, index, 2))\ - MOVNTQ(%%mm1, 8(dst, index, 2))\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" -#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) - -static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" - "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" - "paddusb "RED_DITHER"(%0), %%mm5\n\t" -#endif - WRITERGB15(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" - "paddusb "RED_DITHER"(%0), %%mm5 \n\t" -#endif - WRITERGB15(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -#define WRITEBGR24MMX(dst, dstw, index) \ - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ - "movq %%mm2, %%mm1 \n\t" /* B */\ - "movq %%mm5, %%mm6 \n\t" /* R */\ - "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ - "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ - "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ - "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ - "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ - "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ - "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ - "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ - "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ -\ - "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ - "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ - "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ - "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ -\ - "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ - "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ - "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ - "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ -\ - "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ - "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ - "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ - "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ -\ - "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ - "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ - "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ - "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ - MOVNTQ(%%mm0, (dst))\ -\ - "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ - "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ - "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ - "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ - MOVNTQ(%%mm6, 8(dst))\ -\ - "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ - "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ - "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ - MOVNTQ(%%mm5, 16(dst))\ -\ - "add $24, "#dst" \n\t"\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" - -#define WRITEBGR24MMXEXT(dst, dstw, index) \ - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ - "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ - "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ - "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ - "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ - "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ -\ - "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ - "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ - "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ -\ - "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ - "por %%mm1, %%mm6 \n\t"\ - "por %%mm3, %%mm6 \n\t"\ - MOVNTQ(%%mm6, (dst))\ -\ - "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ - "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ - "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ - "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ -\ - "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ - "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ - "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ -\ - "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ - "por %%mm3, %%mm6 \n\t"\ - MOVNTQ(%%mm6, 8(dst))\ -\ - "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ - "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ - "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ -\ - "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ - "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ - "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ -\ - "por %%mm1, %%mm3 \n\t"\ - "por %%mm3, %%mm6 \n\t"\ - MOVNTQ(%%mm6, 16(dst))\ -\ - "add $24, "#dst" \n\t"\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" - -#if COMPILE_TEMPLATE_MMXEXT -#undef WRITEBGR24 -#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) -#else -#undef WRITEBGR24 -#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) -#endif - -static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX_ACCURATE - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize - "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) - :: "r" (&c->redDither), - "m" (dummy), "m" (dummy), "m" (dummy), - "r" (dest), "m" (dstW_reg), "m"(uv_off) - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S - ); -} - -static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX - YSCALEYUV2RGBX - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize - "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) - :: "r" (&c->redDither), - "m" (dummy), "m" (dummy), "m" (dummy), - "r" (dest), "m" (dstW_reg), "m"(uv_off) - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S - ); -} - -#define REAL_WRITEYUY2(dst, dstw, index) \ - "packuswb %%mm3, %%mm3 \n\t"\ - "packuswb %%mm4, %%mm4 \n\t"\ - "packuswb %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm4, %%mm3 \n\t"\ - "movq %%mm1, %%mm7 \n\t"\ - "punpcklbw %%mm3, %%mm1 \n\t"\ - "punpckhbw %%mm3, %%mm7 \n\t"\ -\ - MOVNTQ(%%mm1, (dst, index, 2))\ - MOVNTQ(%%mm7, 8(dst, index, 2))\ -\ - "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ - " jb 1b \n\t" -#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) - -static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX_ACCURATE - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ - "psraw $3, %%mm3 \n\t" - "psraw $3, %%mm4 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest, int dstW, int dstY) -{ - x86_reg dummy=0; - x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_offx2; - - YSCALEYUV2PACKEDX - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ - "psraw $3, %%mm3 \n\t" - "psraw $3, %%mm4 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) - YSCALEYUV2PACKEDX_END -} - -#define REAL_YSCALEYUV2RGB_UV(index, c) \ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ - "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ - "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ - "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ - "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ - "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ - "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ - "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ - "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ - "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ - "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ - "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ - "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ - "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ - /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ - -#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ - "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ - "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ - "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ - "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ - "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ - "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ - "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - -#define REAL_YSCALEYUV2RGB_COEFF(c) \ - "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ - "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ - "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ - "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ - "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ - "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ - /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ - "paddw %%mm3, %%mm4 \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "movq %%mm5, %%mm6 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ - "punpcklwd %%mm2, %%mm2 \n\t"\ - "punpcklwd %%mm5, %%mm5 \n\t"\ - "punpcklwd %%mm4, %%mm4 \n\t"\ - "paddw %%mm1, %%mm2 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "paddw %%mm1, %%mm4 \n\t"\ - "punpckhwd %%mm0, %%mm0 \n\t"\ - "punpckhwd %%mm6, %%mm6 \n\t"\ - "punpckhwd %%mm3, %%mm3 \n\t"\ - "paddw %%mm7, %%mm0 \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw %%mm7, %%mm3 \n\t"\ - /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ - "packuswb %%mm0, %%mm2 \n\t"\ - "packuswb %%mm6, %%mm5 \n\t"\ - "packuswb %%mm3, %%mm4 \n\t"\ - -#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) - -#define YSCALEYUV2RGB(index, c) \ - REAL_YSCALEYUV2RGB_UV(index, c) \ - REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ - REAL_YSCALEYUV2RGB_COEFF(c) - -/** - * vertical bilinear scale YV12 to RGB - */ -static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf[2], uint8_t *dest, - int dstW, int yalpha, int uvalpha, int y) -{ - const int16_t *buf0 = buf[0], *buf1 = buf[1], - *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { - const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; -#if ARCH_X86_64 - __asm__ volatile( - YSCALEYUV2RGB(%%r8, %5) - YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) - "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ - "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ - "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), - "a" (&c->redDither), - "r" (abuf0), "r" (abuf1) - : "%r8" - ); -#else - c->u_temp=(intptr_t)abuf0; - c->v_temp=(intptr_t)abuf1; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "push %0 \n\t" - "push %1 \n\t" - "mov "U_TEMP"(%5), %0 \n\t" - "mov "V_TEMP"(%5), %1 \n\t" - YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) - "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ - "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ - "packuswb %%mm7, %%mm1 \n\t" - "pop %1 \n\t" - "pop %0 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); -#endif - } else { - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } -} - -static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf[2], uint8_t *dest, - int dstW, int yalpha, int uvalpha, int y) -{ - const int16_t *buf0 = buf[0], *buf1 = buf[1], - *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); -} - -static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf[2], uint8_t *dest, - int dstW, int yalpha, int uvalpha, int y) -{ - const int16_t *buf0 = buf[0], *buf1 = buf[1], - *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); -} - -static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf[2], uint8_t *dest, - int dstW, int yalpha, int uvalpha, int y) -{ - const int16_t *buf0 = buf[0], *buf1 = buf[1], - *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); -} - -#define REAL_YSCALEYUV2PACKED(index, c) \ - "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ - "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ - "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ - "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ - "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ - "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ - "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ - "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ - "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ - "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ - "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ - "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ - "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ - "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ - "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ - "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ - "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ - "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ - "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ - "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ - -#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) - -static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf[2], uint8_t *dest, - int dstW, int yalpha, int uvalpha, int y) -{ - const int16_t *buf0 = buf[0], *buf1 = buf[1], - *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2PACKED(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); -} - -#define REAL_YSCALEYUV2RGB1(index, c) \ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ - "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ - "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ - "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ - "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ - "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ - /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ - "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ - "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ - "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ - "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ - "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ - "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ - "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ - "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ - /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ - "paddw %%mm3, %%mm4 \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "movq %%mm5, %%mm6 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ - "punpcklwd %%mm2, %%mm2 \n\t"\ - "punpcklwd %%mm5, %%mm5 \n\t"\ - "punpcklwd %%mm4, %%mm4 \n\t"\ - "paddw %%mm1, %%mm2 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "paddw %%mm1, %%mm4 \n\t"\ - "punpckhwd %%mm0, %%mm0 \n\t"\ - "punpckhwd %%mm6, %%mm6 \n\t"\ - "punpckhwd %%mm3, %%mm3 \n\t"\ - "paddw %%mm7, %%mm0 \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw %%mm7, %%mm3 \n\t"\ - /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ - "packuswb %%mm0, %%mm2 \n\t"\ - "packuswb %%mm6, %%mm5 \n\t"\ - "packuswb %%mm3, %%mm4 \n\t"\ - -#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) - -// do vertical chrominance interpolation -#define REAL_YSCALEYUV2RGB1b(index, c) \ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ - "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ - "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ - "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ - "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ - "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ - "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ - "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ - "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ - "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ - "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ - "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ - /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ - "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ - "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ - "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ - "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ - "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ - "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ - "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ - "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ - "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ - /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ - "paddw %%mm3, %%mm4 \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "movq %%mm5, %%mm6 \n\t"\ - "movq %%mm4, %%mm3 \n\t"\ - "punpcklwd %%mm2, %%mm2 \n\t"\ - "punpcklwd %%mm5, %%mm5 \n\t"\ - "punpcklwd %%mm4, %%mm4 \n\t"\ - "paddw %%mm1, %%mm2 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "paddw %%mm1, %%mm4 \n\t"\ - "punpckhwd %%mm0, %%mm0 \n\t"\ - "punpckhwd %%mm6, %%mm6 \n\t"\ - "punpckhwd %%mm3, %%mm3 \n\t"\ - "paddw %%mm7, %%mm0 \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw %%mm7, %%mm3 \n\t"\ - /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ - "packuswb %%mm0, %%mm2 \n\t"\ - "packuswb %%mm6, %%mm5 \n\t"\ - "packuswb %%mm3, %%mm4 \n\t"\ - -#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) - -#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ - "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ - "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ - "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ - "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ - "packuswb %%mm1, %%mm7 \n\t" -#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) - -/** - * YV12 to RGB without scaling or interpolating - */ -static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf0, uint8_t *dest, - int dstW, int uvalpha, int y) -{ - const int16_t *ubuf0 = ubuf[0]; - const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 - - if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - const int16_t *ubuf1 = ubuf[0]; - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } - } else { - const int16_t *ubuf1 = ubuf[1]; - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } - } -} - -static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf0, uint8_t *dest, - int dstW, int uvalpha, int y) -{ - const int16_t *ubuf0 = ubuf[0]; - const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 - - if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - const int16_t *ubuf1 = ubuf[0]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - const int16_t *ubuf1 = ubuf[1]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } -} - -static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf0, uint8_t *dest, - int dstW, int uvalpha, int y) -{ - const int16_t *ubuf0 = ubuf[0]; - const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 - - if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - const int16_t *ubuf1 = ubuf[0]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - const int16_t *ubuf1 = ubuf[1]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } -} - -static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf0, uint8_t *dest, - int dstW, int uvalpha, int y) -{ - const int16_t *ubuf0 = ubuf[0]; - const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 - - if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - const int16_t *ubuf1 = ubuf[0]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - const int16_t *ubuf1 = ubuf[1]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ -#ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" -#endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } -} - -#define REAL_YSCALEYUV2PACKED1(index, c) \ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "psraw $7, %%mm3 \n\t" \ - "psraw $7, %%mm4 \n\t" \ - "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ - "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ - "psraw $7, %%mm1 \n\t" \ - "psraw $7, %%mm7 \n\t" \ - -#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) - -#define REAL_YSCALEYUV2PACKED1b(index, c) \ - "xor "#index", "#index" \n\t"\ - ".p2align 4 \n\t"\ - "1: \n\t"\ - "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ - "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ - "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ - "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ - "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ - "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ - "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ - "psrlw $8, %%mm3 \n\t" \ - "psrlw $8, %%mm4 \n\t" \ - "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ - "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ - "psraw $7, %%mm1 \n\t" \ - "psraw $7, %%mm7 \n\t" -#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) - -static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, - const int16_t *ubuf[2], const int16_t *vbuf[2], - const int16_t *abuf0, uint8_t *dest, - int dstW, int uvalpha, int y) -{ - const int16_t *ubuf0 = ubuf[0]; - const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 - - if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - const int16_t *ubuf1 = ubuf[0]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2PACKED1(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } else { - const int16_t *ubuf1 = ubuf[1]; - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2PACKED1b(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), - "a" (&c->redDither) - ); - } -} - -#if COMPILE_TEMPLATE_MMXEXT -static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, - int dstWidth, const uint8_t *src, - int srcW, int xInc) -{ - int32_t *filterPos = c->hLumFilterPos; - int16_t *filter = c->hLumFilter; - void *mmxextFilterCode = c->lumMmxextFilterCode; - int i; -#if defined(PIC) - uint64_t ebxsave; -#endif -#if ARCH_X86_64 - uint64_t retsave; -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %5 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %6 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %5 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - -#if ARCH_X86_64 -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ - "add %%"REG_S", %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#else -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#endif /* ARCH_X86_64 */ - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %5, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %6, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %5, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) - dst[i] = src[srcW-1]*128; -} - -static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, - int dstWidth, const uint8_t *src1, - const uint8_t *src2, int srcW, int xInc) -{ - int32_t *filterPos = c->hChrFilterPos; - int16_t *filter = c->hChrFilter; - void *mmxextFilterCode = c->chrMmxextFilterCode; - int i; -#if defined(PIC) - DECLARE_ALIGNED(8, uint64_t, ebxsave); -#endif -#if ARCH_X86_64 - DECLARE_ALIGNED(8, uint64_t, retsave); -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %7 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %8 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %7 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - "xor %%"REG_a", %%"REG_a" \n\t" // i - "mov %5, %%"REG_c" \n\t" // src - "mov %6, %%"REG_D" \n\t" // buf2 - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %7, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %8, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %7, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode), "m" (src2), "m"(dst2) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { - dst1[i] = src1[srcW-1]*128; - dst2[i] = src2[srcW-1]*128; - } -} -#endif /* COMPILE_TEMPLATE_MMXEXT */ - -static av_cold void RENAME(sws_init_swscale)(SwsContext *c) -{ - enum AVPixelFormat dstFormat = c->dstFormat; - - c->use_mmx_vfilter= 0; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 - && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { - if (c->flags & SWS_ACCURATE_RND) { - if (!(c->flags & SWS_FULL_CHR_H_INT)) { - switch (c->dstFormat) { - case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; - case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; - case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; - case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; - case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; - default: break; - } - } - } else { - c->use_mmx_vfilter= 1; - c->yuv2planeX = RENAME(yuv2yuvX ); - if (!(c->flags & SWS_FULL_CHR_H_INT)) { - switch (c->dstFormat) { - case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; - case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; - case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; - case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; - case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; - default: break; - } - } - } - if (!(c->flags & SWS_FULL_CHR_H_INT)) { - switch (c->dstFormat) { - case AV_PIX_FMT_RGB32: - c->yuv2packed1 = RENAME(yuv2rgb32_1); - c->yuv2packed2 = RENAME(yuv2rgb32_2); - break; - case AV_PIX_FMT_BGR24: - c->yuv2packed1 = RENAME(yuv2bgr24_1); - c->yuv2packed2 = RENAME(yuv2bgr24_2); - break; - case AV_PIX_FMT_RGB555: - c->yuv2packed1 = RENAME(yuv2rgb555_1); - c->yuv2packed2 = RENAME(yuv2rgb555_2); - break; - case AV_PIX_FMT_RGB565: - c->yuv2packed1 = RENAME(yuv2rgb565_1); - c->yuv2packed2 = RENAME(yuv2rgb565_2); - break; - case AV_PIX_FMT_YUYV422: - c->yuv2packed1 = RENAME(yuv2yuyv422_1); - c->yuv2packed2 = RENAME(yuv2yuyv422_2); - break; - default: - break; - } - } - } - - if (c->srcBpc == 8 && c->dstBpc <= 14) { - // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). -#if COMPILE_TEMPLATE_MMXEXT - if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { - c->hyscale_fast = RENAME(hyscale_fast); - c->hcscale_fast = RENAME(hcscale_fast); - } else { -#endif /* COMPILE_TEMPLATE_MMXEXT */ - c->hyscale_fast = NULL; - c->hcscale_fast = NULL; -#if COMPILE_TEMPLATE_MMXEXT - } -#endif /* COMPILE_TEMPLATE_MMXEXT */ - } -} diff --git a/ffmpeg/libswscale/x86/w64xmmtest.c b/ffmpeg/libswscale/x86/w64xmmtest.c deleted file mode 100644 index 88143d9..0000000 --- a/ffmpeg/libswscale/x86/w64xmmtest.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * check XMM registers for clobbers on Win64 - * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/w64xmmtest.h" -#include "libswscale/swscale.h" - -wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[], - const int srcStride[], int srcSliceY, int srcSliceH, - uint8_t *const dst[], const int dstStride[])) -{ - testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY, - srcSliceH, dst, dstStride); -} diff --git a/ffmpeg/libswscale/x86/yuv2rgb.c b/ffmpeg/libswscale/x86/yuv2rgb.c deleted file mode 100644 index e4315ef..0000000 --- a/ffmpeg/libswscale/x86/yuv2rgb.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * software YUV to RGB converter - * - * Copyright (C) 2009 Konstantin Shishkov - * - * MMX/MMXEXT template stuff (needed for fast movntq support), - * 1,4,8bpp support and context / deglobalize stuff - * by Michael Niedermayer (michaelni@gmx.at) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdio.h> -#include <stdlib.h> -#include <inttypes.h> - -#include "config.h" -#include "libswscale/rgb2rgb.h" -#include "libswscale/swscale.h" -#include "libswscale/swscale_internal.h" -#include "libavutil/attributes.h" -#include "libavutil/x86/asm.h" -#include "libavutil/cpu.h" - -#if HAVE_INLINE_ASM - -#define DITHER1XBPP // only for MMX - -/* hope these constant values are cache line aligned */ -DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL; -DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; -DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; -DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL; -DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL; -DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL; - -//MMX versions -#if HAVE_MMX_INLINE -#undef RENAME -#undef COMPILE_TEMPLATE_MMXEXT -#define COMPILE_TEMPLATE_MMXEXT 0 -#define RENAME(a) a ## _mmx -#include "yuv2rgb_template.c" -#endif /* HAVE_MMX_INLINE */ - -// MMXEXT versions -#if HAVE_MMXEXT_INLINE -#undef RENAME -#undef COMPILE_TEMPLATE_MMXEXT -#define COMPILE_TEMPLATE_MMXEXT 1 -#define RENAME(a) a ## _mmxext -#include "yuv2rgb_template.c" -#endif /* HAVE_MMXEXT_INLINE */ - -#endif /* HAVE_INLINE_ASM */ - -av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) -{ -#if HAVE_MMX_INLINE - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_MMXEXT_INLINE - if (cpu_flags & AV_CPU_FLAG_MMXEXT) { - switch (c->dstFormat) { - case AV_PIX_FMT_RGB24: - return yuv420_rgb24_mmxext; - case AV_PIX_FMT_BGR24: - return yuv420_bgr24_mmxext; - } - } -#endif - - if (cpu_flags & AV_CPU_FLAG_MMX) { - switch (c->dstFormat) { - case AV_PIX_FMT_RGB32: - if (c->srcFormat == AV_PIX_FMT_YUVA420P) { -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA - return yuva420_rgb32_mmx; -#endif - break; - } else - return yuv420_rgb32_mmx; - case AV_PIX_FMT_BGR32: - if (c->srcFormat == AV_PIX_FMT_YUVA420P) { -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA - return yuva420_bgr32_mmx; -#endif - break; - } else - return yuv420_bgr32_mmx; - case AV_PIX_FMT_RGB24: - return yuv420_rgb24_mmx; - case AV_PIX_FMT_BGR24: - return yuv420_bgr24_mmx; - case AV_PIX_FMT_RGB565: - return yuv420_rgb16_mmx; - case AV_PIX_FMT_RGB555: - return yuv420_rgb15_mmx; - } - } -#endif /* HAVE_MMX_INLINE */ - - return NULL; -} diff --git a/ffmpeg/libswscale/x86/yuv2rgb_template.c b/ffmpeg/libswscale/x86/yuv2rgb_template.c deleted file mode 100644 index c879102..0000000 --- a/ffmpeg/libswscale/x86/yuv2rgb_template.c +++ /dev/null @@ -1,451 +0,0 @@ -/* - * software YUV to RGB converter - * - * Copyright (C) 2001-2007 Michael Niedermayer - * (c) 2010 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#undef MOVNTQ -#undef EMMS -#undef SFENCE - -#if COMPILE_TEMPLATE_MMXEXT -#define MOVNTQ "movntq" -#define SFENCE "sfence" -#else -#define MOVNTQ "movq" -#define SFENCE " # nop" -#endif - -#define REG_BLUE "0" -#define REG_RED "1" -#define REG_GREEN "2" -#define REG_ALPHA "3" - -#define YUV2RGB_LOOP(depth) \ - h_size = (c->dstW + 7) & ~7; \ - if (h_size * depth > FFABS(dstStride[0])) \ - h_size -= 8; \ - \ - vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \ - \ - __asm__ volatile ("pxor %mm4, %mm4\n\t"); \ - for (y = 0; y < srcSliceH; y++) { \ - uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \ - const uint8_t *py = src[0] + y * srcStride[0]; \ - const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \ - const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \ - x86_reg index = -h_size / 2; \ - -#define YUV2RGB_INITIAL_LOAD \ - __asm__ volatile ( \ - "movq (%5, %0, 2), %%mm6\n\t" \ - "movd (%2, %0), %%mm0\n\t" \ - "movd (%3, %0), %%mm1\n\t" \ - "1: \n\t" \ - -/* YUV2RGB core - * Conversion is performed in usual way: - * R = Y' * Ycoef + Vred * V' - * G = Y' * Ycoef + Vgreen * V' + Ugreen * U' - * B = Y' * Ycoef + Ublue * U' - * - * where X' = X * 8 - Xoffset (multiplication is performed to increase - * precision a bit). - * Since it operates in YUV420 colorspace, Y component is additionally - * split into Y1 and Y2 for even and odd pixels. - * - * Input: - * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register - * Output: - * mm1 - R, mm2 - G, mm0 - B - */ -#define YUV2RGB \ - /* convert Y, U, V into Y1', Y2', U', V' */ \ - "movq %%mm6, %%mm7\n\t" \ - "punpcklbw %%mm4, %%mm0\n\t" \ - "punpcklbw %%mm4, %%mm1\n\t" \ - "pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \ - "psrlw $8, %%mm7\n\t" \ - "psllw $3, %%mm0\n\t" \ - "psllw $3, %%mm1\n\t" \ - "psllw $3, %%mm6\n\t" \ - "psllw $3, %%mm7\n\t" \ - "psubsw "U_OFFSET"(%4), %%mm0\n\t" \ - "psubsw "V_OFFSET"(%4), %%mm1\n\t" \ - "psubw "Y_OFFSET"(%4), %%mm6\n\t" \ - "psubw "Y_OFFSET"(%4), %%mm7\n\t" \ -\ - /* multiply by coefficients */ \ - "movq %%mm0, %%mm2\n\t" \ - "movq %%mm1, %%mm3\n\t" \ - "pmulhw "UG_COEFF"(%4), %%mm2\n\t" \ - "pmulhw "VG_COEFF"(%4), %%mm3\n\t" \ - "pmulhw "Y_COEFF" (%4), %%mm6\n\t" \ - "pmulhw "Y_COEFF" (%4), %%mm7\n\t" \ - "pmulhw "UB_COEFF"(%4), %%mm0\n\t" \ - "pmulhw "VR_COEFF"(%4), %%mm1\n\t" \ - "paddsw %%mm3, %%mm2\n\t" \ - /* now: mm0 = UB, mm1 = VR, mm2 = CG */ \ - /* mm6 = Y1, mm7 = Y2 */ \ -\ - /* produce RGB */ \ - "movq %%mm7, %%mm3\n\t" \ - "movq %%mm7, %%mm5\n\t" \ - "paddsw %%mm0, %%mm3\n\t" \ - "paddsw %%mm1, %%mm5\n\t" \ - "paddsw %%mm2, %%mm7\n\t" \ - "paddsw %%mm6, %%mm0\n\t" \ - "paddsw %%mm6, %%mm1\n\t" \ - "paddsw %%mm6, %%mm2\n\t" \ - -#define RGB_PACK_INTERLEAVE \ - /* pack and interleave even/odd pixels */ \ - "packuswb %%mm1, %%mm0\n\t" \ - "packuswb %%mm5, %%mm3\n\t" \ - "packuswb %%mm2, %%mm2\n\t" \ - "movq %%mm0, %%mm1\n\n" \ - "packuswb %%mm7, %%mm7\n\t" \ - "punpcklbw %%mm3, %%mm0\n\t" \ - "punpckhbw %%mm3, %%mm1\n\t" \ - "punpcklbw %%mm7, %%mm2\n\t" \ - -#define YUV2RGB_ENDLOOP(depth) \ - "movq 8 (%5, %0, 2), %%mm6\n\t" \ - "movd 4 (%3, %0), %%mm1\n\t" \ - "movd 4 (%2, %0), %%mm0\n\t" \ - "add $"AV_STRINGIFY(depth * 8)", %1\n\t" \ - "add $4, %0\n\t" \ - "js 1b\n\t" \ - -#define YUV2RGB_OPERANDS \ - : "+r" (index), "+r" (image) \ - : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ - "r" (py - 2*index) \ - : "memory" \ - ); \ - } \ - -#define YUV2RGB_OPERANDS_ALPHA \ - : "+r" (index), "+r" (image) \ - : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ - "r" (py - 2*index), "r" (pa - 2*index) \ - : "memory" \ - ); \ - } \ - -#define YUV2RGB_ENDFUNC \ - __asm__ volatile (SFENCE"\n\t" \ - "emms \n\t"); \ - return srcSliceH; \ - -#define IF0(x) -#define IF1(x) x - -#define RGB_PACK16(gmask, is15) \ - "pand "MANGLE(mmx_redmask)", %%mm0\n\t" \ - "pand "MANGLE(mmx_redmask)", %%mm1\n\t" \ - "movq %%mm2, %%mm3\n\t" \ - "psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \ - "psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \ - "psrlw $3, %%mm0\n\t" \ - IF##is15("psrlw $1, %%mm1\n\t") \ - "pand "MANGLE(pb_e0)", %%mm2\n\t" \ - "pand "MANGLE(gmask)", %%mm3\n\t" \ - "por %%mm2, %%mm0\n\t" \ - "por %%mm3, %%mm1\n\t" \ - "movq %%mm0, %%mm2\n\t" \ - "punpcklbw %%mm1, %%mm0\n\t" \ - "punpckhbw %%mm1, %%mm2\n\t" \ - MOVNTQ " %%mm0, (%1)\n\t" \ - MOVNTQ " %%mm2, 8(%1)\n\t" \ - -#define DITHER_RGB \ - "paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \ - "paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \ - "paddusb "RED_DITHER"(%4), %%mm1\n\t" \ - -#if !COMPILE_TEMPLATE_MMXEXT -static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(2) - -#ifdef DITHER1XBPP - c->blueDither = ff_dither8[y & 1]; - c->greenDither = ff_dither8[y & 1]; - c->redDither = ff_dither8[(y + 1) & 1]; -#endif - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE -#ifdef DITHER1XBPP - DITHER_RGB -#endif - RGB_PACK16(pb_03, 1) - - YUV2RGB_ENDLOOP(2) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} - -static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(2) - -#ifdef DITHER1XBPP - c->blueDither = ff_dither8[y & 1]; - c->greenDither = ff_dither4[y & 1]; - c->redDither = ff_dither8[(y + 1) & 1]; -#endif - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE -#ifdef DITHER1XBPP - DITHER_RGB -#endif - RGB_PACK16(pb_07, 0) - - YUV2RGB_ENDLOOP(2) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} -#endif /* !COMPILE_TEMPLATE_MMXEXT */ - -#define RGB_PACK24(blue, red)\ - "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\ - "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\ - "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\ - "movq %%mm"red", %%mm3 \n"\ - "movq %%mm"blue", %%mm6 \n"\ - "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\ - "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\ - "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\ - "movq %%mm3, %%mm5 \n"\ - "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\ - "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\ - "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\ - RGB_PACK24_B - -#if COMPILE_TEMPLATE_MMXEXT -DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1}; -DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0}; -DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0}; -DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1}; -DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0}; -#undef RGB_PACK24_B -#define RGB_PACK24_B\ - "pshufw $0xc6, %%mm2, %%mm1 \n"\ - "pshufw $0x84, %%mm3, %%mm6 \n"\ - "pshufw $0x38, %%mm5, %%mm7 \n"\ - "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\ - "movq %%mm1, %%mm0 \n"\ - "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\ - "movq %%mm1, %%mm2 \n"\ - "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\ - "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\ - "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\ - "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\ - "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\ - "por %%mm3, %%mm1 \n"\ - "por %%mm6, %%mm0 \n"\ - "por %%mm5, %%mm1 \n"\ - "por %%mm7, %%mm2 \n"\ - MOVNTQ" %%mm0, (%1) \n"\ - MOVNTQ" %%mm1, 8(%1) \n"\ - MOVNTQ" %%mm2, 16(%1) \n"\ - -#else -#undef RGB_PACK24_B -#define RGB_PACK24_B\ - "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\ - "movd %%mm2, 4(%1) \n" /* G1 B1 */\ - "psrlq $32, %%mm3 \n"\ - "psrlq $16, %%mm2 \n"\ - "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\ - "movd %%mm2, 10(%1) \n" /* G3 B3 */\ - "psrlq $16, %%mm2 \n"\ - "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\ - "movd %%mm2, 16(%1) \n" /* G5 B5 */\ - "psrlq $32, %%mm5 \n"\ - "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\ - "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\ - -#endif - -static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(3) - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK24(REG_BLUE, REG_RED) - - YUV2RGB_ENDLOOP(3) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} - -static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(3) - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK24(REG_RED, REG_BLUE) - - YUV2RGB_ENDLOOP(3) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} - - -#define SET_EMPTY_ALPHA \ - "pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \ - -#define LOAD_ALPHA \ - "movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \ - -#define RGB_PACK32(red, green, blue, alpha) \ - "movq %%mm"blue", %%mm5\n\t" \ - "movq %%mm"red", %%mm6\n\t" \ - "punpckhbw %%mm"green", %%mm5\n\t" \ - "punpcklbw %%mm"green", %%mm"blue"\n\t" \ - "punpckhbw %%mm"alpha", %%mm6\n\t" \ - "punpcklbw %%mm"alpha", %%mm"red"\n\t" \ - "movq %%mm"blue", %%mm"green"\n\t" \ - "movq %%mm5, %%mm"alpha"\n\t" \ - "punpcklwd %%mm"red", %%mm"blue"\n\t" \ - "punpckhwd %%mm"red", %%mm"green"\n\t" \ - "punpcklwd %%mm6, %%mm5\n\t" \ - "punpckhwd %%mm6, %%mm"alpha"\n\t" \ - MOVNTQ " %%mm"blue", 0(%1)\n\t" \ - MOVNTQ " %%mm"green", 8(%1)\n\t" \ - MOVNTQ " %%mm5, 16(%1)\n\t" \ - MOVNTQ " %%mm"alpha", 24(%1)\n\t" \ - -#if !COMPILE_TEMPLATE_MMXEXT -static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(4) - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE - SET_EMPTY_ALPHA - RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) - - YUV2RGB_ENDLOOP(4) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} - -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA -static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(4) - - const uint8_t *pa = src[3] + y * srcStride[3]; - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE - LOAD_ALPHA - RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) - - YUV2RGB_ENDLOOP(4) - YUV2RGB_OPERANDS_ALPHA - YUV2RGB_ENDFUNC -} -#endif - -static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(4) - - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE - SET_EMPTY_ALPHA - RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) - - YUV2RGB_ENDLOOP(4) - YUV2RGB_OPERANDS - YUV2RGB_ENDFUNC -} - -#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA -static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[], - int srcStride[], - int srcSliceY, int srcSliceH, - uint8_t *dst[], int dstStride[]) -{ - int y, h_size, vshift; - - YUV2RGB_LOOP(4) - - const uint8_t *pa = src[3] + y * srcStride[3]; - YUV2RGB_INITIAL_LOAD - YUV2RGB - RGB_PACK_INTERLEAVE - LOAD_ALPHA - RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) - - YUV2RGB_ENDLOOP(4) - YUV2RGB_OPERANDS_ALPHA - YUV2RGB_ENDFUNC -} -#endif - -#endif /* !COMPILE_TEMPLATE_MMXEXT */ |
