diff options
Diffstat (limited to 'ffmpeg1/libswscale/x86')
| -rw-r--r-- | ffmpeg1/libswscale/x86/Makefile | 11 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/input.asm | 670 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/output.asm | 413 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/rgb2rgb.c | 149 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/rgb2rgb_template.c | 2498 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/scale.asm | 431 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/swscale.c | 585 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/swscale_template.c | 1717 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/w64xmmtest.c | 31 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/yuv2rgb.c | 113 | ||||
| -rw-r--r-- | ffmpeg1/libswscale/x86/yuv2rgb_template.c | 451 |
11 files changed, 7069 insertions, 0 deletions
diff --git a/ffmpeg1/libswscale/x86/Makefile b/ffmpeg1/libswscale/x86/Makefile new file mode 100644 index 0000000..7d219b4 --- /dev/null +++ b/ffmpeg1/libswscale/x86/Makefile @@ -0,0 +1,11 @@ +$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) + +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o + +MMX-OBJS += x86/rgb2rgb.o \ + x86/swscale.o \ + x86/yuv2rgb.o \ + +YASM-OBJS += x86/input.o \ + x86/output.o \ + x86/scale.o \ diff --git a/ffmpeg1/libswscale/x86/input.asm b/ffmpeg1/libswscale/x86/input.asm new file mode 100644 index 0000000..9d5a871 --- /dev/null +++ b/ffmpeg1/libswscale/x86/input.asm @@ -0,0 +1,670 @@ +;****************************************************************************** +;* x86-optimized input routines; does shuffling of packed +;* YUV formats into individual planes, and converts RGB +;* into YUV planes also. +;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +%define RY 0x20DE +%define GY 0x4087 +%define BY 0x0C88 +%define RU 0xECFF +%define GU 0xDAC8 +%define BU 0x3838 +%define RV 0x3838 +%define GV 0xD0E3 +%define BV 0xF6E4 + +rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 +rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 +bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY +bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY +rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY +rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY +bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU +bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU +rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU +rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU +bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV +bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV +rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV +rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV + +rgba_Ycoeff_rb: times 4 dw RY, BY +rgba_Ycoeff_br: times 4 dw BY, RY +rgba_Ycoeff_ga: times 4 dw GY, 0 +rgba_Ycoeff_ag: times 4 dw 0, GY +rgba_Ucoeff_rb: times 4 dw RU, BU +rgba_Ucoeff_br: times 4 dw BU, RU +rgba_Ucoeff_ga: times 4 dw GU, 0 +rgba_Ucoeff_ag: times 4 dw 0, GU +rgba_Vcoeff_rb: times 4 dw RV, BV +rgba_Vcoeff_br: times 4 dw BV, RV +rgba_Vcoeff_ga: times 4 dw GV, 0 +rgba_Vcoeff_ag: times 4 dw 0, GV + +shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ + 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 +shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \ + 8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80 + +SECTION .text + +;----------------------------------------------------------------------------- +; RGB to Y/UV. +; +; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); +; and +; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, +; const uint8_t *unused, int w); +;----------------------------------------------------------------------------- + +; %1 = nr. of XMM registers +; %2 = rgb or bgr +%macro RGB24_TO_Y_FN 2-3 +cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3 +%if mmsize == 8 + mova m5, [%2_Ycoeff_12x4] + mova m6, [%2_Ycoeff_3x56] +%define coeff1 m5 +%define coeff2 m6 +%elif ARCH_X86_64 + mova m8, [%2_Ycoeff_12x4] + mova m9, [%2_Ycoeff_3x56] +%define coeff1 m8 +%define coeff2 m9 +%else ; x86-32 && mmsize == 16 +%define coeff1 [%2_Ycoeff_12x4] +%define coeff2 [%2_Ycoeff_3x56] +%endif ; x86-32/64 && mmsize == 8/16 +%if (ARCH_X86_64 || mmsize == 8) && %0 == 3 + jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body +%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 +.body: +%if cpuflag(ssse3) + mova m7, [shuf_rgb_12x4] +%define shuf_rgb1 m7 +%if ARCH_X86_64 + mova m10, [shuf_rgb_3x56] +%define shuf_rgb2 m10 +%else ; x86-32 +%define shuf_rgb2 [shuf_rgb_3x56] +%endif ; x86-32/64 +%endif ; cpuflag(ssse3) +%if ARCH_X86_64 + movsxd wq, wd +%endif + add wq, wq + add dstq, wq + neg wq +%if notcpuflag(ssse3) + pxor m7, m7 +%endif ; !cpuflag(ssse3) + mova m4, [rgb_Yrnd] +.loop: +%if cpuflag(ssse3) + movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] + movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] + pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } + pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } + pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } + pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } +%else ; !cpuflag(ssse3) + movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } + movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } + movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 } + movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 } +%if mmsize == 16 ; i.e. sse2 + punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } + punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } + movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 } + movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 } + movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 } + movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 } + punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } + punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } +%endif ; mmsize == 16 + punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } + punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } + punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } + punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } +%endif ; cpuflag(ssse3) + add srcq, 3 * mmsize / 2 + pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY } + pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY } + pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY } + pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY } + paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3] + paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] + paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } + paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m2 ; (word) { Y[0-7] } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop + REP_RET +%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 +%endmacro + +; %1 = nr. of XMM registers +; %2 = rgb or bgr +%macro RGB24_TO_UV_FN 2-3 +cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 +%if ARCH_X86_64 + mova m8, [%2_Ucoeff_12x4] + mova m9, [%2_Ucoeff_3x56] + mova m10, [%2_Vcoeff_12x4] + mova m11, [%2_Vcoeff_3x56] +%define coeffU1 m8 +%define coeffU2 m9 +%define coeffV1 m10 +%define coeffV2 m11 +%else ; x86-32 +%define coeffU1 [%2_Ucoeff_12x4] +%define coeffU2 [%2_Ucoeff_3x56] +%define coeffV1 [%2_Vcoeff_12x4] +%define coeffV2 [%2_Vcoeff_3x56] +%endif ; x86-32/64 +%if ARCH_X86_64 && %0 == 3 + jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body +%else ; ARCH_X86_64 && %0 == 3 +.body: +%if cpuflag(ssse3) + mova m7, [shuf_rgb_12x4] +%define shuf_rgb1 m7 +%if ARCH_X86_64 + mova m12, [shuf_rgb_3x56] +%define shuf_rgb2 m12 +%else ; x86-32 +%define shuf_rgb2 [shuf_rgb_3x56] +%endif ; x86-32/64 +%endif ; cpuflag(ssse3) +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add wq, wq + add dstUq, wq + add dstVq, wq + neg wq + mova m6, [rgb_UVrnd] +%if notcpuflag(ssse3) + pxor m7, m7 +%endif +.loop: +%if cpuflag(ssse3) + movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3] + movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7] + pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } + pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } +%else ; !cpuflag(ssse3) + movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 } + movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } + movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 } + movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 } +%if mmsize == 16 + punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } + punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } + movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 } + movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 } +%endif ; mmsize == 16 + punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } + punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } +%endif ; cpuflag(ssse3) + pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV } + pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV } + pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU } + pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU } + paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3] + paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3] +%if cpuflag(ssse3) + pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } + pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } +%else ; !cpuflag(ssse3) +%if mmsize == 16 + movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 } + movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 } + punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } + punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } +%endif ; mmsize == 16 && !cpuflag(ssse3) + punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } + punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } +%endif ; cpuflag(ssse3) + add srcq, 3 * mmsize / 2 + pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU } + pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU } + pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV } + pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV } + paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7] + paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7] + paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] } + paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } + paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } + paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } + psrad m0, 9 + psrad m2, 9 + psrad m1, 9 + psrad m4, 9 + packssdw m0, m1 ; (word) { U[0-7] } + packssdw m2, m4 ; (word) { V[0-7] } +%if mmsize == 8 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%else ; mmsize == 16 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%endif ; mmsize == 8/16 + add wq, mmsize + jl .loop + REP_RET +%endif ; ARCH_X86_64 && %0 == 3 +%endmacro + +; %1 = nr. of XMM registers for rgb-to-Y func +; %2 = nr. of XMM registers for rgb-to-UV func +%macro RGB24_FUNCS 2 +RGB24_TO_Y_FN %1, rgb +RGB24_TO_Y_FN %1, bgr, rgb +RGB24_TO_UV_FN %2, rgb +RGB24_TO_UV_FN %2, bgr, rgb +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +RGB24_FUNCS 0, 0 +%endif + +INIT_XMM sse2 +RGB24_FUNCS 10, 12 + +INIT_XMM ssse3 +RGB24_FUNCS 11, 13 + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +RGB24_FUNCS 11, 13 +%endif + +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_Y_FN 5-6 +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3 + mova m5, [rgba_Ycoeff_%2%4] + mova m6, [rgba_Ycoeff_%3%5] +%if %0 == 6 + jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body +%else ; %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, wd +%endif + lea srcq, [srcq+wq*4] + add wq, wq + add dstq, wq + neg wq + mova m4, [rgb_Yrnd] + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7] + pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7] + paddd m0, m4 ; += rgb_Yrnd + paddd m2, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + paddd m2, m3 ; (dword) { Y[4-7] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m2 ; (word) { Y[0-7] } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop + REP_RET +%endif ; %0 == 3 +%endmacro + +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_UV_FN 5-6 +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 +%if ARCH_X86_64 + mova m8, [rgba_Ucoeff_%2%4] + mova m9, [rgba_Ucoeff_%3%5] + mova m10, [rgba_Vcoeff_%2%4] + mova m11, [rgba_Vcoeff_%3%5] +%define coeffU1 m8 +%define coeffU2 m9 +%define coeffV1 m10 +%define coeffV2 m11 +%else ; x86-32 +%define coeffU1 [rgba_Ucoeff_%2%4] +%define coeffU2 [rgba_Ucoeff_%3%5] +%define coeffV1 [rgba_Vcoeff_%2%4] +%define coeffV2 [rgba_Vcoeff_%3%5] +%endif ; x86-64/32 +%if ARCH_X86_64 && %0 == 6 + jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body +%else ; ARCH_X86_64 && %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add wq, wq + add dstUq, wq + add dstVq, wq + lea srcq, [srcq+wq*2] + neg wq + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 + mova m6, [rgb_UVrnd] +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] + pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] + pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] + pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] + pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] + paddd m3, m6 ; += rgb_UVrnd + paddd m1, m6 ; += rgb_UVrnd + paddd m2, m3 ; (dword) { V[0-3] } + paddd m0, m1 ; (dword) { U[0-3] } + pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7] + pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7] + pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7] + pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] + paddd m3, m6 ; += rgb_UVrnd + paddd m5, m6 ; += rgb_UVrnd + psrad m0, 9 + paddd m1, m3 ; (dword) { V[4-7] } + paddd m4, m5 ; (dword) { U[4-7] } + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 + packssdw m0, m4 ; (word) { U[0-7] } + packssdw m2, m1 ; (word) { V[0-7] } +%if mmsize == 8 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%else ; mmsize == 16 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%endif ; mmsize == 8/16 + add wq, mmsize + jl .loop + REP_RET +%endif ; ARCH_X86_64 && %0 == 3 +%endmacro + +; %1 = nr. of XMM registers for rgb-to-Y func +; %2 = nr. of XMM registers for rgb-to-UV func +%macro RGB32_FUNCS 2 +RGB32_TO_Y_FN %1, r, g, b, a +RGB32_TO_Y_FN %1, b, g, r, a, rgba +RGB32_TO_Y_FN %1, a, r, g, b, rgba +RGB32_TO_Y_FN %1, a, b, g, r, rgba + +RGB32_TO_UV_FN %2, r, g, b, a +RGB32_TO_UV_FN %2, b, g, r, a, rgba +RGB32_TO_UV_FN %2, a, r, g, b, rgba +RGB32_TO_UV_FN %2, a, b, g, r, rgba +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +RGB32_FUNCS 0, 0 +%endif + +INIT_XMM sse2 +RGB32_FUNCS 8, 12 + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +RGB32_FUNCS 8, 12 +%endif + +;----------------------------------------------------------------------------- +; YUYV/UYVY/NV12/NV21 packed pixel shuffling. +; +; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); +; and +; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, +; const uint8_t *unused, int w); +;----------------------------------------------------------------------------- + +; %1 = a (aligned) or u (unaligned) +; %2 = yuyv or uyvy +%macro LOOP_YUYV_TO_Y 2 +.loop_%1: + mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } +%ifidn %2, yuyv + pand m0, m2 ; (word) { Y0, Y1, ..., Y7 } + pand m1, m2 ; (word) { Y8, Y9, ..., Y15 } +%else ; uyvy + psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 } + psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 } +%endif ; yuyv/uyvy + packuswb m0, m1 ; (byte) { Y0, ..., Y15 } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = yuyv or uyvy +; %3 = if specified, it means that unaligned and aligned code in loop +; will be the same (i.e. YUYV+AVX), and thus we don't need to +; split the loop in an aligned and unaligned case +%macro YUYV_TO_Y_FN 2-3 +cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w +%if ARCH_X86_64 + movsxd wq, wd +%endif + add dstq, wq +%if mmsize == 16 + test srcq, 15 +%endif + lea srcq, [srcq+wq*2] +%ifidn %2, yuyv + pcmpeqb m2, m2 ; (byte) { 0xff } x 16 + psrlw m2, 8 ; (word) { 0x00ff } x 8 +%endif ; yuyv +%if mmsize == 16 + jnz .loop_u_start + neg wq + LOOP_YUYV_TO_Y a, %2 +.loop_u_start: + neg wq + LOOP_YUYV_TO_Y u, %2 +%else ; mmsize == 8 + neg wq + LOOP_YUYV_TO_Y a, %2 +%endif ; mmsize == 8/16 +%endmacro + +; %1 = a (aligned) or u (unaligned) +; %2 = yuyv or uyvy +%macro LOOP_YUYV_TO_UV 2 +.loop_%1: +%ifidn %2, yuyv + mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } + psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 } + psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 } +%else ; uyvy +%if cpuflag(avx) + vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 } + vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 } +%else + mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } + pand m0, m2 ; (word) { U0, V0, ..., U3, V3 } + pand m1, m2 ; (word) { U4, V4, ..., U7, V7 } +%endif +%endif ; yuyv/uyvy + packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } + pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } + psrlw m0, 8 ; (word) { V0, V1, ..., V7 } +%if mmsize == 16 + packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } + movh [dstUq+wq], m1 + movhps [dstVq+wq], m1 +%else ; mmsize == 8 + packuswb m1, m1 ; (byte) { U0, ... U3 } + packuswb m0, m0 ; (byte) { V0, ... V3 } + movh [dstUq+wq], m1 + movh [dstVq+wq], m0 +%endif ; mmsize == 8/16 + add wq, mmsize / 2 + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = yuyv or uyvy +; %3 = if specified, it means that unaligned and aligned code in loop +; will be the same (i.e. UYVY+AVX), and thus we don't need to +; split the loop in an aligned and unaligned case +%macro YUYV_TO_UV_FN 2-3 +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add dstUq, wq + add dstVq, wq +%if mmsize == 16 && %0 == 2 + test srcq, 15 +%endif + lea srcq, [srcq+wq*4] + pcmpeqb m2, m2 ; (byte) { 0xff } x 16 + psrlw m2, 8 ; (word) { 0x00ff } x 8 + ; NOTE: if uyvy+avx, u/a are identical +%if mmsize == 16 && %0 == 2 + jnz .loop_u_start + neg wq + LOOP_YUYV_TO_UV a, %2 +.loop_u_start: + neg wq + LOOP_YUYV_TO_UV u, %2 +%else ; mmsize == 8 + neg wq + LOOP_YUYV_TO_UV a, %2 +%endif ; mmsize == 8/16 +%endmacro + +; %1 = a (aligned) or u (unaligned) +; %2 = nv12 or nv21 +%macro LOOP_NVXX_TO_UV 2 +.loop_%1: + mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } + mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } + pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } + psrlw m0, 8 ; (word) { V0, V1, ..., V7 } + psrlw m1, 8 ; (word) { V8, V9, ..., V15 } + packuswb m2, m3 ; (byte) { U0, ..., U15 } + packuswb m0, m1 ; (byte) { V0, ..., V15 } +%ifidn %2, nv12 + mova [dstUq+wq], m2 + mova [dstVq+wq], m0 +%else ; nv21 + mova [dstVq+wq], m2 + mova [dstUq+wq], m0 +%endif ; nv12/21 + add wq, mmsize + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = nv12 or nv21 +%macro NVXX_TO_UV_FN 2 +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add dstUq, wq + add dstVq, wq +%if mmsize == 16 + test srcq, 15 +%endif + lea srcq, [srcq+wq*2] + pcmpeqb m5, m5 ; (byte) { 0xff } x 16 + psrlw m5, 8 ; (word) { 0x00ff } x 8 +%if mmsize == 16 + jnz .loop_u_start + neg wq + LOOP_NVXX_TO_UV a, %2 +.loop_u_start: + neg wq + LOOP_NVXX_TO_UV u, %2 +%else ; mmsize == 8 + neg wq + LOOP_NVXX_TO_UV a, %2 +%endif ; mmsize == 8/16 +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +YUYV_TO_Y_FN 0, yuyv +YUYV_TO_Y_FN 0, uyvy +YUYV_TO_UV_FN 0, yuyv +YUYV_TO_UV_FN 0, uyvy +NVXX_TO_UV_FN 0, nv12 +NVXX_TO_UV_FN 0, nv21 +%endif + +INIT_XMM sse2 +YUYV_TO_Y_FN 3, yuyv +YUYV_TO_Y_FN 2, uyvy +YUYV_TO_UV_FN 3, yuyv +YUYV_TO_UV_FN 3, uyvy +NVXX_TO_UV_FN 5, nv12 +NVXX_TO_UV_FN 5, nv21 + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but +; that's not faster in practice +YUYV_TO_UV_FN 3, yuyv +YUYV_TO_UV_FN 3, uyvy, 1 +NVXX_TO_UV_FN 5, nv12 +NVXX_TO_UV_FN 5, nv21 +%endif diff --git a/ffmpeg1/libswscale/x86/output.asm b/ffmpeg1/libswscale/x86/output.asm new file mode 100644 index 0000000..f9add35 --- /dev/null +++ b/ffmpeg1/libswscale/x86/output.asm @@ -0,0 +1,413 @@ +;****************************************************************************** +;* x86-optimized vertical line scaling functions +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* Kieran Kunhya <kieran@kunhya.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +minshort: times 8 dw 0x8000 +yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 +yuv2yuvX_10_start: times 4 dd 0x10000 +yuv2yuvX_9_start: times 4 dd 0x20000 +yuv2yuvX_10_upper: times 8 dw 0x3ff +yuv2yuvX_9_upper: times 8 dw 0x1ff +pd_4: times 4 dd 4 +pd_4min0x40000:times 4 dd 4 - (0x40000) +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +pw_512: times 8 dw 512 +pw_1024: times 8 dw 1024 + +SECTION .text + +;----------------------------------------------------------------------------- +; vertical line scaling +; +; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; and +; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, +; const int16_t **src, uint8_t *dst, int dstW, +; const uint8_t *dither, int offset) +; +; Scale one or $filterSize lines of source data to generate one line of output +; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in +; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple +; of 2. $offset is either 0 or 3. $dither holds 8 values. +;----------------------------------------------------------------------------- + +%macro yuv2planeX_fn 3 + +%if ARCH_X86_32 +%define cntr_reg fltsizeq +%define movsx mov +%else +%define cntr_reg r7 +%define movsx movsxd +%endif + +cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset +%if %1 == 8 || %1 == 9 || %1 == 10 + pxor m6, m6 +%endif ; %1 == 8/9/10 + +%if %1 == 8 +%if ARCH_X86_32 +%assign pad 0x2c - (stack_offset & 15) + SUB rsp, pad +%define m_dith m7 +%else ; x86-64 +%define m_dith m9 +%endif ; x86-32 + + ; create registers holding dither + movq m_dith, [ditherq] ; dither + test offsetd, offsetd + jz .no_rot +%if mmsize == 16 + punpcklqdq m_dith, m_dith +%endif ; mmsize == 16 + PALIGNR m_dith, m_dith, 3, m0 +.no_rot: +%if mmsize == 16 + punpcklbw m_dith, m6 +%if ARCH_X86_64 + punpcklwd m8, m_dith, m6 + pslld m8, 12 +%else ; x86-32 + punpcklwd m5, m_dith, m6 + pslld m5, 12 +%endif ; x86-32/64 + punpckhwd m_dith, m6 + pslld m_dith, 12 +%if ARCH_X86_32 + mova [rsp+ 0], m5 + mova [rsp+16], m_dith +%endif +%else ; mmsize == 8 + punpcklbw m5, m_dith, m6 + punpckhbw m_dith, m6 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m3, m_dith, m6 + punpckhwd m_dith, m6 + pslld m4, 12 + pslld m5, 12 + pslld m3, 12 + pslld m_dith, 12 + mova [rsp+ 0], m4 + mova [rsp+ 8], m5 + mova [rsp+16], m3 + mova [rsp+24], m_dith +%endif ; mmsize == 8/16 +%endif ; %1 == 8 + + xor r5, r5 + +.pixelloop: +%assign %%i 0 + ; the rep here is for the 8bit output mmx case, where dither covers + ; 8 pixels but we can only handle 2 pixels per register, and thus 4 + ; pixels per iteration. In order to not have to keep track of where + ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. +%if %1 == 8 +%assign %%repcnt 16/mmsize +%else +%assign %%repcnt 1 +%endif + +%rep %%repcnt + +%if %1 == 8 +%if ARCH_X86_32 + mova m2, [rsp+mmsize*(0+%%i)] + mova m1, [rsp+mmsize*(1+%%i)] +%else ; x86-64 + mova m2, m8 + mova m1, m_dith +%endif ; x86-32/64 +%else ; %1 == 9/10/16 + mova m1, [yuv2yuvX_%1_start] + mova m2, m1 +%endif ; %1 == 8/9/10/16 + movsx cntr_reg, fltsizem +.filterloop_ %+ %%i: + ; input pixels + mov r6, [srcq+gprsize*cntr_reg-2*gprsize] +%if %1 == 16 + mova m3, [r6+r5*4] + mova m5, [r6+r5*4+mmsize] +%else ; %1 == 8/9/10 + mova m3, [r6+r5*2] +%endif ; %1 == 8/9/10/16 + mov r6, [srcq+gprsize*cntr_reg-gprsize] +%if %1 == 16 + mova m4, [r6+r5*4] + mova m6, [r6+r5*4+mmsize] +%else ; %1 == 8/9/10 + mova m4, [r6+r5*2] +%endif ; %1 == 8/9/10/16 + + ; coefficients + movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] +%if %1 == 16 + pshuflw m7, m0, 0 ; coeff[0] + pshuflw m0, m0, 0x55 ; coeff[1] + pmovsxwd m7, m7 ; word -> dword + pmovsxwd m0, m0 ; word -> dword + + pmulld m3, m7 + pmulld m5, m7 + pmulld m4, m0 + pmulld m6, m0 + + paddd m2, m3 + paddd m1, m5 + paddd m2, m4 + paddd m1, m6 +%else ; %1 == 10/9/8 + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + SPLATD m0 + + pmaddwd m5, m0 + pmaddwd m3, m0 + + paddd m2, m5 + paddd m1, m3 +%endif ; %1 == 8/9/10/16 + + sub cntr_reg, 2 + jg .filterloop_ %+ %%i + +%if %1 == 16 + psrad m2, 31 - %1 + psrad m1, 31 - %1 +%else ; %1 == 10/9/8 + psrad m2, 27 - %1 + psrad m1, 27 - %1 +%endif ; %1 == 8/9/10/16 + +%if %1 == 8 + packssdw m2, m1 + packuswb m2, m2 + movh [dstq+r5*1], m2 +%else ; %1 == 9/10/16 +%if %1 == 16 + packssdw m2, m1 + paddw m2, [minshort] +%else ; %1 == 9/10 +%if cpuflag(sse4) + packusdw m2, m1 +%else ; mmxext/sse2 + packssdw m2, m1 + pmaxsw m2, m6 +%endif ; mmxext/sse2/sse4/avx + pminsw m2, [yuv2yuvX_%1_upper] +%endif ; %1 == 9/10/16 + mova [dstq+r5*2], m2 +%endif ; %1 == 8/9/10/16 + + add r5, mmsize/2 + sub wd, mmsize/2 + +%assign %%i %%i+2 +%endrep + jg .pixelloop + +%if %1 == 8 +%if ARCH_X86_32 + ADD rsp, pad + RET +%else ; x86-64 + REP_RET +%endif ; x86-32/64 +%else ; %1 == 9/10/16 + REP_RET +%endif ; %1 == 8/9/10/16 +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmxext +yuv2planeX_fn 8, 0, 7 +yuv2planeX_fn 9, 0, 5 +yuv2planeX_fn 10, 0, 5 +%endif + +INIT_XMM sse2 +yuv2planeX_fn 8, 10, 7 +yuv2planeX_fn 9, 7, 5 +yuv2planeX_fn 10, 7, 5 + +INIT_XMM sse4 +yuv2planeX_fn 8, 10, 7 +yuv2planeX_fn 9, 7, 5 +yuv2planeX_fn 10, 7, 5 +yuv2planeX_fn 16, 8, 5 + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +yuv2planeX_fn 8, 10, 7 +yuv2planeX_fn 9, 7, 5 +yuv2planeX_fn 10, 7, 5 +%endif + +; %1=outout-bpc, %2=alignment (u/a) +%macro yuv2plane1_mainloop 2 +.loop_%2: +%if %1 == 8 + paddsw m0, m2, [srcq+wq*2+mmsize*0] + paddsw m1, m3, [srcq+wq*2+mmsize*1] + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 + mov%2 [dstq+wq], m0 +%elif %1 == 16 + paddd m0, m4, [srcq+wq*4+mmsize*0] + paddd m1, m4, [srcq+wq*4+mmsize*1] + paddd m2, m4, [srcq+wq*4+mmsize*2] + paddd m3, m4, [srcq+wq*4+mmsize*3] + psrad m0, 3 + psrad m1, 3 + psrad m2, 3 + psrad m3, 3 +%if cpuflag(sse4) ; avx/sse4 + packusdw m0, m1 + packusdw m2, m3 +%else ; mmx/sse2 + packssdw m0, m1 + packssdw m2, m3 + paddw m0, m5 + paddw m2, m5 +%endif ; mmx/sse2/sse4/avx + mov%2 [dstq+wq*2+mmsize*0], m0 + mov%2 [dstq+wq*2+mmsize*1], m2 +%else ; %1 == 9/10 + paddsw m0, m2, [srcq+wq*2+mmsize*0] + paddsw m1, m2, [srcq+wq*2+mmsize*1] + psraw m0, 15 - %1 + psraw m1, 15 - %1 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m3 + pminsw m1, m3 + mov%2 [dstq+wq*2+mmsize*0], m0 + mov%2 [dstq+wq*2+mmsize*1], m1 +%endif + add wq, mmsize + jl .loop_%2 +%endmacro + +%macro yuv2plane1_fn 3 +cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset + movsxdifnidn wq, wd + add wq, mmsize - 1 + and wq, ~(mmsize - 1) +%if %1 == 8 + add dstq, wq +%else ; %1 != 8 + lea dstq, [dstq+wq*2] +%endif ; %1 == 8 +%if %1 == 16 + lea srcq, [srcq+wq*4] +%else ; %1 != 16 + lea srcq, [srcq+wq*2] +%endif ; %1 == 16 + neg wq + +%if %1 == 8 + pxor m4, m4 ; zero + + ; create registers holding dither + movq m3, [ditherq] ; dither + test offsetd, offsetd + jz .no_rot +%if mmsize == 16 + punpcklqdq m3, m3 +%endif ; mmsize == 16 + PALIGNR m3, m3, 3, m2 +.no_rot: +%if mmsize == 8 + mova m2, m3 + punpckhbw m3, m4 ; byte->word + punpcklbw m2, m4 ; byte->word +%else + punpcklbw m3, m4 + mova m2, m3 +%endif +%elif %1 == 9 + pxor m4, m4 + mova m3, [pw_512] + mova m2, [pw_32] +%elif %1 == 10 + pxor m4, m4 + mova m3, [pw_1024] + mova m2, [pw_16] +%else ; %1 == 16 +%if cpuflag(sse4) ; sse4/avx + mova m4, [pd_4] +%else ; mmx/sse2 + mova m4, [pd_4min0x40000] + mova m5, [minshort] +%endif ; mmx/sse2/sse4/avx +%endif ; %1 == .. + + ; actual pixel scaling +%if mmsize == 8 + yuv2plane1_mainloop %1, a +%else ; mmsize == 16 + test dstq, 15 + jnz .unaligned + yuv2plane1_mainloop %1, a + REP_RET +.unaligned: + yuv2plane1_mainloop %1, u +%endif ; mmsize == 8/16 + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +yuv2plane1_fn 8, 0, 5 +yuv2plane1_fn 16, 0, 3 + +INIT_MMX mmxext +yuv2plane1_fn 9, 0, 3 +yuv2plane1_fn 10, 0, 3 +%endif + +INIT_XMM sse2 +yuv2plane1_fn 8, 5, 5 +yuv2plane1_fn 9, 5, 3 +yuv2plane1_fn 10, 5, 3 +yuv2plane1_fn 16, 6, 3 + +INIT_XMM sse4 +yuv2plane1_fn 16, 5, 3 + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +yuv2plane1_fn 8, 5, 5 +yuv2plane1_fn 9, 5, 3 +yuv2plane1_fn 10, 5, 3 +yuv2plane1_fn 16, 5, 3 +%endif diff --git a/ffmpeg1/libswscale/x86/rgb2rgb.c b/ffmpeg1/libswscale/x86/rgb2rgb.c new file mode 100644 index 0000000..1e20176 --- /dev/null +++ b/ffmpeg1/libswscale/x86/rgb2rgb.c @@ -0,0 +1,149 @@ +/* + * software RGB to RGB converter + * pluralize by software PAL8 to RGB converter + * software YUV to YUV converter + * software YUV to RGB converter + * Written by Nick Kurshev. + * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavutil/cpu.h" +#include "libavutil/bswap.h" +#include "libswscale/rgb2rgb.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" + +#if HAVE_INLINE_ASM + +DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; +DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; +DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL; +DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL; +DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL; +DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL; +DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; +DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; +DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; +DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; +DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; +DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; +DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL; +DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ +DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ +DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; +DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; +DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; +#define mask16b mask15b +DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; +DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; +DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL; +DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; +DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; +DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; +DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; +DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; +DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; + +#define RGB2YUV_SHIFT 8 +#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) +#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) +#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) +#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) +#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) +#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) +#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) +#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) +#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) + +// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one. + +#define COMPILE_TEMPLATE_MMXEXT 0 +#define COMPILE_TEMPLATE_AMD3DNOW 0 +#define COMPILE_TEMPLATE_SSE2 0 + +//MMX versions +#undef RENAME +#define RENAME(a) a ## _MMX +#include "rgb2rgb_template.c" + +// MMXEXT versions +#undef RENAME +#undef COMPILE_TEMPLATE_MMXEXT +#define COMPILE_TEMPLATE_MMXEXT 1 +#define RENAME(a) a ## _MMXEXT +#include "rgb2rgb_template.c" + +//SSE2 versions +#undef RENAME +#undef COMPILE_TEMPLATE_SSE2 +#define COMPILE_TEMPLATE_SSE2 1 +#define RENAME(a) a ## _SSE2 +#include "rgb2rgb_template.c" + +//3DNOW versions +#undef RENAME +#undef COMPILE_TEMPLATE_MMXEXT +#undef COMPILE_TEMPLATE_SSE2 +#undef COMPILE_TEMPLATE_AMD3DNOW +#define COMPILE_TEMPLATE_MMXEXT 0 +#define COMPILE_TEMPLATE_SSE2 0 +#define COMPILE_TEMPLATE_AMD3DNOW 1 +#define RENAME(a) a ## _3DNOW +#include "rgb2rgb_template.c" + +/* + RGB15->RGB16 original by Strepto/Astral + ported to gcc & bugfixed : A'rpi + MMXEXT, 3DNOW optimization by Nick Kurshev + 32-bit C version, and and&add trick by Michael Niedermayer +*/ + +#endif /* HAVE_INLINE_ASM */ + +av_cold void rgb2rgb_init_x86(void) +{ +#if HAVE_INLINE_ASM + int cpu_flags = av_get_cpu_flags(); + + if (INLINE_MMX(cpu_flags)) + rgb2rgb_init_MMX(); + if (INLINE_AMD3DNOW(cpu_flags)) + rgb2rgb_init_3DNOW(); + if (INLINE_MMXEXT(cpu_flags)) + rgb2rgb_init_MMXEXT(); + if (INLINE_SSE2(cpu_flags)) + rgb2rgb_init_SSE2(); +#endif /* HAVE_INLINE_ASM */ +} diff --git a/ffmpeg1/libswscale/x86/rgb2rgb_template.c b/ffmpeg1/libswscale/x86/rgb2rgb_template.c new file mode 100644 index 0000000..d802ab4 --- /dev/null +++ b/ffmpeg1/libswscale/x86/rgb2rgb_template.c @@ -0,0 +1,2498 @@ +/* + * software RGB to RGB converter + * pluralize by software PAL8 to RGB converter + * software YUV to YUV converter + * software YUV to RGB converter + * Written by Nick Kurshev. + * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) + * lot of big-endian byte order fixes by Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> + +#undef PREFETCH +#undef MOVNTQ +#undef EMMS +#undef SFENCE +#undef PAVGB + +#if COMPILE_TEMPLATE_AMD3DNOW +#define PREFETCH "prefetch" +#define PAVGB "pavgusb" +#elif COMPILE_TEMPLATE_MMXEXT +#define PREFETCH "prefetchnta" +#define PAVGB "pavgb" +#else +#define PREFETCH " # nop" +#endif + +#if COMPILE_TEMPLATE_AMD3DNOW +/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ +#define EMMS "femms" +#else +#define EMMS "emms" +#endif + +#if COMPILE_TEMPLATE_MMXEXT +#define MOVNTQ "movntq" +#define SFENCE "sfence" +#else +#define MOVNTQ "movq" +#define SFENCE " # nop" +#endif + +#if !COMPILE_TEMPLATE_SSE2 + +#if !COMPILE_TEMPLATE_AMD3DNOW + +static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size) +{ + uint8_t *dest = dst; + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 23; + __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "punpckldq 3(%1), %%mm0 \n\t" + "movd 6(%1), %%mm1 \n\t" + "punpckldq 9(%1), %%mm1 \n\t" + "movd 12(%1), %%mm2 \n\t" + "punpckldq 15(%1), %%mm2 \n\t" + "movd 18(%1), %%mm3 \n\t" + "punpckldq 21(%1), %%mm3 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm2 \n\t" + "por %%mm7, %%mm3 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + MOVNTQ" %%mm1, 8(%0) \n\t" + MOVNTQ" %%mm2, 16(%0) \n\t" + MOVNTQ" %%mm3, 24(%0)" + :: "r"(dest), "r"(s) + :"memory"); + dest += 32; + s += 24; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + *dest++ = *s++; + *dest++ = *s++; + *dest++ = *s++; + *dest++ = 255; + } +} + +#define STORE_BGR24_MMX \ + "psrlq $8, %%mm2 \n\t" \ + "psrlq $8, %%mm3 \n\t" \ + "psrlq $8, %%mm6 \n\t" \ + "psrlq $8, %%mm7 \n\t" \ + "pand "MANGLE(mask24l)", %%mm0\n\t" \ + "pand "MANGLE(mask24l)", %%mm1\n\t" \ + "pand "MANGLE(mask24l)", %%mm4\n\t" \ + "pand "MANGLE(mask24l)", %%mm5\n\t" \ + "pand "MANGLE(mask24h)", %%mm2\n\t" \ + "pand "MANGLE(mask24h)", %%mm3\n\t" \ + "pand "MANGLE(mask24h)", %%mm6\n\t" \ + "pand "MANGLE(mask24h)", %%mm7\n\t" \ + "por %%mm2, %%mm0 \n\t" \ + "por %%mm3, %%mm1 \n\t" \ + "por %%mm6, %%mm4 \n\t" \ + "por %%mm7, %%mm5 \n\t" \ + \ + "movq %%mm1, %%mm2 \n\t" \ + "movq %%mm4, %%mm3 \n\t" \ + "psllq $48, %%mm2 \n\t" \ + "psllq $32, %%mm3 \n\t" \ + "por %%mm2, %%mm0 \n\t" \ + "psrlq $16, %%mm1 \n\t" \ + "psrlq $32, %%mm4 \n\t" \ + "psllq $16, %%mm5 \n\t" \ + "por %%mm3, %%mm1 \n\t" \ + "por %%mm5, %%mm4 \n\t" \ + \ + MOVNTQ" %%mm0, (%0) \n\t" \ + MOVNTQ" %%mm1, 8(%0) \n\t" \ + MOVNTQ" %%mm4, 16(%0)" + + +static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) +{ + uint8_t *dest = dst; + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 31; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + STORE_BGR24_MMX + :: "r"(dest), "r"(s) + :"memory"); + dest += 24; + s += 32; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + *dest++ = *s++; + *dest++ = *s++; + *dest++ = *s++; + s++; + } +} + +/* + original by Strepto/Astral + ported to gcc & bugfixed: A'rpi + MMXEXT, 3DNOW optimization by Nick Kurshev + 32-bit C version, and and&add trick by Michael Niedermayer +*/ +static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size) +{ + register const uint8_t* s=src; + register uint8_t* d=dst; + register const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*s)); + __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); + mm_end = end - 15; + while (s<mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "pand %%mm4, %%mm0 \n\t" + "pand %%mm4, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + MOVNTQ" %%mm2, 8(%0)" + :: "r"(d), "r"(s) + ); + d+=16; + s+=16; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + mm_end = end - 3; + while (s < mm_end) { + register unsigned x= *((const uint32_t *)s); + *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); + d+=4; + s+=4; + } + if (s < end) { + register unsigned short x= *((const uint16_t *)s); + *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); + } +} + +static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size) +{ + register const uint8_t* s=src; + register uint8_t* d=dst; + register const uint8_t *end; + const uint8_t *mm_end; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*s)); + __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); + __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); + mm_end = end - 15; + while (s<mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $1, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm3 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + MOVNTQ" %%mm2, 8(%0)" + :: "r"(d), "r"(s) + ); + d+=16; + s+=16; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + mm_end = end - 3; + while (s < mm_end) { + register uint32_t x= *((const uint32_t*)s); + *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); + s+=4; + d+=4; + } + if (s < end) { + register uint16_t x= *((const uint16_t*)s); + *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); + } +} + +static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + mm_end = end - 15; + __asm__ volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + "jmp 2f \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $5, %%mm0 \n\t" + "pslld $11, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "add $16, %1 \n\t" + "add $8, %0 \n\t" + "2: \n\t" + "cmp %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) + ); + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register int rgb = *(const uint32_t*)s; s += 4; + *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); + } +} + +static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 15; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $8, %%mm0 \n\t" + "psllq $8, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); + d += 4; + s += 16; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register int rgb = *(const uint32_t*)s; s += 4; + *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); + } +} + +static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + mm_end = end - 15; + __asm__ volatile( + "movq %3, %%mm5 \n\t" + "movq %4, %%mm6 \n\t" + "movq %5, %%mm7 \n\t" + "jmp 2f \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm6, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pmaddwd %%mm7, %%mm0 \n\t" + "pmaddwd %%mm7, %%mm3 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm5, %%mm4 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "psrld $6, %%mm0 \n\t" + "pslld $10, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + "add $16, %1 \n\t" + "add $8, %0 \n\t" + "2: \n\t" + "cmp %2, %1 \n\t" + " jb 1b \n\t" + : "+r" (d), "+r"(s) + : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) + ); + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register int rgb = *(const uint32_t*)s; s += 4; + *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); + } +} + +static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 15; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 4(%1), %%mm3 \n\t" + "punpckldq 8(%1), %%mm0 \n\t" + "punpckldq 12(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $7, %%mm0 \n\t" + "psllq $7, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); + d += 4; + s += 16; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register int rgb = *(const uint32_t*)s; s += 4; + *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); + } +} + +static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 11; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 3(%1), %%mm3 \n\t" + "punpckldq 6(%1), %%mm0 \n\t" + "punpckldq 9(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $8, %%mm2 \n\t" + "psrlq $8, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); + d += 4; + s += 12; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + const int b = *s++; + const int g = *s++; + const int r = *s++; + *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); + } +} + +static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_16mask),"m"(green_16mask)); + mm_end = end - 15; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 3(%1), %%mm3 \n\t" + "punpckldq 6(%1), %%mm0 \n\t" + "punpckldq 9(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $8, %%mm0 \n\t" + "psllq $8, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $5, %%mm1 \n\t" + "psrlq $5, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); + d += 4; + s += 12; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + const int r = *s++; + const int g = *s++; + const int b = *s++; + *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); + } +} + +static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 11; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 3(%1), %%mm3 \n\t" + "punpckldq 6(%1), %%mm0 \n\t" + "punpckldq 9(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psrlq $3, %%mm0 \n\t" + "psrlq $3, %%mm3 \n\t" + "pand %2, %%mm0 \n\t" + "pand %2, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $9, %%mm2 \n\t" + "psrlq $9, %%mm5 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); + d += 4; + s += 12; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + const int b = *s++; + const int g = *s++; + const int r = *s++; + *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); + } +} + +static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint8_t *s = src; + const uint8_t *end; + const uint8_t *mm_end; + uint16_t *d = (uint16_t *)dst; + end = s + src_size; + __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); + __asm__ volatile( + "movq %0, %%mm7 \n\t" + "movq %1, %%mm6 \n\t" + ::"m"(red_15mask),"m"(green_15mask)); + mm_end = end - 15; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movd (%1), %%mm0 \n\t" + "movd 3(%1), %%mm3 \n\t" + "punpckldq 6(%1), %%mm0 \n\t" + "punpckldq 9(%1), %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psllq $7, %%mm0 \n\t" + "psllq $7, %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm3 \n\t" + "psrlq $6, %%mm1 \n\t" + "psrlq $6, %%mm4 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "psrlq $19, %%mm2 \n\t" + "psrlq $19, %%mm5 \n\t" + "pand %2, %%mm2 \n\t" + "pand %2, %%mm5 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm5, %%mm3 \n\t" + "psllq $16, %%mm3 \n\t" + "por %%mm3, %%mm0 \n\t" + MOVNTQ" %%mm0, (%0) \n\t" + ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); + d += 4; + s += 12; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + const int r = *s++; + const int g = *s++; + const int b = *s++; + *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); + } +} + +static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint16_t *end; + const uint16_t *mm_end; + uint8_t *d = dst; + const uint16_t *s = (const uint16_t*)src; + end = s + src_size/2; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 7; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "movq %%mm3, %%mm7 \n\t" + + "movq 8(%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + :"=m"(*d) + :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) + :"memory"); + /* borrowed 32 to 24 */ + __asm__ volatile( + "movq %%mm0, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "movq %%mm6, %%mm0 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + + STORE_BGR24_MMX + + :: "r"(d), "m"(*s) + :"memory"); + d += 24; + s += 8; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register uint16_t bgr; + bgr = *s++; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); + } +} + +static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint16_t *end; + const uint16_t *mm_end; + uint8_t *d = (uint8_t *)dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + mm_end = end - 7; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + + "movq %%mm0, %%mm6 \n\t" + "movq %%mm3, %%mm7 \n\t" + + "movq 8(%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + "movq %%mm0, %%mm3 \n\t" + "movq %%mm1, %%mm4 \n\t" + "movq %%mm2, %%mm5 \n\t" + "punpcklwd %5, %%mm0 \n\t" + "punpcklwd %5, %%mm1 \n\t" + "punpcklwd %5, %%mm2 \n\t" + "punpckhwd %5, %%mm3 \n\t" + "punpckhwd %5, %%mm4 \n\t" + "punpckhwd %5, %%mm5 \n\t" + "psllq $8, %%mm1 \n\t" + "psllq $16, %%mm2 \n\t" + "por %%mm1, %%mm0 \n\t" + "por %%mm2, %%mm0 \n\t" + "psllq $8, %%mm4 \n\t" + "psllq $16, %%mm5 \n\t" + "por %%mm4, %%mm3 \n\t" + "por %%mm5, %%mm3 \n\t" + :"=m"(*d) + :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) + :"memory"); + /* borrowed 32 to 24 */ + __asm__ volatile( + "movq %%mm0, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "movq %%mm6, %%mm0 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + + STORE_BGR24_MMX + + :: "r"(d), "m"(*s) + :"memory"); + d += 24; + s += 8; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register uint16_t bgr; + bgr = *s++; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); + } +} + +/* + * mm0 = 00 B3 00 B2 00 B1 00 B0 + * mm1 = 00 G3 00 G2 00 G1 00 G0 + * mm2 = 00 R3 00 R2 00 R1 00 R0 + * mm6 = FF FF FF FF FF FF FF FF + * mm7 = 00 00 00 00 00 00 00 00 + */ +#define PACK_RGB32 \ + "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ + "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ + "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ + "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ + "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ + "movq %%mm0, %%mm3 \n\t" \ + "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ + "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ + MOVNTQ" %%mm0, (%0) \n\t" \ + MOVNTQ" %%mm3, 8(%0) \n\t" \ + +static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint16_t *end; + const uint16_t *mm_end; + uint8_t *d = dst; + const uint16_t *s = (const uint16_t *)src; + end = s + src_size/2; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); + __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); + mm_end = end - 3; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw %5, %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + PACK_RGB32 + ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) + :"memory"); + d += 16; + s += 4; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register uint16_t bgr; + bgr = *s++; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); + *d++ = 255; + } +} + +static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size) +{ + const uint16_t *end; + const uint16_t *mm_end; + uint8_t *d = dst; + const uint16_t *s = (const uint16_t*)src; + end = s + src_size/2; + __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); + __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); + __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); + mm_end = end - 3; + while (s < mm_end) { + __asm__ volatile( + PREFETCH" 32(%1) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "pand %2, %%mm0 \n\t" + "pand %3, %%mm1 \n\t" + "pand %4, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" + PACK_RGB32 + ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) + :"memory"); + d += 16; + s += 4; + } + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + while (s < end) { + register uint16_t bgr; + bgr = *s++; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); + *d++ = 255; + } +} + +static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size) +{ + x86_reg idx = 15 - src_size; + const uint8_t *s = src-idx; + uint8_t *d = dst-idx; + __asm__ volatile( + "test %0, %0 \n\t" + "jns 2f \n\t" + PREFETCH" (%1, %0) \n\t" + "movq %3, %%mm7 \n\t" + "pxor %4, %%mm7 \n\t" + "movq %%mm7, %%mm6 \n\t" + "pxor %5, %%mm7 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1, %0) \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq 8(%1, %0), %%mm1 \n\t" +# if COMPILE_TEMPLATE_MMXEXT + "pshufw $177, %%mm0, %%mm3 \n\t" + "pshufw $177, %%mm1, %%mm5 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm6, %%mm3 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm6, %%mm5 \n\t" + "por %%mm3, %%mm0 \n\t" + "por %%mm5, %%mm1 \n\t" +# else + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm4 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm6, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm6, %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "pslld $16, %%mm2 \n\t" + "psrld $16, %%mm3 \n\t" + "pslld $16, %%mm4 \n\t" + "psrld $16, %%mm5 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm4, %%mm1 \n\t" + "por %%mm3, %%mm0 \n\t" + "por %%mm5, %%mm1 \n\t" +# endif + MOVNTQ" %%mm0, (%2, %0) \n\t" + MOVNTQ" %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" + "js 1b \n\t" + SFENCE" \n\t" + EMMS" \n\t" + "2: \n\t" + : "+&r"(idx) + : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) + : "memory"); + for (; idx<15; idx+=4) { + register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; + v &= 0xff00ff; + *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); + } +} + +static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) +{ + unsigned i; + x86_reg mmx_size= 23 - src_size; + __asm__ volatile ( + "test %%"REG_a", %%"REG_a" \n\t" + "jns 2f \n\t" + "movq "MANGLE(mask24r)", %%mm5 \n\t" + "movq "MANGLE(mask24g)", %%mm6 \n\t" + "movq "MANGLE(mask24b)", %%mm7 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a") \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG + "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG + "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B + "psllq $16, %%mm0 \n\t" // 00 BGR BGR + "pand %%mm5, %%mm0 \n\t" + "pand %%mm6, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG + MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG + "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B + "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR + "pand %%mm7, %%mm0 \n\t" + "pand %%mm5, %%mm1 \n\t" + "pand %%mm6, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B + MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R + "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR + "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG + "pand %%mm6, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm5, %%mm2 \n\t" + "por %%mm0, %%mm1 \n\t" + "por %%mm2, %%mm1 \n\t" + MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" + "add $24, %%"REG_a" \n\t" + " js 1b \n\t" + "2: \n\t" + : "+a" (mmx_size) + : "r" (src-mmx_size), "r"(dst-mmx_size) + ); + + __asm__ volatile(SFENCE:::"memory"); + __asm__ volatile(EMMS:::"memory"); + + if (mmx_size==23) return; //finished, was multiple of 8 + + src+= src_size; + dst+= src_size; + src_size= 23-mmx_size; + src-= src_size; + dst-= src_size; + for (i=0; i<src_size; i+=3) { + register uint8_t x; + x = src[i + 2]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 0]; + dst[i + 0] = x; + } +} + +static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride, int vertLumPerChroma) +{ + int y; + const x86_reg chromWidth= width>>1; + for (y=0; y<height; y++) { + //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a", 2) \n\t" + PREFETCH" 32(%2, %%"REG_a") \n\t" + PREFETCH" 32(%3, %%"REG_a") \n\t" + "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) + "movq %%mm0, %%mm2 \n\t" // U(0) + "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) + "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) + + "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) + "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) + "movq %%mm3, %%mm4 \n\t" // Y(0) + "movq %%mm5, %%mm6 \n\t" // Y(8) + "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) + "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) + "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) + "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) + + MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) + : "%"REG_a + ); + if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; + } + __asm__(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); +} + +/** + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) + */ +static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride) +{ + //FIXME interpolate chroma + RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); +} + +static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride, int vertLumPerChroma) +{ + int y; + const x86_reg chromWidth= width>>1; + for (y=0; y<height; y++) { + //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 32(%1, %%"REG_a", 2) \n\t" + PREFETCH" 32(%2, %%"REG_a") \n\t" + PREFETCH" 32(%3, %%"REG_a") \n\t" + "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) + "movq %%mm0, %%mm2 \n\t" // U(0) + "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) + "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) + + "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) + "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) + "movq %%mm0, %%mm4 \n\t" // Y(0) + "movq %%mm2, %%mm6 \n\t" // Y(8) + "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) + "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) + "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) + "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) + + MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" + MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) + : "%"REG_a + ); + if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { + usrc += chromStride; + vsrc += chromStride; + } + ysrc += lumStride; + dst += dstStride; + } + __asm__(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); +} + +/** + * Height should be a multiple of 2 and width should be a multiple of 16 + * (If this is a problem for anyone then tell me, and I will fix it.) + */ +static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride) +{ + //FIXME interpolate chroma + RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); +} + +/** + * Width should be a multiple of 16. + */ +static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride) +{ + RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); +} + +/** + * Width should be a multiple of 16. + */ +static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, + int width, int height, + int lumStride, int chromStride, int dstStride) +{ + RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); +} + +/** + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) + */ +static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const x86_reg chromWidth= width>>1; + for (y=0; y<height; y+=2) { + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) + "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" + + "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) + "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) + "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) + "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + + MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" + MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); + + ydst += lumStride; + src += srcStride; + + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) + "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; + } + __asm__ volatile(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW +static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) +{ + int x,y; + + dst[0]= src[0]; + + // first line + for (x=0; x<srcWidth-1; x++) { + dst[2*x+1]= (3*src[x] + src[x+1])>>2; + dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; + } + dst[2*srcWidth-1]= src[srcWidth-1]; + + dst+= dstStride; + + for (y=1; y<srcHeight; y++) { + const x86_reg mmxSize= srcWidth&~15; + __asm__ volatile( + "mov %4, %%"REG_a" \n\t" + "movq "MANGLE(mmx_ff)", %%mm0 \n\t" + "movq (%0, %%"REG_a"), %%mm4 \n\t" + "movq %%mm4, %%mm2 \n\t" + "psllq $8, %%mm4 \n\t" + "pand %%mm0, %%mm2 \n\t" + "por %%mm2, %%mm4 \n\t" + "movq (%1, %%"REG_a"), %%mm5 \n\t" + "movq %%mm5, %%mm3 \n\t" + "psllq $8, %%mm5 \n\t" + "pand %%mm0, %%mm3 \n\t" + "por %%mm3, %%mm5 \n\t" + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq 1(%0, %%"REG_a"), %%mm2 \n\t" + "movq 1(%1, %%"REG_a"), %%mm3 \n\t" + PAVGB" %%mm0, %%mm5 \n\t" + PAVGB" %%mm0, %%mm3 \n\t" + PAVGB" %%mm0, %%mm5 \n\t" + PAVGB" %%mm0, %%mm3 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + "movq %%mm5, %%mm7 \n\t" + "movq %%mm4, %%mm6 \n\t" + "punpcklbw %%mm3, %%mm5 \n\t" + "punpckhbw %%mm3, %%mm7 \n\t" + "punpcklbw %%mm2, %%mm4 \n\t" + "punpckhbw %%mm2, %%mm6 \n\t" + MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" + "add $8, %%"REG_a" \n\t" + "movq -1(%0, %%"REG_a"), %%mm4 \n\t" + "movq -1(%1, %%"REG_a"), %%mm5 \n\t" + " js 1b \n\t" + :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), + "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), + "g" (-mmxSize) + : "%"REG_a + ); + + for (x=mmxSize-1; x<srcWidth-1; x++) { + dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; + dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; + dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; + dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; + } + dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; + dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; + + dst+=dstStride*2; + src+=srcStride; + } + + // last line + dst[0]= src[0]; + + for (x=0; x<srcWidth-1; x++) { + dst[2*x+1]= (3*src[x] + src[x+1])>>2; + dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; + } + dst[2*srcWidth-1]= src[srcWidth-1]; + + __asm__ volatile(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); +} +#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ + +#if !COMPILE_TEMPLATE_AMD3DNOW +/** + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, others are ignored. + * FIXME: Write HQ version. + */ +static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const x86_reg chromWidth= width>>1; + for (y=0; y<height; y+=2) { + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) + "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) + "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) + "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) + "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) + "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) + "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) + + MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" + + "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) + "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) + "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) + "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) + "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) + "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) + "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" + + "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) + "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) + "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) + "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) + "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) + "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) + "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) + "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) + + MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" + MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); + + ydst += lumStride; + src += srcStride; + + __asm__ volatile( + "xor %%"REG_a", %%"REG_a" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_a", 4) \n\t" + "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) + "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) + "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) + "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) + "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) + "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) + "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) + "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) + "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) + "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) + + MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" + + "add $8, %%"REG_a" \n\t" + "cmp %4, %%"REG_a" \n\t" + " jb 1b \n\t" + + ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) + : "memory", "%"REG_a + ); + udst += chromStride; + vdst += chromStride; + ydst += lumStride; + src += srcStride; + } + __asm__ volatile(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +/** + * Height should be a multiple of 2 and width should be a multiple of 2. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, + * others are ignored in the C version. + * FIXME: Write HQ version. + */ +static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const x86_reg chromWidth= width>>1; + for (y=0; y<height-2; y+=2) { + int i; + for (i=0; i<2; i++) { + __asm__ volatile( + "mov %2, %%"REG_a" \n\t" + "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" + "movq "MANGLE(ff_w1111)", %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_d") \n\t" + "movd (%0, %%"REG_d"), %%mm0 \n\t" + "movd 3(%0, %%"REG_d"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movd 6(%0, %%"REG_d"), %%mm2 \n\t" + "movd 9(%0, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" +#ifndef FAST_BGR2YV12 + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm1, %%mm0 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "packssdw %%mm2, %%mm0 \n\t" + "psraw $7, %%mm0 \n\t" + + "movd 12(%0, %%"REG_d"), %%mm4 \n\t" + "movd 15(%0, %%"REG_d"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "movd 18(%0, %%"REG_d"), %%mm2 \n\t" + "movd 21(%0, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm1 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" + "pmaddwd %%mm6, %%mm3 \n\t" +#ifndef FAST_BGR2YV12 + "psrad $8, %%mm4 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm1, %%mm4 \n\t" + "packssdw %%mm3, %%mm2 \n\t" + "pmaddwd %%mm5, %%mm4 \n\t" + "pmaddwd %%mm5, %%mm2 \n\t" + "add $24, %%"REG_d" \n\t" + "packssdw %%mm2, %%mm4 \n\t" + "psraw $7, %%mm4 \n\t" + + "packuswb %%mm4, %%mm0 \n\t" + "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" + + MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) + : "%"REG_a, "%"REG_d + ); + ydst += lumStride; + src += srcStride; + } + src -= srcStride*2; + __asm__ volatile( + "mov %4, %%"REG_a" \n\t" + "movq "MANGLE(ff_w1111)", %%mm5 \n\t" + "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" + "add %%"REG_d", %%"REG_d" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + PREFETCH" 64(%0, %%"REG_d") \n\t" + PREFETCH" 64(%1, %%"REG_d") \n\t" +#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW + "movq (%0, %%"REG_d"), %%mm0 \n\t" + "movq (%1, %%"REG_d"), %%mm1 \n\t" + "movq 6(%0, %%"REG_d"), %%mm2 \n\t" + "movq 6(%1, %%"REG_d"), %%mm3 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $24, %%mm0 \n\t" + "psrlq $24, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" +#else + "movd (%0, %%"REG_d"), %%mm0 \n\t" + "movd (%1, %%"REG_d"), %%mm1 \n\t" + "movd 3(%0, %%"REG_d"), %%mm2 \n\t" + "movd 3(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "movd 6(%0, %%"REG_d"), %%mm4 \n\t" + "movd 6(%1, %%"REG_d"), %%mm1 \n\t" + "movd 9(%0, %%"REG_d"), %%mm2 \n\t" + "movd 9(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm4 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm4, %%mm2 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm2 \n\t" +#endif + "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" + "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm0 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" +#ifndef FAST_BGR2YV12 + "psrad $8, %%mm0 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm2, %%mm0 \n\t" + "packssdw %%mm3, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm0 \n\t" + "pmaddwd %%mm5, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 + "psraw $7, %%mm0 \n\t" + +#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW + "movq 12(%0, %%"REG_d"), %%mm4 \n\t" + "movq 12(%1, %%"REG_d"), %%mm1 \n\t" + "movq 18(%0, %%"REG_d"), %%mm2 \n\t" + "movq 18(%1, %%"REG_d"), %%mm3 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm4, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlq $24, %%mm4 \n\t" + "psrlq $24, %%mm2 \n\t" + PAVGB" %%mm1, %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" +#else + "movd 12(%0, %%"REG_d"), %%mm4 \n\t" + "movd 12(%1, %%"REG_d"), %%mm1 \n\t" + "movd 15(%0, %%"REG_d"), %%mm2 \n\t" + "movd 15(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm4 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm4 \n\t" + "movd 18(%0, %%"REG_d"), %%mm5 \n\t" + "movd 18(%1, %%"REG_d"), %%mm1 \n\t" + "movd 21(%0, %%"REG_d"), %%mm2 \n\t" + "movd 21(%1, %%"REG_d"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddw %%mm1, %%mm5 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm5, %%mm2 \n\t" + "movq "MANGLE(ff_w1111)", %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm2 \n\t" +#endif + "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" + "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + + "pmaddwd %%mm4, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm6, %%mm4 \n\t" + "pmaddwd %%mm6, %%mm2 \n\t" +#ifndef FAST_BGR2YV12 + "psrad $8, %%mm4 \n\t" + "psrad $8, %%mm1 \n\t" + "psrad $8, %%mm2 \n\t" + "psrad $8, %%mm3 \n\t" +#endif + "packssdw %%mm2, %%mm4 \n\t" + "packssdw %%mm3, %%mm1 \n\t" + "pmaddwd %%mm5, %%mm4 \n\t" + "pmaddwd %%mm5, %%mm1 \n\t" + "add $24, %%"REG_d" \n\t" + "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 + "psraw $7, %%mm4 \n\t" + + "movq %%mm0, %%mm1 \n\t" + "punpckldq %%mm4, %%mm0 \n\t" + "punpckhdq %%mm4, %%mm1 \n\t" + "packsswb %%mm1, %%mm0 \n\t" + "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" + "movd %%mm0, (%2, %%"REG_a") \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, (%3, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) + : "%"REG_a, "%"REG_d + ); + + udst += chromStride; + vdst += chromStride; + src += srcStride*2; + } + + __asm__ volatile(EMMS" \n\t" + SFENCE" \n\t" + :::"memory"); + + rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride); +} +#endif /* !COMPILE_TEMPLATE_SSE2 */ + +#if !COMPILE_TEMPLATE_AMD3DNOW +static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, + int width, int height, int src1Stride, + int src2Stride, int dstStride) +{ + int h; + + for (h=0; h < height; h++) { + int w; + +#if COMPILE_TEMPLATE_SSE2 + __asm__( + "xor %%"REG_a", %%"REG_a" \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%"REG_a") \n\t" + PREFETCH" 64(%2, %%"REG_a") \n\t" + "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" + "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" + "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" + "punpcklbw %%xmm2, %%xmm0 \n\t" + "punpckhbw %%xmm2, %%xmm1 \n\t" + "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" + "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" + "add $16, %%"REG_a" \n\t" + "cmp %3, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) + : "memory", "%"REG_a"" + ); +#else + __asm__( + "xor %%"REG_a", %%"REG_a" \n\t" + "1: \n\t" + PREFETCH" 64(%1, %%"REG_a") \n\t" + PREFETCH" 64(%2, %%"REG_a") \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 8(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "movq 8(%2, %%"REG_a"), %%mm5 \n\t" + "punpcklbw %%mm4, %%mm0 \n\t" + "punpckhbw %%mm4, %%mm1 \n\t" + "punpcklbw %%mm5, %%mm2 \n\t" + "punpckhbw %%mm5, %%mm3 \n\t" + MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" + MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" + "add $16, %%"REG_a" \n\t" + "cmp %3, %%"REG_a" \n\t" + " jb 1b \n\t" + ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) + : "memory", "%"REG_a + ); +#endif + for (w= (width&(~15)); w < width; w++) { + dest[2*w+0] = src1[w]; + dest[2*w+1] = src2[w]; + } + dest += dstStride; + src1 += src1Stride; + src2 += src2Stride; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +#if !COMPILE_TEMPLATE_SSE2 +#if !COMPILE_TEMPLATE_AMD3DNOW +static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, + uint8_t *dst1, uint8_t *dst2, + int width, int height, + int srcStride1, int srcStride2, + int dstStride1, int dstStride2) +{ + x86_reg x, y; + int w,h; + w=width/2; h=height/2; + __asm__ volatile( + PREFETCH" %0 \n\t" + PREFETCH" %1 \n\t" + ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); + for (y=0;y<h;y++) { + const uint8_t* s1=src1+srcStride1*(y>>1); + uint8_t* d=dst1+dstStride1*y; + x=0; + for (;x<w-31;x+=32) { + __asm__ volatile( + PREFETCH" 32(%1,%2) \n\t" + "movq (%1,%2), %%mm0 \n\t" + "movq 8(%1,%2), %%mm2 \n\t" + "movq 16(%1,%2), %%mm4 \n\t" + "movq 24(%1,%2), %%mm6 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "movq %%mm6, %%mm7 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpcklbw %%mm2, %%mm2 \n\t" + "punpckhbw %%mm3, %%mm3 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpckhbw %%mm5, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm7 \n\t" + MOVNTQ" %%mm0, (%0,%2,2) \n\t" + MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" + MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" + MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" + MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" + MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" + MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" + MOVNTQ" %%mm7, 56(%0,%2,2)" + :: "r"(d), "r"(s1), "r"(x) + :"memory"); + } + for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; + } + for (y=0;y<h;y++) { + const uint8_t* s2=src2+srcStride2*(y>>1); + uint8_t* d=dst2+dstStride2*y; + x=0; + for (;x<w-31;x+=32) { + __asm__ volatile( + PREFETCH" 32(%1,%2) \n\t" + "movq (%1,%2), %%mm0 \n\t" + "movq 8(%1,%2), %%mm2 \n\t" + "movq 16(%1,%2), %%mm4 \n\t" + "movq 24(%1,%2), %%mm6 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "movq %%mm6, %%mm7 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpcklbw %%mm2, %%mm2 \n\t" + "punpckhbw %%mm3, %%mm3 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpckhbw %%mm5, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm7 \n\t" + MOVNTQ" %%mm0, (%0,%2,2) \n\t" + MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" + MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" + MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" + MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" + MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" + MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" + MOVNTQ" %%mm7, 56(%0,%2,2)" + :: "r"(d), "r"(s2), "r"(x) + :"memory"); + } + for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} + +static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, + uint8_t *dst, + int width, int height, + int srcStride1, int srcStride2, + int srcStride3, int dstStride) +{ + x86_reg x; + int y,w,h; + w=width/2; h=height; + for (y=0;y<h;y++) { + const uint8_t* yp=src1+srcStride1*y; + const uint8_t* up=src2+srcStride2*(y>>2); + const uint8_t* vp=src3+srcStride3*(y>>2); + uint8_t* d=dst+dstStride*y; + x=0; + for (;x<w-7;x+=8) { + __asm__ volatile( + PREFETCH" 32(%1, %0) \n\t" + PREFETCH" 32(%2, %0) \n\t" + PREFETCH" 32(%3, %0) \n\t" + "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ + "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ + "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ + "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ + "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ + "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ + "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ + "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ + "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ + "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ + + "movq %%mm1, %%mm6 \n\t" + "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ + "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ + "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ + MOVNTQ" %%mm0, (%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" + + "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ + "movq 8(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ + "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ + MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" + + "movq %%mm4, %%mm6 \n\t" + "movq 16(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm5, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ + "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ + MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" + + "punpckhbw %%mm5, %%mm6 \n\t" + "movq 24(%1, %0, 4), %%mm0 \n\t" + "movq %%mm0, %%mm3 \n\t" + "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ + "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ + MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" + MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" + + : "+r" (x) + : "r"(yp), "r" (up), "r"(vp), "r"(d) + :"memory"); + } + for (; x<w; x++) { + const int x2 = x<<2; + d[8*x+0] = yp[x2]; + d[8*x+1] = up[x]; + d[8*x+2] = yp[x2+1]; + d[8*x+3] = vp[x]; + d[8*x+4] = yp[x2+2]; + d[8*x+5] = up[x]; + d[8*x+6] = yp[x2+3]; + d[8*x+7] = vp[x]; + } + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) +{ + dst += count; + src += 2*count; + count= - count; + + if(count <= -16) { + count += 15; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -30(%1, %0, 2), %%mm0 \n\t" + "movq -22(%1, %0, 2), %%mm1 \n\t" + "movq -14(%1, %0, 2), %%mm2 \n\t" + "movq -6(%1, %0, 2), %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0,-15(%2, %0) \n\t" + MOVNTQ" %%mm2,- 7(%2, %0) \n\t" + "add $16, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src), "r"(dst) + ); + count -= 15; + } + while(count<0) { + dst[count]= src[2*count]; + count++; + } +} + +#if !COMPILE_TEMPLATE_AMD3DNOW +static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) +{ + dst0+= count; + dst1+= count; + src += 4*count; + count= - count; + if(count <= -8) { + count += 7; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -28(%1, %0, 4), %%mm0 \n\t" + "movq -20(%1, %0, 4), %%mm1 \n\t" + "movq -12(%1, %0, 4), %%mm2 \n\t" + "movq -4(%1, %0, 4), %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm2, %%mm0 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + MOVNTQ" %%mm0,- 7(%3, %0) \n\t" + MOVNTQ" %%mm1,- 7(%2, %0) \n\t" + "add $8, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src), "r"(dst0), "r"(dst1) + ); + count -= 7; + } + while(count<0) { + dst0[count]= src[4*count+0]; + dst1[count]= src[4*count+2]; + count++; + } +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) +{ + dst0 += count; + dst1 += count; + src0 += 4*count; + src1 += 4*count; + count= - count; +#ifdef PAVGB + if(count <= -8) { + count += 7; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -28(%1, %0, 4), %%mm0 \n\t" + "movq -20(%1, %0, 4), %%mm1 \n\t" + "movq -12(%1, %0, 4), %%mm2 \n\t" + "movq -4(%1, %0, 4), %%mm3 \n\t" + PAVGB" -28(%2, %0, 4), %%mm0 \n\t" + PAVGB" -20(%2, %0, 4), %%mm1 \n\t" + PAVGB" -12(%2, %0, 4), %%mm2 \n\t" + PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm2, %%mm0 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + MOVNTQ" %%mm0,- 7(%4, %0) \n\t" + MOVNTQ" %%mm1,- 7(%3, %0) \n\t" + "add $8, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) + ); + count -= 7; + } +#endif + while(count<0) { + dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; + dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; + count++; + } +} + +#if !COMPILE_TEMPLATE_AMD3DNOW +static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) +{ + dst0+= count; + dst1+= count; + src += 4*count; + count= - count; + if(count <= -8) { + count += 7; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -28(%1, %0, 4), %%mm0 \n\t" + "movq -20(%1, %0, 4), %%mm1 \n\t" + "movq -12(%1, %0, 4), %%mm2 \n\t" + "movq -4(%1, %0, 4), %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm1 \n\t" + "psrlw $8, %%mm2 \n\t" + "psrlw $8, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm2, %%mm0 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + MOVNTQ" %%mm0,- 7(%3, %0) \n\t" + MOVNTQ" %%mm1,- 7(%2, %0) \n\t" + "add $8, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src), "r"(dst0), "r"(dst1) + ); + count -= 7; + } + src++; + while(count<0) { + dst0[count]= src[4*count+0]; + dst1[count]= src[4*count+2]; + count++; + } +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) +{ + dst0 += count; + dst1 += count; + src0 += 4*count; + src1 += 4*count; + count= - count; +#ifdef PAVGB + if(count <= -8) { + count += 7; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -28(%1, %0, 4), %%mm0 \n\t" + "movq -20(%1, %0, 4), %%mm1 \n\t" + "movq -12(%1, %0, 4), %%mm2 \n\t" + "movq -4(%1, %0, 4), %%mm3 \n\t" + PAVGB" -28(%2, %0, 4), %%mm0 \n\t" + PAVGB" -20(%2, %0, 4), %%mm1 \n\t" + PAVGB" -12(%2, %0, 4), %%mm2 \n\t" + PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm1 \n\t" + "psrlw $8, %%mm2 \n\t" + "psrlw $8, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm2, %%mm0 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + MOVNTQ" %%mm0,- 7(%4, %0) \n\t" + MOVNTQ" %%mm1,- 7(%3, %0) \n\t" + "add $8, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) + ); + count -= 7; + } +#endif + src0++; + src1++; + while(count<0) { + dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; + dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; + count++; + } +} + +static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const int chromWidth= -((-width)>>1); + + for (y=0; y<height; y++) { + RENAME(extract_even)(src, ydst, width); + if(y&1) { + RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); + udst+= chromStride; + vdst+= chromStride; + } + + src += srcStride; + ydst+= lumStride; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} + +#if !COMPILE_TEMPLATE_AMD3DNOW +static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const int chromWidth= -((-width)>>1); + + for (y=0; y<height; y++) { + RENAME(extract_even)(src, ydst, width); + RENAME(extract_odd2)(src, udst, vdst, chromWidth); + + src += srcStride; + ydst+= lumStride; + udst+= chromStride; + vdst+= chromStride; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const int chromWidth= -((-width)>>1); + + for (y=0; y<height; y++) { + RENAME(extract_even)(src+1, ydst, width); + if(y&1) { + RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); + udst+= chromStride; + vdst+= chromStride; + } + + src += srcStride; + ydst+= lumStride; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} + +#if !COMPILE_TEMPLATE_AMD3DNOW +static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, + int width, int height, + int lumStride, int chromStride, int srcStride) +{ + int y; + const int chromWidth= -((-width)>>1); + + for (y=0; y<height; y++) { + RENAME(extract_even)(src+1, ydst, width); + RENAME(extract_even2)(src, udst, vdst, chromWidth); + + src += srcStride; + ydst+= lumStride; + udst+= chromStride; + vdst+= chromStride; + } + __asm__( + EMMS" \n\t" + SFENCE" \n\t" + ::: "memory" + ); +} +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ +#endif /* !COMPILE_TEMPLATE_SSE2 */ + +static inline void RENAME(rgb2rgb_init)(void) +{ +#if !COMPILE_TEMPLATE_SSE2 +#if !COMPILE_TEMPLATE_AMD3DNOW + rgb15to16 = RENAME(rgb15to16); + rgb15tobgr24 = RENAME(rgb15tobgr24); + rgb15to32 = RENAME(rgb15to32); + rgb16tobgr24 = RENAME(rgb16tobgr24); + rgb16to32 = RENAME(rgb16to32); + rgb16to15 = RENAME(rgb16to15); + rgb24tobgr16 = RENAME(rgb24tobgr16); + rgb24tobgr15 = RENAME(rgb24tobgr15); + rgb24tobgr32 = RENAME(rgb24tobgr32); + rgb32to16 = RENAME(rgb32to16); + rgb32to15 = RENAME(rgb32to15); + rgb32tobgr24 = RENAME(rgb32tobgr24); + rgb24to15 = RENAME(rgb24to15); + rgb24to16 = RENAME(rgb24to16); + rgb24tobgr24 = RENAME(rgb24tobgr24); + shuffle_bytes_2103 = RENAME(shuffle_bytes_2103); + rgb32tobgr16 = RENAME(rgb32tobgr16); + rgb32tobgr15 = RENAME(rgb32tobgr15); + yv12toyuy2 = RENAME(yv12toyuy2); + yv12touyvy = RENAME(yv12touyvy); + yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); + yuv422ptouyvy = RENAME(yuv422ptouyvy); + yuy2toyv12 = RENAME(yuy2toyv12); + vu9_to_vu12 = RENAME(vu9_to_vu12); + yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); + uyvytoyuv422 = RENAME(uyvytoyuv422); + yuyvtoyuv422 = RENAME(yuyvtoyuv422); +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ + +#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW + planar2x = RENAME(planar2x); +#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ + rgb24toyv12 = RENAME(rgb24toyv12); + + yuyvtoyuv420 = RENAME(yuyvtoyuv420); + uyvytoyuv420 = RENAME(uyvytoyuv420); +#endif /* !COMPILE_TEMPLATE_SSE2 */ + +#if !COMPILE_TEMPLATE_AMD3DNOW + interleaveBytes = RENAME(interleaveBytes); +#endif /* !COMPILE_TEMPLATE_AMD3DNOW */ +} diff --git a/ffmpeg1/libswscale/x86/scale.asm b/ffmpeg1/libswscale/x86/scale.asm new file mode 100644 index 0000000..c6dafde --- /dev/null +++ b/ffmpeg1/libswscale/x86/scale.asm @@ -0,0 +1,431 @@ +;****************************************************************************** +;* x86-optimized horizontal line scaling functions +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +max_19bit_int: times 4 dd 0x7ffff +max_19bit_flt: times 4 dd 524287.0 +minshort: times 8 dw 0x8000 +unicoeff: times 4 dd 0x20000000 + +SECTION .text + +;----------------------------------------------------------------------------- +; horizontal line scaling +; +; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> +; (SwsContext *c, int{16,32}_t *dst, +; int dstW, const uint{8,16}_t *src, +; const int16_t *filter, +; const int32_t *filterPos, int filterSize); +; +; Scale one horizontal line. Input is either 8-bits width or 16-bits width +; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to +; downscale before multiplying). Filter is 14-bits. Output is either 15bits +; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each +; output pixel is generated from $filterSize input pixels, the position of +; the first pixel is given in filterPos[nOutputPixel]. +;----------------------------------------------------------------------------- + +; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm +%macro SCALE_FUNC 6 +%ifnidn %3, X +cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 +%else +cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize +%endif +%if ARCH_X86_64 + movsxd wq, wd +%define mov32 movsxd +%else ; x86-32 +%define mov32 mov +%endif ; x86-64 +%if %2 == 19 +%if mmsize == 8 ; mmx + mova m2, [max_19bit_int] +%elif cpuflag(sse4) + mova m2, [max_19bit_int] +%else ; ssse3/sse2 + mova m2, [max_19bit_flt] +%endif ; mmx/sse2/ssse3/sse4 +%endif ; %2 == 19 +%if %1 == 16 + mova m6, [minshort] + mova m7, [unicoeff] +%elif %1 == 8 + pxor m3, m3 +%endif ; %1 == 8/16 + +%if %1 == 8 +%define movlh movd +%define movbh movh +%define srcmul 1 +%else ; %1 == 9-16 +%define movlh movq +%define movbh movu +%define srcmul 2 +%endif ; %1 == 8/9-16 + +%ifnidn %3, X + + ; setup loop +%if %3 == 8 + shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter +%define wshr 1 +%else ; %3 == 4 +%define wshr 0 +%endif ; %3 == 8 + lea filterq, [filterq+wq*8] +%if %2 == 15 + lea dstq, [dstq+wq*(2>>wshr)] +%else ; %2 == 19 + lea dstq, [dstq+wq*(4>>wshr)] +%endif ; %2 == 15/19 + lea fltposq, [fltposq+wq*(4>>wshr)] + neg wq + +.loop: +%if %3 == 4 ; filterSize == 4 scaling + ; load 2x4 or 4x4 source pixels into m0/m1 + mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0] + mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1] + movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}] +%if mmsize == 8 + movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] +%else ; mmsize == 16 +%if %1 > 8 + movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] +%else ; %1 == 8 + movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] +%endif + mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2] + mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3] + movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}] +%if %1 > 8 + movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] +%else ; %1 == 8 + movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] + punpckldq m0, m4 + punpckldq m1, m5 +%endif ; %1 == 8 +%endif ; mmsize == 8/16 +%if %1 == 8 + punpcklbw m0, m3 ; byte -> word + punpcklbw m1, m3 ; byte -> word +%endif ; %1 == 8 + + ; multiply with filter coefficients +%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll + ; add back 0x8000 * sum(coeffs) after the horizontal add + psubw m0, m6 + psubw m1, m6 +%endif ; %1 == 16 + pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] + pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] + + ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) +%if mmsize == 8 ; mmx + movq m4, m0 + punpckldq m0, m1 + punpckhdq m4, m1 + paddd m0, m4 +%elif notcpuflag(ssse3) ; sse2 + mova m4, m0 + shufps m0, m1, 10001000b + shufps m4, m1, 11011101b + paddd m0, m4 +%else ; ssse3/sse4 + phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], + ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], + ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], + ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] +%endif ; mmx/sse2/ssse3/sse4 +%else ; %3 == 8, i.e. filterSize == 8 scaling + ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 + mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] + mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1] + movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] +%if mmsize == 8 + movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] + movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}] + movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] +%else ; mmsize == 16 + movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] + mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2] + mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3] + movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] + movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] +%endif ; mmsize == 8/16 +%if %1 == 8 + punpcklbw m0, m3 ; byte -> word + punpcklbw m1, m3 ; byte -> word + punpcklbw m4, m3 ; byte -> word + punpcklbw m5, m3 ; byte -> word +%endif ; %1 == 8 + + ; multiply +%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll + ; add back 0x8000 * sum(coeffs) after the horizontal add + psubw m0, m6 + psubw m1, m6 + psubw m4, m6 + psubw m5, m6 +%endif ; %1 == 16 + pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] + pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] + pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}] + pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] + + ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) +%if mmsize == 8 + paddd m0, m1 + paddd m4, m5 + movq m1, m0 + punpckldq m0, m4 + punpckhdq m1, m4 + paddd m0, m1 +%elif notcpuflag(ssse3) ; sse2 +%if %1 == 8 +%define mex m6 +%else +%define mex m3 +%endif + ; emulate horizontal add as transpose + vertical add + mova mex, m0 + punpckldq m0, m1 + punpckhdq mex, m1 + paddd m0, mex + mova m1, m4 + punpckldq m4, m5 + punpckhdq m1, m5 + paddd m4, m1 + mova m1, m0 + punpcklqdq m0, m4 + punpckhqdq m1, m4 + paddd m0, m1 +%else ; ssse3/sse4 + ; FIXME if we rearrange the filter in pairs of 4, we can + ; load pixels likewise and use 2 x paddd + phaddd instead + ; of 3 x phaddd here, faster on older cpus + phaddd m0, m1 + phaddd m4, m5 + phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], + ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], + ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], + ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] +%endif ; mmx/sse2/ssse3/sse4 +%endif ; %3 == 4/8 + +%else ; %3 == X, i.e. any filterSize scaling + +%ifidn %4, X4 +%define dlt 4 +%else ; %4 == X || %4 == X8 +%define dlt 0 +%endif ; %4 ==/!= X4 +%if ARCH_X86_64 +%define srcq r8 +%define pos1q r7 +%define srcendq r9 + movsxd fltsizeq, fltsized ; filterSize + lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] +%else ; x86-32 +%define srcq srcmemq +%define pos1q dstq +%define srcendq r6m + lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] + mov srcendq, pos0q +%endif ; x86-32/64 + lea fltposq, [fltposq+wq*4] +%if %2 == 15 + lea dstq, [dstq+wq*2] +%else ; %2 == 19 + lea dstq, [dstq+wq*4] +%endif ; %2 == 15/19 + movifnidn dstmp, dstq + neg wq + +.loop: + mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0] + mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] + ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? + pxor m4, m4 + pxor m5, m5 + mov srcq, srcmemmp + +.innerloop: + ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 + movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] + movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] +%if %1 == 8 + punpcklbw m0, m3 + punpcklbw m1, m3 +%endif ; %1 == 8 + + ; multiply +%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll + ; add back 0x8000 * sum(coeffs) after the horizontal add + psubw m0, m6 + psubw m1, m6 +%endif ; %1 == 16 + pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}] + pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}] + paddd m4, m0 + paddd m5, m1 + add filterq, mmsize + add srcq, srcmul*mmsize/2 + cmp srcq, srcendq ; while (src += 4) < &src[filterSize] + jl .innerloop + +%ifidn %4, X4 + mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] + movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0] + sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1] +%if %1 > 8 + movhps m0, [srcq+(pos1q+dlt)*srcmul] +%else ; %1 == 8 + movd m1, [srcq+(pos1q+dlt)*srcmul] + punpckldq m0, m1 +%endif ; %1 == 8 +%if %1 == 8 + punpcklbw m0, m3 +%endif ; %1 == 8 +%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll + ; add back 0x8000 * sum(coeffs) after the horizontal add + psubw m0, m6 +%endif ; %1 == 16 + pmaddwd m0, [filterq] +%endif ; %4 == X4 + + lea filterq, [filterq+(fltsizeq+dlt)*2] + +%if mmsize == 8 ; mmx + movq m0, m4 + punpckldq m4, m5 + punpckhdq m0, m5 + paddd m0, m4 +%else ; mmsize == 16 +%if notcpuflag(ssse3) ; sse2 + mova m1, m4 + punpcklqdq m4, m5 + punpckhqdq m1, m5 + paddd m4, m1 +%else ; ssse3/sse4 + phaddd m4, m5 +%endif ; sse2/ssse3/sse4 +%ifidn %4, X4 + paddd m4, m0 +%endif ; %3 == X4 +%if notcpuflag(ssse3) ; sse2 + pshufd m4, m4, 11011000b + movhlps m0, m4 + paddd m0, m4 +%else ; ssse3/sse4 + phaddd m4, m4 + SWAP 0, 4 +%endif ; sse2/ssse3/sse4 +%endif ; mmsize == 8/16 +%endif ; %3 ==/!= X + +%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned + paddd m0, m7 +%endif ; %1 == 16 + + ; clip, store + psrad m0, 14 + %1 - %2 +%ifidn %3, X + movifnidn dstq, dstmp +%endif ; %3 == X +%if %2 == 15 + packssdw m0, m0 +%ifnidn %3, X + movh [dstq+wq*(2>>wshr)], m0 +%else ; %3 == X + movd [dstq+wq*2], m0 +%endif ; %3 ==/!= X +%else ; %2 == 19 +%if mmsize == 8 + PMINSD_MMX m0, m2, m4 +%elif cpuflag(sse4) + pminsd m0, m2 +%else ; sse2/ssse3 + cvtdq2ps m0, m0 + minps m0, m2 + cvtps2dq m0, m0 +%endif ; mmx/sse2/ssse3/sse4 +%ifnidn %3, X + mova [dstq+wq*(4>>wshr)], m0 +%else ; %3 == X + movq [dstq+wq*4], m0 +%endif ; %3 ==/!= X +%endif ; %2 == 15/19 +%ifnidn %3, X + add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) + ; per iteration. see "shl wq,1" above as for why we do this +%else ; %3 == X + add wq, 2 +%endif ; %3 ==/!= X + jl .loop + REP_RET +%endmacro + +; SCALE_FUNCS source_width, intermediate_nbits, n_xmm +%macro SCALE_FUNCS 3 +SCALE_FUNC %1, %2, 4, 4, 6, %3 +SCALE_FUNC %1, %2, 8, 8, 6, %3 +%if mmsize == 8 +SCALE_FUNC %1, %2, X, X, 7, %3 +%else +SCALE_FUNC %1, %2, X, X4, 7, %3 +SCALE_FUNC %1, %2, X, X8, 7, %3 +%endif +%endmacro + +; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args +%macro SCALE_FUNCS2 3 +%if notcpuflag(sse4) +SCALE_FUNCS 8, 15, %1 +SCALE_FUNCS 9, 15, %2 +SCALE_FUNCS 10, 15, %2 +SCALE_FUNCS 12, 15, %2 +SCALE_FUNCS 14, 15, %2 +SCALE_FUNCS 16, 15, %3 +%endif ; !sse4 +SCALE_FUNCS 8, 19, %1 +SCALE_FUNCS 9, 19, %2 +SCALE_FUNCS 10, 19, %2 +SCALE_FUNCS 12, 19, %2 +SCALE_FUNCS 14, 19, %2 +SCALE_FUNCS 16, 19, %3 +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +SCALE_FUNCS2 0, 0, 0 +%endif +INIT_XMM sse2 +SCALE_FUNCS2 6, 7, 8 +INIT_XMM ssse3 +SCALE_FUNCS2 6, 6, 8 +INIT_XMM sse4 +SCALE_FUNCS2 6, 6, 8 diff --git a/ffmpeg1/libswscale/x86/swscale.c b/ffmpeg1/libswscale/x86/swscale.c new file mode 100644 index 0000000..2f67b1b --- /dev/null +++ b/ffmpeg1/libswscale/x86/swscale.c @@ -0,0 +1,585 @@ +/* + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include "config.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavutil/cpu.h" +#include "libavutil/pixdesc.h" + +#if HAVE_INLINE_ASM + +#define DITHER1XBPP + +DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL; +DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL; +DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL; +DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL; + +const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = { + 0x0103010301030103LL, + 0x0200020002000200LL,}; + +const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = { + 0x0602060206020602LL, + 0x0004000400040004LL,}; + +DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL; +DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL; +DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL; +DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL; +DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL; +DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL; + +DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; +DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; +DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; + +#ifdef FAST_BGR2YV12 +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL; +#else +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; +#endif /* FAST_BGR2YV12 */ +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + + +//MMX versions +#if HAVE_MMX_INLINE +#undef RENAME +#define COMPILE_TEMPLATE_MMXEXT 0 +#define RENAME(a) a ## _MMX +#include "swscale_template.c" +#endif + +// MMXEXT versions +#if HAVE_MMXEXT_INLINE +#undef RENAME +#undef COMPILE_TEMPLATE_MMXEXT +#define COMPILE_TEMPLATE_MMXEXT 1 +#define RENAME(a) a ## _MMXEXT +#include "swscale_template.c" +#endif + +void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex, + int lastInLumBuf, int lastInChrBuf) +{ + const int dstH= c->dstH; + const int flags= c->flags; + int16_t **lumPixBuf= c->lumPixBuf; + int16_t **chrUPixBuf= c->chrUPixBuf; + int16_t **alpPixBuf= c->alpPixBuf; + const int vLumBufSize= c->vLumBufSize; + const int vChrBufSize= c->vChrBufSize; + int32_t *vLumFilterPos= c->vLumFilterPos; + int32_t *vChrFilterPos= c->vChrFilterPos; + int16_t *vLumFilter= c->vLumFilter; + int16_t *vChrFilter= c->vChrFilter; + int32_t *lumMmxFilter= c->lumMmxFilter; + int32_t *chrMmxFilter= c->chrMmxFilter; + int32_t av_unused *alpMmxFilter= c->alpMmxFilter; + const int vLumFilterSize= c->vLumFilterSize; + const int vChrFilterSize= c->vChrFilterSize; + const int chrDstY= dstY>>c->chrDstVSubSample; + const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input + const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input + + c->blueDither= ff_dither8[dstY&1]; + if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555) + c->greenDither= ff_dither8[dstY&1]; + else + c->greenDither= ff_dither4[dstY&1]; + c->redDither= ff_dither8[(dstY+1)&1]; + if (dstY < dstH - 2) { + const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; + int i; + + if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { + const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize; + int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize); + for (i = 0; i < neg; i++) + tmpY[i] = lumSrcPtr[neg]; + for ( ; i < end; i++) + tmpY[i] = lumSrcPtr[i]; + for ( ; i < vLumFilterSize; i++) + tmpY[i] = tmpY[i-1]; + lumSrcPtr = tmpY; + + if (alpSrcPtr) { + const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize; + for (i = 0; i < neg; i++) + tmpA[i] = alpSrcPtr[neg]; + for ( ; i < end; i++) + tmpA[i] = alpSrcPtr[i]; + for ( ; i < vLumFilterSize; i++) + tmpA[i] = tmpA[i - 1]; + alpSrcPtr = tmpA; + } + } + if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) { + const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize; + int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize); + for (i = 0; i < neg; i++) { + tmpU[i] = chrUSrcPtr[neg]; + } + for ( ; i < end; i++) { + tmpU[i] = chrUSrcPtr[i]; + } + for ( ; i < vChrFilterSize; i++) { + tmpU[i] = tmpU[i - 1]; + } + chrUSrcPtr = tmpU; + } + + if (flags & SWS_ACCURATE_RND) { + int s= APCK_SIZE / 8; + for (i=0; i<vLumFilterSize; i+=2) { + *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ]; + *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)]; + lumMmxFilter[s*i+APCK_COEF/4 ]= + lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] + + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); + if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; + *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; + alpMmxFilter[s*i+APCK_COEF/4 ]= + alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; + } + } + for (i=0; i<vChrFilterSize; i+=2) { + *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ]; + *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)]; + chrMmxFilter[s*i+APCK_COEF/4 ]= + chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] + + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); + } + } else { + for (i=0; i<vLumFilterSize; i++) { + *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i]; + lumMmxFilter[4*i+2]= + lumMmxFilter[4*i+3]= + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U; + if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i]; + alpMmxFilter[4*i+2]= + alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; + } + } + for (i=0; i<vChrFilterSize; i++) { + *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i]; + chrMmxFilter[4*i+2]= + chrMmxFilter[4*i+3]= + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U; + } + } + } +} + +#if HAVE_MMXEXT +static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + if(((int)dest) & 15){ + return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset); + } + if (offset) { + __asm__ volatile("movq (%0), %%xmm3\n\t" + "movdqa %%xmm3, %%xmm4\n\t" + "psrlq $24, %%xmm3\n\t" + "psllq $40, %%xmm4\n\t" + "por %%xmm4, %%xmm3\n\t" + :: "r"(dither) + ); + } else { + __asm__ volatile("movq (%0), %%xmm3\n\t" + :: "r"(dither) + ); + } + filterSize--; + __asm__ volatile( + "pxor %%xmm0, %%xmm0\n\t" + "punpcklbw %%xmm0, %%xmm3\n\t" + "movd %0, %%xmm1\n\t" + "punpcklwd %%xmm1, %%xmm1\n\t" + "punpckldq %%xmm1, %%xmm1\n\t" + "punpcklqdq %%xmm1, %%xmm1\n\t" + "psllw $3, %%xmm1\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "psraw $4, %%xmm3\n\t" + ::"m"(filterSize) + ); + __asm__ volatile( + "movdqa %%xmm3, %%xmm4\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ + "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ + "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%xmm0, %%xmm2 \n\t"\ + "pmulhw %%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm2, %%xmm3 \n\t"\ + "paddw %%xmm5, %%xmm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%xmm3 \n\t"\ + "psraw $3, %%xmm4 \n\t"\ + "packuswb %%xmm4, %%xmm3 \n\t" + "movntdq %%xmm3, (%1, %%"REG_c")\n\t" + "add $16, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movdqa %%xmm7, %%xmm3\n\t" + "movdqa %%xmm7, %%xmm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} +#endif + +#endif /* HAVE_INLINE_ASM */ + +#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ +extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ + SwsContext *c, int16_t *data, \ + int dstW, const uint8_t *src, \ + const int16_t *filter, \ + const int32_t *filterPos, int filterSize) + +#define SCALE_FUNCS(filter_n, opt) \ + SCALE_FUNC(filter_n, 8, 15, opt); \ + SCALE_FUNC(filter_n, 9, 15, opt); \ + SCALE_FUNC(filter_n, 10, 15, opt); \ + SCALE_FUNC(filter_n, 12, 15, opt); \ + SCALE_FUNC(filter_n, 14, 15, opt); \ + SCALE_FUNC(filter_n, 16, 15, opt); \ + SCALE_FUNC(filter_n, 8, 19, opt); \ + SCALE_FUNC(filter_n, 9, 19, opt); \ + SCALE_FUNC(filter_n, 10, 19, opt); \ + SCALE_FUNC(filter_n, 12, 19, opt); \ + SCALE_FUNC(filter_n, 14, 19, opt); \ + SCALE_FUNC(filter_n, 16, 19, opt) + +#define SCALE_FUNCS_MMX(opt) \ + SCALE_FUNCS(4, opt); \ + SCALE_FUNCS(8, opt); \ + SCALE_FUNCS(X, opt) + +#define SCALE_FUNCS_SSE(opt) \ + SCALE_FUNCS(4, opt); \ + SCALE_FUNCS(8, opt); \ + SCALE_FUNCS(X4, opt); \ + SCALE_FUNCS(X8, opt) + +#if ARCH_X86_32 +SCALE_FUNCS_MMX(mmx); +#endif +SCALE_FUNCS_SSE(sse2); +SCALE_FUNCS_SSE(ssse3); +SCALE_FUNCS_SSE(sse4); + +#define VSCALEX_FUNC(size, opt) \ +extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \ + const int16_t **src, uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset) +#define VSCALEX_FUNCS(opt) \ + VSCALEX_FUNC(8, opt); \ + VSCALEX_FUNC(9, opt); \ + VSCALEX_FUNC(10, opt) + +#if ARCH_X86_32 +VSCALEX_FUNCS(mmxext); +#endif +VSCALEX_FUNCS(sse2); +VSCALEX_FUNCS(sse4); +VSCALEX_FUNC(16, sse4); +VSCALEX_FUNCS(avx); + +#define VSCALE_FUNC(size, opt) \ +extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \ + const uint8_t *dither, int offset) +#define VSCALE_FUNCS(opt1, opt2) \ + VSCALE_FUNC(8, opt1); \ + VSCALE_FUNC(9, opt2); \ + VSCALE_FUNC(10, opt2); \ + VSCALE_FUNC(16, opt1) + +#if ARCH_X86_32 +VSCALE_FUNCS(mmx, mmxext); +#endif +VSCALE_FUNCS(sse2, sse2); +VSCALE_FUNC(16, sse4); +VSCALE_FUNCS(avx, avx); + +#define INPUT_Y_FUNC(fmt, opt) \ +extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ + const uint8_t *unused1, const uint8_t *unused2, \ + int w, uint32_t *unused) +#define INPUT_UV_FUNC(fmt, opt) \ +extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *unused0, \ + const uint8_t *src1, \ + const uint8_t *src2, \ + int w, uint32_t *unused) +#define INPUT_FUNC(fmt, opt) \ + INPUT_Y_FUNC(fmt, opt); \ + INPUT_UV_FUNC(fmt, opt) +#define INPUT_FUNCS(opt) \ + INPUT_FUNC(uyvy, opt); \ + INPUT_FUNC(yuyv, opt); \ + INPUT_UV_FUNC(nv12, opt); \ + INPUT_UV_FUNC(nv21, opt); \ + INPUT_FUNC(rgba, opt); \ + INPUT_FUNC(bgra, opt); \ + INPUT_FUNC(argb, opt); \ + INPUT_FUNC(abgr, opt); \ + INPUT_FUNC(rgb24, opt); \ + INPUT_FUNC(bgr24, opt) + +#if ARCH_X86_32 +INPUT_FUNCS(mmx); +#endif +INPUT_FUNCS(sse2); +INPUT_FUNCS(ssse3); +INPUT_FUNCS(avx); + +av_cold void ff_sws_init_swScale_mmx(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_INLINE_ASM + if (cpu_flags & AV_CPU_FLAG_MMX) + sws_init_swScale_MMX(c); +#if HAVE_MMXEXT_INLINE + if (cpu_flags & AV_CPU_FLAG_MMXEXT) + sws_init_swScale_MMXEXT(c); + if (cpu_flags & AV_CPU_FLAG_SSE3){ + if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) + c->yuv2planeX = yuv2yuvX_sse3; + } +#endif +#endif /* HAVE_INLINE_ASM */ + +#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ + if (c->srcBpc == 8) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale8to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 9) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale9to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 10) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale10to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 12) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale12to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale14to19_ ## filtersize ## _ ## opt1; \ + } else { /* c->srcBpc == 16 */ \ + av_assert0(c->srcBpc == 16);\ + hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale16to19_ ## filtersize ## _ ## opt1; \ + } \ +} while (0) +#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ + switch (filtersize) { \ + case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ + case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ + default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ + } +#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \ +switch(c->dstBpc){ \ + case 16: do_16_case; break; \ + case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \ + case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \ + default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \ + } +#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ + switch(c->dstBpc){ \ + case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ + case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ + case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ + case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + default: av_assert0(c->dstBpc>8); \ + } +#define case_rgb(x, X, opt) \ + case AV_PIX_FMT_ ## X: \ + c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \ + if (!c->chrSrcHSubSample) \ + c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \ + break +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); + ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT); + + switch (c->srcFormat) { + case AV_PIX_FMT_Y400A: + c->lumToYV12 = ff_yuyvToY_mmx; + if (c->alpPixBuf) + c->alpToYV12 = ff_uyvyToY_mmx; + break; + case AV_PIX_FMT_YUYV422: + c->lumToYV12 = ff_yuyvToY_mmx; + c->chrToYV12 = ff_yuyvToUV_mmx; + break; + case AV_PIX_FMT_UYVY422: + c->lumToYV12 = ff_uyvyToY_mmx; + c->chrToYV12 = ff_uyvyToUV_mmx; + break; + case AV_PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_mmx; + break; + case AV_PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_mmx; + break; + case_rgb(rgb24, RGB24, mmx); + case_rgb(bgr24, BGR24, mmx); + case_rgb(bgra, BGRA, mmx); + case_rgb(rgba, RGBA, mmx); + case_rgb(abgr, ABGR, mmx); + case_rgb(argb, ARGB, mmx); + default: + break; + } + } + if (EXTERNAL_MMXEXT(cpu_flags)) { + ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1); + } +#endif /* ARCH_X86_32 */ +#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ + switch (filtersize) { \ + case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ + case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ + default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \ + else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \ + break; \ + } + if (EXTERNAL_SSE2(cpu_flags)) { + ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); + ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); + ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, , + HAVE_ALIGNED_STACK || ARCH_X86_64); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); + + switch (c->srcFormat) { + case AV_PIX_FMT_Y400A: + c->lumToYV12 = ff_yuyvToY_sse2; + if (c->alpPixBuf) + c->alpToYV12 = ff_uyvyToY_sse2; + break; + case AV_PIX_FMT_YUYV422: + c->lumToYV12 = ff_yuyvToY_sse2; + c->chrToYV12 = ff_yuyvToUV_sse2; + break; + case AV_PIX_FMT_UYVY422: + c->lumToYV12 = ff_uyvyToY_sse2; + c->chrToYV12 = ff_uyvyToUV_sse2; + break; + case AV_PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_sse2; + break; + case AV_PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_sse2; + break; + case_rgb(rgb24, RGB24, sse2); + case_rgb(bgr24, BGR24, sse2); + case_rgb(bgra, BGRA, sse2); + case_rgb(rgba, RGBA, sse2); + case_rgb(abgr, ABGR, sse2); + case_rgb(argb, ARGB, sse2); + default: + break; + } + } + if (EXTERNAL_SSSE3(cpu_flags)) { + ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); + ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3); + switch (c->srcFormat) { + case_rgb(rgb24, RGB24, ssse3); + case_rgb(bgr24, BGR24, ssse3); + default: + break; + } + } + if (EXTERNAL_SSE4(cpu_flags)) { + /* Xto15 don't need special sse4 functions */ + ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); + ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); + ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4, + if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4, + HAVE_ALIGNED_STACK || ARCH_X86_64); + if (c->dstBpc == 16 && !isBE(c->dstFormat)) + c->yuv2plane1 = ff_yuv2plane1_16_sse4; + } + + if (EXTERNAL_AVX(cpu_flags)) { + ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, , + HAVE_ALIGNED_STACK || ARCH_X86_64); + ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); + + switch (c->srcFormat) { + case AV_PIX_FMT_YUYV422: + c->chrToYV12 = ff_yuyvToUV_avx; + break; + case AV_PIX_FMT_UYVY422: + c->chrToYV12 = ff_uyvyToUV_avx; + break; + case AV_PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_avx; + break; + case AV_PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_avx; + break; + case_rgb(rgb24, RGB24, avx); + case_rgb(bgr24, BGR24, avx); + case_rgb(bgra, BGRA, avx); + case_rgb(rgba, RGBA, avx); + case_rgb(abgr, ABGR, avx); + case_rgb(argb, ARGB, avx); + default: + break; + } + } +} diff --git a/ffmpeg1/libswscale/x86/swscale_template.c b/ffmpeg1/libswscale/x86/swscale_template.c new file mode 100644 index 0000000..f2567c1 --- /dev/null +++ b/ffmpeg1/libswscale/x86/swscale_template.c @@ -0,0 +1,1717 @@ +/* + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef REAL_MOVNTQ +#undef MOVNTQ +#undef MOVNTQ2 +#undef PREFETCH + +#if COMPILE_TEMPLATE_MMXEXT +#define PREFETCH "prefetchnta" +#else +#define PREFETCH " # nop" +#endif + +#if COMPILE_TEMPLATE_MMXEXT +#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" +#define MOVNTQ2 "movntq " +#else +#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" +#define MOVNTQ2 "movq " +#endif +#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) + +#if !COMPILE_TEMPLATE_MMXEXT +static av_always_inline void +dither_8to16(const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } +} +#endif + +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + dither_8to16(dither, offset); + filterSize--; + __asm__ volatile( + "movd %0, %%mm1\n\t" + "punpcklwd %%mm1, %%mm1\n\t" + "punpckldq %%mm1, %%mm1\n\t" + "psllw $3, %%mm1\n\t" + "paddw %%mm1, %%mm3\n\t" + "paddw %%mm1, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + ::"m"(filterSize) + ); + + __asm__ volatile(\ + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} + +#define YSCALEYUV2PACKEDX_UV \ + __asm__ volatile(\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + ".p2align 4 \n\t"\ + "nop \n\t"\ + "1: \n\t"\ + "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ + "movq %%mm3, %%mm4 \n\t"\ + ".p2align 4 \n\t"\ + "2: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ + "add %6, %%"REG_S" \n\t" \ + "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + " jnz 2b \n\t"\ + +#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ + "lea "offset"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ + "movq "#dst1", "#dst2" \n\t"\ + ".p2align 4 \n\t"\ + "2: \n\t"\ + "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ + "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pmulhw "#coeff", "#src1" \n\t"\ + "pmulhw "#coeff", "#src2" \n\t"\ + "paddw "#src1", "#dst1" \n\t"\ + "paddw "#src2", "#dst2" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + " jnz 2b \n\t"\ + +#define YSCALEYUV2PACKEDX \ + YSCALEYUV2PACKEDX_UV \ + YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ + +#define YSCALEYUV2PACKEDX_END \ + :: "r" (&c->redDither), \ + "m" (dummy), "m" (dummy), "m" (dummy),\ + "r" (dest), "m" (dstW_reg), "m"(uv_off) \ + : "%"REG_a, "%"REG_d, "%"REG_S \ + ); + +#define YSCALEYUV2PACKEDX_ACCURATE_UV \ + __asm__ volatile(\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + ".p2align 4 \n\t"\ + "nop \n\t"\ + "1: \n\t"\ + "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pxor %%mm4, %%mm4 \n\t"\ + "pxor %%mm5, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + ".p2align 4 \n\t"\ + "2: \n\t"\ + "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ + "add %6, %%"REG_S" \n\t" \ + "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ + "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ + "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ + "movq %%mm0, %%mm3 \n\t"\ + "punpcklwd %%mm1, %%mm0 \n\t"\ + "punpckhwd %%mm1, %%mm3 \n\t"\ + "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm3 \n\t"\ + "paddd %%mm0, %%mm4 \n\t"\ + "paddd %%mm3, %%mm5 \n\t"\ + "add %6, %%"REG_S" \n\t" \ + "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ + "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ + "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "punpcklwd %%mm3, %%mm2 \n\t"\ + "punpckhwd %%mm3, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm2 \n\t"\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "paddd %%mm2, %%mm6 \n\t"\ + "paddd %%mm0, %%mm7 \n\t"\ + " jnz 2b \n\t"\ + "psrad $16, %%mm4 \n\t"\ + "psrad $16, %%mm5 \n\t"\ + "psrad $16, %%mm6 \n\t"\ + "psrad $16, %%mm7 \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "packssdw %%mm5, %%mm4 \n\t"\ + "packssdw %%mm7, %%mm6 \n\t"\ + "paddw %%mm0, %%mm4 \n\t"\ + "paddw %%mm0, %%mm6 \n\t"\ + "movq %%mm4, "U_TEMP"(%0) \n\t"\ + "movq %%mm6, "V_TEMP"(%0) \n\t"\ + +#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ + "lea "offset"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pxor %%mm1, %%mm1 \n\t"\ + "pxor %%mm5, %%mm5 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + ".p2align 4 \n\t"\ + "2: \n\t"\ + "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ + "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ + "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ + "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ + "movq %%mm0, %%mm3 \n\t"\ + "punpcklwd %%mm4, %%mm0 \n\t"\ + "punpckhwd %%mm4, %%mm3 \n\t"\ + "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ + "pmaddwd %%mm4, %%mm0 \n\t"\ + "pmaddwd %%mm4, %%mm3 \n\t"\ + "paddd %%mm0, %%mm1 \n\t"\ + "paddd %%mm3, %%mm5 \n\t"\ + "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ + "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ + "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "punpcklwd %%mm3, %%mm2 \n\t"\ + "punpckhwd %%mm3, %%mm0 \n\t"\ + "pmaddwd %%mm4, %%mm2 \n\t"\ + "pmaddwd %%mm4, %%mm0 \n\t"\ + "paddd %%mm2, %%mm7 \n\t"\ + "paddd %%mm0, %%mm6 \n\t"\ + " jnz 2b \n\t"\ + "psrad $16, %%mm1 \n\t"\ + "psrad $16, %%mm5 \n\t"\ + "psrad $16, %%mm7 \n\t"\ + "psrad $16, %%mm6 \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "packssdw %%mm5, %%mm1 \n\t"\ + "packssdw %%mm6, %%mm7 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm0, %%mm7 \n\t"\ + "movq "U_TEMP"(%0), %%mm3 \n\t"\ + "movq "V_TEMP"(%0), %%mm4 \n\t"\ + +#define YSCALEYUV2PACKEDX_ACCURATE \ + YSCALEYUV2PACKEDX_ACCURATE_UV \ + YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) + +#define YSCALEYUV2RGBX \ + "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ + "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ + "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ + "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ + /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ + "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ + "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ + "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ + "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ + "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ + "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ + /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ + "paddw %%mm3, %%mm4 \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "movq %%mm5, %%mm6 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ + "punpcklwd %%mm2, %%mm2 \n\t"\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "punpcklwd %%mm4, %%mm4 \n\t"\ + "paddw %%mm1, %%mm2 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "paddw %%mm1, %%mm4 \n\t"\ + "punpckhwd %%mm0, %%mm0 \n\t"\ + "punpckhwd %%mm6, %%mm6 \n\t"\ + "punpckhwd %%mm3, %%mm3 \n\t"\ + "paddw %%mm7, %%mm0 \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw %%mm7, %%mm3 \n\t"\ + /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ + "packuswb %%mm0, %%mm2 \n\t"\ + "packuswb %%mm6, %%mm5 \n\t"\ + "packuswb %%mm3, %%mm4 \n\t"\ + +#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ + "movq "#b", "#q2" \n\t" /* B */\ + "movq "#r", "#t" \n\t" /* R */\ + "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ + "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ + "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ + "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ + "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ + "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ + "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ + "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ + "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ + "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ +\ + MOVNTQ( q0, (dst, index, 4))\ + MOVNTQ( b, 8(dst, index, 4))\ + MOVNTQ( q2, 16(dst, index, 4))\ + MOVNTQ( q3, 24(dst, index, 4))\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" +#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) + +static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "movq %%mm2, "U_TEMP"(%0) \n\t" + "movq %%mm4, "V_TEMP"(%0) \n\t" + "movq %%mm5, "Y_TEMP"(%0) \n\t" + YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) + "movq "Y_TEMP"(%0), %%mm5 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) + YSCALEYUV2PACKEDX_END + } else { + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } +} + +static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } else { + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } +} + +#define REAL_WRITERGB16(dst, dstw, index) \ + "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ + "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ + "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ + "psrlq $3, %%mm2 \n\t"\ +\ + "movq %%mm2, %%mm1 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ +\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm5, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm5, %%mm1 \n\t"\ +\ + "psllq $3, %%mm3 \n\t"\ + "psllq $3, %%mm4 \n\t"\ +\ + "por %%mm3, %%mm2 \n\t"\ + "por %%mm4, %%mm1 \n\t"\ +\ + MOVNTQ(%%mm2, (dst, index, 2))\ + MOVNTQ(%%mm1, 8(dst, index, 2))\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" +#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) + +static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" + "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" + "paddusb "RED_DITHER"(%0), %%mm5\n\t" +#endif + WRITERGB16(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" + "paddusb "RED_DITHER"(%0), %%mm5 \n\t" +#endif + WRITERGB16(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +#define REAL_WRITERGB15(dst, dstw, index) \ + "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ + "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ + "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ + "psrlq $3, %%mm2 \n\t"\ + "psrlq $1, %%mm5 \n\t"\ +\ + "movq %%mm2, %%mm1 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ +\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm5, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm5, %%mm1 \n\t"\ +\ + "psllq $2, %%mm3 \n\t"\ + "psllq $2, %%mm4 \n\t"\ +\ + "por %%mm3, %%mm2 \n\t"\ + "por %%mm4, %%mm1 \n\t"\ +\ + MOVNTQ(%%mm2, (dst, index, 2))\ + MOVNTQ(%%mm1, 8(dst, index, 2))\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" +#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) + +static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" + "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" + "paddusb "RED_DITHER"(%0), %%mm5\n\t" +#endif + WRITERGB15(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" + "paddusb "RED_DITHER"(%0), %%mm5 \n\t" +#endif + WRITERGB15(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +#define WRITEBGR24MMX(dst, dstw, index) \ + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ + "movq %%mm2, %%mm1 \n\t" /* B */\ + "movq %%mm5, %%mm6 \n\t" /* R */\ + "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ + "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ + "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ + "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ + "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ + "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ + "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ + "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ + "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ + "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ +\ + "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ + "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ + "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ + "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ +\ + "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ + "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ + "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ + "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ +\ + "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ + "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ + "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ + "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ +\ + "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ + "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ + "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ + "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ + MOVNTQ(%%mm0, (dst))\ +\ + "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ + "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ + "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ + "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ + MOVNTQ(%%mm6, 8(dst))\ +\ + "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ + "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ + "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ + MOVNTQ(%%mm5, 16(dst))\ +\ + "add $24, "#dst" \n\t"\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" + +#define WRITEBGR24MMXEXT(dst, dstw, index) \ + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ + "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ + "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ + "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ + "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ + "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ +\ + "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ + "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ + "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ +\ + "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ + "por %%mm1, %%mm6 \n\t"\ + "por %%mm3, %%mm6 \n\t"\ + MOVNTQ(%%mm6, (dst))\ +\ + "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ + "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ + "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ + "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ +\ + "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ + "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ + "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ +\ + "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ + "por %%mm3, %%mm6 \n\t"\ + MOVNTQ(%%mm6, 8(dst))\ +\ + "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ + "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ + "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ +\ + "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ + "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ + "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ +\ + "por %%mm1, %%mm3 \n\t"\ + "por %%mm3, %%mm6 \n\t"\ + MOVNTQ(%%mm6, 16(dst))\ +\ + "add $24, "#dst" \n\t"\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" + +#if COMPILE_TEMPLATE_MMXEXT +#undef WRITEBGR24 +#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) +#else +#undef WRITEBGR24 +#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) +#endif + +static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX_ACCURATE + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize + "add %4, %%"REG_c" \n\t" + WRITEBGR24(%%REGc, %5, %%REGa) + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW_reg), "m"(uv_off) + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S + ); +} + +static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pxor %%mm7, %%mm7 \n\t" + "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize + "add %4, %%"REG_c" \n\t" + WRITEBGR24(%%REGc, %5, %%REGa) + :: "r" (&c->redDither), + "m" (dummy), "m" (dummy), "m" (dummy), + "r" (dest), "m" (dstW_reg), "m"(uv_off) + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S + ); +} + +#define REAL_WRITEYUY2(dst, dstw, index) \ + "packuswb %%mm3, %%mm3 \n\t"\ + "packuswb %%mm4, %%mm4 \n\t"\ + "packuswb %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm4, %%mm3 \n\t"\ + "movq %%mm1, %%mm7 \n\t"\ + "punpcklbw %%mm3, %%mm1 \n\t"\ + "punpckhbw %%mm3, %%mm7 \n\t"\ +\ + MOVNTQ(%%mm1, (dst, index, 2))\ + MOVNTQ(%%mm7, 8(dst, index, 2))\ +\ + "add $8, "#index" \n\t"\ + "cmp "#dstw", "#index" \n\t"\ + " jb 1b \n\t" +#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) + +static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX_ACCURATE + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "psraw $3, %%mm3 \n\t" + "psraw $3, %%mm4 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + WRITEYUY2(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; + + YSCALEYUV2PACKEDX + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "psraw $3, %%mm3 \n\t" + "psraw $3, %%mm4 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + WRITEYUY2(%4, %5, %%REGa) + YSCALEYUV2PACKEDX_END +} + +#define REAL_YSCALEYUV2RGB_UV(index, c) \ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ + "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ + "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ + "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ + "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ + "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ + "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ + "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ + "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ + "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ + /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ + +#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ + "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ + "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ + "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ + "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ + "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ + "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ + "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + +#define REAL_YSCALEYUV2RGB_COEFF(c) \ + "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ + "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ + "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ + "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ + "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ + "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ + /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ + "paddw %%mm3, %%mm4 \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "movq %%mm5, %%mm6 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ + "punpcklwd %%mm2, %%mm2 \n\t"\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "punpcklwd %%mm4, %%mm4 \n\t"\ + "paddw %%mm1, %%mm2 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "paddw %%mm1, %%mm4 \n\t"\ + "punpckhwd %%mm0, %%mm0 \n\t"\ + "punpckhwd %%mm6, %%mm6 \n\t"\ + "punpckhwd %%mm3, %%mm3 \n\t"\ + "paddw %%mm7, %%mm0 \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw %%mm7, %%mm3 \n\t"\ + /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ + "packuswb %%mm0, %%mm2 \n\t"\ + "packuswb %%mm6, %%mm5 \n\t"\ + "packuswb %%mm3, %%mm4 \n\t"\ + +#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) + +#define YSCALEYUV2RGB(index, c) \ + REAL_YSCALEYUV2RGB_UV(index, c) \ + REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ + REAL_YSCALEYUV2RGB_COEFF(c) + +/** + * vertical bilinear scale YV12 to RGB + */ +static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, + int dstW, int yalpha, int uvalpha, int y) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; + + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; +#if ARCH_X86_64 + __asm__ volatile( + YSCALEYUV2RGB(%%r8, %5) + YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) + "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), + "a" (&c->redDither), + "r" (abuf0), "r" (abuf1) + : "%r8" + ); +#else + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "push %0 \n\t" + "push %1 \n\t" + "mov "U_TEMP"(%5), %0 \n\t" + "mov "V_TEMP"(%5), %1 \n\t" + YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) + "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ + "packuswb %%mm7, %%mm1 \n\t" + "pop %1 \n\t" + "pop %0 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); +#endif + } else { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } +} + +static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, + int dstW, int yalpha, int uvalpha, int y) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; + + //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); +} + +static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, + int dstW, int yalpha, int uvalpha, int y) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; + + //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB15(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); +} + +static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, + int dstW, int yalpha, int uvalpha, int y) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; + + //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB16(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); +} + +#define REAL_YSCALEYUV2PACKED(index, c) \ + "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ + "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ + "psraw $3, %%mm0 \n\t"\ + "psraw $3, %%mm1 \n\t"\ + "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ + "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ + "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ + "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ + "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ + "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ + "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ + "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ + "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ + "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ + "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ + "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ + "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ + "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ + "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ + +#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) + +static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, + int dstW, int yalpha, int uvalpha, int y) +{ + const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; + + //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2PACKED(%%REGBP, %5) + WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); +} + +#define REAL_YSCALEYUV2RGB1(index, c) \ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ + "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ + "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ + "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ + "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ + "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ + /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ + "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ + "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ + "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ + "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ + "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ + "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ + /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ + "paddw %%mm3, %%mm4 \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "movq %%mm5, %%mm6 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ + "punpcklwd %%mm2, %%mm2 \n\t"\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "punpcklwd %%mm4, %%mm4 \n\t"\ + "paddw %%mm1, %%mm2 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "paddw %%mm1, %%mm4 \n\t"\ + "punpckhwd %%mm0, %%mm0 \n\t"\ + "punpckhwd %%mm6, %%mm6 \n\t"\ + "punpckhwd %%mm3, %%mm3 \n\t"\ + "paddw %%mm7, %%mm0 \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw %%mm7, %%mm3 \n\t"\ + /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ + "packuswb %%mm0, %%mm2 \n\t"\ + "packuswb %%mm6, %%mm5 \n\t"\ + "packuswb %%mm3, %%mm4 \n\t"\ + +#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) + +// do vertical chrominance interpolation +#define REAL_YSCALEYUV2RGB1b(index, c) \ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ + "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ + "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ + "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ + "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ + "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ + "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ + "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ + "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ + /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ + "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ + "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ + "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ + "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ + "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ + "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ + "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ + /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ + "paddw %%mm3, %%mm4 \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "movq %%mm5, %%mm6 \n\t"\ + "movq %%mm4, %%mm3 \n\t"\ + "punpcklwd %%mm2, %%mm2 \n\t"\ + "punpcklwd %%mm5, %%mm5 \n\t"\ + "punpcklwd %%mm4, %%mm4 \n\t"\ + "paddw %%mm1, %%mm2 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "paddw %%mm1, %%mm4 \n\t"\ + "punpckhwd %%mm0, %%mm0 \n\t"\ + "punpckhwd %%mm6, %%mm6 \n\t"\ + "punpckhwd %%mm3, %%mm3 \n\t"\ + "paddw %%mm7, %%mm0 \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw %%mm7, %%mm3 \n\t"\ + /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ + "packuswb %%mm0, %%mm2 \n\t"\ + "packuswb %%mm6, %%mm5 \n\t"\ + "packuswb %%mm3, %%mm4 \n\t"\ + +#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) + +#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ + "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ + "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ + "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ + "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ + "packuswb %%mm1, %%mm7 \n\t" +#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) + +/** + * YV12 to RGB without scaling or interpolating + */ +static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, + int dstW, int uvalpha, int y) +{ + const int16_t *ubuf0 = ubuf[0]; + const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 + + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + const int16_t *ubuf1 = ubuf[0]; + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } + } else { + const int16_t *ubuf1 = ubuf[1]; + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } + } +} + +static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, + int dstW, int uvalpha, int y) +{ + const int16_t *ubuf0 = ubuf[0]; + const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 + + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + const int16_t *ubuf1 = ubuf[0]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + const int16_t *ubuf1 = ubuf[1]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } +} + +static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, + int dstW, int uvalpha, int y) +{ + const int16_t *ubuf0 = ubuf[0]; + const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 + + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + const int16_t *ubuf1 = ubuf[0]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB15(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + const int16_t *ubuf1 = ubuf[1]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB15(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } +} + +static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, + int dstW, int uvalpha, int y) +{ + const int16_t *ubuf0 = ubuf[0]; + const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 + + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + const int16_t *ubuf1 = ubuf[0]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB16(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + const int16_t *ubuf1 = ubuf[1]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ +#ifdef DITHER1XBPP + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" +#endif + WRITERGB16(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } +} + +#define REAL_YSCALEYUV2PACKED1(index, c) \ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "psraw $7, %%mm3 \n\t" \ + "psraw $7, %%mm4 \n\t" \ + "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $7, %%mm1 \n\t" \ + "psraw $7, %%mm7 \n\t" \ + +#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) + +#define REAL_YSCALEYUV2PACKED1b(index, c) \ + "xor "#index", "#index" \n\t"\ + ".p2align 4 \n\t"\ + "1: \n\t"\ + "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ + "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ + "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ + "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ + "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ + "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ + "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ + "psrlw $8, %%mm3 \n\t" \ + "psrlw $8, %%mm4 \n\t" \ + "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ + "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ + "psraw $7, %%mm1 \n\t" \ + "psraw $7, %%mm7 \n\t" +#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) + +static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf0, uint8_t *dest, + int dstW, int uvalpha, int y) +{ + const int16_t *ubuf0 = ubuf[0]; + const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 + + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + const int16_t *ubuf1 = ubuf[0]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2PACKED1(%%REGBP, %5) + WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } else { + const int16_t *ubuf1 = ubuf[1]; + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2PACKED1b(%%REGBP, %5) + WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), + "a" (&c->redDither) + ); + } +} + +#if COMPILE_TEMPLATE_MMXEXT +static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, + int dstWidth, const uint8_t *src, + int srcW, int xInc) +{ + int32_t *filterPos = c->hLumFilterPos; + int16_t *filter = c->hLumFilter; + void *mmxextFilterCode = c->lumMmxextFilterCode; + int i; +#if defined(PIC) + uint64_t ebxsave; +#endif +#if ARCH_X86_64 + uint64_t retsave; +#endif + + __asm__ volatile( +#if defined(PIC) + "mov %%"REG_b", %5 \n\t" +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %6 \n\t" +#endif +#else +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %5 \n\t" +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + +#if ARCH_X86_64 +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ + "add %%"REG_S", %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#else +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#endif /* ARCH_X86_64 */ + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if defined(PIC) + "mov %5, %%"REG_b" \n\t" +#if ARCH_X86_64 + "mov %6, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#else +#if ARCH_X86_64 + "mov %5, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#endif + :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode) +#if defined(PIC) + ,"m" (ebxsave) +#endif +#if ARCH_X86_64 + ,"m"(retsave) +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) + dst[i] = src[srcW-1]*128; +} + +static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ + int32_t *filterPos = c->hChrFilterPos; + int16_t *filter = c->hChrFilter; + void *mmxextFilterCode = c->chrMmxextFilterCode; + int i; +#if defined(PIC) + DECLARE_ALIGNED(8, uint64_t, ebxsave); +#endif +#if ARCH_X86_64 + DECLARE_ALIGNED(8, uint64_t, retsave); +#endif + + __asm__ volatile( +#if defined(PIC) + "mov %%"REG_b", %7 \n\t" +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %8 \n\t" +#endif +#else +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %7 \n\t" +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + "xor %%"REG_a", %%"REG_a" \n\t" // i + "mov %5, %%"REG_c" \n\t" // src + "mov %6, %%"REG_D" \n\t" // buf2 + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if defined(PIC) + "mov %7, %%"REG_b" \n\t" +#if ARCH_X86_64 + "mov %8, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#else +#if ARCH_X86_64 + "mov %7, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#endif +#endif + :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode), "m" (src2), "m"(dst2) +#if defined(PIC) + ,"m" (ebxsave) +#endif +#if ARCH_X86_64 + ,"m"(retsave) +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + dst1[i] = src1[srcW-1]*128; + dst2[i] = src2[srcW-1]*128; + } +} +#endif /* COMPILE_TEMPLATE_MMXEXT */ + +static av_cold void RENAME(sws_init_swScale)(SwsContext *c) +{ + enum AVPixelFormat dstFormat = c->dstFormat; + + c->use_mmx_vfilter= 0; + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 + && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { + if (c->flags & SWS_ACCURATE_RND) { + if (!(c->flags & SWS_FULL_CHR_H_INT)) { + switch (c->dstFormat) { + case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; + case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; + case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; + case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; + case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; + default: break; + } + } + } else { + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); + if (!(c->flags & SWS_FULL_CHR_H_INT)) { + switch (c->dstFormat) { + case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; + case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; + case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; + case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; + case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; + default: break; + } + } + } + if (!(c->flags & SWS_FULL_CHR_H_INT)) { + switch (c->dstFormat) { + case AV_PIX_FMT_RGB32: + c->yuv2packed1 = RENAME(yuv2rgb32_1); + c->yuv2packed2 = RENAME(yuv2rgb32_2); + break; + case AV_PIX_FMT_BGR24: + c->yuv2packed1 = RENAME(yuv2bgr24_1); + c->yuv2packed2 = RENAME(yuv2bgr24_2); + break; + case AV_PIX_FMT_RGB555: + c->yuv2packed1 = RENAME(yuv2rgb555_1); + c->yuv2packed2 = RENAME(yuv2rgb555_2); + break; + case AV_PIX_FMT_RGB565: + c->yuv2packed1 = RENAME(yuv2rgb565_1); + c->yuv2packed2 = RENAME(yuv2rgb565_2); + break; + case AV_PIX_FMT_YUYV422: + c->yuv2packed1 = RENAME(yuv2yuyv422_1); + c->yuv2packed2 = RENAME(yuv2yuyv422_2); + break; + default: + break; + } + } + } + + if (c->srcBpc == 8 && c->dstBpc <= 14) { + // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). +#if COMPILE_TEMPLATE_MMXEXT + if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { + c->hyscale_fast = RENAME(hyscale_fast); + c->hcscale_fast = RENAME(hcscale_fast); + } else { +#endif /* COMPILE_TEMPLATE_MMXEXT */ + c->hyscale_fast = NULL; + c->hcscale_fast = NULL; +#if COMPILE_TEMPLATE_MMXEXT + } +#endif /* COMPILE_TEMPLATE_MMXEXT */ + } +} diff --git a/ffmpeg1/libswscale/x86/w64xmmtest.c b/ffmpeg1/libswscale/x86/w64xmmtest.c new file mode 100644 index 0000000..dd9a2a4 --- /dev/null +++ b/ffmpeg1/libswscale/x86/w64xmmtest.c @@ -0,0 +1,31 @@ +/* + * check XMM registers for clobbers on Win64 + * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/w64xmmtest.h" +#include "libswscale/swscale.h" + +wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[], + const int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *const dst[], const int dstStride[])) +{ + testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY, + srcSliceH, dst, dstStride); +} diff --git a/ffmpeg1/libswscale/x86/yuv2rgb.c b/ffmpeg1/libswscale/x86/yuv2rgb.c new file mode 100644 index 0000000..3938e6b --- /dev/null +++ b/ffmpeg1/libswscale/x86/yuv2rgb.c @@ -0,0 +1,113 @@ +/* + * software YUV to RGB converter + * + * Copyright (C) 2009 Konstantin Shishkov + * + * MMX/MMXEXT template stuff (needed for fast movntq support), + * 1,4,8bpp support and context / deglobalize stuff + * by Michael Niedermayer (michaelni@gmx.at) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include <assert.h> + +#include "config.h" +#include "libswscale/rgb2rgb.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/attributes.h" +#include "libavutil/x86/asm.h" +#include "libavutil/cpu.h" + +#if HAVE_INLINE_ASM + +#define DITHER1XBPP // only for MMX + +/* hope these constant values are cache line aligned */ +DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL; +DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; +DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; +DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL; +DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL; +DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL; + +//MMX versions +#if HAVE_MMX_INLINE +#undef RENAME +#undef COMPILE_TEMPLATE_MMXEXT +#define COMPILE_TEMPLATE_MMXEXT 0 +#define RENAME(a) a ## _MMX +#include "yuv2rgb_template.c" +#endif /* HAVE_MMX_INLINE */ + +// MMXEXT versions +#if HAVE_MMXEXT_INLINE +#undef RENAME +#undef COMPILE_TEMPLATE_MMXEXT +#define COMPILE_TEMPLATE_MMXEXT 1 +#define RENAME(a) a ## _MMXEXT +#include "yuv2rgb_template.c" +#endif /* HAVE_MMXEXT_INLINE */ + +#endif /* HAVE_INLINE_ASM */ + +av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c) +{ +#if HAVE_INLINE_ASM + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_MMXEXT_INLINE + if (cpu_flags & AV_CPU_FLAG_MMXEXT) { + switch (c->dstFormat) { + case AV_PIX_FMT_RGB24: + return yuv420_rgb24_MMXEXT; + case AV_PIX_FMT_BGR24: + return yuv420_bgr24_MMXEXT; + } + } +#endif + + if (cpu_flags & AV_CPU_FLAG_MMX) { + switch (c->dstFormat) { + case AV_PIX_FMT_RGB32: + if (c->srcFormat == AV_PIX_FMT_YUVA420P) { +#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA + return yuva420_rgb32_MMX; +#endif + break; + } else return yuv420_rgb32_MMX; + case AV_PIX_FMT_BGR32: + if (c->srcFormat == AV_PIX_FMT_YUVA420P) { +#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA + return yuva420_bgr32_MMX; +#endif + break; + } else return yuv420_bgr32_MMX; + case AV_PIX_FMT_RGB24: return yuv420_rgb24_MMX; + case AV_PIX_FMT_BGR24: return yuv420_bgr24_MMX; + case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX; + case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX; + } + } +#endif /* HAVE_INLINE_ASM */ + + return NULL; +} diff --git a/ffmpeg1/libswscale/x86/yuv2rgb_template.c b/ffmpeg1/libswscale/x86/yuv2rgb_template.c new file mode 100644 index 0000000..c879102 --- /dev/null +++ b/ffmpeg1/libswscale/x86/yuv2rgb_template.c @@ -0,0 +1,451 @@ +/* + * software YUV to RGB converter + * + * Copyright (C) 2001-2007 Michael Niedermayer + * (c) 2010 Konstantin Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef MOVNTQ +#undef EMMS +#undef SFENCE + +#if COMPILE_TEMPLATE_MMXEXT +#define MOVNTQ "movntq" +#define SFENCE "sfence" +#else +#define MOVNTQ "movq" +#define SFENCE " # nop" +#endif + +#define REG_BLUE "0" +#define REG_RED "1" +#define REG_GREEN "2" +#define REG_ALPHA "3" + +#define YUV2RGB_LOOP(depth) \ + h_size = (c->dstW + 7) & ~7; \ + if (h_size * depth > FFABS(dstStride[0])) \ + h_size -= 8; \ + \ + vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \ + \ + __asm__ volatile ("pxor %mm4, %mm4\n\t"); \ + for (y = 0; y < srcSliceH; y++) { \ + uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \ + const uint8_t *py = src[0] + y * srcStride[0]; \ + const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \ + const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \ + x86_reg index = -h_size / 2; \ + +#define YUV2RGB_INITIAL_LOAD \ + __asm__ volatile ( \ + "movq (%5, %0, 2), %%mm6\n\t" \ + "movd (%2, %0), %%mm0\n\t" \ + "movd (%3, %0), %%mm1\n\t" \ + "1: \n\t" \ + +/* YUV2RGB core + * Conversion is performed in usual way: + * R = Y' * Ycoef + Vred * V' + * G = Y' * Ycoef + Vgreen * V' + Ugreen * U' + * B = Y' * Ycoef + Ublue * U' + * + * where X' = X * 8 - Xoffset (multiplication is performed to increase + * precision a bit). + * Since it operates in YUV420 colorspace, Y component is additionally + * split into Y1 and Y2 for even and odd pixels. + * + * Input: + * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register + * Output: + * mm1 - R, mm2 - G, mm0 - B + */ +#define YUV2RGB \ + /* convert Y, U, V into Y1', Y2', U', V' */ \ + "movq %%mm6, %%mm7\n\t" \ + "punpcklbw %%mm4, %%mm0\n\t" \ + "punpcklbw %%mm4, %%mm1\n\t" \ + "pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \ + "psrlw $8, %%mm7\n\t" \ + "psllw $3, %%mm0\n\t" \ + "psllw $3, %%mm1\n\t" \ + "psllw $3, %%mm6\n\t" \ + "psllw $3, %%mm7\n\t" \ + "psubsw "U_OFFSET"(%4), %%mm0\n\t" \ + "psubsw "V_OFFSET"(%4), %%mm1\n\t" \ + "psubw "Y_OFFSET"(%4), %%mm6\n\t" \ + "psubw "Y_OFFSET"(%4), %%mm7\n\t" \ +\ + /* multiply by coefficients */ \ + "movq %%mm0, %%mm2\n\t" \ + "movq %%mm1, %%mm3\n\t" \ + "pmulhw "UG_COEFF"(%4), %%mm2\n\t" \ + "pmulhw "VG_COEFF"(%4), %%mm3\n\t" \ + "pmulhw "Y_COEFF" (%4), %%mm6\n\t" \ + "pmulhw "Y_COEFF" (%4), %%mm7\n\t" \ + "pmulhw "UB_COEFF"(%4), %%mm0\n\t" \ + "pmulhw "VR_COEFF"(%4), %%mm1\n\t" \ + "paddsw %%mm3, %%mm2\n\t" \ + /* now: mm0 = UB, mm1 = VR, mm2 = CG */ \ + /* mm6 = Y1, mm7 = Y2 */ \ +\ + /* produce RGB */ \ + "movq %%mm7, %%mm3\n\t" \ + "movq %%mm7, %%mm5\n\t" \ + "paddsw %%mm0, %%mm3\n\t" \ + "paddsw %%mm1, %%mm5\n\t" \ + "paddsw %%mm2, %%mm7\n\t" \ + "paddsw %%mm6, %%mm0\n\t" \ + "paddsw %%mm6, %%mm1\n\t" \ + "paddsw %%mm6, %%mm2\n\t" \ + +#define RGB_PACK_INTERLEAVE \ + /* pack and interleave even/odd pixels */ \ + "packuswb %%mm1, %%mm0\n\t" \ + "packuswb %%mm5, %%mm3\n\t" \ + "packuswb %%mm2, %%mm2\n\t" \ + "movq %%mm0, %%mm1\n\n" \ + "packuswb %%mm7, %%mm7\n\t" \ + "punpcklbw %%mm3, %%mm0\n\t" \ + "punpckhbw %%mm3, %%mm1\n\t" \ + "punpcklbw %%mm7, %%mm2\n\t" \ + +#define YUV2RGB_ENDLOOP(depth) \ + "movq 8 (%5, %0, 2), %%mm6\n\t" \ + "movd 4 (%3, %0), %%mm1\n\t" \ + "movd 4 (%2, %0), %%mm0\n\t" \ + "add $"AV_STRINGIFY(depth * 8)", %1\n\t" \ + "add $4, %0\n\t" \ + "js 1b\n\t" \ + +#define YUV2RGB_OPERANDS \ + : "+r" (index), "+r" (image) \ + : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ + "r" (py - 2*index) \ + : "memory" \ + ); \ + } \ + +#define YUV2RGB_OPERANDS_ALPHA \ + : "+r" (index), "+r" (image) \ + : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ + "r" (py - 2*index), "r" (pa - 2*index) \ + : "memory" \ + ); \ + } \ + +#define YUV2RGB_ENDFUNC \ + __asm__ volatile (SFENCE"\n\t" \ + "emms \n\t"); \ + return srcSliceH; \ + +#define IF0(x) +#define IF1(x) x + +#define RGB_PACK16(gmask, is15) \ + "pand "MANGLE(mmx_redmask)", %%mm0\n\t" \ + "pand "MANGLE(mmx_redmask)", %%mm1\n\t" \ + "movq %%mm2, %%mm3\n\t" \ + "psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \ + "psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \ + "psrlw $3, %%mm0\n\t" \ + IF##is15("psrlw $1, %%mm1\n\t") \ + "pand "MANGLE(pb_e0)", %%mm2\n\t" \ + "pand "MANGLE(gmask)", %%mm3\n\t" \ + "por %%mm2, %%mm0\n\t" \ + "por %%mm3, %%mm1\n\t" \ + "movq %%mm0, %%mm2\n\t" \ + "punpcklbw %%mm1, %%mm0\n\t" \ + "punpckhbw %%mm1, %%mm2\n\t" \ + MOVNTQ " %%mm0, (%1)\n\t" \ + MOVNTQ " %%mm2, 8(%1)\n\t" \ + +#define DITHER_RGB \ + "paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \ + "paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \ + "paddusb "RED_DITHER"(%4), %%mm1\n\t" \ + +#if !COMPILE_TEMPLATE_MMXEXT +static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(2) + +#ifdef DITHER1XBPP + c->blueDither = ff_dither8[y & 1]; + c->greenDither = ff_dither8[y & 1]; + c->redDither = ff_dither8[(y + 1) & 1]; +#endif + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE +#ifdef DITHER1XBPP + DITHER_RGB +#endif + RGB_PACK16(pb_03, 1) + + YUV2RGB_ENDLOOP(2) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} + +static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(2) + +#ifdef DITHER1XBPP + c->blueDither = ff_dither8[y & 1]; + c->greenDither = ff_dither4[y & 1]; + c->redDither = ff_dither8[(y + 1) & 1]; +#endif + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE +#ifdef DITHER1XBPP + DITHER_RGB +#endif + RGB_PACK16(pb_07, 0) + + YUV2RGB_ENDLOOP(2) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} +#endif /* !COMPILE_TEMPLATE_MMXEXT */ + +#define RGB_PACK24(blue, red)\ + "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\ + "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\ + "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\ + "movq %%mm"red", %%mm3 \n"\ + "movq %%mm"blue", %%mm6 \n"\ + "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\ + "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\ + "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\ + "movq %%mm3, %%mm5 \n"\ + "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\ + "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\ + "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\ + RGB_PACK24_B + +#if COMPILE_TEMPLATE_MMXEXT +DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1}; +DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0}; +DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0}; +DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1}; +DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0}; +#undef RGB_PACK24_B +#define RGB_PACK24_B\ + "pshufw $0xc6, %%mm2, %%mm1 \n"\ + "pshufw $0x84, %%mm3, %%mm6 \n"\ + "pshufw $0x38, %%mm5, %%mm7 \n"\ + "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\ + "movq %%mm1, %%mm0 \n"\ + "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\ + "movq %%mm1, %%mm2 \n"\ + "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\ + "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\ + "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\ + "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\ + "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\ + "por %%mm3, %%mm1 \n"\ + "por %%mm6, %%mm0 \n"\ + "por %%mm5, %%mm1 \n"\ + "por %%mm7, %%mm2 \n"\ + MOVNTQ" %%mm0, (%1) \n"\ + MOVNTQ" %%mm1, 8(%1) \n"\ + MOVNTQ" %%mm2, 16(%1) \n"\ + +#else +#undef RGB_PACK24_B +#define RGB_PACK24_B\ + "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\ + "movd %%mm2, 4(%1) \n" /* G1 B1 */\ + "psrlq $32, %%mm3 \n"\ + "psrlq $16, %%mm2 \n"\ + "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\ + "movd %%mm2, 10(%1) \n" /* G3 B3 */\ + "psrlq $16, %%mm2 \n"\ + "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\ + "movd %%mm2, 16(%1) \n" /* G5 B5 */\ + "psrlq $32, %%mm5 \n"\ + "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\ + "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\ + +#endif + +static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(3) + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK24(REG_BLUE, REG_RED) + + YUV2RGB_ENDLOOP(3) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} + +static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(3) + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK24(REG_RED, REG_BLUE) + + YUV2RGB_ENDLOOP(3) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} + + +#define SET_EMPTY_ALPHA \ + "pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \ + +#define LOAD_ALPHA \ + "movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \ + +#define RGB_PACK32(red, green, blue, alpha) \ + "movq %%mm"blue", %%mm5\n\t" \ + "movq %%mm"red", %%mm6\n\t" \ + "punpckhbw %%mm"green", %%mm5\n\t" \ + "punpcklbw %%mm"green", %%mm"blue"\n\t" \ + "punpckhbw %%mm"alpha", %%mm6\n\t" \ + "punpcklbw %%mm"alpha", %%mm"red"\n\t" \ + "movq %%mm"blue", %%mm"green"\n\t" \ + "movq %%mm5, %%mm"alpha"\n\t" \ + "punpcklwd %%mm"red", %%mm"blue"\n\t" \ + "punpckhwd %%mm"red", %%mm"green"\n\t" \ + "punpcklwd %%mm6, %%mm5\n\t" \ + "punpckhwd %%mm6, %%mm"alpha"\n\t" \ + MOVNTQ " %%mm"blue", 0(%1)\n\t" \ + MOVNTQ " %%mm"green", 8(%1)\n\t" \ + MOVNTQ " %%mm5, 16(%1)\n\t" \ + MOVNTQ " %%mm"alpha", 24(%1)\n\t" \ + +#if !COMPILE_TEMPLATE_MMXEXT +static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(4) + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE + SET_EMPTY_ALPHA + RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) + + YUV2RGB_ENDLOOP(4) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} + +#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA +static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(4) + + const uint8_t *pa = src[3] + y * srcStride[3]; + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE + LOAD_ALPHA + RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) + + YUV2RGB_ENDLOOP(4) + YUV2RGB_OPERANDS_ALPHA + YUV2RGB_ENDFUNC +} +#endif + +static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(4) + + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE + SET_EMPTY_ALPHA + RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) + + YUV2RGB_ENDLOOP(4) + YUV2RGB_OPERANDS + YUV2RGB_ENDFUNC +} + +#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA +static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[], + int srcStride[], + int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +{ + int y, h_size, vshift; + + YUV2RGB_LOOP(4) + + const uint8_t *pa = src[3] + y * srcStride[3]; + YUV2RGB_INITIAL_LOAD + YUV2RGB + RGB_PACK_INTERLEAVE + LOAD_ALPHA + RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) + + YUV2RGB_ENDLOOP(4) + YUV2RGB_OPERANDS_ALPHA + YUV2RGB_ENDFUNC +} +#endif + +#endif /* !COMPILE_TEMPLATE_MMXEXT */ |
