summaryrefslogtreecommitdiff
path: root/ffmpeg1/libswscale/x86
diff options
context:
space:
mode:
Diffstat (limited to 'ffmpeg1/libswscale/x86')
-rw-r--r--ffmpeg1/libswscale/x86/Makefile11
-rw-r--r--ffmpeg1/libswscale/x86/input.asm670
-rw-r--r--ffmpeg1/libswscale/x86/output.asm413
-rw-r--r--ffmpeg1/libswscale/x86/rgb2rgb.c149
-rw-r--r--ffmpeg1/libswscale/x86/rgb2rgb_template.c2498
-rw-r--r--ffmpeg1/libswscale/x86/scale.asm431
-rw-r--r--ffmpeg1/libswscale/x86/swscale.c585
-rw-r--r--ffmpeg1/libswscale/x86/swscale_template.c1717
-rw-r--r--ffmpeg1/libswscale/x86/w64xmmtest.c31
-rw-r--r--ffmpeg1/libswscale/x86/yuv2rgb.c113
-rw-r--r--ffmpeg1/libswscale/x86/yuv2rgb_template.c451
11 files changed, 0 insertions, 7069 deletions
diff --git a/ffmpeg1/libswscale/x86/Makefile b/ffmpeg1/libswscale/x86/Makefile
deleted file mode 100644
index 7d219b4..0000000
--- a/ffmpeg1/libswscale/x86/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
-
-OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
-
-MMX-OBJS += x86/rgb2rgb.o \
- x86/swscale.o \
- x86/yuv2rgb.o \
-
-YASM-OBJS += x86/input.o \
- x86/output.o \
- x86/scale.o \
diff --git a/ffmpeg1/libswscale/x86/input.asm b/ffmpeg1/libswscale/x86/input.asm
deleted file mode 100644
index 9d5a871..0000000
--- a/ffmpeg1/libswscale/x86/input.asm
+++ /dev/null
@@ -1,670 +0,0 @@
-;******************************************************************************
-;* x86-optimized input routines; does shuffling of packed
-;* YUV formats into individual planes, and converts RGB
-;* into YUV planes also.
-;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-%define RY 0x20DE
-%define GY 0x4087
-%define BY 0x0C88
-%define RU 0xECFF
-%define GU 0xDAC8
-%define BU 0x3838
-%define RV 0x3838
-%define GV 0xD0E3
-%define BV 0xF6E4
-
-rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
-rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
-bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
-bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
-rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
-rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
-bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
-bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
-rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
-rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
-bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
-bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
-rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
-rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
-
-rgba_Ycoeff_rb: times 4 dw RY, BY
-rgba_Ycoeff_br: times 4 dw BY, RY
-rgba_Ycoeff_ga: times 4 dw GY, 0
-rgba_Ycoeff_ag: times 4 dw 0, GY
-rgba_Ucoeff_rb: times 4 dw RU, BU
-rgba_Ucoeff_br: times 4 dw BU, RU
-rgba_Ucoeff_ga: times 4 dw GU, 0
-rgba_Ucoeff_ag: times 4 dw 0, GU
-rgba_Vcoeff_rb: times 4 dw RV, BV
-rgba_Vcoeff_br: times 4 dw BV, RV
-rgba_Vcoeff_ga: times 4 dw GV, 0
-rgba_Vcoeff_ag: times 4 dw 0, GV
-
-shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
- 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
-shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
- 8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; RGB to Y/UV.
-;
-; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
-; and
-; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
-; const uint8_t *unused, int w);
-;-----------------------------------------------------------------------------
-
-; %1 = nr. of XMM registers
-; %2 = rgb or bgr
-%macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
-%if mmsize == 8
- mova m5, [%2_Ycoeff_12x4]
- mova m6, [%2_Ycoeff_3x56]
-%define coeff1 m5
-%define coeff2 m6
-%elif ARCH_X86_64
- mova m8, [%2_Ycoeff_12x4]
- mova m9, [%2_Ycoeff_3x56]
-%define coeff1 m8
-%define coeff2 m9
-%else ; x86-32 && mmsize == 16
-%define coeff1 [%2_Ycoeff_12x4]
-%define coeff2 [%2_Ycoeff_3x56]
-%endif ; x86-32/64 && mmsize == 8/16
-%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
- jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
-%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
-.body:
-%if cpuflag(ssse3)
- mova m7, [shuf_rgb_12x4]
-%define shuf_rgb1 m7
-%if ARCH_X86_64
- mova m10, [shuf_rgb_3x56]
-%define shuf_rgb2 m10
-%else ; x86-32
-%define shuf_rgb2 [shuf_rgb_3x56]
-%endif ; x86-32/64
-%endif ; cpuflag(ssse3)
-%if ARCH_X86_64
- movsxd wq, wd
-%endif
- add wq, wq
- add dstq, wq
- neg wq
-%if notcpuflag(ssse3)
- pxor m7, m7
-%endif ; !cpuflag(ssse3)
- mova m4, [rgb_Yrnd]
-.loop:
-%if cpuflag(ssse3)
- movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
- movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
- pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
- pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
- pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
- pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
-%else ; !cpuflag(ssse3)
- movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
- movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
- movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
- movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16 ; i.e. sse2
- punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
- punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
- movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
- movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
- movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
- movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
- punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
- punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16
- punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
- punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
- punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
- punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; cpuflag(ssse3)
- add srcq, 3 * mmsize / 2
- pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
- pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
- pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
- pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
- paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
- paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
- paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
- paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
- psrad m0, 9
- psrad m2, 9
- packssdw m0, m2 ; (word) { Y[0-7] }
- mova [dstq+wq], m0
- add wq, mmsize
- jl .loop
- REP_RET
-%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
-%endmacro
-
-; %1 = nr. of XMM registers
-; %2 = rgb or bgr
-%macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
-%if ARCH_X86_64
- mova m8, [%2_Ucoeff_12x4]
- mova m9, [%2_Ucoeff_3x56]
- mova m10, [%2_Vcoeff_12x4]
- mova m11, [%2_Vcoeff_3x56]
-%define coeffU1 m8
-%define coeffU2 m9
-%define coeffV1 m10
-%define coeffV2 m11
-%else ; x86-32
-%define coeffU1 [%2_Ucoeff_12x4]
-%define coeffU2 [%2_Ucoeff_3x56]
-%define coeffV1 [%2_Vcoeff_12x4]
-%define coeffV2 [%2_Vcoeff_3x56]
-%endif ; x86-32/64
-%if ARCH_X86_64 && %0 == 3
- jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
-%else ; ARCH_X86_64 && %0 == 3
-.body:
-%if cpuflag(ssse3)
- mova m7, [shuf_rgb_12x4]
-%define shuf_rgb1 m7
-%if ARCH_X86_64
- mova m12, [shuf_rgb_3x56]
-%define shuf_rgb2 m12
-%else ; x86-32
-%define shuf_rgb2 [shuf_rgb_3x56]
-%endif ; x86-32/64
-%endif ; cpuflag(ssse3)
-%if ARCH_X86_64
- movsxd wq, dword r5m
-%else ; x86-32
- mov wq, r5m
-%endif
- add wq, wq
- add dstUq, wq
- add dstVq, wq
- neg wq
- mova m6, [rgb_UVrnd]
-%if notcpuflag(ssse3)
- pxor m7, m7
-%endif
-.loop:
-%if cpuflag(ssse3)
- movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
- movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
- pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
- pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
-%else ; !cpuflag(ssse3)
- movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
- movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
- movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
- movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16
- punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
- punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
- movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
- movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
-%endif ; mmsize == 16
- punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
- punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
-%endif ; cpuflag(ssse3)
- pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
- pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
- pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
- pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
- paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
- paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
-%if cpuflag(ssse3)
- pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
- pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
-%else ; !cpuflag(ssse3)
-%if mmsize == 16
- movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
- movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
- punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
- punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16 && !cpuflag(ssse3)
- punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
- punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; cpuflag(ssse3)
- add srcq, 3 * mmsize / 2
- pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
- pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
- pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
- pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
- paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
- paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
- paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
- paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
- paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
- paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
- psrad m0, 9
- psrad m2, 9
- psrad m1, 9
- psrad m4, 9
- packssdw m0, m1 ; (word) { U[0-7] }
- packssdw m2, m4 ; (word) { V[0-7] }
-%if mmsize == 8
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
- add wq, mmsize
- jl .loop
- REP_RET
-%endif ; ARCH_X86_64 && %0 == 3
-%endmacro
-
-; %1 = nr. of XMM registers for rgb-to-Y func
-; %2 = nr. of XMM registers for rgb-to-UV func
-%macro RGB24_FUNCS 2
-RGB24_TO_Y_FN %1, rgb
-RGB24_TO_Y_FN %1, bgr, rgb
-RGB24_TO_UV_FN %2, rgb
-RGB24_TO_UV_FN %2, bgr, rgb
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB24_FUNCS 0, 0
-%endif
-
-INIT_XMM sse2
-RGB24_FUNCS 10, 12
-
-INIT_XMM ssse3
-RGB24_FUNCS 11, 13
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-RGB24_FUNCS 11, 13
-%endif
-
-; %1 = nr. of XMM registers
-; %2-5 = rgba, bgra, argb or abgr (in individual characters)
-%macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
- mova m5, [rgba_Ycoeff_%2%4]
- mova m6, [rgba_Ycoeff_%3%5]
-%if %0 == 6
- jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
-%else ; %0 == 6
-.body:
-%if ARCH_X86_64
- movsxd wq, wd
-%endif
- lea srcq, [srcq+wq*4]
- add wq, wq
- add dstq, wq
- neg wq
- mova m4, [rgb_Yrnd]
- pcmpeqb m7, m7
- psrlw m7, 8 ; (word) { 0x00ff } x4
-.loop:
- ; FIXME check alignment and use mova
- movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
- DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
- pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
- pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
- pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
- pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
- paddd m0, m4 ; += rgb_Yrnd
- paddd m2, m4 ; += rgb_Yrnd
- paddd m0, m1 ; (dword) { Y[0-3] }
- paddd m2, m3 ; (dword) { Y[4-7] }
- psrad m0, 9
- psrad m2, 9
- packssdw m0, m2 ; (word) { Y[0-7] }
- mova [dstq+wq], m0
- add wq, mmsize
- jl .loop
- REP_RET
-%endif ; %0 == 3
-%endmacro
-
-; %1 = nr. of XMM registers
-; %2-5 = rgba, bgra, argb or abgr (in individual characters)
-%macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
-%if ARCH_X86_64
- mova m8, [rgba_Ucoeff_%2%4]
- mova m9, [rgba_Ucoeff_%3%5]
- mova m10, [rgba_Vcoeff_%2%4]
- mova m11, [rgba_Vcoeff_%3%5]
-%define coeffU1 m8
-%define coeffU2 m9
-%define coeffV1 m10
-%define coeffV2 m11
-%else ; x86-32
-%define coeffU1 [rgba_Ucoeff_%2%4]
-%define coeffU2 [rgba_Ucoeff_%3%5]
-%define coeffV1 [rgba_Vcoeff_%2%4]
-%define coeffV2 [rgba_Vcoeff_%3%5]
-%endif ; x86-64/32
-%if ARCH_X86_64 && %0 == 6
- jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
-%else ; ARCH_X86_64 && %0 == 6
-.body:
-%if ARCH_X86_64
- movsxd wq, dword r5m
-%else ; x86-32
- mov wq, r5m
-%endif
- add wq, wq
- add dstUq, wq
- add dstVq, wq
- lea srcq, [srcq+wq*2]
- neg wq
- pcmpeqb m7, m7
- psrlw m7, 8 ; (word) { 0x00ff } x4
- mova m6, [rgb_UVrnd]
-.loop:
- ; FIXME check alignment and use mova
- movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
- DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
- pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
- pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
- pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
- pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
- paddd m3, m6 ; += rgb_UVrnd
- paddd m1, m6 ; += rgb_UVrnd
- paddd m2, m3 ; (dword) { V[0-3] }
- paddd m0, m1 ; (dword) { U[0-3] }
- pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
- pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
- pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
- pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
- paddd m3, m6 ; += rgb_UVrnd
- paddd m5, m6 ; += rgb_UVrnd
- psrad m0, 9
- paddd m1, m3 ; (dword) { V[4-7] }
- paddd m4, m5 ; (dword) { U[4-7] }
- psrad m2, 9
- psrad m4, 9
- psrad m1, 9
- packssdw m0, m4 ; (word) { U[0-7] }
- packssdw m2, m1 ; (word) { V[0-7] }
-%if mmsize == 8
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
- add wq, mmsize
- jl .loop
- REP_RET
-%endif ; ARCH_X86_64 && %0 == 3
-%endmacro
-
-; %1 = nr. of XMM registers for rgb-to-Y func
-; %2 = nr. of XMM registers for rgb-to-UV func
-%macro RGB32_FUNCS 2
-RGB32_TO_Y_FN %1, r, g, b, a
-RGB32_TO_Y_FN %1, b, g, r, a, rgba
-RGB32_TO_Y_FN %1, a, r, g, b, rgba
-RGB32_TO_Y_FN %1, a, b, g, r, rgba
-
-RGB32_TO_UV_FN %2, r, g, b, a
-RGB32_TO_UV_FN %2, b, g, r, a, rgba
-RGB32_TO_UV_FN %2, a, r, g, b, rgba
-RGB32_TO_UV_FN %2, a, b, g, r, rgba
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB32_FUNCS 0, 0
-%endif
-
-INIT_XMM sse2
-RGB32_FUNCS 8, 12
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-RGB32_FUNCS 8, 12
-%endif
-
-;-----------------------------------------------------------------------------
-; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
-;
-; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
-; and
-; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
-; const uint8_t *unused, int w);
-;-----------------------------------------------------------------------------
-
-; %1 = a (aligned) or u (unaligned)
-; %2 = yuyv or uyvy
-%macro LOOP_YUYV_TO_Y 2
-.loop_%1:
- mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
- mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
-%ifidn %2, yuyv
- pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
- pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
-%else ; uyvy
- psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
- psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
-%endif ; yuyv/uyvy
- packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
- mova [dstq+wq], m0
- add wq, mmsize
- jl .loop_%1
- REP_RET
-%endmacro
-
-; %1 = nr. of XMM registers
-; %2 = yuyv or uyvy
-; %3 = if specified, it means that unaligned and aligned code in loop
-; will be the same (i.e. YUYV+AVX), and thus we don't need to
-; split the loop in an aligned and unaligned case
-%macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
-%if ARCH_X86_64
- movsxd wq, wd
-%endif
- add dstq, wq
-%if mmsize == 16
- test srcq, 15
-%endif
- lea srcq, [srcq+wq*2]
-%ifidn %2, yuyv
- pcmpeqb m2, m2 ; (byte) { 0xff } x 16
- psrlw m2, 8 ; (word) { 0x00ff } x 8
-%endif ; yuyv
-%if mmsize == 16
- jnz .loop_u_start
- neg wq
- LOOP_YUYV_TO_Y a, %2
-.loop_u_start:
- neg wq
- LOOP_YUYV_TO_Y u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_YUYV_TO_Y a, %2
-%endif ; mmsize == 8/16
-%endmacro
-
-; %1 = a (aligned) or u (unaligned)
-; %2 = yuyv or uyvy
-%macro LOOP_YUYV_TO_UV 2
-.loop_%1:
-%ifidn %2, yuyv
- mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
- mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
- psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
- psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
-%else ; uyvy
-%if cpuflag(avx)
- vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
- vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
-%else
- mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
- mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
- pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
- pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
-%endif
-%endif ; yuyv/uyvy
- packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
- pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
- psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
-%if mmsize == 16
- packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
- movh [dstUq+wq], m1
- movhps [dstVq+wq], m1
-%else ; mmsize == 8
- packuswb m1, m1 ; (byte) { U0, ... U3 }
- packuswb m0, m0 ; (byte) { V0, ... V3 }
- movh [dstUq+wq], m1
- movh [dstVq+wq], m0
-%endif ; mmsize == 8/16
- add wq, mmsize / 2
- jl .loop_%1
- REP_RET
-%endmacro
-
-; %1 = nr. of XMM registers
-; %2 = yuyv or uyvy
-; %3 = if specified, it means that unaligned and aligned code in loop
-; will be the same (i.e. UYVY+AVX), and thus we don't need to
-; split the loop in an aligned and unaligned case
-%macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
-%if ARCH_X86_64
- movsxd wq, dword r5m
-%else ; x86-32
- mov wq, r5m
-%endif
- add dstUq, wq
- add dstVq, wq
-%if mmsize == 16 && %0 == 2
- test srcq, 15
-%endif
- lea srcq, [srcq+wq*4]
- pcmpeqb m2, m2 ; (byte) { 0xff } x 16
- psrlw m2, 8 ; (word) { 0x00ff } x 8
- ; NOTE: if uyvy+avx, u/a are identical
-%if mmsize == 16 && %0 == 2
- jnz .loop_u_start
- neg wq
- LOOP_YUYV_TO_UV a, %2
-.loop_u_start:
- neg wq
- LOOP_YUYV_TO_UV u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_YUYV_TO_UV a, %2
-%endif ; mmsize == 8/16
-%endmacro
-
-; %1 = a (aligned) or u (unaligned)
-; %2 = nv12 or nv21
-%macro LOOP_NVXX_TO_UV 2
-.loop_%1:
- mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
- mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
- pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
- pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
- psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
- psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
- packuswb m2, m3 ; (byte) { U0, ..., U15 }
- packuswb m0, m1 ; (byte) { V0, ..., V15 }
-%ifidn %2, nv12
- mova [dstUq+wq], m2
- mova [dstVq+wq], m0
-%else ; nv21
- mova [dstVq+wq], m2
- mova [dstUq+wq], m0
-%endif ; nv12/21
- add wq, mmsize
- jl .loop_%1
- REP_RET
-%endmacro
-
-; %1 = nr. of XMM registers
-; %2 = nv12 or nv21
-%macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
-%if ARCH_X86_64
- movsxd wq, dword r5m
-%else ; x86-32
- mov wq, r5m
-%endif
- add dstUq, wq
- add dstVq, wq
-%if mmsize == 16
- test srcq, 15
-%endif
- lea srcq, [srcq+wq*2]
- pcmpeqb m5, m5 ; (byte) { 0xff } x 16
- psrlw m5, 8 ; (word) { 0x00ff } x 8
-%if mmsize == 16
- jnz .loop_u_start
- neg wq
- LOOP_NVXX_TO_UV a, %2
-.loop_u_start:
- neg wq
- LOOP_NVXX_TO_UV u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_NVXX_TO_UV a, %2
-%endif ; mmsize == 8/16
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-YUYV_TO_Y_FN 0, yuyv
-YUYV_TO_Y_FN 0, uyvy
-YUYV_TO_UV_FN 0, yuyv
-YUYV_TO_UV_FN 0, uyvy
-NVXX_TO_UV_FN 0, nv12
-NVXX_TO_UV_FN 0, nv21
-%endif
-
-INIT_XMM sse2
-YUYV_TO_Y_FN 3, yuyv
-YUYV_TO_Y_FN 2, uyvy
-YUYV_TO_UV_FN 3, yuyv
-YUYV_TO_UV_FN 3, uyvy
-NVXX_TO_UV_FN 5, nv12
-NVXX_TO_UV_FN 5, nv21
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
-; that's not faster in practice
-YUYV_TO_UV_FN 3, yuyv
-YUYV_TO_UV_FN 3, uyvy, 1
-NVXX_TO_UV_FN 5, nv12
-NVXX_TO_UV_FN 5, nv21
-%endif
diff --git a/ffmpeg1/libswscale/x86/output.asm b/ffmpeg1/libswscale/x86/output.asm
deleted file mode 100644
index f9add35..0000000
--- a/ffmpeg1/libswscale/x86/output.asm
+++ /dev/null
@@ -1,413 +0,0 @@
-;******************************************************************************
-;* x86-optimized vertical line scaling functions
-;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
-;* Kieran Kunhya <kieran@kunhya.com>
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-minshort: times 8 dw 0x8000
-yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
-yuv2yuvX_10_start: times 4 dd 0x10000
-yuv2yuvX_9_start: times 4 dd 0x20000
-yuv2yuvX_10_upper: times 8 dw 0x3ff
-yuv2yuvX_9_upper: times 8 dw 0x1ff
-pd_4: times 4 dd 4
-pd_4min0x40000:times 4 dd 4 - (0x40000)
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-pw_512: times 8 dw 512
-pw_1024: times 8 dw 1024
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; vertical line scaling
-;
-; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
-; const uint8_t *dither, int offset)
-; and
-; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
-; const int16_t **src, uint8_t *dst, int dstW,
-; const uint8_t *dither, int offset)
-;
-; Scale one or $filterSize lines of source data to generate one line of output
-; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
-; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
-; of 2. $offset is either 0 or 3. $dither holds 8 values.
-;-----------------------------------------------------------------------------
-
-%macro yuv2planeX_fn 3
-
-%if ARCH_X86_32
-%define cntr_reg fltsizeq
-%define movsx mov
-%else
-%define cntr_reg r7
-%define movsx movsxd
-%endif
-
-cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
-%if %1 == 8 || %1 == 9 || %1 == 10
- pxor m6, m6
-%endif ; %1 == 8/9/10
-
-%if %1 == 8
-%if ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
- SUB rsp, pad
-%define m_dith m7
-%else ; x86-64
-%define m_dith m9
-%endif ; x86-32
-
- ; create registers holding dither
- movq m_dith, [ditherq] ; dither
- test offsetd, offsetd
- jz .no_rot
-%if mmsize == 16
- punpcklqdq m_dith, m_dith
-%endif ; mmsize == 16
- PALIGNR m_dith, m_dith, 3, m0
-.no_rot:
-%if mmsize == 16
- punpcklbw m_dith, m6
-%if ARCH_X86_64
- punpcklwd m8, m_dith, m6
- pslld m8, 12
-%else ; x86-32
- punpcklwd m5, m_dith, m6
- pslld m5, 12
-%endif ; x86-32/64
- punpckhwd m_dith, m6
- pslld m_dith, 12
-%if ARCH_X86_32
- mova [rsp+ 0], m5
- mova [rsp+16], m_dith
-%endif
-%else ; mmsize == 8
- punpcklbw m5, m_dith, m6
- punpckhbw m_dith, m6
- punpcklwd m4, m5, m6
- punpckhwd m5, m6
- punpcklwd m3, m_dith, m6
- punpckhwd m_dith, m6
- pslld m4, 12
- pslld m5, 12
- pslld m3, 12
- pslld m_dith, 12
- mova [rsp+ 0], m4
- mova [rsp+ 8], m5
- mova [rsp+16], m3
- mova [rsp+24], m_dith
-%endif ; mmsize == 8/16
-%endif ; %1 == 8
-
- xor r5, r5
-
-.pixelloop:
-%assign %%i 0
- ; the rep here is for the 8bit output mmx case, where dither covers
- ; 8 pixels but we can only handle 2 pixels per register, and thus 4
- ; pixels per iteration. In order to not have to keep track of where
- ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
-%if %1 == 8
-%assign %%repcnt 16/mmsize
-%else
-%assign %%repcnt 1
-%endif
-
-%rep %%repcnt
-
-%if %1 == 8
-%if ARCH_X86_32
- mova m2, [rsp+mmsize*(0+%%i)]
- mova m1, [rsp+mmsize*(1+%%i)]
-%else ; x86-64
- mova m2, m8
- mova m1, m_dith
-%endif ; x86-32/64
-%else ; %1 == 9/10/16
- mova m1, [yuv2yuvX_%1_start]
- mova m2, m1
-%endif ; %1 == 8/9/10/16
- movsx cntr_reg, fltsizem
-.filterloop_ %+ %%i:
- ; input pixels
- mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
-%if %1 == 16
- mova m3, [r6+r5*4]
- mova m5, [r6+r5*4+mmsize]
-%else ; %1 == 8/9/10
- mova m3, [r6+r5*2]
-%endif ; %1 == 8/9/10/16
- mov r6, [srcq+gprsize*cntr_reg-gprsize]
-%if %1 == 16
- mova m4, [r6+r5*4]
- mova m6, [r6+r5*4+mmsize]
-%else ; %1 == 8/9/10
- mova m4, [r6+r5*2]
-%endif ; %1 == 8/9/10/16
-
- ; coefficients
- movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
-%if %1 == 16
- pshuflw m7, m0, 0 ; coeff[0]
- pshuflw m0, m0, 0x55 ; coeff[1]
- pmovsxwd m7, m7 ; word -> dword
- pmovsxwd m0, m0 ; word -> dword
-
- pmulld m3, m7
- pmulld m5, m7
- pmulld m4, m0
- pmulld m6, m0
-
- paddd m2, m3
- paddd m1, m5
- paddd m2, m4
- paddd m1, m6
-%else ; %1 == 10/9/8
- punpcklwd m5, m3, m4
- punpckhwd m3, m4
- SPLATD m0
-
- pmaddwd m5, m0
- pmaddwd m3, m0
-
- paddd m2, m5
- paddd m1, m3
-%endif ; %1 == 8/9/10/16
-
- sub cntr_reg, 2
- jg .filterloop_ %+ %%i
-
-%if %1 == 16
- psrad m2, 31 - %1
- psrad m1, 31 - %1
-%else ; %1 == 10/9/8
- psrad m2, 27 - %1
- psrad m1, 27 - %1
-%endif ; %1 == 8/9/10/16
-
-%if %1 == 8
- packssdw m2, m1
- packuswb m2, m2
- movh [dstq+r5*1], m2
-%else ; %1 == 9/10/16
-%if %1 == 16
- packssdw m2, m1
- paddw m2, [minshort]
-%else ; %1 == 9/10
-%if cpuflag(sse4)
- packusdw m2, m1
-%else ; mmxext/sse2
- packssdw m2, m1
- pmaxsw m2, m6
-%endif ; mmxext/sse2/sse4/avx
- pminsw m2, [yuv2yuvX_%1_upper]
-%endif ; %1 == 9/10/16
- mova [dstq+r5*2], m2
-%endif ; %1 == 8/9/10/16
-
- add r5, mmsize/2
- sub wd, mmsize/2
-
-%assign %%i %%i+2
-%endrep
- jg .pixelloop
-
-%if %1 == 8
-%if ARCH_X86_32
- ADD rsp, pad
- RET
-%else ; x86-64
- REP_RET
-%endif ; x86-32/64
-%else ; %1 == 9/10/16
- REP_RET
-%endif ; %1 == 8/9/10/16
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmxext
-yuv2planeX_fn 8, 0, 7
-yuv2planeX_fn 9, 0, 5
-yuv2planeX_fn 10, 0, 5
-%endif
-
-INIT_XMM sse2
-yuv2planeX_fn 8, 10, 7
-yuv2planeX_fn 9, 7, 5
-yuv2planeX_fn 10, 7, 5
-
-INIT_XMM sse4
-yuv2planeX_fn 8, 10, 7
-yuv2planeX_fn 9, 7, 5
-yuv2planeX_fn 10, 7, 5
-yuv2planeX_fn 16, 8, 5
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-yuv2planeX_fn 8, 10, 7
-yuv2planeX_fn 9, 7, 5
-yuv2planeX_fn 10, 7, 5
-%endif
-
-; %1=outout-bpc, %2=alignment (u/a)
-%macro yuv2plane1_mainloop 2
-.loop_%2:
-%if %1 == 8
- paddsw m0, m2, [srcq+wq*2+mmsize*0]
- paddsw m1, m3, [srcq+wq*2+mmsize*1]
- psraw m0, 7
- psraw m1, 7
- packuswb m0, m1
- mov%2 [dstq+wq], m0
-%elif %1 == 16
- paddd m0, m4, [srcq+wq*4+mmsize*0]
- paddd m1, m4, [srcq+wq*4+mmsize*1]
- paddd m2, m4, [srcq+wq*4+mmsize*2]
- paddd m3, m4, [srcq+wq*4+mmsize*3]
- psrad m0, 3
- psrad m1, 3
- psrad m2, 3
- psrad m3, 3
-%if cpuflag(sse4) ; avx/sse4
- packusdw m0, m1
- packusdw m2, m3
-%else ; mmx/sse2
- packssdw m0, m1
- packssdw m2, m3
- paddw m0, m5
- paddw m2, m5
-%endif ; mmx/sse2/sse4/avx
- mov%2 [dstq+wq*2+mmsize*0], m0
- mov%2 [dstq+wq*2+mmsize*1], m2
-%else ; %1 == 9/10
- paddsw m0, m2, [srcq+wq*2+mmsize*0]
- paddsw m1, m2, [srcq+wq*2+mmsize*1]
- psraw m0, 15 - %1
- psraw m1, 15 - %1
- pmaxsw m0, m4
- pmaxsw m1, m4
- pminsw m0, m3
- pminsw m1, m3
- mov%2 [dstq+wq*2+mmsize*0], m0
- mov%2 [dstq+wq*2+mmsize*1], m1
-%endif
- add wq, mmsize
- jl .loop_%2
-%endmacro
-
-%macro yuv2plane1_fn 3
-cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
- movsxdifnidn wq, wd
- add wq, mmsize - 1
- and wq, ~(mmsize - 1)
-%if %1 == 8
- add dstq, wq
-%else ; %1 != 8
- lea dstq, [dstq+wq*2]
-%endif ; %1 == 8
-%if %1 == 16
- lea srcq, [srcq+wq*4]
-%else ; %1 != 16
- lea srcq, [srcq+wq*2]
-%endif ; %1 == 16
- neg wq
-
-%if %1 == 8
- pxor m4, m4 ; zero
-
- ; create registers holding dither
- movq m3, [ditherq] ; dither
- test offsetd, offsetd
- jz .no_rot
-%if mmsize == 16
- punpcklqdq m3, m3
-%endif ; mmsize == 16
- PALIGNR m3, m3, 3, m2
-.no_rot:
-%if mmsize == 8
- mova m2, m3
- punpckhbw m3, m4 ; byte->word
- punpcklbw m2, m4 ; byte->word
-%else
- punpcklbw m3, m4
- mova m2, m3
-%endif
-%elif %1 == 9
- pxor m4, m4
- mova m3, [pw_512]
- mova m2, [pw_32]
-%elif %1 == 10
- pxor m4, m4
- mova m3, [pw_1024]
- mova m2, [pw_16]
-%else ; %1 == 16
-%if cpuflag(sse4) ; sse4/avx
- mova m4, [pd_4]
-%else ; mmx/sse2
- mova m4, [pd_4min0x40000]
- mova m5, [minshort]
-%endif ; mmx/sse2/sse4/avx
-%endif ; %1 == ..
-
- ; actual pixel scaling
-%if mmsize == 8
- yuv2plane1_mainloop %1, a
-%else ; mmsize == 16
- test dstq, 15
- jnz .unaligned
- yuv2plane1_mainloop %1, a
- REP_RET
-.unaligned:
- yuv2plane1_mainloop %1, u
-%endif ; mmsize == 8/16
- REP_RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-yuv2plane1_fn 8, 0, 5
-yuv2plane1_fn 16, 0, 3
-
-INIT_MMX mmxext
-yuv2plane1_fn 9, 0, 3
-yuv2plane1_fn 10, 0, 3
-%endif
-
-INIT_XMM sse2
-yuv2plane1_fn 8, 5, 5
-yuv2plane1_fn 9, 5, 3
-yuv2plane1_fn 10, 5, 3
-yuv2plane1_fn 16, 6, 3
-
-INIT_XMM sse4
-yuv2plane1_fn 16, 5, 3
-
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-yuv2plane1_fn 8, 5, 5
-yuv2plane1_fn 9, 5, 3
-yuv2plane1_fn 10, 5, 3
-yuv2plane1_fn 16, 5, 3
-%endif
diff --git a/ffmpeg1/libswscale/x86/rgb2rgb.c b/ffmpeg1/libswscale/x86/rgb2rgb.c
deleted file mode 100644
index 1e20176..0000000
--- a/ffmpeg1/libswscale/x86/rgb2rgb.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * software RGB to RGB converter
- * pluralize by software PAL8 to RGB converter
- * software YUV to YUV converter
- * software YUV to RGB converter
- * Written by Nick Kurshev.
- * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavutil/cpu.h"
-#include "libavutil/bswap.h"
-#include "libswscale/rgb2rgb.h"
-#include "libswscale/swscale.h"
-#include "libswscale/swscale_internal.h"
-
-#if HAVE_INLINE_ASM
-
-DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
-DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
-DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
-DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
-DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
-DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
-DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
-DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
-#define mask16b mask15b
-DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
-DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
-DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
-DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
-DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
-DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
-DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
-DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
-DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
-DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
-DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
-
-#define RGB2YUV_SHIFT 8
-#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
-#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
-#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
-#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
-#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
-#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
-#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
-#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
-#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
-
-// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
-
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_AMD3DNOW 0
-#define COMPILE_TEMPLATE_SSE2 0
-
-//MMX versions
-#undef RENAME
-#define RENAME(a) a ## _MMX
-#include "rgb2rgb_template.c"
-
-// MMXEXT versions
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#define COMPILE_TEMPLATE_MMXEXT 1
-#define RENAME(a) a ## _MMXEXT
-#include "rgb2rgb_template.c"
-
-//SSE2 versions
-#undef RENAME
-#undef COMPILE_TEMPLATE_SSE2
-#define COMPILE_TEMPLATE_SSE2 1
-#define RENAME(a) a ## _SSE2
-#include "rgb2rgb_template.c"
-
-//3DNOW versions
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNOW
-#include "rgb2rgb_template.c"
-
-/*
- RGB15->RGB16 original by Strepto/Astral
- ported to gcc & bugfixed : A'rpi
- MMXEXT, 3DNOW optimization by Nick Kurshev
- 32-bit C version, and and&add trick by Michael Niedermayer
-*/
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void rgb2rgb_init_x86(void)
-{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
-
- if (INLINE_MMX(cpu_flags))
- rgb2rgb_init_MMX();
- if (INLINE_AMD3DNOW(cpu_flags))
- rgb2rgb_init_3DNOW();
- if (INLINE_MMXEXT(cpu_flags))
- rgb2rgb_init_MMXEXT();
- if (INLINE_SSE2(cpu_flags))
- rgb2rgb_init_SSE2();
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/ffmpeg1/libswscale/x86/rgb2rgb_template.c b/ffmpeg1/libswscale/x86/rgb2rgb_template.c
deleted file mode 100644
index d802ab4..0000000
--- a/ffmpeg1/libswscale/x86/rgb2rgb_template.c
+++ /dev/null
@@ -1,2498 +0,0 @@
-/*
- * software RGB to RGB converter
- * pluralize by software PAL8 to RGB converter
- * software YUV to YUV converter
- * software YUV to RGB converter
- * Written by Nick Kurshev.
- * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
- * lot of big-endian byte order fixes by Alex Beregszaszi
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-
-#undef PREFETCH
-#undef MOVNTQ
-#undef EMMS
-#undef SFENCE
-#undef PAVGB
-
-#if COMPILE_TEMPLATE_AMD3DNOW
-#define PREFETCH "prefetch"
-#define PAVGB "pavgusb"
-#elif COMPILE_TEMPLATE_MMXEXT
-#define PREFETCH "prefetchnta"
-#define PAVGB "pavgb"
-#else
-#define PREFETCH " # nop"
-#endif
-
-#if COMPILE_TEMPLATE_AMD3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
-#define EMMS "femms"
-#else
-#define EMMS "emms"
-#endif
-
-#if COMPILE_TEMPLATE_MMXEXT
-#define MOVNTQ "movntq"
-#define SFENCE "sfence"
-#else
-#define MOVNTQ "movq"
-#define SFENCE " # nop"
-#endif
-
-#if !COMPILE_TEMPLATE_SSE2
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-
-static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- uint8_t *dest = dst;
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 23;
- __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "punpckldq 3(%1), %%mm0 \n\t"
- "movd 6(%1), %%mm1 \n\t"
- "punpckldq 9(%1), %%mm1 \n\t"
- "movd 12(%1), %%mm2 \n\t"
- "punpckldq 15(%1), %%mm2 \n\t"
- "movd 18(%1), %%mm3 \n\t"
- "punpckldq 21(%1), %%mm3 \n\t"
- "por %%mm7, %%mm0 \n\t"
- "por %%mm7, %%mm1 \n\t"
- "por %%mm7, %%mm2 \n\t"
- "por %%mm7, %%mm3 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- MOVNTQ" %%mm1, 8(%0) \n\t"
- MOVNTQ" %%mm2, 16(%0) \n\t"
- MOVNTQ" %%mm3, 24(%0)"
- :: "r"(dest), "r"(s)
- :"memory");
- dest += 32;
- s += 24;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- *dest++ = *s++;
- *dest++ = *s++;
- *dest++ = *s++;
- *dest++ = 255;
- }
-}
-
-#define STORE_BGR24_MMX \
- "psrlq $8, %%mm2 \n\t" \
- "psrlq $8, %%mm3 \n\t" \
- "psrlq $8, %%mm6 \n\t" \
- "psrlq $8, %%mm7 \n\t" \
- "pand "MANGLE(mask24l)", %%mm0\n\t" \
- "pand "MANGLE(mask24l)", %%mm1\n\t" \
- "pand "MANGLE(mask24l)", %%mm4\n\t" \
- "pand "MANGLE(mask24l)", %%mm5\n\t" \
- "pand "MANGLE(mask24h)", %%mm2\n\t" \
- "pand "MANGLE(mask24h)", %%mm3\n\t" \
- "pand "MANGLE(mask24h)", %%mm6\n\t" \
- "pand "MANGLE(mask24h)", %%mm7\n\t" \
- "por %%mm2, %%mm0 \n\t" \
- "por %%mm3, %%mm1 \n\t" \
- "por %%mm6, %%mm4 \n\t" \
- "por %%mm7, %%mm5 \n\t" \
- \
- "movq %%mm1, %%mm2 \n\t" \
- "movq %%mm4, %%mm3 \n\t" \
- "psllq $48, %%mm2 \n\t" \
- "psllq $32, %%mm3 \n\t" \
- "por %%mm2, %%mm0 \n\t" \
- "psrlq $16, %%mm1 \n\t" \
- "psrlq $32, %%mm4 \n\t" \
- "psllq $16, %%mm5 \n\t" \
- "por %%mm3, %%mm1 \n\t" \
- "por %%mm5, %%mm4 \n\t" \
- \
- MOVNTQ" %%mm0, (%0) \n\t" \
- MOVNTQ" %%mm1, 8(%0) \n\t" \
- MOVNTQ" %%mm4, 16(%0)"
-
-
-static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- uint8_t *dest = dst;
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 31;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "movq 16(%1), %%mm4 \n\t"
- "movq 24(%1), %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "movq %%mm4, %%mm6 \n\t"
- "movq %%mm5, %%mm7 \n\t"
- STORE_BGR24_MMX
- :: "r"(dest), "r"(s)
- :"memory");
- dest += 24;
- s += 32;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- *dest++ = *s++;
- *dest++ = *s++;
- *dest++ = *s++;
- s++;
- }
-}
-
-/*
- original by Strepto/Astral
- ported to gcc & bugfixed: A'rpi
- MMXEXT, 3DNOW optimization by Nick Kurshev
- 32-bit C version, and and&add trick by Michael Niedermayer
-*/
-static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- register const uint8_t* s=src;
- register uint8_t* d=dst;
- register const uint8_t *end;
- const uint8_t *mm_end;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*s));
- __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
- mm_end = end - 15;
- while (s<mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "pand %%mm4, %%mm0 \n\t"
- "pand %%mm4, %%mm2 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- MOVNTQ" %%mm2, 8(%0)"
- :: "r"(d), "r"(s)
- );
- d+=16;
- s+=16;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- mm_end = end - 3;
- while (s < mm_end) {
- register unsigned x= *((const uint32_t *)s);
- *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
- d+=4;
- s+=4;
- }
- if (s < end) {
- register unsigned short x= *((const uint16_t *)s);
- *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
- }
-}
-
-static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- register const uint8_t* s=src;
- register uint8_t* d=dst;
- register const uint8_t *end;
- const uint8_t *mm_end;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*s));
- __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
- __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
- mm_end = end - 15;
- while (s<mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlq $1, %%mm0 \n\t"
- "psrlq $1, %%mm2 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm3 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm3, %%mm2 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- MOVNTQ" %%mm2, 8(%0)"
- :: "r"(d), "r"(s)
- );
- d+=16;
- s+=16;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- mm_end = end - 3;
- while (s < mm_end) {
- register uint32_t x= *((const uint32_t*)s);
- *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
- s+=4;
- d+=4;
- }
- if (s < end) {
- register uint16_t x= *((const uint16_t*)s);
- *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
- }
-}
-
-static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- mm_end = end - 15;
- __asm__ volatile(
- "movq %3, %%mm5 \n\t"
- "movq %4, %%mm6 \n\t"
- "movq %5, %%mm7 \n\t"
- "jmp 2f \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 4(%1), %%mm3 \n\t"
- "punpckldq 8(%1), %%mm0 \n\t"
- "punpckldq 12(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "pand %%mm6, %%mm0 \n\t"
- "pand %%mm6, %%mm3 \n\t"
- "pmaddwd %%mm7, %%mm0 \n\t"
- "pmaddwd %%mm7, %%mm3 \n\t"
- "pand %%mm5, %%mm1 \n\t"
- "pand %%mm5, %%mm4 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "psrld $5, %%mm0 \n\t"
- "pslld $11, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- "add $16, %1 \n\t"
- "add $8, %0 \n\t"
- "2: \n\t"
- "cmp %2, %1 \n\t"
- " jb 1b \n\t"
- : "+r" (d), "+r"(s)
- : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
- );
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register int rgb = *(const uint32_t*)s; s += 4;
- *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
- }
-}
-
-static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_16mask),"m"(green_16mask));
- mm_end = end - 15;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 4(%1), %%mm3 \n\t"
- "punpckldq 8(%1), %%mm0 \n\t"
- "punpckldq 12(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psllq $8, %%mm0 \n\t"
- "psllq $8, %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "psrlq $5, %%mm1 \n\t"
- "psrlq $5, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $19, %%mm2 \n\t"
- "psrlq $19, %%mm5 \n\t"
- "pand %2, %%mm2 \n\t"
- "pand %2, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
- d += 4;
- s += 16;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register int rgb = *(const uint32_t*)s; s += 4;
- *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
- }
-}
-
-static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- mm_end = end - 15;
- __asm__ volatile(
- "movq %3, %%mm5 \n\t"
- "movq %4, %%mm6 \n\t"
- "movq %5, %%mm7 \n\t"
- "jmp 2f \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 4(%1), %%mm3 \n\t"
- "punpckldq 8(%1), %%mm0 \n\t"
- "punpckldq 12(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "pand %%mm6, %%mm0 \n\t"
- "pand %%mm6, %%mm3 \n\t"
- "pmaddwd %%mm7, %%mm0 \n\t"
- "pmaddwd %%mm7, %%mm3 \n\t"
- "pand %%mm5, %%mm1 \n\t"
- "pand %%mm5, %%mm4 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "psrld $6, %%mm0 \n\t"
- "pslld $10, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- "add $16, %1 \n\t"
- "add $8, %0 \n\t"
- "2: \n\t"
- "cmp %2, %1 \n\t"
- " jb 1b \n\t"
- : "+r" (d), "+r"(s)
- : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
- );
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register int rgb = *(const uint32_t*)s; s += 4;
- *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
- }
-}
-
-static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_15mask),"m"(green_15mask));
- mm_end = end - 15;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 4(%1), %%mm3 \n\t"
- "punpckldq 8(%1), %%mm0 \n\t"
- "punpckldq 12(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psllq $7, %%mm0 \n\t"
- "psllq $7, %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "psrlq $6, %%mm1 \n\t"
- "psrlq $6, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $19, %%mm2 \n\t"
- "psrlq $19, %%mm5 \n\t"
- "pand %2, %%mm2 \n\t"
- "pand %2, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
- d += 4;
- s += 16;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register int rgb = *(const uint32_t*)s; s += 4;
- *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
- }
-}
-
-static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_16mask),"m"(green_16mask));
- mm_end = end - 11;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 3(%1), %%mm3 \n\t"
- "punpckldq 6(%1), %%mm0 \n\t"
- "punpckldq 9(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psrlq $3, %%mm0 \n\t"
- "psrlq $3, %%mm3 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %2, %%mm3 \n\t"
- "psrlq $5, %%mm1 \n\t"
- "psrlq $5, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $8, %%mm2 \n\t"
- "psrlq $8, %%mm5 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm7, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
- d += 4;
- s += 12;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- const int b = *s++;
- const int g = *s++;
- const int r = *s++;
- *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
- }
-}
-
-static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_16mask),"m"(green_16mask));
- mm_end = end - 15;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 3(%1), %%mm3 \n\t"
- "punpckldq 6(%1), %%mm0 \n\t"
- "punpckldq 9(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psllq $8, %%mm0 \n\t"
- "psllq $8, %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "psrlq $5, %%mm1 \n\t"
- "psrlq $5, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $19, %%mm2 \n\t"
- "psrlq $19, %%mm5 \n\t"
- "pand %2, %%mm2 \n\t"
- "pand %2, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
- d += 4;
- s += 12;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- const int r = *s++;
- const int g = *s++;
- const int b = *s++;
- *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
- }
-}
-
-static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_15mask),"m"(green_15mask));
- mm_end = end - 11;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 3(%1), %%mm3 \n\t"
- "punpckldq 6(%1), %%mm0 \n\t"
- "punpckldq 9(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psrlq $3, %%mm0 \n\t"
- "psrlq $3, %%mm3 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %2, %%mm3 \n\t"
- "psrlq $6, %%mm1 \n\t"
- "psrlq $6, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $9, %%mm2 \n\t"
- "psrlq $9, %%mm5 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm7, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
- d += 4;
- s += 12;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- const int b = *s++;
- const int g = *s++;
- const int r = *s++;
- *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
- }
-}
-
-static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint8_t *s = src;
- const uint8_t *end;
- const uint8_t *mm_end;
- uint16_t *d = (uint16_t *)dst;
- end = s + src_size;
- __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
- __asm__ volatile(
- "movq %0, %%mm7 \n\t"
- "movq %1, %%mm6 \n\t"
- ::"m"(red_15mask),"m"(green_15mask));
- mm_end = end - 15;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 3(%1), %%mm3 \n\t"
- "punpckldq 6(%1), %%mm0 \n\t"
- "punpckldq 9(%1), %%mm3 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "psllq $7, %%mm0 \n\t"
- "psllq $7, %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "psrlq $6, %%mm1 \n\t"
- "psrlq $6, %%mm4 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "psrlq $19, %%mm2 \n\t"
- "psrlq $19, %%mm5 \n\t"
- "pand %2, %%mm2 \n\t"
- "pand %2, %%mm5 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm5, %%mm3 \n\t"
- "psllq $16, %%mm3 \n\t"
- "por %%mm3, %%mm0 \n\t"
- MOVNTQ" %%mm0, (%0) \n\t"
- ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
- d += 4;
- s += 12;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- const int r = *s++;
- const int g = *s++;
- const int b = *s++;
- *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
- }
-}
-
-static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint16_t *end;
- const uint16_t *mm_end;
- uint8_t *d = dst;
- const uint16_t *s = (const uint16_t*)src;
- end = s + src_size/2;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 7;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "movq %%mm2, %%mm5 \n\t"
- "punpcklwd %5, %%mm0 \n\t"
- "punpcklwd %5, %%mm1 \n\t"
- "punpcklwd %5, %%mm2 \n\t"
- "punpckhwd %5, %%mm3 \n\t"
- "punpckhwd %5, %%mm4 \n\t"
- "punpckhwd %5, %%mm5 \n\t"
- "psllq $8, %%mm1 \n\t"
- "psllq $16, %%mm2 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "psllq $8, %%mm4 \n\t"
- "psllq $16, %%mm5 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm5, %%mm3 \n\t"
-
- "movq %%mm0, %%mm6 \n\t"
- "movq %%mm3, %%mm7 \n\t"
-
- "movq 8(%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "movq %%mm2, %%mm5 \n\t"
- "punpcklwd %5, %%mm0 \n\t"
- "punpcklwd %5, %%mm1 \n\t"
- "punpcklwd %5, %%mm2 \n\t"
- "punpckhwd %5, %%mm3 \n\t"
- "punpckhwd %5, %%mm4 \n\t"
- "punpckhwd %5, %%mm5 \n\t"
- "psllq $8, %%mm1 \n\t"
- "psllq $16, %%mm2 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "psllq $8, %%mm4 \n\t"
- "psllq $16, %%mm5 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm5, %%mm3 \n\t"
-
- :"=m"(*d)
- :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
- :"memory");
- /* borrowed 32 to 24 */
- __asm__ volatile(
- "movq %%mm0, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "movq %%mm7, %%mm1 \n\t"
-
- "movq %%mm4, %%mm6 \n\t"
- "movq %%mm5, %%mm7 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
-
- STORE_BGR24_MMX
-
- :: "r"(d), "m"(*s)
- :"memory");
- d += 24;
- s += 8;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register uint16_t bgr;
- bgr = *s++;
- *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
- *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
- *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
- }
-}
-
-static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint16_t *end;
- const uint16_t *mm_end;
- uint8_t *d = (uint8_t *)dst;
- const uint16_t *s = (const uint16_t *)src;
- end = s + src_size/2;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- mm_end = end - 7;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "psrlq $1, %%mm2 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
- "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "movq %%mm2, %%mm5 \n\t"
- "punpcklwd %5, %%mm0 \n\t"
- "punpcklwd %5, %%mm1 \n\t"
- "punpcklwd %5, %%mm2 \n\t"
- "punpckhwd %5, %%mm3 \n\t"
- "punpckhwd %5, %%mm4 \n\t"
- "punpckhwd %5, %%mm5 \n\t"
- "psllq $8, %%mm1 \n\t"
- "psllq $16, %%mm2 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "psllq $8, %%mm4 \n\t"
- "psllq $16, %%mm5 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm5, %%mm3 \n\t"
-
- "movq %%mm0, %%mm6 \n\t"
- "movq %%mm3, %%mm7 \n\t"
-
- "movq 8(%1), %%mm0 \n\t"
- "movq 8(%1), %%mm1 \n\t"
- "movq 8(%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "psrlq $1, %%mm2 \n\t"
- "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
- "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "movq %%mm2, %%mm5 \n\t"
- "punpcklwd %5, %%mm0 \n\t"
- "punpcklwd %5, %%mm1 \n\t"
- "punpcklwd %5, %%mm2 \n\t"
- "punpckhwd %5, %%mm3 \n\t"
- "punpckhwd %5, %%mm4 \n\t"
- "punpckhwd %5, %%mm5 \n\t"
- "psllq $8, %%mm1 \n\t"
- "psllq $16, %%mm2 \n\t"
- "por %%mm1, %%mm0 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "psllq $8, %%mm4 \n\t"
- "psllq $16, %%mm5 \n\t"
- "por %%mm4, %%mm3 \n\t"
- "por %%mm5, %%mm3 \n\t"
- :"=m"(*d)
- :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
- :"memory");
- /* borrowed 32 to 24 */
- __asm__ volatile(
- "movq %%mm0, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "movq %%mm7, %%mm1 \n\t"
-
- "movq %%mm4, %%mm6 \n\t"
- "movq %%mm5, %%mm7 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
-
- STORE_BGR24_MMX
-
- :: "r"(d), "m"(*s)
- :"memory");
- d += 24;
- s += 8;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register uint16_t bgr;
- bgr = *s++;
- *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
- *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
- *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
- }
-}
-
-/*
- * mm0 = 00 B3 00 B2 00 B1 00 B0
- * mm1 = 00 G3 00 G2 00 G1 00 G0
- * mm2 = 00 R3 00 R2 00 R1 00 R0
- * mm6 = FF FF FF FF FF FF FF FF
- * mm7 = 00 00 00 00 00 00 00 00
- */
-#define PACK_RGB32 \
- "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
- "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
- "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
- "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
- "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
- "movq %%mm0, %%mm3 \n\t" \
- "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
- "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
- MOVNTQ" %%mm0, (%0) \n\t" \
- MOVNTQ" %%mm3, 8(%0) \n\t" \
-
-static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint16_t *end;
- const uint16_t *mm_end;
- uint8_t *d = dst;
- const uint16_t *s = (const uint16_t *)src;
- end = s + src_size/2;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
- __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
- mm_end = end - 3;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "pmulhw %5, %%mm0 \n\t"
- "pmulhw %5, %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
- :"memory");
- d += 16;
- s += 4;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register uint16_t bgr;
- bgr = *s++;
- *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
- *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
- *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
- *d++ = 255;
- }
-}
-
-static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- const uint16_t *end;
- const uint16_t *mm_end;
- uint8_t *d = dst;
- const uint16_t *s = (const uint16_t*)src;
- end = s + src_size/2;
- __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
- __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
- __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
- mm_end = end - 3;
- while (s < mm_end) {
- __asm__ volatile(
- PREFETCH" 32(%1) \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1), %%mm2 \n\t"
- "pand %2, %%mm0 \n\t"
- "pand %3, %%mm1 \n\t"
- "pand %4, %%mm2 \n\t"
- "psllq $5, %%mm0 \n\t"
- "psrlq $1, %%mm2 \n\t"
- "pmulhw %5, %%mm0 \n\t"
- "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
- "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
- PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
- :"memory");
- d += 16;
- s += 4;
- }
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
- while (s < end) {
- register uint16_t bgr;
- bgr = *s++;
- *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
- *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
- *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
- *d++ = 255;
- }
-}
-
-static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- x86_reg idx = 15 - src_size;
- const uint8_t *s = src-idx;
- uint8_t *d = dst-idx;
- __asm__ volatile(
- "test %0, %0 \n\t"
- "jns 2f \n\t"
- PREFETCH" (%1, %0) \n\t"
- "movq %3, %%mm7 \n\t"
- "pxor %4, %%mm7 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "pxor %5, %%mm7 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1, %0) \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
-# if COMPILE_TEMPLATE_MMXEXT
- "pshufw $177, %%mm0, %%mm3 \n\t"
- "pshufw $177, %%mm1, %%mm5 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm6, %%mm3 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm6, %%mm5 \n\t"
- "por %%mm3, %%mm0 \n\t"
- "por %%mm5, %%mm1 \n\t"
-# else
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm4 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm6, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm6, %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "pslld $16, %%mm2 \n\t"
- "psrld $16, %%mm3 \n\t"
- "pslld $16, %%mm4 \n\t"
- "psrld $16, %%mm5 \n\t"
- "por %%mm2, %%mm0 \n\t"
- "por %%mm4, %%mm1 \n\t"
- "por %%mm3, %%mm0 \n\t"
- "por %%mm5, %%mm1 \n\t"
-# endif
- MOVNTQ" %%mm0, (%2, %0) \n\t"
- MOVNTQ" %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "js 1b \n\t"
- SFENCE" \n\t"
- EMMS" \n\t"
- "2: \n\t"
- : "+&r"(idx)
- : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
- : "memory");
- for (; idx<15; idx+=4) {
- register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
- v &= 0xff00ff;
- *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
- }
-}
-
-static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- unsigned i;
- x86_reg mmx_size= 23 - src_size;
- __asm__ volatile (
- "test %%"REG_a", %%"REG_a" \n\t"
- "jns 2f \n\t"
- "movq "MANGLE(mask24r)", %%mm5 \n\t"
- "movq "MANGLE(mask24g)", %%mm6 \n\t"
- "movq "MANGLE(mask24b)", %%mm7 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1, %%"REG_a") \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
- "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
- "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
- "psllq $16, %%mm0 \n\t" // 00 BGR BGR
- "pand %%mm5, %%mm0 \n\t"
- "pand %%mm6, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "por %%mm0, %%mm1 \n\t"
- "por %%mm2, %%mm1 \n\t"
- "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
- MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
- "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
- "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm5, %%mm1 \n\t"
- "pand %%mm6, %%mm2 \n\t"
- "por %%mm0, %%mm1 \n\t"
- "por %%mm2, %%mm1 \n\t"
- "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
- MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
- "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
- "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
- "pand %%mm6, %%mm0 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm5, %%mm2 \n\t"
- "por %%mm0, %%mm1 \n\t"
- "por %%mm2, %%mm1 \n\t"
- MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
- "add $24, %%"REG_a" \n\t"
- " js 1b \n\t"
- "2: \n\t"
- : "+a" (mmx_size)
- : "r" (src-mmx_size), "r"(dst-mmx_size)
- );
-
- __asm__ volatile(SFENCE:::"memory");
- __asm__ volatile(EMMS:::"memory");
-
- if (mmx_size==23) return; //finished, was multiple of 8
-
- src+= src_size;
- dst+= src_size;
- src_size= 23-mmx_size;
- src-= src_size;
- dst-= src_size;
- for (i=0; i<src_size; i+=3) {
- register uint8_t x;
- x = src[i + 2];
- dst[i + 1] = src[i + 1];
- dst[i + 2] = src[i + 0];
- dst[i + 0] = x;
- }
-}
-
-static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
-{
- int y;
- const x86_reg chromWidth= width>>1;
- for (y=0; y<height; y++) {
- //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
- PREFETCH" 32(%2, %%"REG_a") \n\t"
- PREFETCH" 32(%3, %%"REG_a") \n\t"
- "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
- "movq %%mm0, %%mm2 \n\t" // U(0)
- "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
- "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
- "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
-
- "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
- "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
- "movq %%mm3, %%mm4 \n\t" // Y(0)
- "movq %%mm5, %%mm6 \n\t" // Y(8)
- "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
- "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
- "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
- "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
-
- MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
- : "%"REG_a
- );
- if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
- usrc += chromStride;
- vsrc += chromStride;
- }
- ysrc += lumStride;
- dst += dstStride;
- }
- __asm__(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-}
-
-/**
- * Height should be a multiple of 2 and width should be a multiple of 16.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- */
-static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride)
-{
- //FIXME interpolate chroma
- RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
-}
-
-static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
-{
- int y;
- const x86_reg chromWidth= width>>1;
- for (y=0; y<height; y++) {
- //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
- PREFETCH" 32(%2, %%"REG_a") \n\t"
- PREFETCH" 32(%3, %%"REG_a") \n\t"
- "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
- "movq %%mm0, %%mm2 \n\t" // U(0)
- "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
- "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
- "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
-
- "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
- "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
- "movq %%mm0, %%mm4 \n\t" // Y(0)
- "movq %%mm2, %%mm6 \n\t" // Y(8)
- "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
- "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
- "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
- "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
-
- MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
- MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
- : "%"REG_a
- );
- if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
- usrc += chromStride;
- vsrc += chromStride;
- }
- ysrc += lumStride;
- dst += dstStride;
- }
- __asm__(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-}
-
-/**
- * Height should be a multiple of 2 and width should be a multiple of 16
- * (If this is a problem for anyone then tell me, and I will fix it.)
- */
-static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride)
-{
- //FIXME interpolate chroma
- RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
-}
-
-/**
- * Width should be a multiple of 16.
- */
-static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride)
-{
- RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
-}
-
-/**
- * Width should be a multiple of 16.
- */
-static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
- int width, int height,
- int lumStride, int chromStride, int dstStride)
-{
- RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
-}
-
-/**
- * Height should be a multiple of 2 and width should be a multiple of 16.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- */
-static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const x86_reg chromWidth= width>>1;
- for (y=0; y<height; y+=2) {
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
- "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
- "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
- "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
- "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
- "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
- "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
- "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
- "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
- "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
- "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
-
- MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
-
- "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
- "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
- "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
- "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
- "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
- "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
- "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
- "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
- "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
- "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
-
- MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
-
- "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
- "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
- "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
- "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
- "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
- "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
- "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
- "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
-
- MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
- MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
- : "memory", "%"REG_a
- );
-
- ydst += lumStride;
- src += srcStride;
-
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
- "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
- "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
- "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
- "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
- "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
- "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
- "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
- "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
- "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
- "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
-
- MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
-
- ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
- : "memory", "%"REG_a
- );
- udst += chromStride;
- vdst += chromStride;
- ydst += lumStride;
- src += srcStride;
- }
- __asm__ volatile(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
-static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
-{
- int x,y;
-
- dst[0]= src[0];
-
- // first line
- for (x=0; x<srcWidth-1; x++) {
- dst[2*x+1]= (3*src[x] + src[x+1])>>2;
- dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
- }
- dst[2*srcWidth-1]= src[srcWidth-1];
-
- dst+= dstStride;
-
- for (y=1; y<srcHeight; y++) {
- const x86_reg mmxSize= srcWidth&~15;
- __asm__ volatile(
- "mov %4, %%"REG_a" \n\t"
- "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
- "movq (%0, %%"REG_a"), %%mm4 \n\t"
- "movq %%mm4, %%mm2 \n\t"
- "psllq $8, %%mm4 \n\t"
- "pand %%mm0, %%mm2 \n\t"
- "por %%mm2, %%mm4 \n\t"
- "movq (%1, %%"REG_a"), %%mm5 \n\t"
- "movq %%mm5, %%mm3 \n\t"
- "psllq $8, %%mm5 \n\t"
- "pand %%mm0, %%mm3 \n\t"
- "por %%mm3, %%mm5 \n\t"
- "1: \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq (%1, %%"REG_a"), %%mm1 \n\t"
- "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
- "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
- PAVGB" %%mm0, %%mm5 \n\t"
- PAVGB" %%mm0, %%mm3 \n\t"
- PAVGB" %%mm0, %%mm5 \n\t"
- PAVGB" %%mm0, %%mm3 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm1, %%mm2 \n\t"
- "movq %%mm5, %%mm7 \n\t"
- "movq %%mm4, %%mm6 \n\t"
- "punpcklbw %%mm3, %%mm5 \n\t"
- "punpckhbw %%mm3, %%mm7 \n\t"
- "punpcklbw %%mm2, %%mm4 \n\t"
- "punpckhbw %%mm2, %%mm6 \n\t"
- MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
- "add $8, %%"REG_a" \n\t"
- "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
- "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
- " js 1b \n\t"
- :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
- "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
- "g" (-mmxSize)
- : "%"REG_a
- );
-
- for (x=mmxSize-1; x<srcWidth-1; x++) {
- dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
- dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
- dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
- dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
- }
- dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
- dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
-
- dst+=dstStride*2;
- src+=srcStride;
- }
-
- // last line
- dst[0]= src[0];
-
- for (x=0; x<srcWidth-1; x++) {
- dst[2*x+1]= (3*src[x] + src[x+1])>>2;
- dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
- }
- dst[2*srcWidth-1]= src[srcWidth-1];
-
- __asm__ volatile(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-}
-#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-/**
- * Height should be a multiple of 2 and width should be a multiple of 16.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- * Chrominance data is only taken from every second line, others are ignored.
- * FIXME: Write HQ version.
- */
-static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const x86_reg chromWidth= width>>1;
- for (y=0; y<height; y+=2) {
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
- "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
- "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
- "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
- "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
- "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
- "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
- "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
- "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
- "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
- "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
-
- MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
-
- "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
- "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
- "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
- "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
- "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
- "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
- "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
- "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
- "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
- "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
-
- MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
-
- "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
- "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
- "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
- "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
- "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
- "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
- "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
- "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
-
- MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
- MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
- : "memory", "%"REG_a
- );
-
- ydst += lumStride;
- src += srcStride;
-
- __asm__ volatile(
- "xor %%"REG_a", %%"REG_a" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
- "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
- "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
- "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
- "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
- "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
- "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
- "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
- "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
- "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
- "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
-
- MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
-
- "add $8, %%"REG_a" \n\t"
- "cmp %4, %%"REG_a" \n\t"
- " jb 1b \n\t"
-
- ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
- : "memory", "%"REG_a
- );
- udst += chromStride;
- vdst += chromStride;
- ydst += lumStride;
- src += srcStride;
- }
- __asm__ volatile(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-/**
- * Height should be a multiple of 2 and width should be a multiple of 2.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- * Chrominance data is only taken from every second line,
- * others are ignored in the C version.
- * FIXME: Write HQ version.
- */
-static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const x86_reg chromWidth= width>>1;
- for (y=0; y<height-2; y+=2) {
- int i;
- for (i=0; i<2; i++) {
- __asm__ volatile(
- "mov %2, %%"REG_a" \n\t"
- "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
- "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_d") \n\t"
- "movd (%0, %%"REG_d"), %%mm0 \n\t"
- "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm0 \n\t"
- "pmaddwd %%mm6, %%mm1 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
- "psrad $8, %%mm0 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
-#endif
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "pmaddwd %%mm5, %%mm0 \n\t"
- "pmaddwd %%mm5, %%mm2 \n\t"
- "packssdw %%mm2, %%mm0 \n\t"
- "psraw $7, %%mm0 \n\t"
-
- "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
- "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm4 \n\t"
- "pmaddwd %%mm6, %%mm1 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
- "psrad $8, %%mm4 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
-#endif
- "packssdw %%mm1, %%mm4 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "pmaddwd %%mm5, %%mm4 \n\t"
- "pmaddwd %%mm5, %%mm2 \n\t"
- "add $24, %%"REG_d" \n\t"
- "packssdw %%mm2, %%mm4 \n\t"
- "psraw $7, %%mm4 \n\t"
-
- "packuswb %%mm4, %%mm0 \n\t"
- "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
-
- MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
- : "%"REG_a, "%"REG_d
- );
- ydst += lumStride;
- src += srcStride;
- }
- src -= srcStride*2;
- __asm__ volatile(
- "mov %4, %%"REG_a" \n\t"
- "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
- "add %%"REG_d", %%"REG_d" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"REG_d") \n\t"
- PREFETCH" 64(%1, %%"REG_d") \n\t"
-#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
- "movq (%0, %%"REG_d"), %%mm0 \n\t"
- "movq (%1, %%"REG_d"), %%mm1 \n\t"
- "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
- "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlq $24, %%mm0 \n\t"
- "psrlq $24, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
-#else
- "movd (%0, %%"REG_d"), %%mm0 \n\t"
- "movd (%1, %%"REG_d"), %%mm1 \n\t"
- "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "paddw %%mm1, %%mm0 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm2, %%mm0 \n\t"
- "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
- "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
- "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "paddw %%mm1, %%mm4 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm4, %%mm2 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm2 \n\t"
-#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
-
- "pmaddwd %%mm0, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm0 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
- "psrad $8, %%mm0 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
-#endif
- "packssdw %%mm2, %%mm0 \n\t"
- "packssdw %%mm3, %%mm1 \n\t"
- "pmaddwd %%mm5, %%mm0 \n\t"
- "pmaddwd %%mm5, %%mm1 \n\t"
- "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
- "psraw $7, %%mm0 \n\t"
-
-#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
- "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
- "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
- "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
- "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm4, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlq $24, %%mm4 \n\t"
- "psrlq $24, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
-#else
- "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
- "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
- "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "paddw %%mm1, %%mm4 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm2, %%mm4 \n\t"
- "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
- "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
- "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
- "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "paddw %%mm1, %%mm5 \n\t"
- "paddw %%mm3, %%mm2 \n\t"
- "paddw %%mm5, %%mm2 \n\t"
- "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "psrlw $2, %%mm4 \n\t"
- "psrlw $2, %%mm2 \n\t"
-#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
-
- "pmaddwd %%mm4, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm4 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
- "psrad $8, %%mm4 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
-#endif
- "packssdw %%mm2, %%mm4 \n\t"
- "packssdw %%mm3, %%mm1 \n\t"
- "pmaddwd %%mm5, %%mm4 \n\t"
- "pmaddwd %%mm5, %%mm1 \n\t"
- "add $24, %%"REG_d" \n\t"
- "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
- "psraw $7, %%mm4 \n\t"
-
- "movq %%mm0, %%mm1 \n\t"
- "punpckldq %%mm4, %%mm0 \n\t"
- "punpckhdq %%mm4, %%mm1 \n\t"
- "packsswb %%mm1, %%mm0 \n\t"
- "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
- "movd %%mm0, (%2, %%"REG_a") \n\t"
- "punpckhdq %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%3, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
- " js 1b \n\t"
- : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
- : "%"REG_a, "%"REG_d
- );
-
- udst += chromStride;
- vdst += chromStride;
- src += srcStride*2;
- }
-
- __asm__ volatile(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-
- rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
-}
-#endif /* !COMPILE_TEMPLATE_SSE2 */
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
- int width, int height, int src1Stride,
- int src2Stride, int dstStride)
-{
- int h;
-
- for (h=0; h < height; h++) {
- int w;
-
-#if COMPILE_TEMPLATE_SSE2
- __asm__(
- "xor %%"REG_a", %%"REG_a" \n\t"
- "1: \n\t"
- PREFETCH" 64(%1, %%"REG_a") \n\t"
- PREFETCH" 64(%2, %%"REG_a") \n\t"
- "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
- "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
- "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
- "punpcklbw %%xmm2, %%xmm0 \n\t"
- "punpckhbw %%xmm2, %%xmm1 \n\t"
- "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
- "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
- "add $16, %%"REG_a" \n\t"
- "cmp %3, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
- : "memory", "%"REG_a""
- );
-#else
- __asm__(
- "xor %%"REG_a", %%"REG_a" \n\t"
- "1: \n\t"
- PREFETCH" 64(%1, %%"REG_a") \n\t"
- PREFETCH" 64(%2, %%"REG_a") \n\t"
- "movq (%1, %%"REG_a"), %%mm0 \n\t"
- "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq (%2, %%"REG_a"), %%mm4 \n\t"
- "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
- "punpcklbw %%mm4, %%mm0 \n\t"
- "punpckhbw %%mm4, %%mm1 \n\t"
- "punpcklbw %%mm5, %%mm2 \n\t"
- "punpckhbw %%mm5, %%mm3 \n\t"
- MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
- MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
- "add $16, %%"REG_a" \n\t"
- "cmp %3, %%"REG_a" \n\t"
- " jb 1b \n\t"
- ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
- : "memory", "%"REG_a
- );
-#endif
- for (w= (width&(~15)); w < width; w++) {
- dest[2*w+0] = src1[w];
- dest[2*w+1] = src2[w];
- }
- dest += dstStride;
- src1 += src1Stride;
- src2 += src2Stride;
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-#if !COMPILE_TEMPLATE_SSE2
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
- uint8_t *dst1, uint8_t *dst2,
- int width, int height,
- int srcStride1, int srcStride2,
- int dstStride1, int dstStride2)
-{
- x86_reg x, y;
- int w,h;
- w=width/2; h=height/2;
- __asm__ volatile(
- PREFETCH" %0 \n\t"
- PREFETCH" %1 \n\t"
- ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
- for (y=0;y<h;y++) {
- const uint8_t* s1=src1+srcStride1*(y>>1);
- uint8_t* d=dst1+dstStride1*y;
- x=0;
- for (;x<w-31;x+=32) {
- __asm__ volatile(
- PREFETCH" 32(%1,%2) \n\t"
- "movq (%1,%2), %%mm0 \n\t"
- "movq 8(%1,%2), %%mm2 \n\t"
- "movq 16(%1,%2), %%mm4 \n\t"
- "movq 24(%1,%2), %%mm6 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "movq %%mm6, %%mm7 \n\t"
- "punpcklbw %%mm0, %%mm0 \n\t"
- "punpckhbw %%mm1, %%mm1 \n\t"
- "punpcklbw %%mm2, %%mm2 \n\t"
- "punpckhbw %%mm3, %%mm3 \n\t"
- "punpcklbw %%mm4, %%mm4 \n\t"
- "punpckhbw %%mm5, %%mm5 \n\t"
- "punpcklbw %%mm6, %%mm6 \n\t"
- "punpckhbw %%mm7, %%mm7 \n\t"
- MOVNTQ" %%mm0, (%0,%2,2) \n\t"
- MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
- MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
- MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
- MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
- MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
- MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
- MOVNTQ" %%mm7, 56(%0,%2,2)"
- :: "r"(d), "r"(s1), "r"(x)
- :"memory");
- }
- for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
- }
- for (y=0;y<h;y++) {
- const uint8_t* s2=src2+srcStride2*(y>>1);
- uint8_t* d=dst2+dstStride2*y;
- x=0;
- for (;x<w-31;x+=32) {
- __asm__ volatile(
- PREFETCH" 32(%1,%2) \n\t"
- "movq (%1,%2), %%mm0 \n\t"
- "movq 8(%1,%2), %%mm2 \n\t"
- "movq 16(%1,%2), %%mm4 \n\t"
- "movq 24(%1,%2), %%mm6 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "movq %%mm6, %%mm7 \n\t"
- "punpcklbw %%mm0, %%mm0 \n\t"
- "punpckhbw %%mm1, %%mm1 \n\t"
- "punpcklbw %%mm2, %%mm2 \n\t"
- "punpckhbw %%mm3, %%mm3 \n\t"
- "punpcklbw %%mm4, %%mm4 \n\t"
- "punpckhbw %%mm5, %%mm5 \n\t"
- "punpcklbw %%mm6, %%mm6 \n\t"
- "punpckhbw %%mm7, %%mm7 \n\t"
- MOVNTQ" %%mm0, (%0,%2,2) \n\t"
- MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
- MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
- MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
- MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
- MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
- MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
- MOVNTQ" %%mm7, 56(%0,%2,2)"
- :: "r"(d), "r"(s2), "r"(x)
- :"memory");
- }
- for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-
-static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
- uint8_t *dst,
- int width, int height,
- int srcStride1, int srcStride2,
- int srcStride3, int dstStride)
-{
- x86_reg x;
- int y,w,h;
- w=width/2; h=height;
- for (y=0;y<h;y++) {
- const uint8_t* yp=src1+srcStride1*y;
- const uint8_t* up=src2+srcStride2*(y>>2);
- const uint8_t* vp=src3+srcStride3*(y>>2);
- uint8_t* d=dst+dstStride*y;
- x=0;
- for (;x<w-7;x+=8) {
- __asm__ volatile(
- PREFETCH" 32(%1, %0) \n\t"
- PREFETCH" 32(%2, %0) \n\t"
- PREFETCH" 32(%3, %0) \n\t"
- "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
- "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
- "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
- "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
- "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
- "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
- "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
- "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
- "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
- "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
-
- "movq %%mm1, %%mm6 \n\t"
- "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
- "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
- "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
- MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
- MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
-
- "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
- "movq 8(%1, %0, 4), %%mm0 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
- "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
- MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
- MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
-
- "movq %%mm4, %%mm6 \n\t"
- "movq 16(%1, %0, 4), %%mm0 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "punpcklbw %%mm5, %%mm4 \n\t"
- "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
- "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
- MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
- MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
-
- "punpckhbw %%mm5, %%mm6 \n\t"
- "movq 24(%1, %0, 4), %%mm0 \n\t"
- "movq %%mm0, %%mm3 \n\t"
- "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
- "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
- MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
- MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
-
- : "+r" (x)
- : "r"(yp), "r" (up), "r"(vp), "r"(d)
- :"memory");
- }
- for (; x<w; x++) {
- const int x2 = x<<2;
- d[8*x+0] = yp[x2];
- d[8*x+1] = up[x];
- d[8*x+2] = yp[x2+1];
- d[8*x+3] = vp[x];
- d[8*x+4] = yp[x2+2];
- d[8*x+5] = up[x];
- d[8*x+6] = yp[x2+3];
- d[8*x+7] = vp[x];
- }
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
-{
- dst += count;
- src += 2*count;
- count= - count;
-
- if(count <= -16) {
- count += 15;
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t"
- "1: \n\t"
- "movq -30(%1, %0, 2), %%mm0 \n\t"
- "movq -22(%1, %0, 2), %%mm1 \n\t"
- "movq -14(%1, %0, 2), %%mm2 \n\t"
- "movq -6(%1, %0, 2), %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- MOVNTQ" %%mm0,-15(%2, %0) \n\t"
- MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
- "add $16, %0 \n\t"
- " js 1b \n\t"
- : "+r"(count)
- : "r"(src), "r"(dst)
- );
- count -= 15;
- }
- while(count<0) {
- dst[count]= src[2*count];
- count++;
- }
-}
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
-{
- dst0+= count;
- dst1+= count;
- src += 4*count;
- count= - count;
- if(count <= -8) {
- count += 7;
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t"
- "1: \n\t"
- "movq -28(%1, %0, 4), %%mm0 \n\t"
- "movq -20(%1, %0, 4), %%mm1 \n\t"
- "movq -12(%1, %0, 4), %%mm2 \n\t"
- "movq -4(%1, %0, 4), %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm2, %%mm0 \n\t"
- "packuswb %%mm3, %%mm1 \n\t"
- MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
- MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
- "add $8, %0 \n\t"
- " js 1b \n\t"
- : "+r"(count)
- : "r"(src), "r"(dst0), "r"(dst1)
- );
- count -= 7;
- }
- while(count<0) {
- dst0[count]= src[4*count+0];
- dst1[count]= src[4*count+2];
- count++;
- }
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
-{
- dst0 += count;
- dst1 += count;
- src0 += 4*count;
- src1 += 4*count;
- count= - count;
-#ifdef PAVGB
- if(count <= -8) {
- count += 7;
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t"
- "1: \n\t"
- "movq -28(%1, %0, 4), %%mm0 \n\t"
- "movq -20(%1, %0, 4), %%mm1 \n\t"
- "movq -12(%1, %0, 4), %%mm2 \n\t"
- "movq -4(%1, %0, 4), %%mm3 \n\t"
- PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
- PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
- PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
- PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm2, %%mm0 \n\t"
- "packuswb %%mm3, %%mm1 \n\t"
- MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
- MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
- "add $8, %0 \n\t"
- " js 1b \n\t"
- : "+r"(count)
- : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
- );
- count -= 7;
- }
-#endif
- while(count<0) {
- dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
- dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
- count++;
- }
-}
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
-{
- dst0+= count;
- dst1+= count;
- src += 4*count;
- count= - count;
- if(count <= -8) {
- count += 7;
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t"
- "1: \n\t"
- "movq -28(%1, %0, 4), %%mm0 \n\t"
- "movq -20(%1, %0, 4), %%mm1 \n\t"
- "movq -12(%1, %0, 4), %%mm2 \n\t"
- "movq -4(%1, %0, 4), %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm1 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "psrlw $8, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm2, %%mm0 \n\t"
- "packuswb %%mm3, %%mm1 \n\t"
- MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
- MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
- "add $8, %0 \n\t"
- " js 1b \n\t"
- : "+r"(count)
- : "r"(src), "r"(dst0), "r"(dst1)
- );
- count -= 7;
- }
- src++;
- while(count<0) {
- dst0[count]= src[4*count+0];
- dst1[count]= src[4*count+2];
- count++;
- }
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
-{
- dst0 += count;
- dst1 += count;
- src0 += 4*count;
- src1 += 4*count;
- count= - count;
-#ifdef PAVGB
- if(count <= -8) {
- count += 7;
- __asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $8, %%mm7 \n\t"
- "1: \n\t"
- "movq -28(%1, %0, 4), %%mm0 \n\t"
- "movq -20(%1, %0, 4), %%mm1 \n\t"
- "movq -12(%1, %0, 4), %%mm2 \n\t"
- "movq -4(%1, %0, 4), %%mm3 \n\t"
- PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
- PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
- PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
- PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm1 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "psrlw $8, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlw $8, %%mm0 \n\t"
- "psrlw $8, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm3 \n\t"
- "packuswb %%mm2, %%mm0 \n\t"
- "packuswb %%mm3, %%mm1 \n\t"
- MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
- MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
- "add $8, %0 \n\t"
- " js 1b \n\t"
- : "+r"(count)
- : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
- );
- count -= 7;
- }
-#endif
- src0++;
- src1++;
- while(count<0) {
- dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
- dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
- count++;
- }
-}
-
-static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const int chromWidth= -((-width)>>1);
-
- for (y=0; y<height; y++) {
- RENAME(extract_even)(src, ydst, width);
- if(y&1) {
- RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
- udst+= chromStride;
- vdst+= chromStride;
- }
-
- src += srcStride;
- ydst+= lumStride;
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const int chromWidth= -((-width)>>1);
-
- for (y=0; y<height; y++) {
- RENAME(extract_even)(src, ydst, width);
- RENAME(extract_odd2)(src, udst, vdst, chromWidth);
-
- src += srcStride;
- ydst+= lumStride;
- udst+= chromStride;
- vdst+= chromStride;
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const int chromWidth= -((-width)>>1);
-
- for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
- if(y&1) {
- RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
- udst+= chromStride;
- vdst+= chromStride;
- }
-
- src += srcStride;
- ydst+= lumStride;
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
-static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
- int width, int height,
- int lumStride, int chromStride, int srcStride)
-{
- int y;
- const int chromWidth= -((-width)>>1);
-
- for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
- RENAME(extract_even2)(src, udst, vdst, chromWidth);
-
- src += srcStride;
- ydst+= lumStride;
- udst+= chromStride;
- vdst+= chromStride;
- }
- __asm__(
- EMMS" \n\t"
- SFENCE" \n\t"
- ::: "memory"
- );
-}
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-#endif /* !COMPILE_TEMPLATE_SSE2 */
-
-static inline void RENAME(rgb2rgb_init)(void)
-{
-#if !COMPILE_TEMPLATE_SSE2
-#if !COMPILE_TEMPLATE_AMD3DNOW
- rgb15to16 = RENAME(rgb15to16);
- rgb15tobgr24 = RENAME(rgb15tobgr24);
- rgb15to32 = RENAME(rgb15to32);
- rgb16tobgr24 = RENAME(rgb16tobgr24);
- rgb16to32 = RENAME(rgb16to32);
- rgb16to15 = RENAME(rgb16to15);
- rgb24tobgr16 = RENAME(rgb24tobgr16);
- rgb24tobgr15 = RENAME(rgb24tobgr15);
- rgb24tobgr32 = RENAME(rgb24tobgr32);
- rgb32to16 = RENAME(rgb32to16);
- rgb32to15 = RENAME(rgb32to15);
- rgb32tobgr24 = RENAME(rgb32tobgr24);
- rgb24to15 = RENAME(rgb24to15);
- rgb24to16 = RENAME(rgb24to16);
- rgb24tobgr24 = RENAME(rgb24tobgr24);
- shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
- rgb32tobgr16 = RENAME(rgb32tobgr16);
- rgb32tobgr15 = RENAME(rgb32tobgr15);
- yv12toyuy2 = RENAME(yv12toyuy2);
- yv12touyvy = RENAME(yv12touyvy);
- yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
- yuv422ptouyvy = RENAME(yuv422ptouyvy);
- yuy2toyv12 = RENAME(yuy2toyv12);
- vu9_to_vu12 = RENAME(vu9_to_vu12);
- yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
- uyvytoyuv422 = RENAME(uyvytoyuv422);
- yuyvtoyuv422 = RENAME(yuyvtoyuv422);
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-
-#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
- planar2x = RENAME(planar2x);
-#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
- rgb24toyv12 = RENAME(rgb24toyv12);
-
- yuyvtoyuv420 = RENAME(yuyvtoyuv420);
- uyvytoyuv420 = RENAME(uyvytoyuv420);
-#endif /* !COMPILE_TEMPLATE_SSE2 */
-
-#if !COMPILE_TEMPLATE_AMD3DNOW
- interleaveBytes = RENAME(interleaveBytes);
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
-}
diff --git a/ffmpeg1/libswscale/x86/scale.asm b/ffmpeg1/libswscale/x86/scale.asm
deleted file mode 100644
index c6dafde..0000000
--- a/ffmpeg1/libswscale/x86/scale.asm
+++ /dev/null
@@ -1,431 +0,0 @@
-;******************************************************************************
-;* x86-optimized horizontal line scaling functions
-;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-max_19bit_int: times 4 dd 0x7ffff
-max_19bit_flt: times 4 dd 524287.0
-minshort: times 8 dw 0x8000
-unicoeff: times 4 dd 0x20000000
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; horizontal line scaling
-;
-; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
-; (SwsContext *c, int{16,32}_t *dst,
-; int dstW, const uint{8,16}_t *src,
-; const int16_t *filter,
-; const int32_t *filterPos, int filterSize);
-;
-; Scale one horizontal line. Input is either 8-bits width or 16-bits width
-; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
-; downscale before multiplying). Filter is 14-bits. Output is either 15bits
-; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
-; output pixel is generated from $filterSize input pixels, the position of
-; the first pixel is given in filterPos[nOutputPixel].
-;-----------------------------------------------------------------------------
-
-; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
-%macro SCALE_FUNC 6
-%ifnidn %3, X
-cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
-%else
-cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
-%endif
-%if ARCH_X86_64
- movsxd wq, wd
-%define mov32 movsxd
-%else ; x86-32
-%define mov32 mov
-%endif ; x86-64
-%if %2 == 19
-%if mmsize == 8 ; mmx
- mova m2, [max_19bit_int]
-%elif cpuflag(sse4)
- mova m2, [max_19bit_int]
-%else ; ssse3/sse2
- mova m2, [max_19bit_flt]
-%endif ; mmx/sse2/ssse3/sse4
-%endif ; %2 == 19
-%if %1 == 16
- mova m6, [minshort]
- mova m7, [unicoeff]
-%elif %1 == 8
- pxor m3, m3
-%endif ; %1 == 8/16
-
-%if %1 == 8
-%define movlh movd
-%define movbh movh
-%define srcmul 1
-%else ; %1 == 9-16
-%define movlh movq
-%define movbh movu
-%define srcmul 2
-%endif ; %1 == 8/9-16
-
-%ifnidn %3, X
-
- ; setup loop
-%if %3 == 8
- shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
-%define wshr 1
-%else ; %3 == 4
-%define wshr 0
-%endif ; %3 == 8
- lea filterq, [filterq+wq*8]
-%if %2 == 15
- lea dstq, [dstq+wq*(2>>wshr)]
-%else ; %2 == 19
- lea dstq, [dstq+wq*(4>>wshr)]
-%endif ; %2 == 15/19
- lea fltposq, [fltposq+wq*(4>>wshr)]
- neg wq
-
-.loop:
-%if %3 == 4 ; filterSize == 4 scaling
- ; load 2x4 or 4x4 source pixels into m0/m1
- mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
- mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
- movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
-%if mmsize == 8
- movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
-%else ; mmsize == 16
-%if %1 > 8
- movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
-%else ; %1 == 8
- movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
-%endif
- mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
- mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
- movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
-%if %1 > 8
- movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
-%else ; %1 == 8
- movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
- punpckldq m0, m4
- punpckldq m1, m5
-%endif ; %1 == 8
-%endif ; mmsize == 8/16
-%if %1 == 8
- punpcklbw m0, m3 ; byte -> word
- punpcklbw m1, m3 ; byte -> word
-%endif ; %1 == 8
-
- ; multiply with filter coefficients
-%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
- ; add back 0x8000 * sum(coeffs) after the horizontal add
- psubw m0, m6
- psubw m1, m6
-%endif ; %1 == 16
- pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
- pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
-
- ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
-%if mmsize == 8 ; mmx
- movq m4, m0
- punpckldq m0, m1
- punpckhdq m4, m1
- paddd m0, m4
-%elif notcpuflag(ssse3) ; sse2
- mova m4, m0
- shufps m0, m1, 10001000b
- shufps m4, m1, 11011101b
- paddd m0, m4
-%else ; ssse3/sse4
- phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
- ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
- ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
- ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
-%endif ; mmx/sse2/ssse3/sse4
-%else ; %3 == 8, i.e. filterSize == 8 scaling
- ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
- mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
- mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
- movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
-%if mmsize == 8
- movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
- movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
- movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
-%else ; mmsize == 16
- movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
- mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
- mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
- movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
- movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
-%endif ; mmsize == 8/16
-%if %1 == 8
- punpcklbw m0, m3 ; byte -> word
- punpcklbw m1, m3 ; byte -> word
- punpcklbw m4, m3 ; byte -> word
- punpcklbw m5, m3 ; byte -> word
-%endif ; %1 == 8
-
- ; multiply
-%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
- ; add back 0x8000 * sum(coeffs) after the horizontal add
- psubw m0, m6
- psubw m1, m6
- psubw m4, m6
- psubw m5, m6
-%endif ; %1 == 16
- pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
- pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
- pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
- pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
-
- ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
-%if mmsize == 8
- paddd m0, m1
- paddd m4, m5
- movq m1, m0
- punpckldq m0, m4
- punpckhdq m1, m4
- paddd m0, m1
-%elif notcpuflag(ssse3) ; sse2
-%if %1 == 8
-%define mex m6
-%else
-%define mex m3
-%endif
- ; emulate horizontal add as transpose + vertical add
- mova mex, m0
- punpckldq m0, m1
- punpckhdq mex, m1
- paddd m0, mex
- mova m1, m4
- punpckldq m4, m5
- punpckhdq m1, m5
- paddd m4, m1
- mova m1, m0
- punpcklqdq m0, m4
- punpckhqdq m1, m4
- paddd m0, m1
-%else ; ssse3/sse4
- ; FIXME if we rearrange the filter in pairs of 4, we can
- ; load pixels likewise and use 2 x paddd + phaddd instead
- ; of 3 x phaddd here, faster on older cpus
- phaddd m0, m1
- phaddd m4, m5
- phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
- ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
- ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
- ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
-%endif ; mmx/sse2/ssse3/sse4
-%endif ; %3 == 4/8
-
-%else ; %3 == X, i.e. any filterSize scaling
-
-%ifidn %4, X4
-%define dlt 4
-%else ; %4 == X || %4 == X8
-%define dlt 0
-%endif ; %4 ==/!= X4
-%if ARCH_X86_64
-%define srcq r8
-%define pos1q r7
-%define srcendq r9
- movsxd fltsizeq, fltsized ; filterSize
- lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
-%else ; x86-32
-%define srcq srcmemq
-%define pos1q dstq
-%define srcendq r6m
- lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
- mov srcendq, pos0q
-%endif ; x86-32/64
- lea fltposq, [fltposq+wq*4]
-%if %2 == 15
- lea dstq, [dstq+wq*2]
-%else ; %2 == 19
- lea dstq, [dstq+wq*4]
-%endif ; %2 == 15/19
- movifnidn dstmp, dstq
- neg wq
-
-.loop:
- mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
- mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
- ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
- pxor m4, m4
- pxor m5, m5
- mov srcq, srcmemmp
-
-.innerloop:
- ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
- movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
- movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
-%if %1 == 8
- punpcklbw m0, m3
- punpcklbw m1, m3
-%endif ; %1 == 8
-
- ; multiply
-%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
- ; add back 0x8000 * sum(coeffs) after the horizontal add
- psubw m0, m6
- psubw m1, m6
-%endif ; %1 == 16
- pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
- pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
- paddd m4, m0
- paddd m5, m1
- add filterq, mmsize
- add srcq, srcmul*mmsize/2
- cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
- jl .innerloop
-
-%ifidn %4, X4
- mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
- movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
- sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
-%if %1 > 8
- movhps m0, [srcq+(pos1q+dlt)*srcmul]
-%else ; %1 == 8
- movd m1, [srcq+(pos1q+dlt)*srcmul]
- punpckldq m0, m1
-%endif ; %1 == 8
-%if %1 == 8
- punpcklbw m0, m3
-%endif ; %1 == 8
-%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
- ; add back 0x8000 * sum(coeffs) after the horizontal add
- psubw m0, m6
-%endif ; %1 == 16
- pmaddwd m0, [filterq]
-%endif ; %4 == X4
-
- lea filterq, [filterq+(fltsizeq+dlt)*2]
-
-%if mmsize == 8 ; mmx
- movq m0, m4
- punpckldq m4, m5
- punpckhdq m0, m5
- paddd m0, m4
-%else ; mmsize == 16
-%if notcpuflag(ssse3) ; sse2
- mova m1, m4
- punpcklqdq m4, m5
- punpckhqdq m1, m5
- paddd m4, m1
-%else ; ssse3/sse4
- phaddd m4, m5
-%endif ; sse2/ssse3/sse4
-%ifidn %4, X4
- paddd m4, m0
-%endif ; %3 == X4
-%if notcpuflag(ssse3) ; sse2
- pshufd m4, m4, 11011000b
- movhlps m0, m4
- paddd m0, m4
-%else ; ssse3/sse4
- phaddd m4, m4
- SWAP 0, 4
-%endif ; sse2/ssse3/sse4
-%endif ; mmsize == 8/16
-%endif ; %3 ==/!= X
-
-%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
- paddd m0, m7
-%endif ; %1 == 16
-
- ; clip, store
- psrad m0, 14 + %1 - %2
-%ifidn %3, X
- movifnidn dstq, dstmp
-%endif ; %3 == X
-%if %2 == 15
- packssdw m0, m0
-%ifnidn %3, X
- movh [dstq+wq*(2>>wshr)], m0
-%else ; %3 == X
- movd [dstq+wq*2], m0
-%endif ; %3 ==/!= X
-%else ; %2 == 19
-%if mmsize == 8
- PMINSD_MMX m0, m2, m4
-%elif cpuflag(sse4)
- pminsd m0, m2
-%else ; sse2/ssse3
- cvtdq2ps m0, m0
- minps m0, m2
- cvtps2dq m0, m0
-%endif ; mmx/sse2/ssse3/sse4
-%ifnidn %3, X
- mova [dstq+wq*(4>>wshr)], m0
-%else ; %3 == X
- movq [dstq+wq*4], m0
-%endif ; %3 ==/!= X
-%endif ; %2 == 15/19
-%ifnidn %3, X
- add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
- ; per iteration. see "shl wq,1" above as for why we do this
-%else ; %3 == X
- add wq, 2
-%endif ; %3 ==/!= X
- jl .loop
- REP_RET
-%endmacro
-
-; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
-%macro SCALE_FUNCS 3
-SCALE_FUNC %1, %2, 4, 4, 6, %3
-SCALE_FUNC %1, %2, 8, 8, 6, %3
-%if mmsize == 8
-SCALE_FUNC %1, %2, X, X, 7, %3
-%else
-SCALE_FUNC %1, %2, X, X4, 7, %3
-SCALE_FUNC %1, %2, X, X8, 7, %3
-%endif
-%endmacro
-
-; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
-%macro SCALE_FUNCS2 3
-%if notcpuflag(sse4)
-SCALE_FUNCS 8, 15, %1
-SCALE_FUNCS 9, 15, %2
-SCALE_FUNCS 10, 15, %2
-SCALE_FUNCS 12, 15, %2
-SCALE_FUNCS 14, 15, %2
-SCALE_FUNCS 16, 15, %3
-%endif ; !sse4
-SCALE_FUNCS 8, 19, %1
-SCALE_FUNCS 9, 19, %2
-SCALE_FUNCS 10, 19, %2
-SCALE_FUNCS 12, 19, %2
-SCALE_FUNCS 14, 19, %2
-SCALE_FUNCS 16, 19, %3
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-SCALE_FUNCS2 0, 0, 0
-%endif
-INIT_XMM sse2
-SCALE_FUNCS2 6, 7, 8
-INIT_XMM ssse3
-SCALE_FUNCS2 6, 6, 8
-INIT_XMM sse4
-SCALE_FUNCS2 6, 6, 8
diff --git a/ffmpeg1/libswscale/x86/swscale.c b/ffmpeg1/libswscale/x86/swscale.c
deleted file mode 100644
index 2f67b1b..0000000
--- a/ffmpeg1/libswscale/x86/swscale.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-#include "config.h"
-#include "libswscale/swscale.h"
-#include "libswscale/swscale_internal.h"
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavutil/cpu.h"
-#include "libavutil/pixdesc.h"
-
-#if HAVE_INLINE_ASM
-
-#define DITHER1XBPP
-
-DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
-DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
-DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
-DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
-
-const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
- 0x0103010301030103LL,
- 0x0200020002000200LL,};
-
-const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
- 0x0602060206020602LL,
- 0x0004000400040004LL,};
-
-DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
-DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
-DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
-DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
-DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
-DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
-
-DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
-DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
-DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
-
-#ifdef FAST_BGR2YV12
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL;
-#else
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
-#endif /* FAST_BGR2YV12 */
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
-
-
-//MMX versions
-#if HAVE_MMX_INLINE
-#undef RENAME
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define RENAME(a) a ## _MMX
-#include "swscale_template.c"
-#endif
-
-// MMXEXT versions
-#if HAVE_MMXEXT_INLINE
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#define COMPILE_TEMPLATE_MMXEXT 1
-#define RENAME(a) a ## _MMXEXT
-#include "swscale_template.c"
-#endif
-
-void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
- int lastInLumBuf, int lastInChrBuf)
-{
- const int dstH= c->dstH;
- const int flags= c->flags;
- int16_t **lumPixBuf= c->lumPixBuf;
- int16_t **chrUPixBuf= c->chrUPixBuf;
- int16_t **alpPixBuf= c->alpPixBuf;
- const int vLumBufSize= c->vLumBufSize;
- const int vChrBufSize= c->vChrBufSize;
- int32_t *vLumFilterPos= c->vLumFilterPos;
- int32_t *vChrFilterPos= c->vChrFilterPos;
- int16_t *vLumFilter= c->vLumFilter;
- int16_t *vChrFilter= c->vChrFilter;
- int32_t *lumMmxFilter= c->lumMmxFilter;
- int32_t *chrMmxFilter= c->chrMmxFilter;
- int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
- const int vLumFilterSize= c->vLumFilterSize;
- const int vChrFilterSize= c->vChrFilterSize;
- const int chrDstY= dstY>>c->chrDstVSubSample;
- const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
- const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
-
- c->blueDither= ff_dither8[dstY&1];
- if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
- c->greenDither= ff_dither8[dstY&1];
- else
- c->greenDither= ff_dither4[dstY&1];
- c->redDither= ff_dither8[(dstY+1)&1];
- if (dstY < dstH - 2) {
- const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
- const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
- const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
- int i;
-
- if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
- const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
- int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
- for (i = 0; i < neg; i++)
- tmpY[i] = lumSrcPtr[neg];
- for ( ; i < end; i++)
- tmpY[i] = lumSrcPtr[i];
- for ( ; i < vLumFilterSize; i++)
- tmpY[i] = tmpY[i-1];
- lumSrcPtr = tmpY;
-
- if (alpSrcPtr) {
- const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
- for (i = 0; i < neg; i++)
- tmpA[i] = alpSrcPtr[neg];
- for ( ; i < end; i++)
- tmpA[i] = alpSrcPtr[i];
- for ( ; i < vLumFilterSize; i++)
- tmpA[i] = tmpA[i - 1];
- alpSrcPtr = tmpA;
- }
- }
- if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
- const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
- int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
- for (i = 0; i < neg; i++) {
- tmpU[i] = chrUSrcPtr[neg];
- }
- for ( ; i < end; i++) {
- tmpU[i] = chrUSrcPtr[i];
- }
- for ( ; i < vChrFilterSize; i++) {
- tmpU[i] = tmpU[i - 1];
- }
- chrUSrcPtr = tmpU;
- }
-
- if (flags & SWS_ACCURATE_RND) {
- int s= APCK_SIZE / 8;
- for (i=0; i<vLumFilterSize; i+=2) {
- *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
- *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
- lumMmxFilter[s*i+APCK_COEF/4 ]=
- lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
- + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
- if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
- *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
- *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
- alpMmxFilter[s*i+APCK_COEF/4 ]=
- alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
- }
- }
- for (i=0; i<vChrFilterSize; i+=2) {
- *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
- *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
- chrMmxFilter[s*i+APCK_COEF/4 ]=
- chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
- + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
- }
- } else {
- for (i=0; i<vLumFilterSize; i++) {
- *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
- lumMmxFilter[4*i+2]=
- lumMmxFilter[4*i+3]=
- ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
- if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
- *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
- alpMmxFilter[4*i+2]=
- alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
- }
- }
- for (i=0; i<vChrFilterSize; i++) {
- *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
- chrMmxFilter[4*i+2]=
- chrMmxFilter[4*i+3]=
- ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
- }
- }
- }
-}
-
-#if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset)
-{
- if(((int)dest) & 15){
- return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset);
- }
- if (offset) {
- __asm__ volatile("movq (%0), %%xmm3\n\t"
- "movdqa %%xmm3, %%xmm4\n\t"
- "psrlq $24, %%xmm3\n\t"
- "psllq $40, %%xmm4\n\t"
- "por %%xmm4, %%xmm3\n\t"
- :: "r"(dither)
- );
- } else {
- __asm__ volatile("movq (%0), %%xmm3\n\t"
- :: "r"(dither)
- );
- }
- filterSize--;
- __asm__ volatile(
- "pxor %%xmm0, %%xmm0\n\t"
- "punpcklbw %%xmm0, %%xmm3\n\t"
- "movd %0, %%xmm1\n\t"
- "punpcklwd %%xmm1, %%xmm1\n\t"
- "punpckldq %%xmm1, %%xmm1\n\t"
- "punpcklqdq %%xmm1, %%xmm1\n\t"
- "psllw $3, %%xmm1\n\t"
- "paddw %%xmm1, %%xmm3\n\t"
- "psraw $4, %%xmm3\n\t"
- ::"m"(filterSize)
- );
- __asm__ volatile(
- "movdqa %%xmm3, %%xmm4\n\t"
- "movdqa %%xmm3, %%xmm7\n\t"
- "movl %3, %%ecx\n\t"
- "mov %0, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- ".p2align 4 \n\t" /* FIXME Unroll? */\
- "1: \n\t"\
- "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
- "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
- "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
- "add $16, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "pmulhw %%xmm0, %%xmm2 \n\t"\
- "pmulhw %%xmm0, %%xmm5 \n\t"\
- "paddw %%xmm2, %%xmm3 \n\t"\
- "paddw %%xmm5, %%xmm4 \n\t"\
- " jnz 1b \n\t"\
- "psraw $3, %%xmm3 \n\t"\
- "psraw $3, %%xmm4 \n\t"\
- "packuswb %%xmm4, %%xmm3 \n\t"
- "movntdq %%xmm3, (%1, %%"REG_c")\n\t"
- "add $16, %%"REG_c" \n\t"\
- "cmp %2, %%"REG_c" \n\t"\
- "movdqa %%xmm7, %%xmm3\n\t"
- "movdqa %%xmm7, %%xmm4\n\t"
- "mov %0, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "jb 1b \n\t"\
- :: "g" (filter),
- "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
- : "%"REG_d, "%"REG_S, "%"REG_c
- );
-}
-#endif
-
-#endif /* HAVE_INLINE_ASM */
-
-#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
-extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
- SwsContext *c, int16_t *data, \
- int dstW, const uint8_t *src, \
- const int16_t *filter, \
- const int32_t *filterPos, int filterSize)
-
-#define SCALE_FUNCS(filter_n, opt) \
- SCALE_FUNC(filter_n, 8, 15, opt); \
- SCALE_FUNC(filter_n, 9, 15, opt); \
- SCALE_FUNC(filter_n, 10, 15, opt); \
- SCALE_FUNC(filter_n, 12, 15, opt); \
- SCALE_FUNC(filter_n, 14, 15, opt); \
- SCALE_FUNC(filter_n, 16, 15, opt); \
- SCALE_FUNC(filter_n, 8, 19, opt); \
- SCALE_FUNC(filter_n, 9, 19, opt); \
- SCALE_FUNC(filter_n, 10, 19, opt); \
- SCALE_FUNC(filter_n, 12, 19, opt); \
- SCALE_FUNC(filter_n, 14, 19, opt); \
- SCALE_FUNC(filter_n, 16, 19, opt)
-
-#define SCALE_FUNCS_MMX(opt) \
- SCALE_FUNCS(4, opt); \
- SCALE_FUNCS(8, opt); \
- SCALE_FUNCS(X, opt)
-
-#define SCALE_FUNCS_SSE(opt) \
- SCALE_FUNCS(4, opt); \
- SCALE_FUNCS(8, opt); \
- SCALE_FUNCS(X4, opt); \
- SCALE_FUNCS(X8, opt)
-
-#if ARCH_X86_32
-SCALE_FUNCS_MMX(mmx);
-#endif
-SCALE_FUNCS_SSE(sse2);
-SCALE_FUNCS_SSE(ssse3);
-SCALE_FUNCS_SSE(sse4);
-
-#define VSCALEX_FUNC(size, opt) \
-extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
- const int16_t **src, uint8_t *dest, int dstW, \
- const uint8_t *dither, int offset)
-#define VSCALEX_FUNCS(opt) \
- VSCALEX_FUNC(8, opt); \
- VSCALEX_FUNC(9, opt); \
- VSCALEX_FUNC(10, opt)
-
-#if ARCH_X86_32
-VSCALEX_FUNCS(mmxext);
-#endif
-VSCALEX_FUNCS(sse2);
-VSCALEX_FUNCS(sse4);
-VSCALEX_FUNC(16, sse4);
-VSCALEX_FUNCS(avx);
-
-#define VSCALE_FUNC(size, opt) \
-extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
- const uint8_t *dither, int offset)
-#define VSCALE_FUNCS(opt1, opt2) \
- VSCALE_FUNC(8, opt1); \
- VSCALE_FUNC(9, opt2); \
- VSCALE_FUNC(10, opt2); \
- VSCALE_FUNC(16, opt1)
-
-#if ARCH_X86_32
-VSCALE_FUNCS(mmx, mmxext);
-#endif
-VSCALE_FUNCS(sse2, sse2);
-VSCALE_FUNC(16, sse4);
-VSCALE_FUNCS(avx, avx);
-
-#define INPUT_Y_FUNC(fmt, opt) \
-extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
- const uint8_t *unused1, const uint8_t *unused2, \
- int w, uint32_t *unused)
-#define INPUT_UV_FUNC(fmt, opt) \
-extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
- const uint8_t *unused0, \
- const uint8_t *src1, \
- const uint8_t *src2, \
- int w, uint32_t *unused)
-#define INPUT_FUNC(fmt, opt) \
- INPUT_Y_FUNC(fmt, opt); \
- INPUT_UV_FUNC(fmt, opt)
-#define INPUT_FUNCS(opt) \
- INPUT_FUNC(uyvy, opt); \
- INPUT_FUNC(yuyv, opt); \
- INPUT_UV_FUNC(nv12, opt); \
- INPUT_UV_FUNC(nv21, opt); \
- INPUT_FUNC(rgba, opt); \
- INPUT_FUNC(bgra, opt); \
- INPUT_FUNC(argb, opt); \
- INPUT_FUNC(abgr, opt); \
- INPUT_FUNC(rgb24, opt); \
- INPUT_FUNC(bgr24, opt)
-
-#if ARCH_X86_32
-INPUT_FUNCS(mmx);
-#endif
-INPUT_FUNCS(sse2);
-INPUT_FUNCS(ssse3);
-INPUT_FUNCS(avx);
-
-av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
-{
- int cpu_flags = av_get_cpu_flags();
-
-#if HAVE_INLINE_ASM
- if (cpu_flags & AV_CPU_FLAG_MMX)
- sws_init_swScale_MMX(c);
-#if HAVE_MMXEXT_INLINE
- if (cpu_flags & AV_CPU_FLAG_MMXEXT)
- sws_init_swScale_MMXEXT(c);
- if (cpu_flags & AV_CPU_FLAG_SSE3){
- if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
- c->yuv2planeX = yuv2yuvX_sse3;
- }
-#endif
-#endif /* HAVE_INLINE_ASM */
-
-#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
- if (c->srcBpc == 8) { \
- hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale8to19_ ## filtersize ## _ ## opt1; \
- } else if (c->srcBpc == 9) { \
- hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale9to19_ ## filtersize ## _ ## opt1; \
- } else if (c->srcBpc == 10) { \
- hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale10to19_ ## filtersize ## _ ## opt1; \
- } else if (c->srcBpc == 12) { \
- hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale12to19_ ## filtersize ## _ ## opt1; \
- } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
- hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale14to19_ ## filtersize ## _ ## opt1; \
- } else { /* c->srcBpc == 16 */ \
- av_assert0(c->srcBpc == 16);\
- hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
- ff_hscale16to19_ ## filtersize ## _ ## opt1; \
- } \
-} while (0)
-#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
- switch (filtersize) { \
- case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
- case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
- default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
- }
-#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
-switch(c->dstBpc){ \
- case 16: do_16_case; break; \
- case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
- case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
- default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \
- }
-#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
- switch(c->dstBpc){ \
- case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
- case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
- case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
- case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
- default: av_assert0(c->dstBpc>8); \
- }
-#define case_rgb(x, X, opt) \
- case AV_PIX_FMT_ ## X: \
- c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
- if (!c->chrSrcHSubSample) \
- c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
- break
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags)) {
- ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
- ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
- ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
-
- switch (c->srcFormat) {
- case AV_PIX_FMT_Y400A:
- c->lumToYV12 = ff_yuyvToY_mmx;
- if (c->alpPixBuf)
- c->alpToYV12 = ff_uyvyToY_mmx;
- break;
- case AV_PIX_FMT_YUYV422:
- c->lumToYV12 = ff_yuyvToY_mmx;
- c->chrToYV12 = ff_yuyvToUV_mmx;
- break;
- case AV_PIX_FMT_UYVY422:
- c->lumToYV12 = ff_uyvyToY_mmx;
- c->chrToYV12 = ff_uyvyToUV_mmx;
- break;
- case AV_PIX_FMT_NV12:
- c->chrToYV12 = ff_nv12ToUV_mmx;
- break;
- case AV_PIX_FMT_NV21:
- c->chrToYV12 = ff_nv21ToUV_mmx;
- break;
- case_rgb(rgb24, RGB24, mmx);
- case_rgb(bgr24, BGR24, mmx);
- case_rgb(bgra, BGRA, mmx);
- case_rgb(rgba, RGBA, mmx);
- case_rgb(abgr, ABGR, mmx);
- case_rgb(argb, ARGB, mmx);
- default:
- break;
- }
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
- }
-#endif /* ARCH_X86_32 */
-#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
- switch (filtersize) { \
- case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
- case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
- default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
- else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
- break; \
- }
- if (EXTERNAL_SSE2(cpu_flags)) {
- ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
- ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
- ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
- HAVE_ALIGNED_STACK || ARCH_X86_64);
- ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
-
- switch (c->srcFormat) {
- case AV_PIX_FMT_Y400A:
- c->lumToYV12 = ff_yuyvToY_sse2;
- if (c->alpPixBuf)
- c->alpToYV12 = ff_uyvyToY_sse2;
- break;
- case AV_PIX_FMT_YUYV422:
- c->lumToYV12 = ff_yuyvToY_sse2;
- c->chrToYV12 = ff_yuyvToUV_sse2;
- break;
- case AV_PIX_FMT_UYVY422:
- c->lumToYV12 = ff_uyvyToY_sse2;
- c->chrToYV12 = ff_uyvyToUV_sse2;
- break;
- case AV_PIX_FMT_NV12:
- c->chrToYV12 = ff_nv12ToUV_sse2;
- break;
- case AV_PIX_FMT_NV21:
- c->chrToYV12 = ff_nv21ToUV_sse2;
- break;
- case_rgb(rgb24, RGB24, sse2);
- case_rgb(bgr24, BGR24, sse2);
- case_rgb(bgra, BGRA, sse2);
- case_rgb(rgba, RGBA, sse2);
- case_rgb(abgr, ABGR, sse2);
- case_rgb(argb, ARGB, sse2);
- default:
- break;
- }
- }
- if (EXTERNAL_SSSE3(cpu_flags)) {
- ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
- ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
- switch (c->srcFormat) {
- case_rgb(rgb24, RGB24, ssse3);
- case_rgb(bgr24, BGR24, ssse3);
- default:
- break;
- }
- }
- if (EXTERNAL_SSE4(cpu_flags)) {
- /* Xto15 don't need special sse4 functions */
- ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
- ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
- ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
- if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
- HAVE_ALIGNED_STACK || ARCH_X86_64);
- if (c->dstBpc == 16 && !isBE(c->dstFormat))
- c->yuv2plane1 = ff_yuv2plane1_16_sse4;
- }
-
- if (EXTERNAL_AVX(cpu_flags)) {
- ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
- HAVE_ALIGNED_STACK || ARCH_X86_64);
- ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
-
- switch (c->srcFormat) {
- case AV_PIX_FMT_YUYV422:
- c->chrToYV12 = ff_yuyvToUV_avx;
- break;
- case AV_PIX_FMT_UYVY422:
- c->chrToYV12 = ff_uyvyToUV_avx;
- break;
- case AV_PIX_FMT_NV12:
- c->chrToYV12 = ff_nv12ToUV_avx;
- break;
- case AV_PIX_FMT_NV21:
- c->chrToYV12 = ff_nv21ToUV_avx;
- break;
- case_rgb(rgb24, RGB24, avx);
- case_rgb(bgr24, BGR24, avx);
- case_rgb(bgra, BGRA, avx);
- case_rgb(rgba, RGBA, avx);
- case_rgb(abgr, ABGR, avx);
- case_rgb(argb, ARGB, avx);
- default:
- break;
- }
- }
-}
diff --git a/ffmpeg1/libswscale/x86/swscale_template.c b/ffmpeg1/libswscale/x86/swscale_template.c
deleted file mode 100644
index f2567c1..0000000
--- a/ffmpeg1/libswscale/x86/swscale_template.c
+++ /dev/null
@@ -1,1717 +0,0 @@
-/*
- * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#undef REAL_MOVNTQ
-#undef MOVNTQ
-#undef MOVNTQ2
-#undef PREFETCH
-
-#if COMPILE_TEMPLATE_MMXEXT
-#define PREFETCH "prefetchnta"
-#else
-#define PREFETCH " # nop"
-#endif
-
-#if COMPILE_TEMPLATE_MMXEXT
-#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
-#define MOVNTQ2 "movntq "
-#else
-#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
-#define MOVNTQ2 "movq "
-#endif
-#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
-
-#if !COMPILE_TEMPLATE_MMXEXT
-static av_always_inline void
-dither_8to16(const uint8_t *srcDither, int rot)
-{
- if (rot) {
- __asm__ volatile("pxor %%mm0, %%mm0\n\t"
- "movq (%0), %%mm3\n\t"
- "movq %%mm3, %%mm4\n\t"
- "psrlq $24, %%mm3\n\t"
- "psllq $40, %%mm4\n\t"
- "por %%mm4, %%mm3\n\t"
- "movq %%mm3, %%mm4\n\t"
- "punpcklbw %%mm0, %%mm3\n\t"
- "punpckhbw %%mm0, %%mm4\n\t"
- :: "r"(srcDither)
- );
- } else {
- __asm__ volatile("pxor %%mm0, %%mm0\n\t"
- "movq (%0), %%mm3\n\t"
- "movq %%mm3, %%mm4\n\t"
- "punpcklbw %%mm0, %%mm3\n\t"
- "punpckhbw %%mm0, %%mm4\n\t"
- :: "r"(srcDither)
- );
- }
-}
-#endif
-
-static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
- const int16_t **src, uint8_t *dest, int dstW,
- const uint8_t *dither, int offset)
-{
- dither_8to16(dither, offset);
- filterSize--;
- __asm__ volatile(
- "movd %0, %%mm1\n\t"
- "punpcklwd %%mm1, %%mm1\n\t"
- "punpckldq %%mm1, %%mm1\n\t"
- "psllw $3, %%mm1\n\t"
- "paddw %%mm1, %%mm3\n\t"
- "paddw %%mm1, %%mm4\n\t"
- "psraw $4, %%mm3\n\t"
- "psraw $4, %%mm4\n\t"
- ::"m"(filterSize)
- );
-
- __asm__ volatile(\
- "movq %%mm3, %%mm6\n\t"
- "movq %%mm4, %%mm7\n\t"
- "movl %3, %%ecx\n\t"
- "mov %0, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- ".p2align 4 \n\t" /* FIXME Unroll? */\
- "1: \n\t"\
- "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
- "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
- "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
- "add $16, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "pmulhw %%mm0, %%mm2 \n\t"\
- "pmulhw %%mm0, %%mm5 \n\t"\
- "paddw %%mm2, %%mm3 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- " jnz 1b \n\t"\
- "psraw $3, %%mm3 \n\t"\
- "psraw $3, %%mm4 \n\t"\
- "packuswb %%mm4, %%mm3 \n\t"
- MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
- "add $8, %%"REG_c" \n\t"\
- "cmp %2, %%"REG_c" \n\t"\
- "movq %%mm6, %%mm3\n\t"
- "movq %%mm7, %%mm4\n\t"
- "mov %0, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "jb 1b \n\t"\
- :: "g" (filter),
- "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
- : "%"REG_d, "%"REG_S, "%"REG_c
- );
-}
-
-#define YSCALEYUV2PACKEDX_UV \
- __asm__ volatile(\
- "xor %%"REG_a", %%"REG_a" \n\t"\
- ".p2align 4 \n\t"\
- "nop \n\t"\
- "1: \n\t"\
- "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
- ".p2align 4 \n\t"\
- "2: \n\t"\
- "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
- "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
- "add %6, %%"REG_S" \n\t" \
- "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
- "add $16, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "pmulhw %%mm0, %%mm2 \n\t"\
- "pmulhw %%mm0, %%mm5 \n\t"\
- "paddw %%mm2, %%mm3 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- " jnz 2b \n\t"\
-
-#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
- "lea "offset"(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
- "movq "#dst1", "#dst2" \n\t"\
- ".p2align 4 \n\t"\
- "2: \n\t"\
- "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
- "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
- "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
- "add $16, %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "pmulhw "#coeff", "#src1" \n\t"\
- "pmulhw "#coeff", "#src2" \n\t"\
- "paddw "#src1", "#dst1" \n\t"\
- "paddw "#src2", "#dst2" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- " jnz 2b \n\t"\
-
-#define YSCALEYUV2PACKEDX \
- YSCALEYUV2PACKEDX_UV \
- YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
-
-#define YSCALEYUV2PACKEDX_END \
- :: "r" (&c->redDither), \
- "m" (dummy), "m" (dummy), "m" (dummy),\
- "r" (dest), "m" (dstW_reg), "m"(uv_off) \
- : "%"REG_a, "%"REG_d, "%"REG_S \
- );
-
-#define YSCALEYUV2PACKEDX_ACCURATE_UV \
- __asm__ volatile(\
- "xor %%"REG_a", %%"REG_a" \n\t"\
- ".p2align 4 \n\t"\
- "nop \n\t"\
- "1: \n\t"\
- "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- ".p2align 4 \n\t"\
- "2: \n\t"\
- "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
- "add %6, %%"REG_S" \n\t" \
- "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
- "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
- "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
- "movq %%mm0, %%mm3 \n\t"\
- "punpcklwd %%mm1, %%mm0 \n\t"\
- "punpckhwd %%mm1, %%mm3 \n\t"\
- "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
- "pmaddwd %%mm1, %%mm0 \n\t"\
- "pmaddwd %%mm1, %%mm3 \n\t"\
- "paddd %%mm0, %%mm4 \n\t"\
- "paddd %%mm3, %%mm5 \n\t"\
- "add %6, %%"REG_S" \n\t" \
- "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
- "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
- "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "punpcklwd %%mm3, %%mm2 \n\t"\
- "punpckhwd %%mm3, %%mm0 \n\t"\
- "pmaddwd %%mm1, %%mm2 \n\t"\
- "pmaddwd %%mm1, %%mm0 \n\t"\
- "paddd %%mm2, %%mm6 \n\t"\
- "paddd %%mm0, %%mm7 \n\t"\
- " jnz 2b \n\t"\
- "psrad $16, %%mm4 \n\t"\
- "psrad $16, %%mm5 \n\t"\
- "psrad $16, %%mm6 \n\t"\
- "psrad $16, %%mm7 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
- "packssdw %%mm5, %%mm4 \n\t"\
- "packssdw %%mm7, %%mm6 \n\t"\
- "paddw %%mm0, %%mm4 \n\t"\
- "paddw %%mm0, %%mm6 \n\t"\
- "movq %%mm4, "U_TEMP"(%0) \n\t"\
- "movq %%mm6, "V_TEMP"(%0) \n\t"\
-
-#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
- "lea "offset"(%0), %%"REG_d" \n\t"\
- "mov (%%"REG_d"), %%"REG_S" \n\t"\
- "pxor %%mm1, %%mm1 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- ".p2align 4 \n\t"\
- "2: \n\t"\
- "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
- "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
- "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
- "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
- "movq %%mm0, %%mm3 \n\t"\
- "punpcklwd %%mm4, %%mm0 \n\t"\
- "punpckhwd %%mm4, %%mm3 \n\t"\
- "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
- "pmaddwd %%mm4, %%mm0 \n\t"\
- "pmaddwd %%mm4, %%mm3 \n\t"\
- "paddd %%mm0, %%mm1 \n\t"\
- "paddd %%mm3, %%mm5 \n\t"\
- "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
- "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
- "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
- "test %%"REG_S", %%"REG_S" \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "punpcklwd %%mm3, %%mm2 \n\t"\
- "punpckhwd %%mm3, %%mm0 \n\t"\
- "pmaddwd %%mm4, %%mm2 \n\t"\
- "pmaddwd %%mm4, %%mm0 \n\t"\
- "paddd %%mm2, %%mm7 \n\t"\
- "paddd %%mm0, %%mm6 \n\t"\
- " jnz 2b \n\t"\
- "psrad $16, %%mm1 \n\t"\
- "psrad $16, %%mm5 \n\t"\
- "psrad $16, %%mm7 \n\t"\
- "psrad $16, %%mm6 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
- "packssdw %%mm5, %%mm1 \n\t"\
- "packssdw %%mm6, %%mm7 \n\t"\
- "paddw %%mm0, %%mm1 \n\t"\
- "paddw %%mm0, %%mm7 \n\t"\
- "movq "U_TEMP"(%0), %%mm3 \n\t"\
- "movq "V_TEMP"(%0), %%mm4 \n\t"\
-
-#define YSCALEYUV2PACKEDX_ACCURATE \
- YSCALEYUV2PACKEDX_ACCURATE_UV \
- YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
-
-#define YSCALEYUV2RGBX \
- "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
- "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
- "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
- "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
- "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
- "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
- /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
- "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
- "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
- "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
- "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
- "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
- "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
- /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
- "paddw %%mm3, %%mm4 \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "movq %%mm5, %%mm6 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
- "punpcklwd %%mm2, %%mm2 \n\t"\
- "punpcklwd %%mm5, %%mm5 \n\t"\
- "punpcklwd %%mm4, %%mm4 \n\t"\
- "paddw %%mm1, %%mm2 \n\t"\
- "paddw %%mm1, %%mm5 \n\t"\
- "paddw %%mm1, %%mm4 \n\t"\
- "punpckhwd %%mm0, %%mm0 \n\t"\
- "punpckhwd %%mm6, %%mm6 \n\t"\
- "punpckhwd %%mm3, %%mm3 \n\t"\
- "paddw %%mm7, %%mm0 \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw %%mm7, %%mm3 \n\t"\
- /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
- "packuswb %%mm0, %%mm2 \n\t"\
- "packuswb %%mm6, %%mm5 \n\t"\
- "packuswb %%mm3, %%mm4 \n\t"\
-
-#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
- "movq "#b", "#q2" \n\t" /* B */\
- "movq "#r", "#t" \n\t" /* R */\
- "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
- "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
- "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
- "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
- "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
- "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
- "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
- "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
- "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
- "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
-\
- MOVNTQ( q0, (dst, index, 4))\
- MOVNTQ( b, 8(dst, index, 4))\
- MOVNTQ( q2, 16(dst, index, 4))\
- MOVNTQ( q3, 24(dst, index, 4))\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
-
-static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
- YSCALEYUV2PACKEDX_ACCURATE
- YSCALEYUV2RGBX
- "movq %%mm2, "U_TEMP"(%0) \n\t"
- "movq %%mm4, "V_TEMP"(%0) \n\t"
- "movq %%mm5, "Y_TEMP"(%0) \n\t"
- YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
- "movq "Y_TEMP"(%0), %%mm5 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psraw $3, %%mm7 \n\t"
- "packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
- YSCALEYUV2PACKEDX_END
- } else {
- YSCALEYUV2PACKEDX_ACCURATE
- YSCALEYUV2RGBX
- "pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- YSCALEYUV2PACKEDX_END
- }
-}
-
-static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
- YSCALEYUV2PACKEDX
- YSCALEYUV2RGBX
- YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
- "psraw $3, %%mm1 \n\t"
- "psraw $3, %%mm7 \n\t"
- "packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
- YSCALEYUV2PACKEDX_END
- } else {
- YSCALEYUV2PACKEDX
- YSCALEYUV2RGBX
- "pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- YSCALEYUV2PACKEDX_END
- }
-}
-
-#define REAL_WRITERGB16(dst, dstw, index) \
- "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
- "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
- "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
- "psrlq $3, %%mm2 \n\t"\
-\
- "movq %%mm2, %%mm1 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
-\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm5, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm4 \n\t"\
- "punpckhbw %%mm5, %%mm1 \n\t"\
-\
- "psllq $3, %%mm3 \n\t"\
- "psllq $3, %%mm4 \n\t"\
-\
- "por %%mm3, %%mm2 \n\t"\
- "por %%mm4, %%mm1 \n\t"\
-\
- MOVNTQ(%%mm2, (dst, index, 2))\
- MOVNTQ(%%mm1, 8(dst, index, 2))\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
-
-static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX_ACCURATE
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
- "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
- "paddusb "RED_DITHER"(%0), %%mm5\n\t"
-#endif
- WRITERGB16(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
-#endif
- WRITERGB16(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-#define REAL_WRITERGB15(dst, dstw, index) \
- "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
- "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
- "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
- "psrlq $3, %%mm2 \n\t"\
- "psrlq $1, %%mm5 \n\t"\
-\
- "movq %%mm2, %%mm1 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
-\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm5, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm4 \n\t"\
- "punpckhbw %%mm5, %%mm1 \n\t"\
-\
- "psllq $2, %%mm3 \n\t"\
- "psllq $2, %%mm4 \n\t"\
-\
- "por %%mm3, %%mm2 \n\t"\
- "por %%mm4, %%mm1 \n\t"\
-\
- MOVNTQ(%%mm2, (dst, index, 2))\
- MOVNTQ(%%mm1, 8(dst, index, 2))\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
-
-static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX_ACCURATE
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
- "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
- "paddusb "RED_DITHER"(%0), %%mm5\n\t"
-#endif
- WRITERGB15(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
-#endif
- WRITERGB15(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-#define WRITEBGR24MMX(dst, dstw, index) \
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
- "movq %%mm2, %%mm1 \n\t" /* B */\
- "movq %%mm5, %%mm6 \n\t" /* R */\
- "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
- "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
- "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
- "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
- "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
- "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
- "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
- "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
- "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
- "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
-\
- "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
- "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
- "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
- "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
-\
- "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
- "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
- "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
- "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
-\
- "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
- "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
- "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
- "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
-\
- "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
- "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
- "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
- "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
- MOVNTQ(%%mm0, (dst))\
-\
- "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
- "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
- "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
- "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
- MOVNTQ(%%mm6, 8(dst))\
-\
- "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
- "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
- "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
- MOVNTQ(%%mm5, 16(dst))\
-\
- "add $24, "#dst" \n\t"\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-
-#define WRITEBGR24MMXEXT(dst, dstw, index) \
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
- "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
- "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
- "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
- "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
- "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
-\
- "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
- "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
- "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
-\
- "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
- "por %%mm1, %%mm6 \n\t"\
- "por %%mm3, %%mm6 \n\t"\
- MOVNTQ(%%mm6, (dst))\
-\
- "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
- "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
- "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
- "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
-\
- "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
- "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
- "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
-\
- "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
- "por %%mm3, %%mm6 \n\t"\
- MOVNTQ(%%mm6, 8(dst))\
-\
- "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
- "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
- "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
-\
- "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
- "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
- "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
-\
- "por %%mm1, %%mm3 \n\t"\
- "por %%mm3, %%mm6 \n\t"\
- MOVNTQ(%%mm6, 16(dst))\
-\
- "add $24, "#dst" \n\t"\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-
-#if COMPILE_TEMPLATE_MMXEXT
-#undef WRITEBGR24
-#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
-#else
-#undef WRITEBGR24
-#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
-#endif
-
-static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX_ACCURATE
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
- "add %4, %%"REG_c" \n\t"
- WRITEBGR24(%%REGc, %5, %%REGa)
- :: "r" (&c->redDither),
- "m" (dummy), "m" (dummy), "m" (dummy),
- "r" (dest), "m" (dstW_reg), "m"(uv_off)
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
- );
-}
-
-static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX
- YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
- "add %4, %%"REG_c" \n\t"
- WRITEBGR24(%%REGc, %5, %%REGa)
- :: "r" (&c->redDither),
- "m" (dummy), "m" (dummy), "m" (dummy),
- "r" (dest), "m" (dstW_reg), "m"(uv_off)
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
- );
-}
-
-#define REAL_WRITEYUY2(dst, dstw, index) \
- "packuswb %%mm3, %%mm3 \n\t"\
- "packuswb %%mm4, %%mm4 \n\t"\
- "packuswb %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm4, %%mm3 \n\t"\
- "movq %%mm1, %%mm7 \n\t"\
- "punpcklbw %%mm3, %%mm1 \n\t"\
- "punpckhbw %%mm3, %%mm7 \n\t"\
-\
- MOVNTQ(%%mm1, (dst, index, 2))\
- MOVNTQ(%%mm7, 8(dst, index, 2))\
-\
- "add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
- " jb 1b \n\t"
-#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
-
-static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX_ACCURATE
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
- "psraw $3, %%mm3 \n\t"
- "psraw $3, %%mm4 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
- const int16_t **lumSrc, int lumFilterSize,
- const int16_t *chrFilter, const int16_t **chrUSrc,
- const int16_t **chrVSrc,
- int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest, int dstW, int dstY)
-{
- x86_reg dummy=0;
- x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_offx2;
-
- YSCALEYUV2PACKEDX
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
- "psraw $3, %%mm3 \n\t"
- "psraw $3, %%mm4 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%REGa)
- YSCALEYUV2PACKEDX_END
-}
-
-#define REAL_YSCALEYUV2RGB_UV(index, c) \
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
- "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
- "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
- "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
- "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
- "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
- "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
- "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
- "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
- "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
- "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
- "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
- "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
- "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
- "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
- "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
- "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
- /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
-
-#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
- "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
- "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
- "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
- "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
- "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
- "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
- "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
- "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
- "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
- "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
-
-#define REAL_YSCALEYUV2RGB_COEFF(c) \
- "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
- "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
- "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
- "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
- "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
- "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
- /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
- "paddw %%mm3, %%mm4 \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "movq %%mm5, %%mm6 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
- "punpcklwd %%mm2, %%mm2 \n\t"\
- "punpcklwd %%mm5, %%mm5 \n\t"\
- "punpcklwd %%mm4, %%mm4 \n\t"\
- "paddw %%mm1, %%mm2 \n\t"\
- "paddw %%mm1, %%mm5 \n\t"\
- "paddw %%mm1, %%mm4 \n\t"\
- "punpckhwd %%mm0, %%mm0 \n\t"\
- "punpckhwd %%mm6, %%mm6 \n\t"\
- "punpckhwd %%mm3, %%mm3 \n\t"\
- "paddw %%mm7, %%mm0 \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw %%mm7, %%mm3 \n\t"\
- /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
- "packuswb %%mm0, %%mm2 \n\t"\
- "packuswb %%mm6, %%mm5 \n\t"\
- "packuswb %%mm3, %%mm4 \n\t"\
-
-#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
-
-#define YSCALEYUV2RGB(index, c) \
- REAL_YSCALEYUV2RGB_UV(index, c) \
- REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
- REAL_YSCALEYUV2RGB_COEFF(c)
-
-/**
- * vertical bilinear scale YV12 to RGB
- */
-static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf[2], uint8_t *dest,
- int dstW, int yalpha, int uvalpha, int y)
-{
- const int16_t *buf0 = buf[0], *buf1 = buf[1],
- *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
-
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
- const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
-#if ARCH_X86_64
- __asm__ volatile(
- YSCALEYUV2RGB(%%r8, %5)
- YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
- "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
- "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
- "packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
- "a" (&c->redDither),
- "r" (abuf0), "r" (abuf1)
- : "%r8"
- );
-#else
- c->u_temp=(intptr_t)abuf0;
- c->v_temp=(intptr_t)abuf1;
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB(%%REGBP, %5)
- "push %0 \n\t"
- "push %1 \n\t"
- "mov "U_TEMP"(%5), %0 \n\t"
- "mov "V_TEMP"(%5), %1 \n\t"
- YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
- "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
- "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
- "packuswb %%mm7, %%mm1 \n\t"
- "pop %1 \n\t"
- "pop %0 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
-#endif
- } else {
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB(%%REGBP, %5)
- "pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
-}
-
-static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf[2], uint8_t *dest,
- int dstW, int yalpha, int uvalpha, int y)
-{
- const int16_t *buf0 = buf[0], *buf1 = buf[1],
- *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
-
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
-}
-
-static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf[2], uint8_t *dest,
- int dstW, int yalpha, int uvalpha, int y)
-{
- const int16_t *buf0 = buf[0], *buf1 = buf[1],
- *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
-
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
-}
-
-static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf[2], uint8_t *dest,
- int dstW, int yalpha, int uvalpha, int y)
-{
- const int16_t *buf0 = buf[0], *buf1 = buf[1],
- *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
-
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
-}
-
-#define REAL_YSCALEYUV2PACKED(index, c) \
- "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
- "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
- "psraw $3, %%mm0 \n\t"\
- "psraw $3, %%mm1 \n\t"\
- "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
- "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
- "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
- "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
- "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
- "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
- "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
- "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
- "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
- "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
- "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
- "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
- "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
- "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
- "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
- "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
- "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
- "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
- "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
- "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
- "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
- "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
-
-#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
-
-static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf[2], uint8_t *dest,
- int dstW, int yalpha, int uvalpha, int y)
-{
- const int16_t *buf0 = buf[0], *buf1 = buf[1],
- *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
-
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2PACKED(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
-}
-
-#define REAL_YSCALEYUV2RGB1(index, c) \
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
- "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
- "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
- "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
- "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
- "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
- "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
- "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
- /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
- "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
- "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
- "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
- "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
- "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
- "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
- "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
- "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
- /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
- "paddw %%mm3, %%mm4 \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "movq %%mm5, %%mm6 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
- "punpcklwd %%mm2, %%mm2 \n\t"\
- "punpcklwd %%mm5, %%mm5 \n\t"\
- "punpcklwd %%mm4, %%mm4 \n\t"\
- "paddw %%mm1, %%mm2 \n\t"\
- "paddw %%mm1, %%mm5 \n\t"\
- "paddw %%mm1, %%mm4 \n\t"\
- "punpckhwd %%mm0, %%mm0 \n\t"\
- "punpckhwd %%mm6, %%mm6 \n\t"\
- "punpckhwd %%mm3, %%mm3 \n\t"\
- "paddw %%mm7, %%mm0 \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw %%mm7, %%mm3 \n\t"\
- /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
- "packuswb %%mm0, %%mm2 \n\t"\
- "packuswb %%mm6, %%mm5 \n\t"\
- "packuswb %%mm3, %%mm4 \n\t"\
-
-#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
-
-// do vertical chrominance interpolation
-#define REAL_YSCALEYUV2RGB1b(index, c) \
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
- "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
- "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
- "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
- "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
- "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
- "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
- "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
- "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
- "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
- "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
- "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
- /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
- "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
- "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
- "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
- "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
- "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
- "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
- "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
- "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
- "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
- /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
- "paddw %%mm3, %%mm4 \n\t"\
- "movq %%mm2, %%mm0 \n\t"\
- "movq %%mm5, %%mm6 \n\t"\
- "movq %%mm4, %%mm3 \n\t"\
- "punpcklwd %%mm2, %%mm2 \n\t"\
- "punpcklwd %%mm5, %%mm5 \n\t"\
- "punpcklwd %%mm4, %%mm4 \n\t"\
- "paddw %%mm1, %%mm2 \n\t"\
- "paddw %%mm1, %%mm5 \n\t"\
- "paddw %%mm1, %%mm4 \n\t"\
- "punpckhwd %%mm0, %%mm0 \n\t"\
- "punpckhwd %%mm6, %%mm6 \n\t"\
- "punpckhwd %%mm3, %%mm3 \n\t"\
- "paddw %%mm7, %%mm0 \n\t"\
- "paddw %%mm7, %%mm6 \n\t"\
- "paddw %%mm7, %%mm3 \n\t"\
- /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
- "packuswb %%mm0, %%mm2 \n\t"\
- "packuswb %%mm6, %%mm5 \n\t"\
- "packuswb %%mm3, %%mm4 \n\t"\
-
-#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
-
-#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
- "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
- "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
- "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
- "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
- "packuswb %%mm1, %%mm7 \n\t"
-#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
-
-/**
- * YV12 to RGB without scaling or interpolating
- */
-static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf0, uint8_t *dest,
- int dstW, int uvalpha, int y)
-{
- const int16_t *ubuf0 = ubuf[0];
- const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-
- if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
- const int16_t *ubuf1 = ubuf[0];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1(%%REGBP, %5)
- YSCALEYUV2RGB1_ALPHA(%%REGBP)
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1(%%REGBP, %5)
- "pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
- } else {
- const int16_t *ubuf1 = ubuf[1];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1b(%%REGBP, %5)
- YSCALEYUV2RGB1_ALPHA(%%REGBP)
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1b(%%REGBP, %5)
- "pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
- }
-}
-
-static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf0, uint8_t *dest,
- int dstW, int uvalpha, int y)
-{
- const int16_t *ubuf0 = ubuf[0];
- const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-
- if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
- const int16_t *ubuf1 = ubuf[0];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- const int16_t *ubuf1 = ubuf[1];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1b(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
-}
-
-static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf0, uint8_t *dest,
- int dstW, int uvalpha, int y)
-{
- const int16_t *ubuf0 = ubuf[0];
- const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-
- if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
- const int16_t *ubuf1 = ubuf[0];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- const int16_t *ubuf1 = ubuf[1];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1b(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
-}
-
-static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf0, uint8_t *dest,
- int dstW, int uvalpha, int y)
-{
- const int16_t *ubuf0 = ubuf[0];
- const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-
- if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
- const int16_t *ubuf1 = ubuf[0];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- const int16_t *ubuf1 = ubuf[1];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2RGB1b(%%REGBP, %5)
- "pxor %%mm7, %%mm7 \n\t"
- /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
- "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
-#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
-}
-
-#define REAL_YSCALEYUV2PACKED1(index, c) \
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "psraw $7, %%mm3 \n\t" \
- "psraw $7, %%mm4 \n\t" \
- "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
- "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
- "psraw $7, %%mm1 \n\t" \
- "psraw $7, %%mm7 \n\t" \
-
-#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
-
-#define REAL_YSCALEYUV2PACKED1b(index, c) \
- "xor "#index", "#index" \n\t"\
- ".p2align 4 \n\t"\
- "1: \n\t"\
- "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
- "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
- "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
- "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
- "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
- "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
- "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
- "psrlw $8, %%mm3 \n\t" \
- "psrlw $8, %%mm4 \n\t" \
- "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
- "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
- "psraw $7, %%mm1 \n\t" \
- "psraw $7, %%mm7 \n\t"
-#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
-
-static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
- const int16_t *ubuf[2], const int16_t *vbuf[2],
- const int16_t *abuf0, uint8_t *dest,
- int dstW, int uvalpha, int y)
-{
- const int16_t *ubuf0 = ubuf[0];
- const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-
- if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
- const int16_t *ubuf1 = ubuf[0];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2PACKED1(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- } else {
- const int16_t *ubuf1 = ubuf[1];
- __asm__ volatile(
- "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"REG_b" \n\t"
- "push %%"REG_BP" \n\t"
- YSCALEYUV2PACKED1b(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
- "pop %%"REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
- :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
- "a" (&c->redDither)
- );
- }
-}
-
-#if COMPILE_TEMPLATE_MMXEXT
-static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
- int dstWidth, const uint8_t *src,
- int srcW, int xInc)
-{
- int32_t *filterPos = c->hLumFilterPos;
- int16_t *filter = c->hLumFilter;
- void *mmxextFilterCode = c->lumMmxextFilterCode;
- int i;
-#if defined(PIC)
- uint64_t ebxsave;
-#endif
-#if ARCH_X86_64
- uint64_t retsave;
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"REG_b", %5 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %6 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %5 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"REG_c" \n\t"
- "mov %1, %%"REG_D" \n\t"
- "mov %2, %%"REG_d" \n\t"
- "mov %3, %%"REG_b" \n\t"
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
-#if ARCH_X86_64
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
- "add %%"REG_S", %%"REG_c" \n\t"\
- "add %%"REG_a", %%"REG_D" \n\t"\
- "xor %%"REG_a", %%"REG_a" \n\t"\
-
-#else
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
- "add %%"REG_a", %%"REG_D" \n\t"\
- "xor %%"REG_a", %%"REG_a" \n\t"\
-
-#endif /* ARCH_X86_64 */
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %5, %%"REG_b" \n\t"
-#if ARCH_X86_64
- "mov %6, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %5, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-#if !defined(PIC)
- ,"%"REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
- dst[i] = src[srcW-1]*128;
-}
-
-static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
- int dstWidth, const uint8_t *src1,
- const uint8_t *src2, int srcW, int xInc)
-{
- int32_t *filterPos = c->hChrFilterPos;
- int16_t *filter = c->hChrFilter;
- void *mmxextFilterCode = c->chrMmxextFilterCode;
- int i;
-#if defined(PIC)
- DECLARE_ALIGNED(8, uint64_t, ebxsave);
-#endif
-#if ARCH_X86_64
- DECLARE_ALIGNED(8, uint64_t, retsave);
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"REG_b", %7 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %8 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %7 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"REG_c" \n\t"
- "mov %1, %%"REG_D" \n\t"
- "mov %2, %%"REG_d" \n\t"
- "mov %3, %%"REG_b" \n\t"
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- "mov %5, %%"REG_c" \n\t" // src
- "mov %6, %%"REG_D" \n\t" // buf2
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %7, %%"REG_b" \n\t"
-#if ARCH_X86_64
- "mov %8, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %7, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-#if !defined(PIC)
- ,"%"REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
- dst1[i] = src1[srcW-1]*128;
- dst2[i] = src2[srcW-1]*128;
- }
-}
-#endif /* COMPILE_TEMPLATE_MMXEXT */
-
-static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
-{
- enum AVPixelFormat dstFormat = c->dstFormat;
-
- c->use_mmx_vfilter= 0;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
- && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
- if (c->flags & SWS_ACCURATE_RND) {
- if (!(c->flags & SWS_FULL_CHR_H_INT)) {
- switch (c->dstFormat) {
- case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
- case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
- case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
- case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
- case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
- default: break;
- }
- }
- } else {
- c->use_mmx_vfilter= 1;
- c->yuv2planeX = RENAME(yuv2yuvX );
- if (!(c->flags & SWS_FULL_CHR_H_INT)) {
- switch (c->dstFormat) {
- case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
- case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
- case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
- case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
- case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
- default: break;
- }
- }
- }
- if (!(c->flags & SWS_FULL_CHR_H_INT)) {
- switch (c->dstFormat) {
- case AV_PIX_FMT_RGB32:
- c->yuv2packed1 = RENAME(yuv2rgb32_1);
- c->yuv2packed2 = RENAME(yuv2rgb32_2);
- break;
- case AV_PIX_FMT_BGR24:
- c->yuv2packed1 = RENAME(yuv2bgr24_1);
- c->yuv2packed2 = RENAME(yuv2bgr24_2);
- break;
- case AV_PIX_FMT_RGB555:
- c->yuv2packed1 = RENAME(yuv2rgb555_1);
- c->yuv2packed2 = RENAME(yuv2rgb555_2);
- break;
- case AV_PIX_FMT_RGB565:
- c->yuv2packed1 = RENAME(yuv2rgb565_1);
- c->yuv2packed2 = RENAME(yuv2rgb565_2);
- break;
- case AV_PIX_FMT_YUYV422:
- c->yuv2packed1 = RENAME(yuv2yuyv422_1);
- c->yuv2packed2 = RENAME(yuv2yuyv422_2);
- break;
- default:
- break;
- }
- }
- }
-
- if (c->srcBpc == 8 && c->dstBpc <= 14) {
- // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
-#if COMPILE_TEMPLATE_MMXEXT
- if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
- c->hyscale_fast = RENAME(hyscale_fast);
- c->hcscale_fast = RENAME(hcscale_fast);
- } else {
-#endif /* COMPILE_TEMPLATE_MMXEXT */
- c->hyscale_fast = NULL;
- c->hcscale_fast = NULL;
-#if COMPILE_TEMPLATE_MMXEXT
- }
-#endif /* COMPILE_TEMPLATE_MMXEXT */
- }
-}
diff --git a/ffmpeg1/libswscale/x86/w64xmmtest.c b/ffmpeg1/libswscale/x86/w64xmmtest.c
deleted file mode 100644
index dd9a2a4..0000000
--- a/ffmpeg1/libswscale/x86/w64xmmtest.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * check XMM registers for clobbers on Win64
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86/w64xmmtest.h"
-#include "libswscale/swscale.h"
-
-wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
- const int srcStride[], int srcSliceY, int srcSliceH,
- uint8_t *const dst[], const int dstStride[]))
-{
- testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
- srcSliceH, dst, dstStride);
-}
diff --git a/ffmpeg1/libswscale/x86/yuv2rgb.c b/ffmpeg1/libswscale/x86/yuv2rgb.c
deleted file mode 100644
index 3938e6b..0000000
--- a/ffmpeg1/libswscale/x86/yuv2rgb.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * software YUV to RGB converter
- *
- * Copyright (C) 2009 Konstantin Shishkov
- *
- * MMX/MMXEXT template stuff (needed for fast movntq support),
- * 1,4,8bpp support and context / deglobalize stuff
- * by Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <assert.h>
-
-#include "config.h"
-#include "libswscale/rgb2rgb.h"
-#include "libswscale/swscale.h"
-#include "libswscale/swscale_internal.h"
-#include "libavutil/attributes.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/cpu.h"
-
-#if HAVE_INLINE_ASM
-
-#define DITHER1XBPP // only for MMX
-
-/* hope these constant values are cache line aligned */
-DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
-DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
-DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
-DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
-
-//MMX versions
-#if HAVE_MMX_INLINE
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define RENAME(a) a ## _MMX
-#include "yuv2rgb_template.c"
-#endif /* HAVE_MMX_INLINE */
-
-// MMXEXT versions
-#if HAVE_MMXEXT_INLINE
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#define COMPILE_TEMPLATE_MMXEXT 1
-#define RENAME(a) a ## _MMXEXT
-#include "yuv2rgb_template.c"
-#endif /* HAVE_MMXEXT_INLINE */
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
-{
-#if HAVE_INLINE_ASM
- int cpu_flags = av_get_cpu_flags();
-
-#if HAVE_MMXEXT_INLINE
- if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
- switch (c->dstFormat) {
- case AV_PIX_FMT_RGB24:
- return yuv420_rgb24_MMXEXT;
- case AV_PIX_FMT_BGR24:
- return yuv420_bgr24_MMXEXT;
- }
- }
-#endif
-
- if (cpu_flags & AV_CPU_FLAG_MMX) {
- switch (c->dstFormat) {
- case AV_PIX_FMT_RGB32:
- if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
- return yuva420_rgb32_MMX;
-#endif
- break;
- } else return yuv420_rgb32_MMX;
- case AV_PIX_FMT_BGR32:
- if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
- return yuva420_bgr32_MMX;
-#endif
- break;
- } else return yuv420_bgr32_MMX;
- case AV_PIX_FMT_RGB24: return yuv420_rgb24_MMX;
- case AV_PIX_FMT_BGR24: return yuv420_bgr24_MMX;
- case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX;
- case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX;
- }
- }
-#endif /* HAVE_INLINE_ASM */
-
- return NULL;
-}
diff --git a/ffmpeg1/libswscale/x86/yuv2rgb_template.c b/ffmpeg1/libswscale/x86/yuv2rgb_template.c
deleted file mode 100644
index c879102..0000000
--- a/ffmpeg1/libswscale/x86/yuv2rgb_template.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * software YUV to RGB converter
- *
- * Copyright (C) 2001-2007 Michael Niedermayer
- * (c) 2010 Konstantin Shishkov
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#undef MOVNTQ
-#undef EMMS
-#undef SFENCE
-
-#if COMPILE_TEMPLATE_MMXEXT
-#define MOVNTQ "movntq"
-#define SFENCE "sfence"
-#else
-#define MOVNTQ "movq"
-#define SFENCE " # nop"
-#endif
-
-#define REG_BLUE "0"
-#define REG_RED "1"
-#define REG_GREEN "2"
-#define REG_ALPHA "3"
-
-#define YUV2RGB_LOOP(depth) \
- h_size = (c->dstW + 7) & ~7; \
- if (h_size * depth > FFABS(dstStride[0])) \
- h_size -= 8; \
- \
- vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
- \
- __asm__ volatile ("pxor %mm4, %mm4\n\t"); \
- for (y = 0; y < srcSliceH; y++) { \
- uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
- const uint8_t *py = src[0] + y * srcStride[0]; \
- const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
- const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
- x86_reg index = -h_size / 2; \
-
-#define YUV2RGB_INITIAL_LOAD \
- __asm__ volatile ( \
- "movq (%5, %0, 2), %%mm6\n\t" \
- "movd (%2, %0), %%mm0\n\t" \
- "movd (%3, %0), %%mm1\n\t" \
- "1: \n\t" \
-
-/* YUV2RGB core
- * Conversion is performed in usual way:
- * R = Y' * Ycoef + Vred * V'
- * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
- * B = Y' * Ycoef + Ublue * U'
- *
- * where X' = X * 8 - Xoffset (multiplication is performed to increase
- * precision a bit).
- * Since it operates in YUV420 colorspace, Y component is additionally
- * split into Y1 and Y2 for even and odd pixels.
- *
- * Input:
- * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
- * Output:
- * mm1 - R, mm2 - G, mm0 - B
- */
-#define YUV2RGB \
- /* convert Y, U, V into Y1', Y2', U', V' */ \
- "movq %%mm6, %%mm7\n\t" \
- "punpcklbw %%mm4, %%mm0\n\t" \
- "punpcklbw %%mm4, %%mm1\n\t" \
- "pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
- "psrlw $8, %%mm7\n\t" \
- "psllw $3, %%mm0\n\t" \
- "psllw $3, %%mm1\n\t" \
- "psllw $3, %%mm6\n\t" \
- "psllw $3, %%mm7\n\t" \
- "psubsw "U_OFFSET"(%4), %%mm0\n\t" \
- "psubsw "V_OFFSET"(%4), %%mm1\n\t" \
- "psubw "Y_OFFSET"(%4), %%mm6\n\t" \
- "psubw "Y_OFFSET"(%4), %%mm7\n\t" \
-\
- /* multiply by coefficients */ \
- "movq %%mm0, %%mm2\n\t" \
- "movq %%mm1, %%mm3\n\t" \
- "pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
- "pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
- "pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
- "pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
- "pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
- "pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
- "paddsw %%mm3, %%mm2\n\t" \
- /* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
- /* mm6 = Y1, mm7 = Y2 */ \
-\
- /* produce RGB */ \
- "movq %%mm7, %%mm3\n\t" \
- "movq %%mm7, %%mm5\n\t" \
- "paddsw %%mm0, %%mm3\n\t" \
- "paddsw %%mm1, %%mm5\n\t" \
- "paddsw %%mm2, %%mm7\n\t" \
- "paddsw %%mm6, %%mm0\n\t" \
- "paddsw %%mm6, %%mm1\n\t" \
- "paddsw %%mm6, %%mm2\n\t" \
-
-#define RGB_PACK_INTERLEAVE \
- /* pack and interleave even/odd pixels */ \
- "packuswb %%mm1, %%mm0\n\t" \
- "packuswb %%mm5, %%mm3\n\t" \
- "packuswb %%mm2, %%mm2\n\t" \
- "movq %%mm0, %%mm1\n\n" \
- "packuswb %%mm7, %%mm7\n\t" \
- "punpcklbw %%mm3, %%mm0\n\t" \
- "punpckhbw %%mm3, %%mm1\n\t" \
- "punpcklbw %%mm7, %%mm2\n\t" \
-
-#define YUV2RGB_ENDLOOP(depth) \
- "movq 8 (%5, %0, 2), %%mm6\n\t" \
- "movd 4 (%3, %0), %%mm1\n\t" \
- "movd 4 (%2, %0), %%mm0\n\t" \
- "add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
- "add $4, %0\n\t" \
- "js 1b\n\t" \
-
-#define YUV2RGB_OPERANDS \
- : "+r" (index), "+r" (image) \
- : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
- "r" (py - 2*index) \
- : "memory" \
- ); \
- } \
-
-#define YUV2RGB_OPERANDS_ALPHA \
- : "+r" (index), "+r" (image) \
- : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
- "r" (py - 2*index), "r" (pa - 2*index) \
- : "memory" \
- ); \
- } \
-
-#define YUV2RGB_ENDFUNC \
- __asm__ volatile (SFENCE"\n\t" \
- "emms \n\t"); \
- return srcSliceH; \
-
-#define IF0(x)
-#define IF1(x) x
-
-#define RGB_PACK16(gmask, is15) \
- "pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
- "pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
- "movq %%mm2, %%mm3\n\t" \
- "psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
- "psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
- "psrlw $3, %%mm0\n\t" \
- IF##is15("psrlw $1, %%mm1\n\t") \
- "pand "MANGLE(pb_e0)", %%mm2\n\t" \
- "pand "MANGLE(gmask)", %%mm3\n\t" \
- "por %%mm2, %%mm0\n\t" \
- "por %%mm3, %%mm1\n\t" \
- "movq %%mm0, %%mm2\n\t" \
- "punpcklbw %%mm1, %%mm0\n\t" \
- "punpckhbw %%mm1, %%mm2\n\t" \
- MOVNTQ " %%mm0, (%1)\n\t" \
- MOVNTQ " %%mm2, 8(%1)\n\t" \
-
-#define DITHER_RGB \
- "paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
- "paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
- "paddusb "RED_DITHER"(%4), %%mm1\n\t" \
-
-#if !COMPILE_TEMPLATE_MMXEXT
-static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(2)
-
-#ifdef DITHER1XBPP
- c->blueDither = ff_dither8[y & 1];
- c->greenDither = ff_dither8[y & 1];
- c->redDither = ff_dither8[(y + 1) & 1];
-#endif
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
-#ifdef DITHER1XBPP
- DITHER_RGB
-#endif
- RGB_PACK16(pb_03, 1)
-
- YUV2RGB_ENDLOOP(2)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-
-static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(2)
-
-#ifdef DITHER1XBPP
- c->blueDither = ff_dither8[y & 1];
- c->greenDither = ff_dither4[y & 1];
- c->redDither = ff_dither8[(y + 1) & 1];
-#endif
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
-#ifdef DITHER1XBPP
- DITHER_RGB
-#endif
- RGB_PACK16(pb_07, 0)
-
- YUV2RGB_ENDLOOP(2)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-#endif /* !COMPILE_TEMPLATE_MMXEXT */
-
-#define RGB_PACK24(blue, red)\
- "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
- "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
- "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
- "movq %%mm"red", %%mm3 \n"\
- "movq %%mm"blue", %%mm6 \n"\
- "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
- "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
- "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
- "movq %%mm3, %%mm5 \n"\
- "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
- "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
- "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
- RGB_PACK24_B
-
-#if COMPILE_TEMPLATE_MMXEXT
-DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
-DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
-DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
-DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
-DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
-#undef RGB_PACK24_B
-#define RGB_PACK24_B\
- "pshufw $0xc6, %%mm2, %%mm1 \n"\
- "pshufw $0x84, %%mm3, %%mm6 \n"\
- "pshufw $0x38, %%mm5, %%mm7 \n"\
- "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
- "movq %%mm1, %%mm0 \n"\
- "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
- "movq %%mm1, %%mm2 \n"\
- "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
- "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
- "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
- "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
- "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
- "por %%mm3, %%mm1 \n"\
- "por %%mm6, %%mm0 \n"\
- "por %%mm5, %%mm1 \n"\
- "por %%mm7, %%mm2 \n"\
- MOVNTQ" %%mm0, (%1) \n"\
- MOVNTQ" %%mm1, 8(%1) \n"\
- MOVNTQ" %%mm2, 16(%1) \n"\
-
-#else
-#undef RGB_PACK24_B
-#define RGB_PACK24_B\
- "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
- "movd %%mm2, 4(%1) \n" /* G1 B1 */\
- "psrlq $32, %%mm3 \n"\
- "psrlq $16, %%mm2 \n"\
- "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
- "movd %%mm2, 10(%1) \n" /* G3 B3 */\
- "psrlq $16, %%mm2 \n"\
- "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
- "movd %%mm2, 16(%1) \n" /* G5 B5 */\
- "psrlq $32, %%mm5 \n"\
- "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
- "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
-
-#endif
-
-static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(3)
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK24(REG_BLUE, REG_RED)
-
- YUV2RGB_ENDLOOP(3)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-
-static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(3)
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK24(REG_RED, REG_BLUE)
-
- YUV2RGB_ENDLOOP(3)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-
-
-#define SET_EMPTY_ALPHA \
- "pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
-
-#define LOAD_ALPHA \
- "movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
-
-#define RGB_PACK32(red, green, blue, alpha) \
- "movq %%mm"blue", %%mm5\n\t" \
- "movq %%mm"red", %%mm6\n\t" \
- "punpckhbw %%mm"green", %%mm5\n\t" \
- "punpcklbw %%mm"green", %%mm"blue"\n\t" \
- "punpckhbw %%mm"alpha", %%mm6\n\t" \
- "punpcklbw %%mm"alpha", %%mm"red"\n\t" \
- "movq %%mm"blue", %%mm"green"\n\t" \
- "movq %%mm5, %%mm"alpha"\n\t" \
- "punpcklwd %%mm"red", %%mm"blue"\n\t" \
- "punpckhwd %%mm"red", %%mm"green"\n\t" \
- "punpcklwd %%mm6, %%mm5\n\t" \
- "punpckhwd %%mm6, %%mm"alpha"\n\t" \
- MOVNTQ " %%mm"blue", 0(%1)\n\t" \
- MOVNTQ " %%mm"green", 8(%1)\n\t" \
- MOVNTQ " %%mm5, 16(%1)\n\t" \
- MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
-
-#if !COMPILE_TEMPLATE_MMXEXT
-static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(4)
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
- SET_EMPTY_ALPHA
- RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
-
- YUV2RGB_ENDLOOP(4)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(4)
-
- const uint8_t *pa = src[3] + y * srcStride[3];
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
- LOAD_ALPHA
- RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
-
- YUV2RGB_ENDLOOP(4)
- YUV2RGB_OPERANDS_ALPHA
- YUV2RGB_ENDFUNC
-}
-#endif
-
-static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(4)
-
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
- SET_EMPTY_ALPHA
- RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
-
- YUV2RGB_ENDLOOP(4)
- YUV2RGB_OPERANDS
- YUV2RGB_ENDFUNC
-}
-
-#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
-static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
- int srcStride[],
- int srcSliceY, int srcSliceH,
- uint8_t *dst[], int dstStride[])
-{
- int y, h_size, vshift;
-
- YUV2RGB_LOOP(4)
-
- const uint8_t *pa = src[3] + y * srcStride[3];
- YUV2RGB_INITIAL_LOAD
- YUV2RGB
- RGB_PACK_INTERLEAVE
- LOAD_ALPHA
- RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
-
- YUV2RGB_ENDLOOP(4)
- YUV2RGB_OPERANDS_ALPHA
- YUV2RGB_ENDFUNC
-}
-#endif
-
-#endif /* !COMPILE_TEMPLATE_MMXEXT */