11 files changed, 7069 insertions, 0 deletions
diff --git a/ffmpeg/libswscale/x86/Makefile b/ffmpeg/libswscale/x86/Makefile
new file mode 100644
index 0000000..7d219b4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/Makefile
@@ -0,0 +1,11 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
+MMX-OBJS                        += x86/rgb2rgb.o                        \
+                                   x86/swscale.o                        \
+                                   x86/yuv2rgb.o                        \
+
+YASM-OBJS                       += x86/input.o                          \
+                                   x86/output.o                         \
+                                   x86/scale.o                          \
diff --git a/ffmpeg/libswscale/x86/input.asm b/ffmpeg/libswscale/x86/input.asm
new file mode 100644
index 0000000..9d5a871
--- /dev/null
+++ b/ffmpeg/libswscale/x86/input.asm
@@ -0,0 +1,670 @@
+;******************************************************************************
+;* x86-optimized input routines; does shuffling of packed
+;* YUV formats into individual planes, and converts RGB
+;* into YUV planes also.
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+%define RY 0x20DE
+%define GY 0x4087
+%define BY 0x0C88
+%define RU 0xECFF
+%define GU 0xDAC8
+%define BU 0x3838
+%define RV 0x3838
+%define GV 0xD0E3
+%define BV 0xF6E4
+
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
+bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+rgba_Ycoeff_rb:  times 4 dw RY, BY
+rgba_Ycoeff_br:  times 4 dw BY, RY
+rgba_Ycoeff_ga:  times 4 dw GY, 0
+rgba_Ycoeff_ag:  times 4 dw 0,  GY
+rgba_Ucoeff_rb:  times 4 dw RU, BU
+rgba_Ucoeff_br:  times 4 dw BU, RU
+rgba_Ucoeff_ga:  times 4 dw GU, 0
+rgba_Ucoeff_ag:  times 4 dw 0,  GU
+rgba_Vcoeff_rb:  times 4 dw RV, BV
+rgba_Vcoeff_br:  times 4 dw BV, RV
+rgba_Vcoeff_ga:  times 4 dw GV, 0
+rgba_Vcoeff_ag:  times 4 dw 0,  GV
+
+shuf_rgb_12x4:   db 0, 0x80, 1, 0x80,  2, 0x80,  3, 0x80, \
+                    6, 0x80, 7, 0x80,  8, 0x80,  9, 0x80
+shuf_rgb_3x56:   db 2, 0x80, 3, 0x80,  4, 0x80,  5, 0x80, \
+                    8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; RGB to Y/UV.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_Y_FN 2-3
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+%if mmsize == 8
+    mova           m5, [%2_Ycoeff_12x4]
+    mova           m6, [%2_Ycoeff_3x56]
+%define coeff1 m5
+%define coeff2 m6
+%elif ARCH_X86_64
+    mova           m8, [%2_Ycoeff_12x4]
+    mova           m9, [%2_Ycoeff_3x56]
+%define coeff1 m8
+%define coeff2 m9
+%else ; x86-32 && mmsize == 16
+%define coeff1 [%2_Ycoeff_12x4]
+%define coeff2 [%2_Ycoeff_3x56]
+%endif ; x86-32/64 && mmsize == 8/16
+%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
+%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m10, [shuf_rgb_3x56]
+%define shuf_rgb2 m10
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif ; !cpuflag(ssse3)
+    mova           m4, [rgb_Yrnd]
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m2, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    pshufb         m3, m2, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m2, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m2, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m3, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16 ; i.e. sse2
+    punpckldq      m0, m2                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m3                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m2, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m3, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+    movd           m5, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m6, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m2, m5                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m3, m6                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    punpcklbw      m2, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m3, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m0, coeff1             ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
+    pmaddwd        m1, coeff2             ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
+    pmaddwd        m2, coeff1             ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
+    pmaddwd        m3, coeff2             ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
+    paddd          m0, m1                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
+    paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_UV_FN 2-3
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+    mova           m8, [%2_Ucoeff_12x4]
+    mova           m9, [%2_Ucoeff_3x56]
+    mova          m10, [%2_Vcoeff_12x4]
+    mova          m11, [%2_Vcoeff_3x56]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [%2_Ucoeff_12x4]
+%define coeffU2 [%2_Ucoeff_3x56]
+%define coeffV1 [%2_Vcoeff_12x4]
+%define coeffV2 [%2_Vcoeff_3x56]
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 3
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m12, [shuf_rgb_3x56]
+%define shuf_rgb2 m12
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    neg            wq
+    mova           m6, [rgb_UVrnd]
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m4, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m4, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m5, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16
+    punpckldq      m0, m4                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m5                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m4, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m5, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+%endif ; cpuflag(ssse3)
+    pmaddwd        m2, m0, coeffV1        ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
+    pmaddwd        m3, m1, coeffV2        ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
+    pmaddwd        m0, coeffU1            ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
+    pmaddwd        m1, coeffU2            ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
+    paddd          m0, m1                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
+%if cpuflag(ssse3)
+    pshufb         m5, m4, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m4, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+%if mmsize == 16
+    movd           m1, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m3, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m4, m1                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m5, m3                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16 && !cpuflag(ssse3)
+    punpcklbw      m4, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m5, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m1, m4, coeffU1        ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
+    pmaddwd        m3, m5, coeffU2        ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
+    pmaddwd        m4, coeffV1            ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
+    pmaddwd        m5, coeffV2            ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
+    paddd          m1, m3                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
+    paddd          m4, m5                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
+    paddd          m0, m6                 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
+    paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
+    paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
+    paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
+    packssdw       m0, m1                 ; (word) { U[0-7] }
+    packssdw       m2, m4                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB24_FUNCS 2
+RGB24_TO_Y_FN %1, rgb
+RGB24_TO_Y_FN %1, bgr, rgb
+RGB24_TO_UV_FN %2, rgb
+RGB24_TO_UV_FN %2, bgr, rgb
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB24_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB24_FUNCS 10, 12
+
+INIT_XMM ssse3
+RGB24_FUNCS 11, 13
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB24_FUNCS 11, 13
+%endif
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+    mova           m5, [rgba_Ycoeff_%2%4]
+    mova           m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    lea          srcq, [srcq+wq*4]
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+    mova           m4, [rgb_Yrnd]
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
+    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
+    pmaddwd        m3, m5                 ; (dword) { Bx*BY + Rx*RY }[4-7]
+    pmaddwd        m2, m6                 ; (dword) { Gx*GY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd
+    paddd          m2, m4                 ; += rgb_Yrnd
+    paddd          m0, m1                 ; (dword) { Y[0-3] }
+    paddd          m2, m3                 ; (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+    mova           m8, [rgba_Ucoeff_%2%4]
+    mova           m9, [rgba_Ucoeff_%3%5]
+    mova          m10, [rgba_Vcoeff_%2%4]
+    mova          m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    lea          srcq, [srcq+wq*2]
+    neg            wq
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+    mova           m6, [rgb_UVrnd]
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
+    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
+    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
+    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m1, m6                 ; += rgb_UVrnd
+    paddd          m2, m3                 ; (dword) { V[0-3] }
+    paddd          m0, m1                 ; (dword) { U[0-3] }
+    pmaddwd        m3, m5, coeffV1        ; (dword) { Bx*BV + Rx*RV }[4-7]
+    pmaddwd        m1, m4, coeffV2        ; (dword) { Gx*GV }[4-7]
+    pmaddwd        m5, coeffU1            ; (dword) { Bx*BU + Rx*RU }[4-7]
+    pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m5, m6                 ; += rgb_UVrnd
+    psrad          m0, 9
+    paddd          m1, m3                 ; (dword) { V[4-7] }
+    paddd          m4, m5                 ; (dword) { U[4-7] }
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
+    packssdw       m0, m4                 ; (word) { U[0-7] }
+    packssdw       m2, m1                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+%endif
+
+;-----------------------------------------------------------------------------
+; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_Y 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+%ifidn %2, yuyv
+    pand           m0, m2                 ; (word) { Y0, Y1, ..., Y7 }
+    pand           m1, m2                 ; (word) { Y8, Y9, ..., Y15 }
+%else ; uyvy
+    psrlw          m0, 8                  ; (word) { Y0, Y1, ..., Y7 }
+    psrlw          m1, 8                  ; (word) { Y8, Y9, ..., Y15 }
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { Y0, ..., Y15 }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. YUYV+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_Y_FN 2-3
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add          dstq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+%ifidn %2, yuyv
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+%endif ; yuyv
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_Y  u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_UV 2
+.loop_%1:
+%ifidn %2, yuyv
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    psrlw          m0, 8                  ; (word) { U0, V0, ..., U3, V3 }
+    psrlw          m1, 8                  ; (word) { U4, V4, ..., U7, V7 }
+%else ; uyvy
+%if cpuflag(avx)
+    vpand          m0, m2, [srcq+wq*4]        ; (word) { U0, V0, ..., U3, V3 }
+    vpand          m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
+%else
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    pand           m0, m2                 ; (word) { U0, V0, ..., U3, V3 }
+    pand           m1, m2                 ; (word) { U4, V4, ..., U7, V7 }
+%endif
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { U0, V0, ..., U7, V7 }
+    pand           m1, m0, m2             ; (word) { U0, U1, ..., U7 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+%if mmsize == 16
+    packuswb       m1, m0                 ; (byte) { U0, ... U7, V1, ... V7 }
+    movh   [dstUq+wq], m1
+    movhps [dstVq+wq], m1
+%else ; mmsize == 8
+    packuswb       m1, m1                 ; (byte) { U0, ... U3 }
+    packuswb       m0, m0                 ; (byte) { V0, ... V3 }
+    movh   [dstUq+wq], m1
+    movh   [dstVq+wq], m0
+%endif ; mmsize == 8/16
+    add            wq, mmsize / 2
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. UYVY+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_UV_FN 2-3
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16 && %0 == 2
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*4]
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+    ; NOTE: if uyvy+avx, u/a are identical
+%if mmsize == 16 && %0 == 2
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = nv12 or nv21
+%macro LOOP_NVXX_TO_UV 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+    psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
+    packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
+    packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
+%ifidn %2, nv12
+    mova   [dstUq+wq], m2
+    mova   [dstVq+wq], m0
+%else ; nv21
+    mova   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+%endif ; nv12/21
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = nv12 or nv21
+%macro NVXX_TO_UV_FN 2
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_NVXX_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+YUYV_TO_Y_FN  0, yuyv
+YUYV_TO_Y_FN  0, uyvy
+YUYV_TO_UV_FN 0, yuyv
+YUYV_TO_UV_FN 0, uyvy
+NVXX_TO_UV_FN 0, nv12
+NVXX_TO_UV_FN 0, nv21
+%endif
+
+INIT_XMM sse2
+YUYV_TO_Y_FN  3, yuyv
+YUYV_TO_Y_FN  2, uyvy
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
+; that's not faster in practice
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy, 1
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/ffmpeg/libswscale/x86/output.asm b/ffmpeg/libswscale/x86/output.asm
new file mode 100644
index 0000000..f9add35
--- /dev/null
+++ b/ffmpeg/libswscale/x86/output.asm
@@ -0,0 +1,413 @@
+;******************************************************************************
+;* x86-optimized vertical line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*                    Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+minshort:      times 8 dw 0x8000
+yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_9_start:   times 4 dd 0x20000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
+yuv2yuvX_9_upper:   times 8 dw 0x1ff
+pd_4:          times 4 dd 4
+pd_4min0x40000:times 4 dd 4 - (0x40000)
+pw_16:         times 8 dw 16
+pw_32:         times 8 dw 32
+pw_512:        times 8 dw 512
+pw_1024:       times 8 dw 1024
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+    pxor            m6,  m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+    SUB             rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+    ; create registers holding dither
+    movq        m_dith, [ditherq]        ; dither
+    test        offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq  m_dith,  m_dith
+%endif ; mmsize == 16
+    PALIGNR     m_dith,  m_dith,  3,  m0
+.no_rot:
+%if mmsize == 16
+    punpcklbw   m_dith,  m6
+%if ARCH_X86_64
+    punpcklwd       m8,  m_dith,  m6
+    pslld           m8,  12
+%else ; x86-32
+    punpcklwd       m5,  m_dith,  m6
+    pslld           m5,  12
+%endif ; x86-32/64
+    punpckhwd   m_dith,  m6
+    pslld       m_dith,  12
+%if ARCH_X86_32
+    mova      [rsp+ 0],  m5
+    mova      [rsp+16],  m_dith
+%endif
+%else ; mmsize == 8
+    punpcklbw       m5,  m_dith,  m6
+    punpckhbw   m_dith,  m6
+    punpcklwd       m4,  m5,  m6
+    punpckhwd       m5,  m6
+    punpcklwd       m3,  m_dith,  m6
+    punpckhwd   m_dith,  m6
+    pslld           m4,  12
+    pslld           m5,  12
+    pslld           m3,  12
+    pslld       m_dith,  12
+    mova      [rsp+ 0],  m4
+    mova      [rsp+ 8],  m5
+    mova      [rsp+16],  m3
+    mova      [rsp+24],  m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+    xor             r5,  r5
+
+.pixelloop:
+%assign %%i 0
+    ; the rep here is for the 8bit output mmx case, where dither covers
+    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
+    ; pixels per iteration. In order to not have to keep track of where
+    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
+%if %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
+
+%if %1 == 8
+%if ARCH_X86_32
+    mova            m2, [rsp+mmsize*(0+%%i)]
+    mova            m1, [rsp+mmsize*(1+%%i)]
+%else ; x86-64
+    mova            m2,  m8
+    mova            m1,  m_dith
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    mova            m1, [yuv2yuvX_%1_start]
+    mova            m2,  m1
+%endif ; %1 == 8/9/10/16
+    movsx     cntr_reg,  fltsizem
+.filterloop_ %+ %%i:
+    ; input pixels
+    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
+%if %1 == 16
+    mova            m3, [r6+r5*4]
+    mova            m5, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m3, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
+%if %1 == 16
+    mova            m4, [r6+r5*4]
+    mova            m6, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m4, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+
+    ; coefficients
+    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
+%if %1 == 16
+    pshuflw         m7,  m0,  0          ; coeff[0]
+    pshuflw         m0,  m0,  0x55       ; coeff[1]
+    pmovsxwd        m7,  m7              ; word -> dword
+    pmovsxwd        m0,  m0              ; word -> dword
+
+    pmulld          m3,  m7
+    pmulld          m5,  m7
+    pmulld          m4,  m0
+    pmulld          m6,  m0
+
+    paddd           m2,  m3
+    paddd           m1,  m5
+    paddd           m2,  m4
+    paddd           m1,  m6
+%else ; %1 == 10/9/8
+    punpcklwd       m5,  m3,  m4
+    punpckhwd       m3,  m4
+    SPLATD          m0
+
+    pmaddwd         m5,  m0
+    pmaddwd         m3,  m0
+
+    paddd           m2,  m5
+    paddd           m1,  m3
+%endif ; %1 == 8/9/10/16
+
+    sub       cntr_reg,  2
+    jg .filterloop_ %+ %%i
+
+%if %1 == 16
+    psrad           m2,  31 - %1
+    psrad           m1,  31 - %1
+%else ; %1 == 10/9/8
+    psrad           m2,  27 - %1
+    psrad           m1,  27 - %1
+%endif ; %1 == 8/9/10/16
+
+%if %1 == 8
+    packssdw        m2,  m1
+    packuswb        m2,  m2
+    movh   [dstq+r5*1],  m2
+%else ; %1 == 9/10/16
+%if %1 == 16
+    packssdw        m2,  m1
+    paddw           m2, [minshort]
+%else ; %1 == 9/10
+%if cpuflag(sse4)
+    packusdw        m2,  m1
+%else ; mmxext/sse2
+    packssdw        m2,  m1
+    pmaxsw          m2,  m6
+%endif ; mmxext/sse2/sse4/avx
+    pminsw          m2, [yuv2yuvX_%1_upper]
+%endif ; %1 == 9/10/16
+    mova   [dstq+r5*2],  m2
+%endif ; %1 == 8/9/10/16
+
+    add             r5,  mmsize/2
+    sub             wd,  mmsize/2
+
+%assign %%i %%i+2
+%endrep
+    jg .pixelloop
+
+%if %1 == 8
+%if ARCH_X86_32
+    ADD             rsp, pad
+    RET
+%else ; x86-64
+    REP_RET
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    REP_RET
+%endif ; %1 == 8/9/10/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+yuv2planeX_fn  8,  0, 7
+yuv2planeX_fn  9,  0, 5
+yuv2planeX_fn 10,  0, 5
+%endif
+
+INIT_XMM sse2
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+
+INIT_XMM sse4
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+yuv2planeX_fn 16,  8, 5
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+%endif
+
+; %1=outout-bpc, %2=alignment (u/a)
+%macro yuv2plane1_mainloop 2
+.loop_%2:
+%if %1 == 8
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
+    psraw           m0, 7
+    psraw           m1, 7
+    packuswb        m0, m1
+    mov%2    [dstq+wq], m0
+%elif %1 == 16
+    paddd           m0, m4, [srcq+wq*4+mmsize*0]
+    paddd           m1, m4, [srcq+wq*4+mmsize*1]
+    paddd           m2, m4, [srcq+wq*4+mmsize*2]
+    paddd           m3, m4, [srcq+wq*4+mmsize*3]
+    psrad           m0, 3
+    psrad           m1, 3
+    psrad           m2, 3
+    psrad           m3, 3
+%if cpuflag(sse4) ; avx/sse4
+    packusdw        m0, m1
+    packusdw        m2, m3
+%else ; mmx/sse2
+    packssdw        m0, m1
+    packssdw        m2, m3
+    paddw           m0, m5
+    paddw           m2, m5
+%endif ; mmx/sse2/sse4/avx
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m2
+%else ; %1 == 9/10
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
+    psraw           m0, 15 - %1
+    psraw           m1, 15 - %1
+    pmaxsw          m0, m4
+    pmaxsw          m1, m4
+    pminsw          m0, m3
+    pminsw          m1, m3
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m1
+%endif
+    add             wq, mmsize
+    jl .loop_%2
+%endmacro
+
+%macro yuv2plane1_fn 3
+cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
+    movsxdifnidn    wq, wd
+    add             wq, mmsize - 1
+    and             wq, ~(mmsize - 1)
+%if %1 == 8
+    add           dstq, wq
+%else ; %1 != 8
+    lea           dstq, [dstq+wq*2]
+%endif ; %1 == 8
+%if %1 == 16
+    lea           srcq, [srcq+wq*4]
+%else ; %1 != 16
+    lea           srcq, [srcq+wq*2]
+%endif ; %1 == 16
+    neg             wq
+
+%if %1 == 8
+    pxor            m4, m4               ; zero
+
+    ; create registers holding dither
+    movq            m3, [ditherq]        ; dither
+    test       offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq      m3, m3
+%endif ; mmsize == 16
+    PALIGNR         m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+    mova            m2, m3
+    punpckhbw       m3, m4               ; byte->word
+    punpcklbw       m2, m4               ; byte->word
+%else
+    punpcklbw       m3, m4
+    mova            m2, m3
+%endif
+%elif %1 == 9
+    pxor            m4, m4
+    mova            m3, [pw_512]
+    mova            m2, [pw_32]
+%elif %1 == 10
+    pxor            m4, m4
+    mova            m3, [pw_1024]
+    mova            m2, [pw_16]
+%else ; %1 == 16
+%if cpuflag(sse4) ; sse4/avx
+    mova            m4, [pd_4]
+%else ; mmx/sse2
+    mova            m4, [pd_4min0x40000]
+    mova            m5, [minshort]
+%endif ; mmx/sse2/sse4/avx
+%endif ; %1 == ..
+
+    ; actual pixel scaling
+%if mmsize == 8
+    yuv2plane1_mainloop %1, a
+%else ; mmsize == 16
+    test          dstq, 15
+    jnz .unaligned
+    yuv2plane1_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2plane1_mainloop %1, u
+%endif ; mmsize == 8/16
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+yuv2plane1_fn  8, 0, 5
+yuv2plane1_fn 16, 0, 3
+
+INIT_MMX mmxext
+yuv2plane1_fn  9, 0, 3
+yuv2plane1_fn 10, 0, 3
+%endif
+
+INIT_XMM sse2
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 6, 3
+
+INIT_XMM sse4
+yuv2plane1_fn 16, 5, 3
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/ffmpeg/libswscale/x86/rgb2rgb.c b/ffmpeg/libswscale/x86/rgb2rgb.c
new file mode 100644
index 0000000..1e20176
--- /dev/null
+++ b/ffmpeg/libswscale/x86/rgb2rgb.c
@@ -0,0 +1,149 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hh)     = 0xffff000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhh)    = 0xffffffff00000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhhh)   = 0xffffffffffff0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
+DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
+DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
+#define mask16b mask15b
+DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, red_16mask)   = 0x0000f8000000f800ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
+
+#define RGB2YUV_SHIFT 8
+#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
+
+// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
+
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_AMD3DNOW 0
+#define COMPILE_TEMPLATE_SSE2 0
+
+//MMX versions
+#undef RENAME
+#define RENAME(a) a ## _MMX
+#include "rgb2rgb_template.c"
+
+// MMXEXT versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "rgb2rgb_template.c"
+
+//SSE2 versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_SSE2
+#define COMPILE_TEMPLATE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include "rgb2rgb_template.c"
+
+//3DNOW versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNOW
+#include "rgb2rgb_template.c"
+
+/*
+ RGB15->RGB16 original by Strepto/Astral
+ ported to gcc & bugfixed : A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void rgb2rgb_init_x86(void)
+{
+#if HAVE_INLINE_ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags))
+        rgb2rgb_init_MMX();
+    if (INLINE_AMD3DNOW(cpu_flags))
+        rgb2rgb_init_3DNOW();
+    if (INLINE_MMXEXT(cpu_flags))
+        rgb2rgb_init_MMXEXT();
+    if (INLINE_SSE2(cpu_flags))
+        rgb2rgb_init_SSE2();
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/ffmpeg/libswscale/x86/rgb2rgb_template.c b/ffmpeg/libswscale/x86/rgb2rgb_template.c
new file mode 100644
index 0000000..d802ab4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/rgb2rgb_template.c
@@ -0,0 +1,2498 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ * lot of big-endian byte order fixes by Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#undef PREFETCH
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+#undef PAVGB
+
+#if COMPILE_TEMPLATE_AMD3DNOW
+#define PREFETCH  "prefetch"
+#define PAVGB     "pavgusb"
+#elif COMPILE_TEMPLATE_MMXEXT
+#define PREFETCH "prefetchnta"
+#define PAVGB     "pavgb"
+#else
+#define PREFETCH  " # nop"
+#endif
+
+#if COMPILE_TEMPLATE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
+#define EMMS     "femms"
+#else
+#define EMMS     "emms"
+#endif
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE " # nop"
+#endif
+
+#if !COMPILE_TEMPLATE_SSE2
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+
+static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *dest = dst;
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    mm_end = end - 23;
+    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "punpckldq  3(%1), %%mm0    \n\t"
+            "movd       6(%1), %%mm1    \n\t"
+            "punpckldq  9(%1), %%mm1    \n\t"
+            "movd      12(%1), %%mm2    \n\t"
+            "punpckldq 15(%1), %%mm2    \n\t"
+            "movd      18(%1), %%mm3    \n\t"
+            "punpckldq 21(%1), %%mm3    \n\t"
+            "por        %%mm7, %%mm0    \n\t"
+            "por        %%mm7, %%mm1    \n\t"
+            "por        %%mm7, %%mm2    \n\t"
+            "por        %%mm7, %%mm3    \n\t"
+            MOVNTQ"     %%mm0,   (%0)   \n\t"
+            MOVNTQ"     %%mm1,  8(%0)   \n\t"
+            MOVNTQ"     %%mm2, 16(%0)   \n\t"
+            MOVNTQ"     %%mm3, 24(%0)"
+            :: "r"(dest), "r"(s)
+            :"memory");
+        dest += 32;
+        s += 24;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        *dest++ = *s++;
+        *dest++ = *s++;
+        *dest++ = *s++;
+        *dest++ = 255;
+    }
+}
+
+#define STORE_BGR24_MMX \
+            "psrlq         $8, %%mm2    \n\t" \
+            "psrlq         $8, %%mm3    \n\t" \
+            "psrlq         $8, %%mm6    \n\t" \
+            "psrlq         $8, %%mm7    \n\t" \
+            "pand "MANGLE(mask24l)", %%mm0\n\t" \
+            "pand "MANGLE(mask24l)", %%mm1\n\t" \
+            "pand "MANGLE(mask24l)", %%mm4\n\t" \
+            "pand "MANGLE(mask24l)", %%mm5\n\t" \
+            "pand "MANGLE(mask24h)", %%mm2\n\t" \
+            "pand "MANGLE(mask24h)", %%mm3\n\t" \
+            "pand "MANGLE(mask24h)", %%mm6\n\t" \
+            "pand "MANGLE(mask24h)", %%mm7\n\t" \
+            "por        %%mm2, %%mm0    \n\t" \
+            "por        %%mm3, %%mm1    \n\t" \
+            "por        %%mm6, %%mm4    \n\t" \
+            "por        %%mm7, %%mm5    \n\t" \
+ \
+            "movq       %%mm1, %%mm2    \n\t" \
+            "movq       %%mm4, %%mm3    \n\t" \
+            "psllq        $48, %%mm2    \n\t" \
+            "psllq        $32, %%mm3    \n\t" \
+            "por        %%mm2, %%mm0    \n\t" \
+            "psrlq        $16, %%mm1    \n\t" \
+            "psrlq        $32, %%mm4    \n\t" \
+            "psllq        $16, %%mm5    \n\t" \
+            "por        %%mm3, %%mm1    \n\t" \
+            "por        %%mm5, %%mm4    \n\t" \
+ \
+            MOVNTQ"     %%mm0,   (%0)    \n\t" \
+            MOVNTQ"     %%mm1,  8(%0)    \n\t" \
+            MOVNTQ"     %%mm4, 16(%0)"
+
+
+static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *dest = dst;
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    mm_end = end - 31;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movq        (%1), %%mm0    \n\t"
+            "movq       8(%1), %%mm1    \n\t"
+            "movq      16(%1), %%mm4    \n\t"
+            "movq      24(%1), %%mm5    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm1, %%mm3    \n\t"
+            "movq       %%mm4, %%mm6    \n\t"
+            "movq       %%mm5, %%mm7    \n\t"
+            STORE_BGR24_MMX
+            :: "r"(dest), "r"(s)
+            :"memory");
+        dest += 24;
+        s += 32;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        *dest++ = *s++;
+        *dest++ = *s++;
+        *dest++ = *s++;
+        s++;
+    }
+}
+
+/*
+ original by Strepto/Astral
+ ported to gcc & bugfixed: A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    register const uint8_t* s=src;
+    register uint8_t* d=dst;
+    register const uint8_t *end;
+    const uint8_t *mm_end;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
+    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
+    mm_end = end - 15;
+    while (s<mm_end) {
+        __asm__ volatile(
+            PREFETCH" 32(%1)        \n\t"
+            "movq      (%1), %%mm0  \n\t"
+            "movq     8(%1), %%mm2  \n\t"
+            "movq     %%mm0, %%mm1  \n\t"
+            "movq     %%mm2, %%mm3  \n\t"
+            "pand     %%mm4, %%mm0  \n\t"
+            "pand     %%mm4, %%mm2  \n\t"
+            "paddw    %%mm1, %%mm0  \n\t"
+            "paddw    %%mm3, %%mm2  \n\t"
+            MOVNTQ"   %%mm0,  (%0)  \n\t"
+            MOVNTQ"   %%mm2, 8(%0)"
+            :: "r"(d), "r"(s)
+        );
+        d+=16;
+        s+=16;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    mm_end = end - 3;
+    while (s < mm_end) {
+        register unsigned x= *((const uint32_t *)s);
+        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
+        d+=4;
+        s+=4;
+    }
+    if (s < end) {
+        register unsigned short x= *((const uint16_t *)s);
+        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
+    }
+}
+
+static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    register const uint8_t* s=src;
+    register uint8_t* d=dst;
+    register const uint8_t *end;
+    const uint8_t *mm_end;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
+    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
+    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
+    mm_end = end - 15;
+    while (s<mm_end) {
+        __asm__ volatile(
+            PREFETCH" 32(%1)        \n\t"
+            "movq      (%1), %%mm0  \n\t"
+            "movq     8(%1), %%mm2  \n\t"
+            "movq     %%mm0, %%mm1  \n\t"
+            "movq     %%mm2, %%mm3  \n\t"
+            "psrlq       $1, %%mm0  \n\t"
+            "psrlq       $1, %%mm2  \n\t"
+            "pand     %%mm7, %%mm0  \n\t"
+            "pand     %%mm7, %%mm2  \n\t"
+            "pand     %%mm6, %%mm1  \n\t"
+            "pand     %%mm6, %%mm3  \n\t"
+            "por      %%mm1, %%mm0  \n\t"
+            "por      %%mm3, %%mm2  \n\t"
+            MOVNTQ"   %%mm0,  (%0)  \n\t"
+            MOVNTQ"   %%mm2, 8(%0)"
+            :: "r"(d), "r"(s)
+        );
+        d+=16;
+        s+=16;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    mm_end = end - 3;
+    while (s < mm_end) {
+        register uint32_t x= *((const uint32_t*)s);
+        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
+        s+=4;
+        d+=4;
+    }
+    if (s < end) {
+        register uint16_t x= *((const uint16_t*)s);
+        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
+    }
+}
+
+static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    mm_end = end - 15;
+    __asm__ volatile(
+        "movq           %3, %%mm5   \n\t"
+        "movq           %4, %%mm6   \n\t"
+        "movq           %5, %%mm7   \n\t"
+        "jmp 2f                     \n\t"
+        ".p2align        4          \n\t"
+        "1:                         \n\t"
+        PREFETCH"   32(%1)          \n\t"
+        "movd         (%1), %%mm0   \n\t"
+        "movd        4(%1), %%mm3   \n\t"
+        "punpckldq   8(%1), %%mm0   \n\t"
+        "punpckldq  12(%1), %%mm3   \n\t"
+        "movq        %%mm0, %%mm1   \n\t"
+        "movq        %%mm3, %%mm4   \n\t"
+        "pand        %%mm6, %%mm0   \n\t"
+        "pand        %%mm6, %%mm3   \n\t"
+        "pmaddwd     %%mm7, %%mm0   \n\t"
+        "pmaddwd     %%mm7, %%mm3   \n\t"
+        "pand        %%mm5, %%mm1   \n\t"
+        "pand        %%mm5, %%mm4   \n\t"
+        "por         %%mm1, %%mm0   \n\t"
+        "por         %%mm4, %%mm3   \n\t"
+        "psrld          $5, %%mm0   \n\t"
+        "pslld         $11, %%mm3   \n\t"
+        "por         %%mm3, %%mm0   \n\t"
+        MOVNTQ"      %%mm0, (%0)    \n\t"
+        "add           $16,  %1     \n\t"
+        "add            $8,  %0     \n\t"
+        "2:                         \n\t"
+        "cmp            %2,  %1     \n\t"
+        " jb            1b          \n\t"
+        : "+r" (d), "+r"(s)
+        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+    );
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register int rgb = *(const uint32_t*)s; s += 4;
+        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
+    }
+}
+
+static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq          %0, %%mm7    \n\t"
+        "movq          %1, %%mm6    \n\t"
+        ::"m"(red_16mask),"m"(green_16mask));
+    mm_end = end - 15;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "movd       4(%1), %%mm3    \n\t"
+            "punpckldq  8(%1), %%mm0    \n\t"
+            "punpckldq 12(%1), %%mm3    \n\t"
+            "movq       %%mm0, %%mm1    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm3, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "psllq         $8, %%mm0    \n\t"
+            "psllq         $8, %%mm3    \n\t"
+            "pand       %%mm7, %%mm0    \n\t"
+            "pand       %%mm7, %%mm3    \n\t"
+            "psrlq         $5, %%mm1    \n\t"
+            "psrlq         $5, %%mm4    \n\t"
+            "pand       %%mm6, %%mm1    \n\t"
+            "pand       %%mm6, %%mm4    \n\t"
+            "psrlq        $19, %%mm2    \n\t"
+            "psrlq        $19, %%mm5    \n\t"
+            "pand          %2, %%mm2    \n\t"
+            "pand          %2, %%mm5    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            "psllq        $16, %%mm3    \n\t"
+            "por        %%mm3, %%mm0    \n\t"
+            MOVNTQ"     %%mm0, (%0)     \n\t"
+            :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
+        d += 4;
+        s += 16;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register int rgb = *(const uint32_t*)s; s += 4;
+        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
+    }
+}
+
+static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    mm_end = end - 15;
+    __asm__ volatile(
+        "movq           %3, %%mm5   \n\t"
+        "movq           %4, %%mm6   \n\t"
+        "movq           %5, %%mm7   \n\t"
+        "jmp            2f          \n\t"
+        ".p2align        4          \n\t"
+        "1:                         \n\t"
+        PREFETCH"   32(%1)          \n\t"
+        "movd         (%1), %%mm0   \n\t"
+        "movd        4(%1), %%mm3   \n\t"
+        "punpckldq   8(%1), %%mm0   \n\t"
+        "punpckldq  12(%1), %%mm3   \n\t"
+        "movq        %%mm0, %%mm1   \n\t"
+        "movq        %%mm3, %%mm4   \n\t"
+        "pand        %%mm6, %%mm0   \n\t"
+        "pand        %%mm6, %%mm3   \n\t"
+        "pmaddwd     %%mm7, %%mm0   \n\t"
+        "pmaddwd     %%mm7, %%mm3   \n\t"
+        "pand        %%mm5, %%mm1   \n\t"
+        "pand        %%mm5, %%mm4   \n\t"
+        "por         %%mm1, %%mm0   \n\t"
+        "por         %%mm4, %%mm3   \n\t"
+        "psrld          $6, %%mm0   \n\t"
+        "pslld         $10, %%mm3   \n\t"
+        "por         %%mm3, %%mm0   \n\t"
+        MOVNTQ"      %%mm0, (%0)    \n\t"
+        "add           $16,  %1     \n\t"
+        "add            $8,  %0     \n\t"
+        "2:                         \n\t"
+        "cmp            %2,  %1     \n\t"
+        " jb            1b          \n\t"
+        : "+r" (d), "+r"(s)
+        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+    );
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register int rgb = *(const uint32_t*)s; s += 4;
+        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
+    }
+}
+
+static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq          %0, %%mm7    \n\t"
+        "movq          %1, %%mm6    \n\t"
+        ::"m"(red_15mask),"m"(green_15mask));
+    mm_end = end - 15;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "movd       4(%1), %%mm3    \n\t"
+            "punpckldq  8(%1), %%mm0    \n\t"
+            "punpckldq 12(%1), %%mm3    \n\t"
+            "movq       %%mm0, %%mm1    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm3, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "psllq         $7, %%mm0    \n\t"
+            "psllq         $7, %%mm3    \n\t"
+            "pand       %%mm7, %%mm0    \n\t"
+            "pand       %%mm7, %%mm3    \n\t"
+            "psrlq         $6, %%mm1    \n\t"
+            "psrlq         $6, %%mm4    \n\t"
+            "pand       %%mm6, %%mm1    \n\t"
+            "pand       %%mm6, %%mm4    \n\t"
+            "psrlq        $19, %%mm2    \n\t"
+            "psrlq        $19, %%mm5    \n\t"
+            "pand          %2, %%mm2    \n\t"
+            "pand          %2, %%mm5    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            "psllq        $16, %%mm3    \n\t"
+            "por        %%mm3, %%mm0    \n\t"
+            MOVNTQ"     %%mm0, (%0)     \n\t"
+            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+        d += 4;
+        s += 16;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register int rgb = *(const uint32_t*)s; s += 4;
+        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
+    }
+}
+
+static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq         %0, %%mm7     \n\t"
+        "movq         %1, %%mm6     \n\t"
+        ::"m"(red_16mask),"m"(green_16mask));
+    mm_end = end - 11;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "movd       3(%1), %%mm3    \n\t"
+            "punpckldq  6(%1), %%mm0    \n\t"
+            "punpckldq  9(%1), %%mm3    \n\t"
+            "movq       %%mm0, %%mm1    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm3, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "psrlq         $3, %%mm0    \n\t"
+            "psrlq         $3, %%mm3    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %2, %%mm3    \n\t"
+            "psrlq         $5, %%mm1    \n\t"
+            "psrlq         $5, %%mm4    \n\t"
+            "pand       %%mm6, %%mm1    \n\t"
+            "pand       %%mm6, %%mm4    \n\t"
+            "psrlq         $8, %%mm2    \n\t"
+            "psrlq         $8, %%mm5    \n\t"
+            "pand       %%mm7, %%mm2    \n\t"
+            "pand       %%mm7, %%mm5    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            "psllq        $16, %%mm3    \n\t"
+            "por        %%mm3, %%mm0    \n\t"
+            MOVNTQ"     %%mm0, (%0)     \n\t"
+            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
+        d += 4;
+        s += 12;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        const int b = *s++;
+        const int g = *s++;
+        const int r = *s++;
+        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
+    }
+}
+
+static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq         %0, %%mm7     \n\t"
+        "movq         %1, %%mm6     \n\t"
+        ::"m"(red_16mask),"m"(green_16mask));
+    mm_end = end - 15;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "movd       3(%1), %%mm3    \n\t"
+            "punpckldq  6(%1), %%mm0    \n\t"
+            "punpckldq  9(%1), %%mm3    \n\t"
+            "movq       %%mm0, %%mm1    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm3, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "psllq         $8, %%mm0    \n\t"
+            "psllq         $8, %%mm3    \n\t"
+            "pand       %%mm7, %%mm0    \n\t"
+            "pand       %%mm7, %%mm3    \n\t"
+            "psrlq         $5, %%mm1    \n\t"
+            "psrlq         $5, %%mm4    \n\t"
+            "pand       %%mm6, %%mm1    \n\t"
+            "pand       %%mm6, %%mm4    \n\t"
+            "psrlq        $19, %%mm2    \n\t"
+            "psrlq        $19, %%mm5    \n\t"
+            "pand          %2, %%mm2    \n\t"
+            "pand          %2, %%mm5    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            "psllq        $16, %%mm3    \n\t"
+            "por        %%mm3, %%mm0    \n\t"
+            MOVNTQ"     %%mm0, (%0)     \n\t"
+            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
+        d += 4;
+        s += 12;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        const int r = *s++;
+        const int g = *s++;
+        const int b = *s++;
+        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
+    }
+}
+
+static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq          %0, %%mm7    \n\t"
+        "movq          %1, %%mm6    \n\t"
+        ::"m"(red_15mask),"m"(green_15mask));
+    mm_end = end - 11;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movd        (%1), %%mm0    \n\t"
+            "movd       3(%1), %%mm3    \n\t"
+            "punpckldq  6(%1), %%mm0    \n\t"
+            "punpckldq  9(%1), %%mm3    \n\t"
+            "movq       %%mm0, %%mm1    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm3, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "psrlq         $3, %%mm0    \n\t"
+            "psrlq         $3, %%mm3    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %2, %%mm3    \n\t"
+            "psrlq         $6, %%mm1    \n\t"
+            "psrlq         $6, %%mm4    \n\t"
+            "pand       %%mm6, %%mm1    \n\t"
+            "pand       %%mm6, %%mm4    \n\t"
+            "psrlq         $9, %%mm2    \n\t"
+            "psrlq         $9, %%mm5    \n\t"
+            "pand       %%mm7, %%mm2    \n\t"
+            "pand       %%mm7, %%mm5    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            "psllq        $16, %%mm3    \n\t"
+            "por        %%mm3, %%mm0    \n\t"
+            MOVNTQ"     %%mm0, (%0)     \n\t"
+            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+        d += 4;
+        s += 12;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        const int b = *s++;
+        const int g = *s++;
+        const int r = *s++;
+        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+    }
+}
+
+static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint8_t *s = src;
+    const uint8_t *end;
+    const uint8_t *mm_end;
+    uint16_t *d = (uint16_t *)dst;
+    end = s + src_size;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
+        "movq         %0, %%mm7     \n\t"
+        "movq         %1, %%mm6     \n\t"
+        ::"m"(red_15mask),"m"(green_15mask));
+    mm_end = end - 15;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH" 32(%1)            \n\t"
+            "movd       (%1), %%mm0     \n\t"
+            "movd      3(%1), %%mm3     \n\t"
+            "punpckldq 6(%1), %%mm0     \n\t"
+            "punpckldq 9(%1), %%mm3     \n\t"
+            "movq      %%mm0, %%mm1     \n\t"
+            "movq      %%mm0, %%mm2     \n\t"
+            "movq      %%mm3, %%mm4     \n\t"
+            "movq      %%mm3, %%mm5     \n\t"
+            "psllq        $7, %%mm0     \n\t"
+            "psllq        $7, %%mm3     \n\t"
+            "pand      %%mm7, %%mm0     \n\t"
+            "pand      %%mm7, %%mm3     \n\t"
+            "psrlq        $6, %%mm1     \n\t"
+            "psrlq        $6, %%mm4     \n\t"
+            "pand      %%mm6, %%mm1     \n\t"
+            "pand      %%mm6, %%mm4     \n\t"
+            "psrlq       $19, %%mm2     \n\t"
+            "psrlq       $19, %%mm5     \n\t"
+            "pand         %2, %%mm2     \n\t"
+            "pand         %2, %%mm5     \n\t"
+            "por       %%mm1, %%mm0     \n\t"
+            "por       %%mm4, %%mm3     \n\t"
+            "por       %%mm2, %%mm0     \n\t"
+            "por       %%mm5, %%mm3     \n\t"
+            "psllq       $16, %%mm3     \n\t"
+            "por       %%mm3, %%mm0     \n\t"
+            MOVNTQ"    %%mm0, (%0)      \n\t"
+            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+        d += 4;
+        s += 12;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        const int r = *s++;
+        const int g = *s++;
+        const int b = *s++;
+        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+    }
+}
+
+static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint16_t *end;
+    const uint16_t *mm_end;
+    uint8_t *d = dst;
+    const uint16_t *s = (const uint16_t*)src;
+    end = s + src_size/2;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    mm_end = end - 7;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movq        (%1), %%mm0    \n\t"
+            "movq        (%1), %%mm1    \n\t"
+            "movq        (%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            "movq       %%mm0, %%mm3    \n\t"
+            "movq       %%mm1, %%mm4    \n\t"
+            "movq       %%mm2, %%mm5    \n\t"
+            "punpcklwd     %5, %%mm0    \n\t"
+            "punpcklwd     %5, %%mm1    \n\t"
+            "punpcklwd     %5, %%mm2    \n\t"
+            "punpckhwd     %5, %%mm3    \n\t"
+            "punpckhwd     %5, %%mm4    \n\t"
+            "punpckhwd     %5, %%mm5    \n\t"
+            "psllq         $8, %%mm1    \n\t"
+            "psllq        $16, %%mm2    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "psllq         $8, %%mm4    \n\t"
+            "psllq        $16, %%mm5    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+
+            "movq       %%mm0, %%mm6    \n\t"
+            "movq       %%mm3, %%mm7    \n\t"
+
+            "movq       8(%1), %%mm0    \n\t"
+            "movq       8(%1), %%mm1    \n\t"
+            "movq       8(%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            "movq       %%mm0, %%mm3    \n\t"
+            "movq       %%mm1, %%mm4    \n\t"
+            "movq       %%mm2, %%mm5    \n\t"
+            "punpcklwd     %5, %%mm0    \n\t"
+            "punpcklwd     %5, %%mm1    \n\t"
+            "punpcklwd     %5, %%mm2    \n\t"
+            "punpckhwd     %5, %%mm3    \n\t"
+            "punpckhwd     %5, %%mm4    \n\t"
+            "punpckhwd     %5, %%mm5    \n\t"
+            "psllq         $8, %%mm1    \n\t"
+            "psllq        $16, %%mm2    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "psllq         $8, %%mm4    \n\t"
+            "psllq        $16, %%mm5    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+
+            :"=m"(*d)
+            :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+            :"memory");
+        /* borrowed 32 to 24 */
+        __asm__ volatile(
+            "movq       %%mm0, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "movq       %%mm6, %%mm0    \n\t"
+            "movq       %%mm7, %%mm1    \n\t"
+
+            "movq       %%mm4, %%mm6    \n\t"
+            "movq       %%mm5, %%mm7    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm1, %%mm3    \n\t"
+
+            STORE_BGR24_MMX
+
+            :: "r"(d), "m"(*s)
+            :"memory");
+        d += 24;
+        s += 8;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register uint16_t bgr;
+        bgr = *s++;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+    }
+}
+
+static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint16_t *end;
+    const uint16_t *mm_end;
+    uint8_t *d = (uint8_t *)dst;
+    const uint16_t *s = (const uint16_t *)src;
+    end = s + src_size/2;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    mm_end = end - 7;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movq        (%1), %%mm0    \n\t"
+            "movq        (%1), %%mm1    \n\t"
+            "movq        (%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            "movq       %%mm0, %%mm3    \n\t"
+            "movq       %%mm1, %%mm4    \n\t"
+            "movq       %%mm2, %%mm5    \n\t"
+            "punpcklwd     %5, %%mm0    \n\t"
+            "punpcklwd     %5, %%mm1    \n\t"
+            "punpcklwd     %5, %%mm2    \n\t"
+            "punpckhwd     %5, %%mm3    \n\t"
+            "punpckhwd     %5, %%mm4    \n\t"
+            "punpckhwd     %5, %%mm5    \n\t"
+            "psllq         $8, %%mm1    \n\t"
+            "psllq        $16, %%mm2    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "psllq         $8, %%mm4    \n\t"
+            "psllq        $16, %%mm5    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+
+            "movq       %%mm0, %%mm6    \n\t"
+            "movq       %%mm3, %%mm7    \n\t"
+
+            "movq       8(%1), %%mm0    \n\t"
+            "movq       8(%1), %%mm1    \n\t"
+            "movq       8(%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            "movq       %%mm0, %%mm3    \n\t"
+            "movq       %%mm1, %%mm4    \n\t"
+            "movq       %%mm2, %%mm5    \n\t"
+            "punpcklwd     %5, %%mm0    \n\t"
+            "punpcklwd     %5, %%mm1    \n\t"
+            "punpcklwd     %5, %%mm2    \n\t"
+            "punpckhwd     %5, %%mm3    \n\t"
+            "punpckhwd     %5, %%mm4    \n\t"
+            "punpckhwd     %5, %%mm5    \n\t"
+            "psllq         $8, %%mm1    \n\t"
+            "psllq        $16, %%mm2    \n\t"
+            "por        %%mm1, %%mm0    \n\t"
+            "por        %%mm2, %%mm0    \n\t"
+            "psllq         $8, %%mm4    \n\t"
+            "psllq        $16, %%mm5    \n\t"
+            "por        %%mm4, %%mm3    \n\t"
+            "por        %%mm5, %%mm3    \n\t"
+            :"=m"(*d)
+            :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+            :"memory");
+        /* borrowed 32 to 24 */
+        __asm__ volatile(
+            "movq       %%mm0, %%mm4    \n\t"
+            "movq       %%mm3, %%mm5    \n\t"
+            "movq       %%mm6, %%mm0    \n\t"
+            "movq       %%mm7, %%mm1    \n\t"
+
+            "movq       %%mm4, %%mm6    \n\t"
+            "movq       %%mm5, %%mm7    \n\t"
+            "movq       %%mm0, %%mm2    \n\t"
+            "movq       %%mm1, %%mm3    \n\t"
+
+            STORE_BGR24_MMX
+
+            :: "r"(d), "m"(*s)
+            :"memory");
+        d += 24;
+        s += 8;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register uint16_t bgr;
+        bgr = *s++;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+    }
+}
+
+/*
+ * mm0 = 00 B3 00 B2 00 B1 00 B0
+ * mm1 = 00 G3 00 G2 00 G1 00 G0
+ * mm2 = 00 R3 00 R2 00 R1 00 R0
+ * mm6 = FF FF FF FF FF FF FF FF
+ * mm7 = 00 00 00 00 00 00 00 00
+ */
+#define PACK_RGB32 \
+    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
+    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
+    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
+    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
+    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
+    "movq       %%mm0, %%mm3    \n\t"                               \
+    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
+    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
+    MOVNTQ"     %%mm0,  (%0)    \n\t"                               \
+    MOVNTQ"     %%mm3, 8(%0)    \n\t"                               \
+
+static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint16_t *end;
+    const uint16_t *mm_end;
+    uint8_t *d = dst;
+    const uint16_t *s = (const uint16_t *)src;
+    end = s + src_size/2;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
+    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
+    mm_end = end - 3;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movq        (%1), %%mm0    \n\t"
+            "movq        (%1), %%mm1    \n\t"
+            "movq        (%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            PACK_RGB32
+            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
+            :"memory");
+        d += 16;
+        s += 4;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register uint16_t bgr;
+        bgr = *s++;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = 255;
+    }
+}
+
+static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    const uint16_t *end;
+    const uint16_t *mm_end;
+    uint8_t *d = dst;
+    const uint16_t *s = (const uint16_t*)src;
+    end = s + src_size/2;
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
+    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
+    mm_end = end - 3;
+    while (s < mm_end) {
+        __asm__ volatile(
+            PREFETCH"  32(%1)           \n\t"
+            "movq        (%1), %%mm0    \n\t"
+            "movq        (%1), %%mm1    \n\t"
+            "movq        (%1), %%mm2    \n\t"
+            "pand          %2, %%mm0    \n\t"
+            "pand          %3, %%mm1    \n\t"
+            "pand          %4, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
+            PACK_RGB32
+            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
+            :"memory");
+        d += 16;
+        s += 4;
+    }
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+    while (s < end) {
+        register uint16_t bgr;
+        bgr = *s++;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = 255;
+    }
+}
+
+static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    x86_reg idx = 15 - src_size;
+    const uint8_t *s = src-idx;
+    uint8_t *d = dst-idx;
+    __asm__ volatile(
+        "test          %0, %0           \n\t"
+        "jns           2f               \n\t"
+        PREFETCH"       (%1, %0)        \n\t"
+        "movq          %3, %%mm7        \n\t"
+        "pxor          %4, %%mm7        \n\t"
+        "movq       %%mm7, %%mm6        \n\t"
+        "pxor          %5, %%mm7        \n\t"
+        ".p2align       4               \n\t"
+        "1:                             \n\t"
+        PREFETCH"     32(%1, %0)        \n\t"
+        "movq           (%1, %0), %%mm0 \n\t"
+        "movq          8(%1, %0), %%mm1 \n\t"
+# if COMPILE_TEMPLATE_MMXEXT
+        "pshufw      $177, %%mm0, %%mm3 \n\t"
+        "pshufw      $177, %%mm1, %%mm5 \n\t"
+        "pand       %%mm7, %%mm0        \n\t"
+        "pand       %%mm6, %%mm3        \n\t"
+        "pand       %%mm7, %%mm1        \n\t"
+        "pand       %%mm6, %%mm5        \n\t"
+        "por        %%mm3, %%mm0        \n\t"
+        "por        %%mm5, %%mm1        \n\t"
+# else
+        "movq       %%mm0, %%mm2        \n\t"
+        "movq       %%mm1, %%mm4        \n\t"
+        "pand       %%mm7, %%mm0        \n\t"
+        "pand       %%mm6, %%mm2        \n\t"
+        "pand       %%mm7, %%mm1        \n\t"
+        "pand       %%mm6, %%mm4        \n\t"
+        "movq       %%mm2, %%mm3        \n\t"
+        "movq       %%mm4, %%mm5        \n\t"
+        "pslld        $16, %%mm2        \n\t"
+        "psrld        $16, %%mm3        \n\t"
+        "pslld        $16, %%mm4        \n\t"
+        "psrld        $16, %%mm5        \n\t"
+        "por        %%mm2, %%mm0        \n\t"
+        "por        %%mm4, %%mm1        \n\t"
+        "por        %%mm3, %%mm0        \n\t"
+        "por        %%mm5, %%mm1        \n\t"
+# endif
+        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
+        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
+        "add          $16, %0           \n\t"
+        "js            1b               \n\t"
+        SFENCE"                         \n\t"
+        EMMS"                           \n\t"
+        "2:                             \n\t"
+        : "+&r"(idx)
+        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
+        : "memory");
+    for (; idx<15; idx+=4) {
+        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
+        v &= 0xff00ff;
+        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
+    }
+}
+
+static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    unsigned i;
+    x86_reg mmx_size= 23 - src_size;
+    __asm__ volatile (
+        "test             %%"REG_a", %%"REG_a"          \n\t"
+        "jns                     2f                     \n\t"
+        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
+        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
+        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
+        ".p2align                 4                     \n\t"
+        "1:                                             \n\t"
+        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
+        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
+        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
+        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
+        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
+        "pand                 %%mm5, %%mm0              \n\t"
+        "pand                 %%mm6, %%mm1              \n\t"
+        "pand                 %%mm7, %%mm2              \n\t"
+        "por                  %%mm0, %%mm1              \n\t"
+        "por                  %%mm2, %%mm1              \n\t"
+        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
+        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
+        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
+        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
+        "pand                 %%mm7, %%mm0              \n\t"
+        "pand                 %%mm5, %%mm1              \n\t"
+        "pand                 %%mm6, %%mm2              \n\t"
+        "por                  %%mm0, %%mm1              \n\t"
+        "por                  %%mm2, %%mm1              \n\t"
+        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
+        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
+        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
+        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
+        "pand                 %%mm6, %%mm0              \n\t"
+        "pand                 %%mm7, %%mm1              \n\t"
+        "pand                 %%mm5, %%mm2              \n\t"
+        "por                  %%mm0, %%mm1              \n\t"
+        "por                  %%mm2, %%mm1              \n\t"
+        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
+        "add                    $24, %%"REG_a"          \n\t"
+        " js                     1b                     \n\t"
+        "2:                                             \n\t"
+        : "+a" (mmx_size)
+        : "r" (src-mmx_size), "r"(dst-mmx_size)
+    );
+
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
+
+    if (mmx_size==23) return; //finished, was multiple of 8
+
+    src+= src_size;
+    dst+= src_size;
+    src_size= 23-mmx_size;
+    src-= src_size;
+    dst-= src_size;
+    for (i=0; i<src_size; i+=3) {
+        register uint8_t x;
+        x          = src[i + 2];
+        dst[i + 1] = src[i + 1];
+        dst[i + 2] = src[i + 0];
+        dst[i + 0] = x;
+    }
+}
+
+static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                           int width, int height,
+                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+    int y;
+    const x86_reg chromWidth= width>>1;
+    for (y=0; y<height; y++) {
+        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
+        __asm__ volatile(
+            "xor                 %%"REG_a", %%"REG_a"   \n\t"
+            ".p2align                    4              \n\t"
+            "1:                                         \n\t"
+            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
+            PREFETCH"    32(%2, %%"REG_a")              \n\t"
+            PREFETCH"    32(%3, %%"REG_a")              \n\t"
+            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
+            "movq                    %%mm0, %%mm2       \n\t" // U(0)
+            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
+            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
+            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
+
+            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
+            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
+            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
+            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
+            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
+            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
+            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
+            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
+
+            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
+            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
+            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
+            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
+
+            "add                        $8, %%"REG_a"   \n\t"
+            "cmp                        %4, %%"REG_a"   \n\t"
+            " jb                        1b              \n\t"
+            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+            : "%"REG_a
+        );
+        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst  += dstStride;
+    }
+    __asm__(EMMS"       \n\t"
+            SFENCE"     \n\t"
+            :::"memory");
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                      int width, int height,
+                                      int lumStride, int chromStride, int dstStride)
+{
+    //FIXME interpolate chroma
+    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
+static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                           int width, int height,
+                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+    int y;
+    const x86_reg chromWidth= width>>1;
+    for (y=0; y<height; y++) {
+        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
+        __asm__ volatile(
+            "xor                %%"REG_a", %%"REG_a"    \n\t"
+            ".p2align                   4               \n\t"
+            "1:                                         \n\t"
+            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
+            PREFETCH"   32(%2, %%"REG_a")               \n\t"
+            PREFETCH"   32(%3, %%"REG_a")               \n\t"
+            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
+            "movq                   %%mm0, %%mm2        \n\t" // U(0)
+            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
+            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
+            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
+
+            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
+            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
+            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
+            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
+            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
+            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
+            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
+            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
+
+            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
+            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
+            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
+            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
+
+            "add                       $8, %%"REG_a"    \n\t"
+            "cmp                       %4, %%"REG_a"    \n\t"
+            " jb                       1b               \n\t"
+            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+            : "%"REG_a
+        );
+        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst += dstStride;
+    }
+    __asm__(EMMS"       \n\t"
+            SFENCE"     \n\t"
+            :::"memory");
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                      int width, int height,
+                                      int lumStride, int chromStride, int dstStride)
+{
+    //FIXME interpolate chroma
+    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                         int width, int height,
+                                         int lumStride, int chromStride, int dstStride)
+{
+    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                         int width, int height,
+                                         int lumStride, int chromStride, int dstStride)
+{
+    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                                      int width, int height,
+                                      int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const x86_reg chromWidth= width>>1;
+    for (y=0; y<height; y+=2) {
+        __asm__ volatile(
+            "xor                 %%"REG_a", %%"REG_a"   \n\t"
+            "pcmpeqw                 %%mm7, %%mm7       \n\t"
+            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
+            ".p2align                    4              \n\t"
+            "1:                \n\t"
+            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
+            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
+            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
+            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
+            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
+            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
+            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
+            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
+            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
+            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
+            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
+
+            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
+
+            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
+            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
+            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
+            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
+            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
+            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
+            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
+            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
+            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
+            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
+
+            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
+
+            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
+            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
+            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
+            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
+            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
+            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
+            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
+            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
+
+            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
+            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
+
+            "add                        $8, %%"REG_a"   \n\t"
+            "cmp                        %4, %%"REG_a"   \n\t"
+            " jb                        1b              \n\t"
+            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+            : "memory", "%"REG_a
+        );
+
+        ydst += lumStride;
+        src  += srcStride;
+
+        __asm__ volatile(
+            "xor                 %%"REG_a", %%"REG_a"   \n\t"
+            ".p2align                    4              \n\t"
+            "1:                                         \n\t"
+            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
+            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
+            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
+            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
+            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
+            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
+            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
+            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
+            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
+            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
+            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
+
+            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
+            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
+
+            "add                        $8, %%"REG_a"   \n\t"
+            "cmp                        %4, %%"REG_a"   \n\t"
+            " jb                        1b              \n\t"
+
+            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+            : "memory", "%"REG_a
+        );
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+    __asm__ volatile(EMMS"       \n\t"
+                     SFENCE"     \n\t"
+                     :::"memory");
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
+{
+    int x,y;
+
+    dst[0]= src[0];
+
+    // first line
+    for (x=0; x<srcWidth-1; x++) {
+        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
+        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
+    }
+    dst[2*srcWidth-1]= src[srcWidth-1];
+
+    dst+= dstStride;
+
+    for (y=1; y<srcHeight; y++) {
+        const x86_reg mmxSize= srcWidth&~15;
+        __asm__ volatile(
+            "mov           %4, %%"REG_a"            \n\t"
+            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
+            "movq         (%0, %%"REG_a"), %%mm4    \n\t"
+            "movq                   %%mm4, %%mm2    \n\t"
+            "psllq                     $8, %%mm4    \n\t"
+            "pand                   %%mm0, %%mm2    \n\t"
+            "por                    %%mm2, %%mm4    \n\t"
+            "movq         (%1, %%"REG_a"), %%mm5    \n\t"
+            "movq                   %%mm5, %%mm3    \n\t"
+            "psllq                     $8, %%mm5    \n\t"
+            "pand                   %%mm0, %%mm3    \n\t"
+            "por                    %%mm3, %%mm5    \n\t"
+            "1:                                     \n\t"
+            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
+            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
+            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
+            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
+            PAVGB"                  %%mm0, %%mm5    \n\t"
+            PAVGB"                  %%mm0, %%mm3    \n\t"
+            PAVGB"                  %%mm0, %%mm5    \n\t"
+            PAVGB"                  %%mm0, %%mm3    \n\t"
+            PAVGB"                  %%mm1, %%mm4    \n\t"
+            PAVGB"                  %%mm1, %%mm2    \n\t"
+            PAVGB"                  %%mm1, %%mm4    \n\t"
+            PAVGB"                  %%mm1, %%mm2    \n\t"
+            "movq                   %%mm5, %%mm7    \n\t"
+            "movq                   %%mm4, %%mm6    \n\t"
+            "punpcklbw              %%mm3, %%mm5    \n\t"
+            "punpckhbw              %%mm3, %%mm7    \n\t"
+            "punpcklbw              %%mm2, %%mm4    \n\t"
+            "punpckhbw              %%mm2, %%mm6    \n\t"
+            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
+            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
+            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
+            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
+            "add                       $8, %%"REG_a"            \n\t"
+            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
+            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
+            " js                       1b                       \n\t"
+            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
+               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
+               "g" (-mmxSize)
+            : "%"REG_a
+        );
+
+        for (x=mmxSize-1; x<srcWidth-1; x++) {
+            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
+            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
+            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
+            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
+        }
+        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
+        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
+
+        dst+=dstStride*2;
+        src+=srcStride;
+    }
+
+    // last line
+    dst[0]= src[0];
+
+    for (x=0; x<srcWidth-1; x++) {
+        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
+        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
+    }
+    dst[2*srcWidth-1]= src[srcWidth-1];
+
+    __asm__ volatile(EMMS"       \n\t"
+                     SFENCE"     \n\t"
+                     :::"memory");
+}
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write HQ version.
+ */
+static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                                      int width, int height,
+                                      int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const x86_reg chromWidth= width>>1;
+    for (y=0; y<height; y+=2) {
+        __asm__ volatile(
+            "xor                 %%"REG_a", %%"REG_a"   \n\t"
+            "pcmpeqw             %%mm7, %%mm7   \n\t"
+            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
+            ".p2align                4          \n\t"
+            "1:                                 \n\t"
+            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
+            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
+            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
+            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
+            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
+            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
+            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
+            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
+            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
+            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
+
+            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
+
+            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
+            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
+            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
+            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
+            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
+            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
+            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
+            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
+            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
+            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
+
+            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
+
+            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
+            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
+            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
+            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
+            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
+            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
+            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
+            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
+
+            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
+            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
+
+            "add                    $8, %%"REG_a"   \n\t"
+            "cmp                    %4, %%"REG_a"   \n\t"
+            " jb                    1b          \n\t"
+            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+            : "memory", "%"REG_a
+        );
+
+        ydst += lumStride;
+        src  += srcStride;
+
+        __asm__ volatile(
+            "xor                 %%"REG_a", %%"REG_a"   \n\t"
+            ".p2align                    4              \n\t"
+            "1:                                 \n\t"
+            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
+            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
+            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
+            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
+            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
+            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
+            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
+            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
+            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
+            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
+
+            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
+            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
+
+            "add                    $8, %%"REG_a"   \n\t"
+            "cmp                    %4, %%"REG_a"   \n\t"
+            " jb                    1b          \n\t"
+
+            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+            : "memory", "%"REG_a
+        );
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+    __asm__ volatile(EMMS"       \n\t"
+                     SFENCE"     \n\t"
+                     :::"memory");
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line,
+ * others are ignored in the C version.
+ * FIXME: Write HQ version.
+ */
+static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                                       int width, int height,
+                                       int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const x86_reg chromWidth= width>>1;
+    for (y=0; y<height-2; y+=2) {
+        int i;
+        for (i=0; i<2; i++) {
+            __asm__ volatile(
+                "mov                        %2, %%"REG_a"   \n\t"
+                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
+                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
+                "pxor                    %%mm7, %%mm7       \n\t"
+                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
+                ".p2align                    4              \n\t"
+                "1:                                         \n\t"
+                PREFETCH"    64(%0, %%"REG_d")              \n\t"
+                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
+                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
+                "punpcklbw               %%mm7, %%mm0       \n\t"
+                "punpcklbw               %%mm7, %%mm1       \n\t"
+                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
+                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
+                "punpcklbw               %%mm7, %%mm2       \n\t"
+                "punpcklbw               %%mm7, %%mm3       \n\t"
+                "pmaddwd                 %%mm6, %%mm0       \n\t"
+                "pmaddwd                 %%mm6, %%mm1       \n\t"
+                "pmaddwd                 %%mm6, %%mm2       \n\t"
+                "pmaddwd                 %%mm6, %%mm3       \n\t"
+#ifndef FAST_BGR2YV12
+                "psrad                      $8, %%mm0       \n\t"
+                "psrad                      $8, %%mm1       \n\t"
+                "psrad                      $8, %%mm2       \n\t"
+                "psrad                      $8, %%mm3       \n\t"
+#endif
+                "packssdw                %%mm1, %%mm0       \n\t"
+                "packssdw                %%mm3, %%mm2       \n\t"
+                "pmaddwd                 %%mm5, %%mm0       \n\t"
+                "pmaddwd                 %%mm5, %%mm2       \n\t"
+                "packssdw                %%mm2, %%mm0       \n\t"
+                "psraw                      $7, %%mm0       \n\t"
+
+                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
+                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
+                "punpcklbw               %%mm7, %%mm4       \n\t"
+                "punpcklbw               %%mm7, %%mm1       \n\t"
+                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
+                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
+                "punpcklbw               %%mm7, %%mm2       \n\t"
+                "punpcklbw               %%mm7, %%mm3       \n\t"
+                "pmaddwd                 %%mm6, %%mm4       \n\t"
+                "pmaddwd                 %%mm6, %%mm1       \n\t"
+                "pmaddwd                 %%mm6, %%mm2       \n\t"
+                "pmaddwd                 %%mm6, %%mm3       \n\t"
+#ifndef FAST_BGR2YV12
+                "psrad                      $8, %%mm4       \n\t"
+                "psrad                      $8, %%mm1       \n\t"
+                "psrad                      $8, %%mm2       \n\t"
+                "psrad                      $8, %%mm3       \n\t"
+#endif
+                "packssdw                %%mm1, %%mm4       \n\t"
+                "packssdw                %%mm3, %%mm2       \n\t"
+                "pmaddwd                 %%mm5, %%mm4       \n\t"
+                "pmaddwd                 %%mm5, %%mm2       \n\t"
+                "add                       $24, %%"REG_d"   \n\t"
+                "packssdw                %%mm2, %%mm4       \n\t"
+                "psraw                      $7, %%mm4       \n\t"
+
+                "packuswb                %%mm4, %%mm0       \n\t"
+                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
+
+                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
+                "add                        $8,      %%"REG_a"  \n\t"
+                " js                        1b                  \n\t"
+                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
+                : "%"REG_a, "%"REG_d
+            );
+            ydst += lumStride;
+            src  += srcStride;
+        }
+        src -= srcStride*2;
+        __asm__ volatile(
+            "mov                        %4, %%"REG_a"   \n\t"
+            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
+            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
+            "pxor                    %%mm7, %%mm7       \n\t"
+            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
+            "add                 %%"REG_d", %%"REG_d"   \n\t"
+            ".p2align                    4              \n\t"
+            "1:                                         \n\t"
+            PREFETCH"    64(%0, %%"REG_d")              \n\t"
+            PREFETCH"    64(%1, %%"REG_d")              \n\t"
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
+            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
+            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
+            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
+            PAVGB"                   %%mm1, %%mm0       \n\t"
+            PAVGB"                   %%mm3, %%mm2       \n\t"
+            "movq                    %%mm0, %%mm1       \n\t"
+            "movq                    %%mm2, %%mm3       \n\t"
+            "psrlq                     $24, %%mm0       \n\t"
+            "psrlq                     $24, %%mm2       \n\t"
+            PAVGB"                   %%mm1, %%mm0       \n\t"
+            PAVGB"                   %%mm3, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm0       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+#else
+            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
+            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
+            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
+            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
+            "punpcklbw               %%mm7, %%mm0       \n\t"
+            "punpcklbw               %%mm7, %%mm1       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm3       \n\t"
+            "paddw                   %%mm1, %%mm0       \n\t"
+            "paddw                   %%mm3, %%mm2       \n\t"
+            "paddw                   %%mm2, %%mm0       \n\t"
+            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
+            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
+            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
+            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
+            "punpcklbw               %%mm7, %%mm4       \n\t"
+            "punpcklbw               %%mm7, %%mm1       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm3       \n\t"
+            "paddw                   %%mm1, %%mm4       \n\t"
+            "paddw                   %%mm3, %%mm2       \n\t"
+            "paddw                   %%mm4, %%mm2       \n\t"
+            "psrlw                      $2, %%mm0       \n\t"
+            "psrlw                      $2, %%mm2       \n\t"
+#endif
+            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
+            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
+
+            "pmaddwd                 %%mm0, %%mm1       \n\t"
+            "pmaddwd                 %%mm2, %%mm3       \n\t"
+            "pmaddwd                 %%mm6, %%mm0       \n\t"
+            "pmaddwd                 %%mm6, %%mm2       \n\t"
+#ifndef FAST_BGR2YV12
+            "psrad                      $8, %%mm0       \n\t"
+            "psrad                      $8, %%mm1       \n\t"
+            "psrad                      $8, %%mm2       \n\t"
+            "psrad                      $8, %%mm3       \n\t"
+#endif
+            "packssdw                %%mm2, %%mm0       \n\t"
+            "packssdw                %%mm3, %%mm1       \n\t"
+            "pmaddwd                 %%mm5, %%mm0       \n\t"
+            "pmaddwd                 %%mm5, %%mm1       \n\t"
+            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
+            "psraw                      $7, %%mm0       \n\t"
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
+            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
+            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
+            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
+            PAVGB"                   %%mm1, %%mm4       \n\t"
+            PAVGB"                   %%mm3, %%mm2       \n\t"
+            "movq                    %%mm4, %%mm1       \n\t"
+            "movq                    %%mm2, %%mm3       \n\t"
+            "psrlq                     $24, %%mm4       \n\t"
+            "psrlq                     $24, %%mm2       \n\t"
+            PAVGB"                   %%mm1, %%mm4       \n\t"
+            PAVGB"                   %%mm3, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm4       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+#else
+            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
+            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
+            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
+            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
+            "punpcklbw               %%mm7, %%mm4       \n\t"
+            "punpcklbw               %%mm7, %%mm1       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm3       \n\t"
+            "paddw                   %%mm1, %%mm4       \n\t"
+            "paddw                   %%mm3, %%mm2       \n\t"
+            "paddw                   %%mm2, %%mm4       \n\t"
+            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
+            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
+            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
+            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
+            "punpcklbw               %%mm7, %%mm5       \n\t"
+            "punpcklbw               %%mm7, %%mm1       \n\t"
+            "punpcklbw               %%mm7, %%mm2       \n\t"
+            "punpcklbw               %%mm7, %%mm3       \n\t"
+            "paddw                   %%mm1, %%mm5       \n\t"
+            "paddw                   %%mm3, %%mm2       \n\t"
+            "paddw                   %%mm5, %%mm2       \n\t"
+            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
+            "psrlw                      $2, %%mm4       \n\t"
+            "psrlw                      $2, %%mm2       \n\t"
+#endif
+            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
+            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
+
+            "pmaddwd                 %%mm4, %%mm1       \n\t"
+            "pmaddwd                 %%mm2, %%mm3       \n\t"
+            "pmaddwd                 %%mm6, %%mm4       \n\t"
+            "pmaddwd                 %%mm6, %%mm2       \n\t"
+#ifndef FAST_BGR2YV12
+            "psrad                      $8, %%mm4       \n\t"
+            "psrad                      $8, %%mm1       \n\t"
+            "psrad                      $8, %%mm2       \n\t"
+            "psrad                      $8, %%mm3       \n\t"
+#endif
+            "packssdw                %%mm2, %%mm4       \n\t"
+            "packssdw                %%mm3, %%mm1       \n\t"
+            "pmaddwd                 %%mm5, %%mm4       \n\t"
+            "pmaddwd                 %%mm5, %%mm1       \n\t"
+            "add                       $24, %%"REG_d"   \n\t"
+            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
+            "psraw                      $7, %%mm4       \n\t"
+
+            "movq                    %%mm0, %%mm1           \n\t"
+            "punpckldq               %%mm4, %%mm0           \n\t"
+            "punpckhdq               %%mm4, %%mm1           \n\t"
+            "packsswb                %%mm1, %%mm0           \n\t"
+            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
+            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
+            "punpckhdq               %%mm0, %%mm0           \n\t"
+            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
+            "add                        $4, %%"REG_a"       \n\t"
+            " js                        1b                  \n\t"
+            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
+            : "%"REG_a, "%"REG_d
+        );
+
+        udst += chromStride;
+        vdst += chromStride;
+        src  += srcStride*2;
+    }
+
+    __asm__ volatile(EMMS"       \n\t"
+                     SFENCE"     \n\t"
+                     :::"memory");
+
+     rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
+}
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
+                                    int width, int height, int src1Stride,
+                                    int src2Stride, int dstStride)
+{
+    int h;
+
+    for (h=0; h < height; h++) {
+        int w;
+
+#if COMPILE_TEMPLATE_SSE2
+        __asm__(
+            "xor              %%"REG_a", %%"REG_a"  \n\t"
+            "1:                                     \n\t"
+            PREFETCH" 64(%1, %%"REG_a")             \n\t"
+            PREFETCH" 64(%2, %%"REG_a")             \n\t"
+            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
+            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
+            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
+            "punpcklbw           %%xmm2, %%xmm0     \n\t"
+            "punpckhbw           %%xmm2, %%xmm1     \n\t"
+            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
+            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
+            "add                    $16, %%"REG_a"  \n\t"
+            "cmp                     %3, %%"REG_a"  \n\t"
+            " jb                     1b             \n\t"
+            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
+            : "memory", "%"REG_a""
+        );
+#else
+        __asm__(
+            "xor %%"REG_a", %%"REG_a"               \n\t"
+            "1:                                     \n\t"
+            PREFETCH" 64(%1, %%"REG_a")             \n\t"
+            PREFETCH" 64(%2, %%"REG_a")             \n\t"
+            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
+            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
+            "movq                 %%mm0, %%mm1      \n\t"
+            "movq                 %%mm2, %%mm3      \n\t"
+            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
+            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
+            "punpcklbw            %%mm4, %%mm0      \n\t"
+            "punpckhbw            %%mm4, %%mm1      \n\t"
+            "punpcklbw            %%mm5, %%mm2      \n\t"
+            "punpckhbw            %%mm5, %%mm3      \n\t"
+            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
+            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
+            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
+            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
+            "add                    $16, %%"REG_a"  \n\t"
+            "cmp                     %3, %%"REG_a"  \n\t"
+            " jb                     1b             \n\t"
+            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
+            : "memory", "%"REG_a
+        );
+#endif
+        for (w= (width&(~15)); w < width; w++) {
+            dest[2*w+0] = src1[w];
+            dest[2*w+1] = src2[w];
+        }
+        dest += dstStride;
+        src1 += src1Stride;
+        src2 += src2Stride;
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+            );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if !COMPILE_TEMPLATE_SSE2
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+                                       uint8_t *dst1, uint8_t *dst2,
+                                       int width, int height,
+                                       int srcStride1, int srcStride2,
+                                       int dstStride1, int dstStride2)
+{
+    x86_reg x, y;
+    int w,h;
+    w=width/2; h=height/2;
+    __asm__ volatile(
+        PREFETCH" %0    \n\t"
+        PREFETCH" %1    \n\t"
+        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
+    for (y=0;y<h;y++) {
+        const uint8_t* s1=src1+srcStride1*(y>>1);
+        uint8_t* d=dst1+dstStride1*y;
+        x=0;
+        for (;x<w-31;x+=32) {
+            __asm__ volatile(
+                PREFETCH"   32(%1,%2)        \n\t"
+                "movq         (%1,%2), %%mm0 \n\t"
+                "movq        8(%1,%2), %%mm2 \n\t"
+                "movq       16(%1,%2), %%mm4 \n\t"
+                "movq       24(%1,%2), %%mm6 \n\t"
+                "movq      %%mm0, %%mm1 \n\t"
+                "movq      %%mm2, %%mm3 \n\t"
+                "movq      %%mm4, %%mm5 \n\t"
+                "movq      %%mm6, %%mm7 \n\t"
+                "punpcklbw %%mm0, %%mm0 \n\t"
+                "punpckhbw %%mm1, %%mm1 \n\t"
+                "punpcklbw %%mm2, %%mm2 \n\t"
+                "punpckhbw %%mm3, %%mm3 \n\t"
+                "punpcklbw %%mm4, %%mm4 \n\t"
+                "punpckhbw %%mm5, %%mm5 \n\t"
+                "punpcklbw %%mm6, %%mm6 \n\t"
+                "punpckhbw %%mm7, %%mm7 \n\t"
+                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm7, 56(%0,%2,2)"
+                :: "r"(d), "r"(s1), "r"(x)
+                :"memory");
+        }
+        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
+    }
+    for (y=0;y<h;y++) {
+        const uint8_t* s2=src2+srcStride2*(y>>1);
+        uint8_t* d=dst2+dstStride2*y;
+        x=0;
+        for (;x<w-31;x+=32) {
+            __asm__ volatile(
+                PREFETCH"   32(%1,%2)        \n\t"
+                "movq         (%1,%2), %%mm0 \n\t"
+                "movq        8(%1,%2), %%mm2 \n\t"
+                "movq       16(%1,%2), %%mm4 \n\t"
+                "movq       24(%1,%2), %%mm6 \n\t"
+                "movq      %%mm0, %%mm1 \n\t"
+                "movq      %%mm2, %%mm3 \n\t"
+                "movq      %%mm4, %%mm5 \n\t"
+                "movq      %%mm6, %%mm7 \n\t"
+                "punpcklbw %%mm0, %%mm0 \n\t"
+                "punpckhbw %%mm1, %%mm1 \n\t"
+                "punpcklbw %%mm2, %%mm2 \n\t"
+                "punpckhbw %%mm3, %%mm3 \n\t"
+                "punpcklbw %%mm4, %%mm4 \n\t"
+                "punpckhbw %%mm5, %%mm5 \n\t"
+                "punpcklbw %%mm6, %%mm6 \n\t"
+                "punpckhbw %%mm7, %%mm7 \n\t"
+                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
+                MOVNTQ"    %%mm7, 56(%0,%2,2)"
+                :: "r"(d), "r"(s2), "r"(x)
+                :"memory");
+        }
+        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+
+static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+                                        uint8_t *dst,
+                                        int width, int height,
+                                        int srcStride1, int srcStride2,
+                                        int srcStride3, int dstStride)
+{
+    x86_reg x;
+    int y,w,h;
+    w=width/2; h=height;
+    for (y=0;y<h;y++) {
+        const uint8_t* yp=src1+srcStride1*y;
+        const uint8_t* up=src2+srcStride2*(y>>2);
+        const uint8_t* vp=src3+srcStride3*(y>>2);
+        uint8_t* d=dst+dstStride*y;
+        x=0;
+        for (;x<w-7;x+=8) {
+            __asm__ volatile(
+                PREFETCH"   32(%1, %0)          \n\t"
+                PREFETCH"   32(%2, %0)          \n\t"
+                PREFETCH"   32(%3, %0)          \n\t"
+                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
+                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
+                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
+                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
+                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
+                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
+                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
+                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
+
+                "movq            %%mm1, %%mm6   \n\t"
+                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
+                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
+                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
+                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
+                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
+
+                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
+                "movq     8(%1, %0, 4), %%mm0   \n\t"
+                "movq            %%mm0, %%mm3   \n\t"
+                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
+                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
+                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
+                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
+
+                "movq            %%mm4, %%mm6   \n\t"
+                "movq    16(%1, %0, 4), %%mm0   \n\t"
+                "movq            %%mm0, %%mm3   \n\t"
+                "punpcklbw       %%mm5, %%mm4   \n\t"
+                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
+                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
+                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
+                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
+
+                "punpckhbw       %%mm5, %%mm6   \n\t"
+                "movq    24(%1, %0, 4), %%mm0   \n\t"
+                "movq            %%mm0, %%mm3   \n\t"
+                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
+                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
+                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
+                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
+
+                : "+r" (x)
+                : "r"(yp), "r" (up), "r"(vp), "r"(d)
+                :"memory");
+        }
+        for (; x<w; x++) {
+            const int x2 = x<<2;
+            d[8*x+0] = yp[x2];
+            d[8*x+1] = up[x];
+            d[8*x+2] = yp[x2+1];
+            d[8*x+3] = vp[x];
+            d[8*x+4] = yp[x2+2];
+            d[8*x+5] = up[x];
+            d[8*x+6] = yp[x2+3];
+            d[8*x+7] = vp[x];
+        }
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
+{
+    dst +=   count;
+    src += 2*count;
+    count= - count;
+
+    if(count <= -16) {
+        count += 15;
+        __asm__ volatile(
+            "pcmpeqw       %%mm7, %%mm7        \n\t"
+            "psrlw            $8, %%mm7        \n\t"
+            "1:                                \n\t"
+            "movq -30(%1, %0, 2), %%mm0        \n\t"
+            "movq -22(%1, %0, 2), %%mm1        \n\t"
+            "movq -14(%1, %0, 2), %%mm2        \n\t"
+            "movq  -6(%1, %0, 2), %%mm3        \n\t"
+            "pand          %%mm7, %%mm0        \n\t"
+            "pand          %%mm7, %%mm1        \n\t"
+            "pand          %%mm7, %%mm2        \n\t"
+            "pand          %%mm7, %%mm3        \n\t"
+            "packuswb      %%mm1, %%mm0        \n\t"
+            "packuswb      %%mm3, %%mm2        \n\t"
+            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
+            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
+            "add             $16, %0           \n\t"
+            " js 1b                            \n\t"
+            : "+r"(count)
+            : "r"(src), "r"(dst)
+        );
+        count -= 15;
+    }
+    while(count<0) {
+        dst[count]= src[2*count];
+        count++;
+    }
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+    dst0+=   count;
+    dst1+=   count;
+    src += 4*count;
+    count= - count;
+    if(count <= -8) {
+        count += 7;
+        __asm__ volatile(
+            "pcmpeqw       %%mm7, %%mm7        \n\t"
+            "psrlw            $8, %%mm7        \n\t"
+            "1:                                \n\t"
+            "movq -28(%1, %0, 4), %%mm0        \n\t"
+            "movq -20(%1, %0, 4), %%mm1        \n\t"
+            "movq -12(%1, %0, 4), %%mm2        \n\t"
+            "movq  -4(%1, %0, 4), %%mm3        \n\t"
+            "pand          %%mm7, %%mm0        \n\t"
+            "pand          %%mm7, %%mm1        \n\t"
+            "pand          %%mm7, %%mm2        \n\t"
+            "pand          %%mm7, %%mm3        \n\t"
+            "packuswb      %%mm1, %%mm0        \n\t"
+            "packuswb      %%mm3, %%mm2        \n\t"
+            "movq          %%mm0, %%mm1        \n\t"
+            "movq          %%mm2, %%mm3        \n\t"
+            "psrlw            $8, %%mm0        \n\t"
+            "psrlw            $8, %%mm2        \n\t"
+            "pand          %%mm7, %%mm1        \n\t"
+            "pand          %%mm7, %%mm3        \n\t"
+            "packuswb      %%mm2, %%mm0        \n\t"
+            "packuswb      %%mm3, %%mm1        \n\t"
+            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
+            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
+            "add              $8, %0           \n\t"
+            " js 1b                            \n\t"
+            : "+r"(count)
+            : "r"(src), "r"(dst0), "r"(dst1)
+        );
+        count -= 7;
+    }
+    while(count<0) {
+        dst0[count]= src[4*count+0];
+        dst1[count]= src[4*count+2];
+        count++;
+    }
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+    dst0 +=   count;
+    dst1 +=   count;
+    src0 += 4*count;
+    src1 += 4*count;
+    count= - count;
+#ifdef PAVGB
+    if(count <= -8) {
+        count += 7;
+        __asm__ volatile(
+            "pcmpeqw        %%mm7, %%mm7        \n\t"
+            "psrlw             $8, %%mm7        \n\t"
+            "1:                                \n\t"
+            "movq  -28(%1, %0, 4), %%mm0        \n\t"
+            "movq  -20(%1, %0, 4), %%mm1        \n\t"
+            "movq  -12(%1, %0, 4), %%mm2        \n\t"
+            "movq   -4(%1, %0, 4), %%mm3        \n\t"
+            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
+            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
+            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
+            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
+            "pand           %%mm7, %%mm0        \n\t"
+            "pand           %%mm7, %%mm1        \n\t"
+            "pand           %%mm7, %%mm2        \n\t"
+            "pand           %%mm7, %%mm3        \n\t"
+            "packuswb       %%mm1, %%mm0        \n\t"
+            "packuswb       %%mm3, %%mm2        \n\t"
+            "movq           %%mm0, %%mm1        \n\t"
+            "movq           %%mm2, %%mm3        \n\t"
+            "psrlw             $8, %%mm0        \n\t"
+            "psrlw             $8, %%mm2        \n\t"
+            "pand           %%mm7, %%mm1        \n\t"
+            "pand           %%mm7, %%mm3        \n\t"
+            "packuswb       %%mm2, %%mm0        \n\t"
+            "packuswb       %%mm3, %%mm1        \n\t"
+            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
+            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
+            "add               $8, %0           \n\t"
+            " js 1b                            \n\t"
+            : "+r"(count)
+            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
+        );
+        count -= 7;
+    }
+#endif
+    while(count<0) {
+        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
+        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
+        count++;
+    }
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+    dst0+=   count;
+    dst1+=   count;
+    src += 4*count;
+    count= - count;
+    if(count <= -8) {
+        count += 7;
+        __asm__ volatile(
+            "pcmpeqw       %%mm7, %%mm7        \n\t"
+            "psrlw            $8, %%mm7        \n\t"
+            "1:                                \n\t"
+            "movq -28(%1, %0, 4), %%mm0        \n\t"
+            "movq -20(%1, %0, 4), %%mm1        \n\t"
+            "movq -12(%1, %0, 4), %%mm2        \n\t"
+            "movq  -4(%1, %0, 4), %%mm3        \n\t"
+            "psrlw            $8, %%mm0        \n\t"
+            "psrlw            $8, %%mm1        \n\t"
+            "psrlw            $8, %%mm2        \n\t"
+            "psrlw            $8, %%mm3        \n\t"
+            "packuswb      %%mm1, %%mm0        \n\t"
+            "packuswb      %%mm3, %%mm2        \n\t"
+            "movq          %%mm0, %%mm1        \n\t"
+            "movq          %%mm2, %%mm3        \n\t"
+            "psrlw            $8, %%mm0        \n\t"
+            "psrlw            $8, %%mm2        \n\t"
+            "pand          %%mm7, %%mm1        \n\t"
+            "pand          %%mm7, %%mm3        \n\t"
+            "packuswb      %%mm2, %%mm0        \n\t"
+            "packuswb      %%mm3, %%mm1        \n\t"
+            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
+            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
+            "add              $8, %0           \n\t"
+            " js 1b                            \n\t"
+            : "+r"(count)
+            : "r"(src), "r"(dst0), "r"(dst1)
+        );
+        count -= 7;
+    }
+    src++;
+    while(count<0) {
+        dst0[count]= src[4*count+0];
+        dst1[count]= src[4*count+2];
+        count++;
+    }
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+    dst0 +=   count;
+    dst1 +=   count;
+    src0 += 4*count;
+    src1 += 4*count;
+    count= - count;
+#ifdef PAVGB
+    if(count <= -8) {
+        count += 7;
+        __asm__ volatile(
+            "pcmpeqw        %%mm7, %%mm7        \n\t"
+            "psrlw             $8, %%mm7        \n\t"
+            "1:                                \n\t"
+            "movq  -28(%1, %0, 4), %%mm0        \n\t"
+            "movq  -20(%1, %0, 4), %%mm1        \n\t"
+            "movq  -12(%1, %0, 4), %%mm2        \n\t"
+            "movq   -4(%1, %0, 4), %%mm3        \n\t"
+            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
+            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
+            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
+            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
+            "psrlw             $8, %%mm0        \n\t"
+            "psrlw             $8, %%mm1        \n\t"
+            "psrlw             $8, %%mm2        \n\t"
+            "psrlw             $8, %%mm3        \n\t"
+            "packuswb       %%mm1, %%mm0        \n\t"
+            "packuswb       %%mm3, %%mm2        \n\t"
+            "movq           %%mm0, %%mm1        \n\t"
+            "movq           %%mm2, %%mm3        \n\t"
+            "psrlw             $8, %%mm0        \n\t"
+            "psrlw             $8, %%mm2        \n\t"
+            "pand           %%mm7, %%mm1        \n\t"
+            "pand           %%mm7, %%mm3        \n\t"
+            "packuswb       %%mm2, %%mm0        \n\t"
+            "packuswb       %%mm3, %%mm1        \n\t"
+            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
+            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
+            "add               $8, %0           \n\t"
+            " js 1b                            \n\t"
+            : "+r"(count)
+            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
+        );
+        count -= 7;
+    }
+#endif
+    src0++;
+    src1++;
+    while(count<0) {
+        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
+        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
+        count++;
+    }
+}
+
+static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                                 int width, int height,
+                                 int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth= -((-width)>>1);
+
+    for (y=0; y<height; y++) {
+        RENAME(extract_even)(src, ydst, width);
+        if(y&1) {
+            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
+            udst+= chromStride;
+            vdst+= chromStride;
+        }
+
+        src += srcStride;
+        ydst+= lumStride;
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                                 int width, int height,
+                                 int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth= -((-width)>>1);
+
+    for (y=0; y<height; y++) {
+        RENAME(extract_even)(src, ydst, width);
+        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
+
+        src += srcStride;
+        ydst+= lumStride;
+        udst+= chromStride;
+        vdst+= chromStride;
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                                 int width, int height,
+                                 int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth= -((-width)>>1);
+
+    for (y=0; y<height; y++) {
+        RENAME(extract_even)(src+1, ydst, width);
+        if(y&1) {
+            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
+            udst+= chromStride;
+            vdst+= chromStride;
+        }
+
+        src += srcStride;
+        ydst+= lumStride;
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                                 int width, int height,
+                                 int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth= -((-width)>>1);
+
+    for (y=0; y<height; y++) {
+        RENAME(extract_even)(src+1, ydst, width);
+        RENAME(extract_even2)(src, udst, vdst, chromWidth);
+
+        src += srcStride;
+        ydst+= lumStride;
+        udst+= chromStride;
+        vdst+= chromStride;
+    }
+    __asm__(
+            EMMS"       \n\t"
+            SFENCE"     \n\t"
+            ::: "memory"
+        );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+static inline void RENAME(rgb2rgb_init)(void)
+{
+#if !COMPILE_TEMPLATE_SSE2
+#if !COMPILE_TEMPLATE_AMD3DNOW
+    rgb15to16          = RENAME(rgb15to16);
+    rgb15tobgr24       = RENAME(rgb15tobgr24);
+    rgb15to32          = RENAME(rgb15to32);
+    rgb16tobgr24       = RENAME(rgb16tobgr24);
+    rgb16to32          = RENAME(rgb16to32);
+    rgb16to15          = RENAME(rgb16to15);
+    rgb24tobgr16       = RENAME(rgb24tobgr16);
+    rgb24tobgr15       = RENAME(rgb24tobgr15);
+    rgb24tobgr32       = RENAME(rgb24tobgr32);
+    rgb32to16          = RENAME(rgb32to16);
+    rgb32to15          = RENAME(rgb32to15);
+    rgb32tobgr24       = RENAME(rgb32tobgr24);
+    rgb24to15          = RENAME(rgb24to15);
+    rgb24to16          = RENAME(rgb24to16);
+    rgb24tobgr24       = RENAME(rgb24tobgr24);
+    shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
+    rgb32tobgr16       = RENAME(rgb32tobgr16);
+    rgb32tobgr15       = RENAME(rgb32tobgr15);
+    yv12toyuy2         = RENAME(yv12toyuy2);
+    yv12touyvy         = RENAME(yv12touyvy);
+    yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
+    yuv422ptouyvy      = RENAME(yuv422ptouyvy);
+    yuy2toyv12         = RENAME(yuy2toyv12);
+    vu9_to_vu12        = RENAME(vu9_to_vu12);
+    yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
+    uyvytoyuv422       = RENAME(uyvytoyuv422);
+    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+    planar2x           = RENAME(planar2x);
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
+    rgb24toyv12        = RENAME(rgb24toyv12);
+
+    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
+    uyvytoyuv420       = RENAME(uyvytoyuv420);
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+    interleaveBytes    = RENAME(interleaveBytes);
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+}
diff --git a/ffmpeg/libswscale/x86/scale.asm b/ffmpeg/libswscale/x86/scale.asm
new file mode 100644
index 0000000..c6dafde
--- /dev/null
+++ b/ffmpeg/libswscale/x86/scale.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+max_19bit_int: times 4 dd 0x7ffff
+max_19bit_flt: times 4 dd 524287.0
+minshort:      times 8 dw 0x8000
+unicoeff:      times 4 dd 0x20000000
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
+;                               (SwsContext *c, int{16,32}_t *dst,
+;                                int dstW, const uint{8,16}_t *src,
+;                                const int16_t *filter,
+;                                const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is either 8-bits width or 16-bits width
+; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
+; downscale before multiplying). Filter is 14-bits. Output is either 15bits
+; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
+; output pixel is generated from $filterSize input pixels, the position of
+; the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
+%macro SCALE_FUNC 6
+%ifnidn %3, X
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
+%else
+cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
+%endif
+%if ARCH_X86_64
+    movsxd        wq, wd
+%define mov32 movsxd
+%else ; x86-32
+%define mov32 mov
+%endif ; x86-64
+%if %2 == 19
+%if mmsize == 8 ; mmx
+    mova          m2, [max_19bit_int]
+%elif cpuflag(sse4)
+    mova          m2, [max_19bit_int]
+%else ; ssse3/sse2
+    mova          m2, [max_19bit_flt]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %2 == 19
+%if %1 == 16
+    mova          m6, [minshort]
+    mova          m7, [unicoeff]
+%elif %1 == 8
+    pxor          m3, m3
+%endif ; %1 == 8/16
+
+%if %1 == 8
+%define movlh movd
+%define movbh movh
+%define srcmul 1
+%else ; %1 == 9-16
+%define movlh movq
+%define movbh movu
+%define srcmul 2
+%endif ; %1 == 8/9-16
+
+%ifnidn %3, X
+
+    ; setup loop
+%if %3 == 8
+    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
+%define wshr 1
+%else ; %3 == 4
+%define wshr 0
+%endif ; %3 == 8
+    lea      filterq, [filterq+wq*8]
+%if %2 == 15
+    lea         dstq, [dstq+wq*(2>>wshr)]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*(4>>wshr)]
+%endif ; %2 == 15/19
+    lea      fltposq, [fltposq+wq*(4>>wshr)]
+    neg           wq
+
+.loop:
+%if %3 == 4 ; filterSize == 4 scaling
+    ; load 2x4 or 4x4 source pixels into m0/m1
+    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
+    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
+%if mmsize == 8
+    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; mmsize == 16
+%if %1 > 8
+    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%endif
+    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
+    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
+%if %1 > 8
+    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+    punpckldq     m0, m4
+    punpckldq     m1, m5
+%endif ; %1 == 8
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply with filter coefficients
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+
+    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
+%if mmsize == 8 ; mmx
+    movq          m4, m0
+    punpckldq     m0, m1
+    punpckhdq     m4, m1
+    paddd         m0, m4
+%elif notcpuflag(ssse3) ; sse2
+    mova          m4, m0
+    shufps        m0, m1, 10001000b
+    shufps        m4, m1, 11011101b
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
+                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
+                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
+                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
+%endif ; mmx/sse2/ssse3/sse4
+%else ; %3 == 8, i.e. filterSize == 8 scaling
+    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
+    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
+    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
+%if mmsize == 8
+    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
+    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
+    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
+%else ; mmsize == 16
+    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
+    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
+    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
+    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+    punpcklbw     m4, m3                        ; byte -> word
+    punpcklbw     m5, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+    psubw         m4, m6
+    psubw         m5, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
+    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
+
+    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
+%if mmsize == 8
+    paddd         m0, m1
+    paddd         m4, m5
+    movq          m1, m0
+    punpckldq     m0, m4
+    punpckhdq     m1, m4
+    paddd         m0, m1
+%elif notcpuflag(ssse3) ; sse2
+%if %1 == 8
+%define mex m6
+%else
+%define mex m3
+%endif
+    ; emulate horizontal add as transpose + vertical add
+    mova         mex, m0
+    punpckldq     m0, m1
+    punpckhdq    mex, m1
+    paddd         m0, mex
+    mova          m1, m4
+    punpckldq     m4, m5
+    punpckhdq     m1, m5
+    paddd         m4, m1
+    mova          m1, m0
+    punpcklqdq    m0, m4
+    punpckhqdq    m1, m4
+    paddd         m0, m1
+%else ; ssse3/sse4
+    ; FIXME if we rearrange the filter in pairs of 4, we can
+    ; load pixels likewise and use 2 x paddd + phaddd instead
+    ; of 3 x phaddd here, faster on older cpus
+    phaddd        m0, m1
+    phaddd        m4, m5
+    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
+                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
+                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
+                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %3 == 4/8
+
+%else ; %3 == X, i.e. any filterSize scaling
+
+%ifidn %4, X4
+%define dlt 4
+%else ; %4 == X || %4 == X8
+%define dlt 0
+%endif ; %4 ==/!= X4
+%if ARCH_X86_64
+%define srcq    r8
+%define pos1q   r7
+%define srcendq r9
+    movsxd  fltsizeq, fltsized                  ; filterSize
+    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+%else ; x86-32
+%define srcq    srcmemq
+%define pos1q   dstq
+%define srcendq r6m
+    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+    mov      srcendq, pos0q
+%endif ; x86-32/64
+    lea      fltposq, [fltposq+wq*4]
+%if %2 == 15
+    lea         dstq, [dstq+wq*2]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*4]
+%endif ; %2 == 15/19
+    movifnidn  dstmp, dstq
+    neg           wq
+
+.loop:
+    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
+    pxor          m4, m4
+    pxor          m5, m5
+    mov         srcq, srcmemmp
+
+.innerloop:
+    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
+    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
+%if %1 == 8
+    punpcklbw     m0, m3
+    punpcklbw     m1, m3
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
+    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
+    paddd         m4, m0
+    paddd         m5, m1
+    add      filterq, mmsize
+    add         srcq, srcmul*mmsize/2
+    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
+    jl .innerloop
+
+%ifidn %4, X4
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
+    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
+%if %1 > 8
+    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
+%else ; %1 == 8
+    movd          m1, [srcq+(pos1q+dlt)*srcmul]
+    punpckldq     m0, m1
+%endif ; %1 == 8
+%if %1 == 8
+    punpcklbw     m0, m3
+%endif ; %1 == 8
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]
+%endif ; %4 == X4
+
+    lea      filterq, [filterq+(fltsizeq+dlt)*2]
+
+%if mmsize == 8 ; mmx
+    movq          m0, m4
+    punpckldq     m4, m5
+    punpckhdq     m0, m5
+    paddd         m0, m4
+%else ; mmsize == 16
+%if notcpuflag(ssse3) ; sse2
+    mova          m1, m4
+    punpcklqdq    m4, m5
+    punpckhqdq    m1, m5
+    paddd         m4, m1
+%else ; ssse3/sse4
+    phaddd        m4, m5
+%endif ; sse2/ssse3/sse4
+%ifidn %4, X4
+    paddd         m4, m0
+%endif ; %3 == X4
+%if notcpuflag(ssse3) ; sse2
+    pshufd        m4, m4, 11011000b
+    movhlps       m0, m4
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m4, m4
+    SWAP           0, 4
+%endif ; sse2/ssse3/sse4
+%endif ; mmsize == 8/16
+%endif ; %3 ==/!= X
+
+%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
+    paddd         m0, m7
+%endif ; %1 == 16
+
+    ; clip, store
+    psrad         m0, 14 + %1 - %2
+%ifidn %3, X
+    movifnidn   dstq, dstmp
+%endif ; %3 == X
+%if %2 == 15
+    packssdw      m0, m0
+%ifnidn %3, X
+    movh [dstq+wq*(2>>wshr)], m0
+%else ; %3 == X
+    movd [dstq+wq*2], m0
+%endif ; %3 ==/!= X
+%else ; %2 == 19
+%if mmsize == 8
+    PMINSD_MMX    m0, m2, m4
+%elif cpuflag(sse4)
+    pminsd        m0, m2
+%else ; sse2/ssse3
+    cvtdq2ps      m0, m0
+    minps         m0, m2
+    cvtps2dq      m0, m0
+%endif ; mmx/sse2/ssse3/sse4
+%ifnidn %3, X
+    mova [dstq+wq*(4>>wshr)], m0
+%else ; %3 == X
+    movq [dstq+wq*4], m0
+%endif ; %3 ==/!= X
+%endif ; %2 == 15/19
+%ifnidn %3, X
+    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+                                                ; per iteration. see "shl wq,1" above as for why we do this
+%else ; %3 == X
+    add           wq, 2
+%endif ; %3 ==/!= X
+    jl .loop
+    REP_RET
+%endmacro
+
+; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
+%macro SCALE_FUNCS 3
+SCALE_FUNC %1, %2, 4, 4,  6, %3
+SCALE_FUNC %1, %2, 8, 8,  6, %3
+%if mmsize == 8
+SCALE_FUNC %1, %2, X, X,  7, %3
+%else
+SCALE_FUNC %1, %2, X, X4, 7, %3
+SCALE_FUNC %1, %2, X, X8, 7, %3
+%endif
+%endmacro
+
+; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
+%macro SCALE_FUNCS2 3
+%if notcpuflag(sse4)
+SCALE_FUNCS  8, 15, %1
+SCALE_FUNCS  9, 15, %2
+SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
+SCALE_FUNCS 16, 15, %3
+%endif ; !sse4
+SCALE_FUNCS  8, 19, %1
+SCALE_FUNCS  9, 19, %2
+SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
+SCALE_FUNCS 16, 19, %3
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SCALE_FUNCS2 0, 0, 0
+%endif
+INIT_XMM sse2
+SCALE_FUNCS2 6, 7, 8
+INIT_XMM ssse3
+SCALE_FUNCS2 6, 6, 8
+INIT_XMM sse4
+SCALE_FUNCS2 6, 6, 8
diff --git a/ffmpeg/libswscale/x86/swscale.c b/ffmpeg/libswscale/x86/swscale.c
new file mode 100644
index 0000000..2f67b1b
--- /dev/null
+++ b/ffmpeg/libswscale/x86/swscale.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP
+
+DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
+DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
+DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
+DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
+    0x0103010301030103LL,
+    0x0200020002000200LL,};
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
+    0x0602060206020602LL,
+    0x0004000400040004LL,};
+
+DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
+DECLARE_ASM_CONST(8, uint64_t, b15Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g15Mask)=   0x03E003E003E003E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r15Mask)=   0x7C007C007C007C00LL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_M24A)         = 0x00FF0000FF0000FFLL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24B)         = 0xFF0000FF0000FF00LL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24C)         = 0x0000FF0000FF0000LL;
+
+#ifdef FAST_BGR2YV12
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000000210041000DULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000FFEEFFDC0038ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00000038FFD2FFF8ULL;
+#else
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000020E540830C8BULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000ED0FDAC23831ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00003831D0E6F6EAULL;
+#endif /* FAST_BGR2YV12 */
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
+
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "swscale_template.c"
+#endif
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "swscale_template.c"
+#endif
+
+void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+                           int lastInLumBuf, int lastInChrBuf)
+{
+    const int dstH= c->dstH;
+    const int flags= c->flags;
+    int16_t **lumPixBuf= c->lumPixBuf;
+    int16_t **chrUPixBuf= c->chrUPixBuf;
+    int16_t **alpPixBuf= c->alpPixBuf;
+    const int vLumBufSize= c->vLumBufSize;
+    const int vChrBufSize= c->vChrBufSize;
+    int32_t *vLumFilterPos= c->vLumFilterPos;
+    int32_t *vChrFilterPos= c->vChrFilterPos;
+    int16_t *vLumFilter= c->vLumFilter;
+    int16_t *vChrFilter= c->vChrFilter;
+    int32_t *lumMmxFilter= c->lumMmxFilter;
+    int32_t *chrMmxFilter= c->chrMmxFilter;
+    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
+    const int vLumFilterSize= c->vLumFilterSize;
+    const int vChrFilterSize= c->vChrFilterSize;
+    const int chrDstY= dstY>>c->chrDstVSubSample;
+    const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+    const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+
+    c->blueDither= ff_dither8[dstY&1];
+    if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
+        c->greenDither= ff_dither8[dstY&1];
+    else
+        c->greenDither= ff_dither4[dstY&1];
+    c->redDither= ff_dither8[(dstY+1)&1];
+    if (dstY < dstH - 2) {
+        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+        int i;
+
+        if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+            const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+            int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+            for (i = 0; i < neg;            i++)
+                tmpY[i] = lumSrcPtr[neg];
+            for (     ; i < end;            i++)
+                tmpY[i] = lumSrcPtr[i];
+            for (     ; i < vLumFilterSize; i++)
+                tmpY[i] = tmpY[i-1];
+            lumSrcPtr = tmpY;
+
+            if (alpSrcPtr) {
+                const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+                for (i = 0; i < neg;            i++)
+                    tmpA[i] = alpSrcPtr[neg];
+                for (     ; i < end;            i++)
+                    tmpA[i] = alpSrcPtr[i];
+                for (     ; i < vLumFilterSize; i++)
+                    tmpA[i] = tmpA[i - 1];
+                alpSrcPtr = tmpA;
+            }
+        }
+        if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+            const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+            int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+            for (i = 0; i < neg;            i++) {
+                tmpU[i] = chrUSrcPtr[neg];
+            }
+            for (     ; i < end;            i++) {
+                tmpU[i] = chrUSrcPtr[i];
+            }
+            for (     ; i < vChrFilterSize; i++) {
+                tmpU[i] = tmpU[i - 1];
+            }
+            chrUSrcPtr = tmpU;
+        }
+
+        if (flags & SWS_ACCURATE_RND) {
+            int s= APCK_SIZE / 8;
+            for (i=0; i<vLumFilterSize; i+=2) {
+                *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
+                *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
+                lumMmxFilter[s*i+APCK_COEF/4  ]=
+                lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
+                + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
+                    *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
+                    alpMmxFilter[s*i+APCK_COEF/4  ]=
+                    alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i+=2) {
+                *(const void**)&chrMmxFilter[s*i              ]= chrUSrcPtr[i  ];
+                *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrUSrcPtr[i+(vChrFilterSize>1)];
+                chrMmxFilter[s*i+APCK_COEF/4  ]=
+                chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
+                + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
+            }
+        } else {
+            for (i=0; i<vLumFilterSize; i++) {
+                *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
+                lumMmxFilter[4*i+2]=
+                lumMmxFilter[4*i+3]=
+                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
+                    alpMmxFilter[4*i+2]=
+                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i++) {
+                *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
+                chrMmxFilter[4*i+2]=
+                chrMmxFilter[4*i+3]=
+                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
+            }
+        }
+    }
+}
+
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((int)dest) & 15){
+        return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset);
+    }
+    if (offset) {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         "movdqa    %%xmm3, %%xmm4\n\t"
+                         "psrlq       $24, %%xmm3\n\t"
+                         "psllq       $40, %%xmm4\n\t"
+                         "por       %%xmm4, %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    } else {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    }
+    filterSize--;
+    __asm__ volatile(
+        "pxor      %%xmm0, %%xmm0\n\t"
+        "punpcklbw %%xmm0, %%xmm3\n\t"
+        "movd          %0, %%xmm1\n\t"
+        "punpcklwd %%xmm1, %%xmm1\n\t"
+        "punpckldq %%xmm1, %%xmm1\n\t"
+        "punpcklqdq %%xmm1, %%xmm1\n\t"
+        "psllw         $3, %%xmm1\n\t"
+        "paddw     %%xmm1, %%xmm3\n\t"
+        "psraw         $4, %%xmm3\n\t"
+        ::"m"(filterSize)
+     );
+    __asm__ volatile(
+        "movdqa    %%xmm3, %%xmm4\n\t"
+        "movdqa    %%xmm3, %%xmm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"REG_d"), %%xmm0      \n\t" /* filterCoeff */\
+        "movdqa              (%%"REG_S", %%"REG_c", 2), %%xmm2      \n\t" /* srcData */\
+        "movdqa            16(%%"REG_S", %%"REG_c", 2), %%xmm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"
+        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
+        "add                         $16, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movdqa    %%xmm7, %%xmm3\n\t"
+        "movdqa    %%xmm7, %%xmm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
+extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
+                                                SwsContext *c, int16_t *data, \
+                                                int dstW, const uint8_t *src, \
+                                                const int16_t *filter, \
+                                                const int32_t *filterPos, int filterSize)
+
+#define SCALE_FUNCS(filter_n, opt) \
+    SCALE_FUNC(filter_n,  8, 15, opt); \
+    SCALE_FUNC(filter_n,  9, 15, opt); \
+    SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 12, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
+    SCALE_FUNC(filter_n, 16, 15, opt); \
+    SCALE_FUNC(filter_n,  8, 19, opt); \
+    SCALE_FUNC(filter_n,  9, 19, opt); \
+    SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 12, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
+    SCALE_FUNC(filter_n, 16, 19, opt)
+
+#define SCALE_FUNCS_MMX(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X, opt)
+
+#define SCALE_FUNCS_SSE(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X4, opt); \
+    SCALE_FUNCS(X8, opt)
+
+#if ARCH_X86_32
+SCALE_FUNCS_MMX(mmx);
+#endif
+SCALE_FUNCS_SSE(sse2);
+SCALE_FUNCS_SSE(ssse3);
+SCALE_FUNCS_SSE(sse4);
+
+#define VSCALEX_FUNC(size, opt) \
+extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
+                                               const int16_t **src, uint8_t *dest, int dstW, \
+                                               const uint8_t *dither, int offset)
+#define VSCALEX_FUNCS(opt) \
+    VSCALEX_FUNC(8,  opt); \
+    VSCALEX_FUNC(9,  opt); \
+    VSCALEX_FUNC(10, opt)
+
+#if ARCH_X86_32
+VSCALEX_FUNCS(mmxext);
+#endif
+VSCALEX_FUNCS(sse2);
+VSCALEX_FUNCS(sse4);
+VSCALEX_FUNC(16, sse4);
+VSCALEX_FUNCS(avx);
+
+#define VSCALE_FUNC(size, opt) \
+extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
+                                               const uint8_t *dither, int offset)
+#define VSCALE_FUNCS(opt1, opt2) \
+    VSCALE_FUNC(8,  opt1); \
+    VSCALE_FUNC(9,  opt2); \
+    VSCALE_FUNC(10, opt2); \
+    VSCALE_FUNC(16, opt1)
+
+#if ARCH_X86_32
+VSCALE_FUNCS(mmx, mmxext);
+#endif
+VSCALE_FUNCS(sse2, sse2);
+VSCALE_FUNC(16, sse4);
+VSCALE_FUNCS(avx, avx);
+
+#define INPUT_Y_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
+                                       const uint8_t *unused1, const uint8_t *unused2, \
+                                       int w, uint32_t *unused)
+#define INPUT_UV_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
+                                       const uint8_t *unused0, \
+                                       const uint8_t *src1, \
+                                       const uint8_t *src2, \
+                                       int w, uint32_t *unused)
+#define INPUT_FUNC(fmt, opt) \
+    INPUT_Y_FUNC(fmt, opt); \
+    INPUT_UV_FUNC(fmt, opt)
+#define INPUT_FUNCS(opt) \
+    INPUT_FUNC(uyvy, opt); \
+    INPUT_FUNC(yuyv, opt); \
+    INPUT_UV_FUNC(nv12, opt); \
+    INPUT_UV_FUNC(nv21, opt); \
+    INPUT_FUNC(rgba, opt); \
+    INPUT_FUNC(bgra, opt); \
+    INPUT_FUNC(argb, opt); \
+    INPUT_FUNC(abgr, opt); \
+    INPUT_FUNC(rgb24, opt); \
+    INPUT_FUNC(bgr24, opt)
+
+#if ARCH_X86_32
+INPUT_FUNCS(mmx);
+#endif
+INPUT_FUNCS(sse2);
+INPUT_FUNCS(ssse3);
+INPUT_FUNCS(avx);
+
+av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
+    if (cpu_flags & AV_CPU_FLAG_MMX)
+        sws_init_swScale_MMX(c);
+#if HAVE_MMXEXT_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
+        sws_init_swScale_MMXEXT(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+    if (c->srcBpc == 8) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 9) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 10) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale10to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 12) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
+        av_assert0(c->srcBpc == 16);\
+        hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+    } \
+} while (0)
+#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+    }
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
+switch(c->dstBpc){ \
+    case 16:                          do_16_case;                          break; \
+    case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
+    case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
+    default: if (condition_8bit)    /*vscalefn = ff_yuv2planeX_8_  ## opt;*/ break; \
+    }
+#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
+    switch(c->dstBpc){ \
+    case 16: if (!isBE(c->dstFormat))            vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
+    case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
+    case 9:  if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_  ## opt2;  break; \
+    case 8:                                      vscalefn = ff_yuv2plane1_8_  ## opt1;  break; \
+    default: av_assert0(c->dstBpc>8); \
+    }
+#define case_rgb(x, X, opt) \
+        case AV_PIX_FMT_ ## X: \
+            c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
+            if (!c->chrSrcHSubSample) \
+                c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
+            break
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+        ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_mmx;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            c->chrToYV12 = ff_yuyvToUV_mmx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_mmx;
+            c->chrToYV12 = ff_uyvyToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_mmx;
+            break;
+        case_rgb(rgb24, RGB24, mmx);
+        case_rgb(bgr24, BGR24, mmx);
+        case_rgb(bgra,  BGRA,  mmx);
+        case_rgb(rgba,  RGBA,  mmx);
+        case_rgb(abgr,  ABGR,  mmx);
+        case_rgb(argb,  ARGB,  mmx);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
+    }
+#endif /* ARCH_X86_32 */
+#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+             else                ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+             break; \
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_sse2;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            c->chrToYV12 = ff_yuyvToUV_sse2;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_sse2;
+            c->chrToYV12 = ff_uyvyToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_sse2;
+            break;
+        case_rgb(rgb24, RGB24, sse2);
+        case_rgb(bgr24, BGR24, sse2);
+        case_rgb(bgra,  BGRA,  sse2);
+        case_rgb(rgba,  RGBA,  sse2);
+        case_rgb(abgr,  ABGR,  sse2);
+        case_rgb(argb,  ARGB,  sse2);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+        switch (c->srcFormat) {
+        case_rgb(rgb24, RGB24, ssse3);
+        case_rgb(bgr24, BGR24, ssse3);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        /* Xto15 don't need special sse4 functions */
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
+                            if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        if (c->dstBpc == 16 && !isBE(c->dstFormat))
+            c->yuv2plane1 = ff_yuv2plane1_16_sse4;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_YUYV422:
+            c->chrToYV12 = ff_yuyvToUV_avx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->chrToYV12 = ff_uyvyToUV_avx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_avx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_avx;
+            break;
+        case_rgb(rgb24, RGB24, avx);
+        case_rgb(bgr24, BGR24, avx);
+        case_rgb(bgra,  BGRA,  avx);
+        case_rgb(rgba,  RGBA,  avx);
+        case_rgb(abgr,  ABGR,  avx);
+        case_rgb(argb,  ARGB,  avx);
+        default:
+            break;
+        }
+    }
+}
diff --git a/ffmpeg/libswscale/x86/swscale_template.c b/ffmpeg/libswscale/x86/swscale_template.c
new file mode 100644
index 0000000..f2567c1
--- /dev/null
+++ b/ffmpeg/libswscale/x86/swscale_template.c
@@ -0,0 +1,1717 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef REAL_MOVNTQ
+#undef MOVNTQ
+#undef MOVNTQ2
+#undef PREFETCH
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define PREFETCH "prefetchnta"
+#else
+#define PREFETCH  " # nop"
+#endif
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
+#else
+#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
+#endif
+#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    dither_8to16(dither, offset);
+    filterSize--;
+    __asm__ volatile(
+        "movd %0, %%mm1\n\t"
+        "punpcklwd %%mm1, %%mm1\n\t"
+        "punpckldq %%mm1, %%mm1\n\t"
+        "psllw        $3, %%mm1\n\t"
+        "paddw     %%mm1, %%mm3\n\t"
+        "paddw     %%mm1, %%mm4\n\t"
+        "psraw        $4, %%mm3\n\t"
+        "psraw        $4, %%mm4\n\t"
+        ::"m"(filterSize)
+     );
+
+    __asm__ volatile(\
+        "movq    %%mm3, %%mm6\n\t"
+        "movq    %%mm4, %%mm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
+        "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
+        "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%mm0, %%mm2      \n\t"\
+        "pmulhw                           %%mm0, %%mm5      \n\t"\
+        "paddw                            %%mm2, %%mm3      \n\t"\
+        "paddw                            %%mm5, %%mm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%mm3      \n\t"\
+        "psraw                               $3, %%mm4      \n\t"\
+        "packuswb                         %%mm4, %%mm3      \n\t"
+        MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
+        "add                          $8, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movq    %%mm6, %%mm3\n\t"
+        "movq    %%mm7, %%mm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+
+#define YSCALEYUV2PACKEDX_UV \
+    __asm__ volatile(\
+        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
+        ".p2align                      4                \n\t"\
+        "nop                                            \n\t"\
+        "1:                                             \n\t"\
+        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
+        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
+        "movq                      %%mm3, %%mm4         \n\t"\
+        ".p2align                      4                \n\t"\
+        "2:                                             \n\t"\
+        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
+        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
+        "add                          %6, %%"REG_S"     \n\t" \
+        "movq     (%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
+        "add                         $16, %%"REG_d"     \n\t"\
+        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+        "pmulhw                    %%mm0, %%mm2         \n\t"\
+        "pmulhw                    %%mm0, %%mm5         \n\t"\
+        "paddw                     %%mm2, %%mm3         \n\t"\
+        "paddw                     %%mm5, %%mm4         \n\t"\
+        "test                  %%"REG_S", %%"REG_S"     \n\t"\
+        " jnz                         2b                \n\t"\
+
+#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
+    "lea                "offset"(%0), %%"REG_d"     \n\t"\
+    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
+    "movq                    "#dst1", "#dst2"       \n\t"\
+    ".p2align                      4                \n\t"\
+    "2:                                             \n\t"\
+    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
+    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
+    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
+    "add                         $16, %%"REG_d"            \n\t"\
+    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+    "pmulhw                 "#coeff", "#src1"       \n\t"\
+    "pmulhw                 "#coeff", "#src2"       \n\t"\
+    "paddw                   "#src1", "#dst1"       \n\t"\
+    "paddw                   "#src2", "#dst2"       \n\t"\
+    "test                  %%"REG_S", %%"REG_S"     \n\t"\
+    " jnz                         2b                \n\t"\
+
+#define YSCALEYUV2PACKEDX \
+    YSCALEYUV2PACKEDX_UV \
+    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
+
+#define YSCALEYUV2PACKEDX_END                     \
+        :: "r" (&c->redDither),                   \
+            "m" (dummy), "m" (dummy), "m" (dummy),\
+            "r" (dest), "m" (dstW_reg), "m"(uv_off) \
+        : "%"REG_a, "%"REG_d, "%"REG_S            \
+    );
+
+#define YSCALEYUV2PACKEDX_ACCURATE_UV \
+    __asm__ volatile(\
+        "xor %%"REG_a", %%"REG_a"                       \n\t"\
+        ".p2align                      4                \n\t"\
+        "nop                                            \n\t"\
+        "1:                                             \n\t"\
+        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
+        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+        "pxor                      %%mm4, %%mm4         \n\t"\
+        "pxor                      %%mm5, %%mm5         \n\t"\
+        "pxor                      %%mm6, %%mm6         \n\t"\
+        "pxor                      %%mm7, %%mm7         \n\t"\
+        ".p2align                      4                \n\t"\
+        "2:                                             \n\t"\
+        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
+        "add                          %6, %%"REG_S"      \n\t" \
+        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
+        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
+        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
+        "movq                      %%mm0, %%mm3         \n\t"\
+        "punpcklwd                 %%mm1, %%mm0         \n\t"\
+        "punpckhwd                 %%mm1, %%mm3         \n\t"\
+        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
+        "pmaddwd                   %%mm1, %%mm0         \n\t"\
+        "pmaddwd                   %%mm1, %%mm3         \n\t"\
+        "paddd                     %%mm0, %%mm4         \n\t"\
+        "paddd                     %%mm3, %%mm5         \n\t"\
+        "add                          %6, %%"REG_S"      \n\t" \
+        "movq     (%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
+        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
+        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
+        "test                  %%"REG_S", %%"REG_S"     \n\t"\
+        "movq                      %%mm2, %%mm0         \n\t"\
+        "punpcklwd                 %%mm3, %%mm2         \n\t"\
+        "punpckhwd                 %%mm3, %%mm0         \n\t"\
+        "pmaddwd                   %%mm1, %%mm2         \n\t"\
+        "pmaddwd                   %%mm1, %%mm0         \n\t"\
+        "paddd                     %%mm2, %%mm6         \n\t"\
+        "paddd                     %%mm0, %%mm7         \n\t"\
+        " jnz                         2b                \n\t"\
+        "psrad                       $16, %%mm4         \n\t"\
+        "psrad                       $16, %%mm5         \n\t"\
+        "psrad                       $16, %%mm6         \n\t"\
+        "psrad                       $16, %%mm7         \n\t"\
+        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
+        "packssdw                  %%mm5, %%mm4         \n\t"\
+        "packssdw                  %%mm7, %%mm6         \n\t"\
+        "paddw                     %%mm0, %%mm4         \n\t"\
+        "paddw                     %%mm0, %%mm6         \n\t"\
+        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
+        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
+
+#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
+    "lea                "offset"(%0), %%"REG_d"     \n\t"\
+    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
+    "pxor                      %%mm1, %%mm1         \n\t"\
+    "pxor                      %%mm5, %%mm5         \n\t"\
+    "pxor                      %%mm7, %%mm7         \n\t"\
+    "pxor                      %%mm6, %%mm6         \n\t"\
+    ".p2align                      4                \n\t"\
+    "2:                                             \n\t"\
+    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
+    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
+    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
+    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
+    "movq                      %%mm0, %%mm3         \n\t"\
+    "punpcklwd                 %%mm4, %%mm0         \n\t"\
+    "punpckhwd                 %%mm4, %%mm3         \n\t"\
+    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
+    "pmaddwd                   %%mm4, %%mm0         \n\t"\
+    "pmaddwd                   %%mm4, %%mm3         \n\t"\
+    "paddd                     %%mm0, %%mm1         \n\t"\
+    "paddd                     %%mm3, %%mm5         \n\t"\
+    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
+    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
+    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
+    "test                  %%"REG_S", %%"REG_S"     \n\t"\
+    "movq                      %%mm2, %%mm0         \n\t"\
+    "punpcklwd                 %%mm3, %%mm2         \n\t"\
+    "punpckhwd                 %%mm3, %%mm0         \n\t"\
+    "pmaddwd                   %%mm4, %%mm2         \n\t"\
+    "pmaddwd                   %%mm4, %%mm0         \n\t"\
+    "paddd                     %%mm2, %%mm7         \n\t"\
+    "paddd                     %%mm0, %%mm6         \n\t"\
+    " jnz                         2b                \n\t"\
+    "psrad                       $16, %%mm1         \n\t"\
+    "psrad                       $16, %%mm5         \n\t"\
+    "psrad                       $16, %%mm7         \n\t"\
+    "psrad                       $16, %%mm6         \n\t"\
+    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
+    "packssdw                  %%mm5, %%mm1         \n\t"\
+    "packssdw                  %%mm6, %%mm7         \n\t"\
+    "paddw                     %%mm0, %%mm1         \n\t"\
+    "paddw                     %%mm0, %%mm7         \n\t"\
+    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
+    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
+
+#define YSCALEYUV2PACKEDX_ACCURATE \
+    YSCALEYUV2PACKEDX_ACCURATE_UV \
+    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
+
+#define YSCALEYUV2RGBX \
+    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
+    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
+    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
+    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
+    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
+    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
+    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
+    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
+    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
+    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
+    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
+    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
+    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+    "paddw           %%mm3, %%mm4       \n\t"\
+    "movq            %%mm2, %%mm0       \n\t"\
+    "movq            %%mm5, %%mm6       \n\t"\
+    "movq            %%mm4, %%mm3       \n\t"\
+    "punpcklwd       %%mm2, %%mm2       \n\t"\
+    "punpcklwd       %%mm5, %%mm5       \n\t"\
+    "punpcklwd       %%mm4, %%mm4       \n\t"\
+    "paddw           %%mm1, %%mm2       \n\t"\
+    "paddw           %%mm1, %%mm5       \n\t"\
+    "paddw           %%mm1, %%mm4       \n\t"\
+    "punpckhwd       %%mm0, %%mm0       \n\t"\
+    "punpckhwd       %%mm6, %%mm6       \n\t"\
+    "punpckhwd       %%mm3, %%mm3       \n\t"\
+    "paddw           %%mm7, %%mm0       \n\t"\
+    "paddw           %%mm7, %%mm6       \n\t"\
+    "paddw           %%mm7, %%mm3       \n\t"\
+    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+    "packuswb        %%mm0, %%mm2       \n\t"\
+    "packuswb        %%mm6, %%mm5       \n\t"\
+    "packuswb        %%mm3, %%mm4       \n\t"\
+
+#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
+    "movq       "#b", "#q2"     \n\t" /* B */\
+    "movq       "#r", "#t"      \n\t" /* R */\
+    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
+    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
+    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
+    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
+    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
+    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
+    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
+    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
+    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
+    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
+\
+    MOVNTQ(   q0,   (dst, index, 4))\
+    MOVNTQ(    b,  8(dst, index, 4))\
+    MOVNTQ(   q2, 16(dst, index, 4))\
+    MOVNTQ(   q3, 24(dst, index, 4))\
+\
+    "add      $8, "#index"      \n\t"\
+    "cmp "#dstw", "#index"      \n\t"\
+    " jb      1b                \n\t"
+#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
+
+static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
+                                   const int16_t **lumSrc, int lumFilterSize,
+                                   const int16_t *chrFilter, const int16_t **chrUSrc,
+                                   const int16_t **chrVSrc,
+                                   int chrFilterSize, const int16_t **alpSrc,
+                                   uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+        YSCALEYUV2PACKEDX_ACCURATE
+        YSCALEYUV2RGBX
+        "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
+        "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
+        "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
+        YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
+        "movq               "Y_TEMP"(%0), %%mm5         \n\t"
+        "psraw                        $3, %%mm1         \n\t"
+        "psraw                        $3, %%mm7         \n\t"
+        "packuswb                  %%mm7, %%mm1         \n\t"
+        WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
+        YSCALEYUV2PACKEDX_END
+    } else {
+        YSCALEYUV2PACKEDX_ACCURATE
+        YSCALEYUV2RGBX
+        "pcmpeqd %%mm7, %%mm7 \n\t"
+        WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+        YSCALEYUV2PACKEDX_END
+    }
+}
+
+static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
+                                const int16_t **lumSrc, int lumFilterSize,
+                                const int16_t *chrFilter, const int16_t **chrUSrc,
+                                const int16_t **chrVSrc,
+                                int chrFilterSize, const int16_t **alpSrc,
+                                uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+        YSCALEYUV2PACKEDX
+        YSCALEYUV2RGBX
+        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
+        "psraw                        $3, %%mm1         \n\t"
+        "psraw                        $3, %%mm7         \n\t"
+        "packuswb                  %%mm7, %%mm1         \n\t"
+        WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+        YSCALEYUV2PACKEDX_END
+    } else {
+        YSCALEYUV2PACKEDX
+        YSCALEYUV2RGBX
+        "pcmpeqd %%mm7, %%mm7 \n\t"
+        WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+        YSCALEYUV2PACKEDX_END
+    }
+}
+
+#define REAL_WRITERGB16(dst, dstw, index) \
+    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
+    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
+    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
+    "psrlq           $3, %%mm2  \n\t"\
+\
+    "movq         %%mm2, %%mm1  \n\t"\
+    "movq         %%mm4, %%mm3  \n\t"\
+\
+    "punpcklbw    %%mm7, %%mm3  \n\t"\
+    "punpcklbw    %%mm5, %%mm2  \n\t"\
+    "punpckhbw    %%mm7, %%mm4  \n\t"\
+    "punpckhbw    %%mm5, %%mm1  \n\t"\
+\
+    "psllq           $3, %%mm3  \n\t"\
+    "psllq           $3, %%mm4  \n\t"\
+\
+    "por          %%mm3, %%mm2  \n\t"\
+    "por          %%mm4, %%mm1  \n\t"\
+\
+    MOVNTQ(%%mm2,  (dst, index, 2))\
+    MOVNTQ(%%mm1, 8(dst, index, 2))\
+\
+    "add             $8, "#index"   \n\t"\
+    "cmp        "#dstw", "#index"   \n\t"\
+    " jb             1b             \n\t"
+#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
+
+static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
+                                    const int16_t **lumSrc, int lumFilterSize,
+                                    const int16_t *chrFilter, const int16_t **chrUSrc,
+                                    const int16_t **chrVSrc,
+                                    int chrFilterSize, const int16_t **alpSrc,
+                                    uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX_ACCURATE
+    YSCALEYUV2RGBX
+    "pxor %%mm7, %%mm7 \n\t"
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
+#endif
+    WRITERGB16(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
+                                 const int16_t **lumSrc, int lumFilterSize,
+                                 const int16_t *chrFilter, const int16_t **chrUSrc,
+                                 const int16_t **chrVSrc,
+                                 int chrFilterSize, const int16_t **alpSrc,
+                                 uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX
+    YSCALEYUV2RGBX
+    "pxor %%mm7, %%mm7 \n\t"
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
+    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
+    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
+#endif
+    WRITERGB16(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+#define REAL_WRITERGB15(dst, dstw, index) \
+    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
+    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
+    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
+    "psrlq           $3, %%mm2  \n\t"\
+    "psrlq           $1, %%mm5  \n\t"\
+\
+    "movq         %%mm2, %%mm1  \n\t"\
+    "movq         %%mm4, %%mm3  \n\t"\
+\
+    "punpcklbw    %%mm7, %%mm3  \n\t"\
+    "punpcklbw    %%mm5, %%mm2  \n\t"\
+    "punpckhbw    %%mm7, %%mm4  \n\t"\
+    "punpckhbw    %%mm5, %%mm1  \n\t"\
+\
+    "psllq           $2, %%mm3  \n\t"\
+    "psllq           $2, %%mm4  \n\t"\
+\
+    "por          %%mm3, %%mm2  \n\t"\
+    "por          %%mm4, %%mm1  \n\t"\
+\
+    MOVNTQ(%%mm2,  (dst, index, 2))\
+    MOVNTQ(%%mm1, 8(dst, index, 2))\
+\
+    "add             $8, "#index"   \n\t"\
+    "cmp        "#dstw", "#index"   \n\t"\
+    " jb             1b             \n\t"
+#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
+
+static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
+                                    const int16_t **lumSrc, int lumFilterSize,
+                                    const int16_t *chrFilter, const int16_t **chrUSrc,
+                                    const int16_t **chrVSrc,
+                                    int chrFilterSize, const int16_t **alpSrc,
+                                    uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX_ACCURATE
+    YSCALEYUV2RGBX
+    "pxor %%mm7, %%mm7 \n\t"
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
+#endif
+    WRITERGB15(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
+                                 const int16_t **lumSrc, int lumFilterSize,
+                                 const int16_t *chrFilter, const int16_t **chrUSrc,
+                                 const int16_t **chrVSrc,
+                                 int chrFilterSize, const int16_t **alpSrc,
+                                 uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX
+    YSCALEYUV2RGBX
+    "pxor %%mm7, %%mm7 \n\t"
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
+    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
+    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
+#endif
+    WRITERGB15(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+#define WRITEBGR24MMX(dst, dstw, index) \
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
+    "movq      %%mm2, %%mm1     \n\t" /* B */\
+    "movq      %%mm5, %%mm6     \n\t" /* R */\
+    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
+    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
+    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
+    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
+    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
+    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
+    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
+    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
+    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
+    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
+\
+    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
+    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
+    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
+    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
+\
+    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
+    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
+    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
+    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
+\
+    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
+    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
+    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
+    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
+\
+    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
+    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
+    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
+    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
+    MOVNTQ(%%mm0, (dst))\
+\
+    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
+    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
+    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
+    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
+    MOVNTQ(%%mm6, 8(dst))\
+\
+    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
+    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
+    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
+    MOVNTQ(%%mm5, 16(dst))\
+\
+    "add         $24, "#dst"    \n\t"\
+\
+    "add          $8, "#index"  \n\t"\
+    "cmp     "#dstw", "#index"  \n\t"\
+    " jb          1b            \n\t"
+
+#define WRITEBGR24MMXEXT(dst, dstw, index) \
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
+    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
+    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
+    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
+    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
+    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
+\
+    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
+    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
+    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
+\
+    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
+    "por    %%mm1, %%mm6        \n\t"\
+    "por    %%mm3, %%mm6        \n\t"\
+    MOVNTQ(%%mm6, (dst))\
+\
+    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
+    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
+    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
+    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
+\
+    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
+    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
+    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
+\
+    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
+    "por    %%mm3, %%mm6        \n\t"\
+    MOVNTQ(%%mm6, 8(dst))\
+\
+    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
+    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
+    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
+\
+    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
+    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
+    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
+\
+    "por    %%mm1, %%mm3        \n\t"\
+    "por    %%mm3, %%mm6        \n\t"\
+    MOVNTQ(%%mm6, 16(dst))\
+\
+    "add      $24, "#dst"       \n\t"\
+\
+    "add       $8, "#index"     \n\t"\
+    "cmp  "#dstw", "#index"     \n\t"\
+    " jb       1b               \n\t"
+
+#if COMPILE_TEMPLATE_MMXEXT
+#undef WRITEBGR24
+#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
+#else
+#undef WRITEBGR24
+#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
+#endif
+
+static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
+                                   const int16_t **lumSrc, int lumFilterSize,
+                                   const int16_t *chrFilter, const int16_t **chrUSrc,
+                                   const int16_t **chrVSrc,
+                                   int chrFilterSize, const int16_t **alpSrc,
+                                   uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX_ACCURATE
+    YSCALEYUV2RGBX
+    "pxor %%mm7, %%mm7 \n\t"
+    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
+    "add %4, %%"REG_c"                        \n\t"
+    WRITEBGR24(%%REGc, %5, %%REGa)
+    :: "r" (&c->redDither),
+       "m" (dummy), "m" (dummy), "m" (dummy),
+       "r" (dest), "m" (dstW_reg), "m"(uv_off)
+    : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+    );
+}
+
+static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
+                                const int16_t **lumSrc, int lumFilterSize,
+                                const int16_t *chrFilter, const int16_t **chrUSrc,
+                                const int16_t **chrVSrc,
+                                int chrFilterSize, const int16_t **alpSrc,
+                                uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX
+    YSCALEYUV2RGBX
+    "pxor                    %%mm7, %%mm7       \n\t"
+    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
+    "add                        %4, %%"REG_c"   \n\t"
+    WRITEBGR24(%%REGc, %5, %%REGa)
+    :: "r" (&c->redDither),
+       "m" (dummy), "m" (dummy), "m" (dummy),
+       "r" (dest),  "m" (dstW_reg), "m"(uv_off)
+    : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+    );
+}
+
+#define REAL_WRITEYUY2(dst, dstw, index) \
+    "packuswb  %%mm3, %%mm3     \n\t"\
+    "packuswb  %%mm4, %%mm4     \n\t"\
+    "packuswb  %%mm7, %%mm1     \n\t"\
+    "punpcklbw %%mm4, %%mm3     \n\t"\
+    "movq      %%mm1, %%mm7     \n\t"\
+    "punpcklbw %%mm3, %%mm1     \n\t"\
+    "punpckhbw %%mm3, %%mm7     \n\t"\
+\
+    MOVNTQ(%%mm1, (dst, index, 2))\
+    MOVNTQ(%%mm7, 8(dst, index, 2))\
+\
+    "add          $8, "#index"  \n\t"\
+    "cmp     "#dstw", "#index"  \n\t"\
+    " jb          1b            \n\t"
+#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
+
+static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
+                                     const int16_t **lumSrc, int lumFilterSize,
+                                     const int16_t *chrFilter, const int16_t **chrUSrc,
+                                     const int16_t **chrVSrc,
+                                     int chrFilterSize, const int16_t **alpSrc,
+                                     uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX_ACCURATE
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+    "psraw $3, %%mm3    \n\t"
+    "psraw $3, %%mm4    \n\t"
+    "psraw $3, %%mm1    \n\t"
+    "psraw $3, %%mm7    \n\t"
+    WRITEYUY2(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
+                                  const int16_t **lumSrc, int lumFilterSize,
+                                  const int16_t *chrFilter, const int16_t **chrUSrc,
+                                  const int16_t **chrVSrc,
+                                  int chrFilterSize, const int16_t **alpSrc,
+                                  uint8_t *dest, int dstW, int dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+    x86_reg uv_off = c->uv_offx2;
+
+    YSCALEYUV2PACKEDX
+    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+    "psraw $3, %%mm3    \n\t"
+    "psraw $3, %%mm4    \n\t"
+    "psraw $3, %%mm1    \n\t"
+    "psraw $3, %%mm7    \n\t"
+    WRITEYUY2(%4, %5, %%REGa)
+    YSCALEYUV2PACKEDX_END
+}
+
+#define REAL_YSCALEYUV2RGB_UV(index, c) \
+    "xor            "#index", "#index"  \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
+    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
+    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
+    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
+    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
+    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
+    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
+    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
+    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
+    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
+    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
+    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
+    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
+    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
+    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
+    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+
+#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
+    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
+    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
+    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
+    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
+    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
+    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
+    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+
+#define REAL_YSCALEYUV2RGB_COEFF(c) \
+    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
+    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
+    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
+    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
+    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
+    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
+    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+    "paddw             %%mm3, %%mm4     \n\t"\
+    "movq              %%mm2, %%mm0     \n\t"\
+    "movq              %%mm5, %%mm6     \n\t"\
+    "movq              %%mm4, %%mm3     \n\t"\
+    "punpcklwd         %%mm2, %%mm2     \n\t"\
+    "punpcklwd         %%mm5, %%mm5     \n\t"\
+    "punpcklwd         %%mm4, %%mm4     \n\t"\
+    "paddw             %%mm1, %%mm2     \n\t"\
+    "paddw             %%mm1, %%mm5     \n\t"\
+    "paddw             %%mm1, %%mm4     \n\t"\
+    "punpckhwd         %%mm0, %%mm0     \n\t"\
+    "punpckhwd         %%mm6, %%mm6     \n\t"\
+    "punpckhwd         %%mm3, %%mm3     \n\t"\
+    "paddw             %%mm7, %%mm0     \n\t"\
+    "paddw             %%mm7, %%mm6     \n\t"\
+    "paddw             %%mm7, %%mm3     \n\t"\
+    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+    "packuswb          %%mm0, %%mm2     \n\t"\
+    "packuswb          %%mm6, %%mm5     \n\t"\
+    "packuswb          %%mm3, %%mm4     \n\t"\
+
+#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
+
+#define YSCALEYUV2RGB(index, c) \
+    REAL_YSCALEYUV2RGB_UV(index, c) \
+    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
+    REAL_YSCALEYUV2RGB_COEFF(c)
+
+/**
+ * vertical bilinear scale YV12 to RGB
+ */
+static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
+                                const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                const int16_t *abuf[2], uint8_t *dest,
+                                int dstW, int yalpha, int uvalpha, int y)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+        const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
+#if ARCH_X86_64
+        __asm__ volatile(
+            YSCALEYUV2RGB(%%r8, %5)
+            YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
+            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+            "packuswb            %%mm7, %%mm1       \n\t"
+            WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
+               "a" (&c->redDither),
+               "r" (abuf0), "r" (abuf1)
+            : "%r8"
+        );
+#else
+        c->u_temp=(intptr_t)abuf0;
+        c->v_temp=(intptr_t)abuf1;
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB(%%REGBP, %5)
+            "push                   %0              \n\t"
+            "push                   %1              \n\t"
+            "mov          "U_TEMP"(%5), %0          \n\t"
+            "mov          "V_TEMP"(%5), %1          \n\t"
+            YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
+            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+            "packuswb            %%mm7, %%mm1       \n\t"
+            "pop                    %1              \n\t"
+            "pop                    %0              \n\t"
+            WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+#endif
+    } else {
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB(%%REGBP, %5)
+            "pcmpeqd %%mm7, %%mm7                   \n\t"
+            WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    }
+}
+
+static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
+                                const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                const int16_t *abuf[2], uint8_t *dest,
+                                int dstW, int yalpha, int uvalpha, int y)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+    __asm__ volatile(
+        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+        "mov        %4, %%"REG_b"               \n\t"
+        "push %%"REG_BP"                        \n\t"
+        YSCALEYUV2RGB(%%REGBP, %5)
+        "pxor    %%mm7, %%mm7                   \n\t"
+        WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+        "pop %%"REG_BP"                         \n\t"
+        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+           "a" (&c->redDither)
+    );
+}
+
+static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
+                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                 const int16_t *abuf[2], uint8_t *dest,
+                                 int dstW, int yalpha, int uvalpha, int y)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+    __asm__ volatile(
+        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+        "mov        %4, %%"REG_b"               \n\t"
+        "push %%"REG_BP"                        \n\t"
+        YSCALEYUV2RGB(%%REGBP, %5)
+        "pxor    %%mm7, %%mm7                   \n\t"
+        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+        "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+        "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+        WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+        "pop %%"REG_BP"                         \n\t"
+        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+           "a" (&c->redDither)
+    );
+}
+
+static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
+                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                 const int16_t *abuf[2], uint8_t *dest,
+                                 int dstW, int yalpha, int uvalpha, int y)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+    __asm__ volatile(
+        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+        "mov        %4, %%"REG_b"               \n\t"
+        "push %%"REG_BP"                        \n\t"
+        YSCALEYUV2RGB(%%REGBP, %5)
+        "pxor    %%mm7, %%mm7                   \n\t"
+        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+        "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+        "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+        WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+        "pop %%"REG_BP"                         \n\t"
+        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+           "a" (&c->redDither)
+    );
+}
+
+#define REAL_YSCALEYUV2PACKED(index, c) \
+    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
+    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
+    "psraw                $3, %%mm0                           \n\t"\
+    "psraw                $3, %%mm1                           \n\t"\
+    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
+    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
+    "xor            "#index", "#index"                        \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
+    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
+    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
+    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
+    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
+    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
+    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
+    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
+    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
+    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
+    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
+    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
+    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
+    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
+    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
+    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+
+#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
+
+static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
+                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                  const int16_t *abuf[2], uint8_t *dest,
+                                  int dstW, int yalpha, int uvalpha, int y)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+    //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+    __asm__ volatile(
+        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+        "mov %4, %%"REG_b"                        \n\t"
+        "push %%"REG_BP"                        \n\t"
+        YSCALEYUV2PACKED(%%REGBP, %5)
+        WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+        "pop %%"REG_BP"                         \n\t"
+        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+           "a" (&c->redDither)
+    );
+}
+
+#define REAL_YSCALEYUV2RGB1(index, c) \
+    "xor            "#index", "#index"  \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
+    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
+    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
+    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
+    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
+    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
+    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
+    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
+    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
+    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
+    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
+    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
+    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
+    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
+    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+    "paddw             %%mm3, %%mm4     \n\t"\
+    "movq              %%mm2, %%mm0     \n\t"\
+    "movq              %%mm5, %%mm6     \n\t"\
+    "movq              %%mm4, %%mm3     \n\t"\
+    "punpcklwd         %%mm2, %%mm2     \n\t"\
+    "punpcklwd         %%mm5, %%mm5     \n\t"\
+    "punpcklwd         %%mm4, %%mm4     \n\t"\
+    "paddw             %%mm1, %%mm2     \n\t"\
+    "paddw             %%mm1, %%mm5     \n\t"\
+    "paddw             %%mm1, %%mm4     \n\t"\
+    "punpckhwd         %%mm0, %%mm0     \n\t"\
+    "punpckhwd         %%mm6, %%mm6     \n\t"\
+    "punpckhwd         %%mm3, %%mm3     \n\t"\
+    "paddw             %%mm7, %%mm0     \n\t"\
+    "paddw             %%mm7, %%mm6     \n\t"\
+    "paddw             %%mm7, %%mm3     \n\t"\
+    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+    "packuswb          %%mm0, %%mm2     \n\t"\
+    "packuswb          %%mm6, %%mm5     \n\t"\
+    "packuswb          %%mm3, %%mm4     \n\t"\
+
+#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
+
+// do vertical chrominance interpolation
+#define REAL_YSCALEYUV2RGB1b(index, c) \
+    "xor            "#index", "#index"  \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
+    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
+    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
+    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
+    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
+    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
+    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
+    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
+    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
+    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
+    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
+    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
+    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
+    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
+    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
+    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
+    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
+    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
+    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+    "paddw             %%mm3, %%mm4     \n\t"\
+    "movq              %%mm2, %%mm0     \n\t"\
+    "movq              %%mm5, %%mm6     \n\t"\
+    "movq              %%mm4, %%mm3     \n\t"\
+    "punpcklwd         %%mm2, %%mm2     \n\t"\
+    "punpcklwd         %%mm5, %%mm5     \n\t"\
+    "punpcklwd         %%mm4, %%mm4     \n\t"\
+    "paddw             %%mm1, %%mm2     \n\t"\
+    "paddw             %%mm1, %%mm5     \n\t"\
+    "paddw             %%mm1, %%mm4     \n\t"\
+    "punpckhwd         %%mm0, %%mm0     \n\t"\
+    "punpckhwd         %%mm6, %%mm6     \n\t"\
+    "punpckhwd         %%mm3, %%mm3     \n\t"\
+    "paddw             %%mm7, %%mm0     \n\t"\
+    "paddw             %%mm7, %%mm6     \n\t"\
+    "paddw             %%mm7, %%mm3     \n\t"\
+    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+    "packuswb          %%mm0, %%mm2     \n\t"\
+    "packuswb          %%mm6, %%mm5     \n\t"\
+    "packuswb          %%mm3, %%mm4     \n\t"\
+
+#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
+
+#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
+    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
+    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
+    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
+    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
+    "packuswb          %%mm1, %%mm7     \n\t"
+#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
+
+/**
+ * YV12 to RGB without scaling or interpolating
+ */
+static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
+                                const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                const int16_t *abuf0, uint8_t *dest,
+                                int dstW, int uvalpha, int y)
+{
+    const int16_t *ubuf0 = ubuf[0];
+    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+        const int16_t *ubuf1 = ubuf[0];
+        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+            __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                YSCALEYUV2RGB1_ALPHA(%%REGBP)
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+                   "a" (&c->redDither)
+            );
+        } else {
+            __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                "pcmpeqd %%mm7, %%mm7                   \n\t"
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+                   "a" (&c->redDither)
+            );
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1];
+        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+            __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                YSCALEYUV2RGB1_ALPHA(%%REGBP)
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+                   "a" (&c->redDither)
+            );
+        } else {
+            __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                "pcmpeqd %%mm7, %%mm7                   \n\t"
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+                   "a" (&c->redDither)
+            );
+        }
+    }
+}
+
+static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
+                                const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                const int16_t *abuf0, uint8_t *dest,
+                                int dstW, int uvalpha, int y)
+{
+    const int16_t *ubuf0 = ubuf[0];
+    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+        const int16_t *ubuf1 = ubuf[0];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    } else {
+        const int16_t *ubuf1 = ubuf[1];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1b(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    }
+}
+
+static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
+                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                 const int16_t *abuf0, uint8_t *dest,
+                                 int dstW, int uvalpha, int y)
+{
+    const int16_t *ubuf0 = ubuf[0];
+    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+        const int16_t *ubuf1 = ubuf[0];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    } else {
+        const int16_t *ubuf1 = ubuf[1];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1b(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    }
+}
+
+static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
+                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                 const int16_t *abuf0, uint8_t *dest,
+                                 int dstW, int uvalpha, int y)
+{
+    const int16_t *ubuf0 = ubuf[0];
+    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+        const int16_t *ubuf1 = ubuf[0];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    } else {
+        const int16_t *ubuf1 = ubuf[1];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2RGB1b(%%REGBP, %5)
+            "pxor    %%mm7, %%mm7                   \n\t"
+            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    }
+}
+
+#define REAL_YSCALEYUV2PACKED1(index, c) \
+    "xor            "#index", "#index"  \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "psraw                $7, %%mm3     \n\t" \
+    "psraw                $7, %%mm4     \n\t" \
+    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
+    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
+    "psraw                $7, %%mm1     \n\t" \
+    "psraw                $7, %%mm7     \n\t" \
+
+#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
+
+#define REAL_YSCALEYUV2PACKED1b(index, c) \
+    "xor "#index", "#index"             \n\t"\
+    ".p2align              4            \n\t"\
+    "1:                                 \n\t"\
+    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
+    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
+    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
+    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
+    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
+    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+    "psrlw                $8, %%mm3     \n\t" \
+    "psrlw                $8, %%mm4     \n\t" \
+    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
+    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
+    "psraw                $7, %%mm1     \n\t" \
+    "psraw                $7, %%mm7     \n\t"
+#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
+
+static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
+                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
+                                  const int16_t *abuf0, uint8_t *dest,
+                                  int dstW, int uvalpha, int y)
+{
+    const int16_t *ubuf0 = ubuf[0];
+    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+        const int16_t *ubuf1 = ubuf[0];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2PACKED1(%%REGBP, %5)
+            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    } else {
+        const int16_t *ubuf1 = ubuf[1];
+        __asm__ volatile(
+            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+            "mov        %4, %%"REG_b"               \n\t"
+            "push %%"REG_BP"                        \n\t"
+            YSCALEYUV2PACKED1b(%%REGBP, %5)
+            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+            "pop %%"REG_BP"                         \n\t"
+            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+               "a" (&c->redDither)
+        );
+    }
+}
+
+#if COMPILE_TEMPLATE_MMXEXT
+static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
+                                 int dstWidth, const uint8_t *src,
+                                 int srcW, int xInc)
+{
+    int32_t *filterPos = c->hLumFilterPos;
+    int16_t *filter    = c->hLumFilter;
+    void    *mmxextFilterCode = c->lumMmxextFilterCode;
+    int i;
+#if defined(PIC)
+    uint64_t ebxsave;
+#endif
+#if ARCH_X86_64
+    uint64_t retsave;
+#endif
+
+    __asm__ volatile(
+#if defined(PIC)
+        "mov               %%"REG_b", %5        \n\t"
+#if ARCH_X86_64
+        "mov               -8(%%rsp), %%"REG_a" \n\t"
+        "mov               %%"REG_a", %6        \n\t"
+#endif
+#else
+#if ARCH_X86_64
+        "mov               -8(%%rsp), %%"REG_a" \n\t"
+        "mov               %%"REG_a", %5        \n\t"
+#endif
+#endif
+        "pxor                  %%mm7, %%mm7     \n\t"
+        "mov                      %0, %%"REG_c" \n\t"
+        "mov                      %1, %%"REG_D" \n\t"
+        "mov                      %2, %%"REG_d" \n\t"
+        "mov                      %3, %%"REG_b" \n\t"
+        "xor               %%"REG_a", %%"REG_a" \n\t" // i
+        PREFETCH"        (%%"REG_c")            \n\t"
+        PREFETCH"      32(%%"REG_c")            \n\t"
+        PREFETCH"      64(%%"REG_c")            \n\t"
+
+#if ARCH_X86_64
+#define CALL_MMXEXT_FILTER_CODE \
+        "movl            (%%"REG_b"), %%esi     \n\t"\
+        "call                    *%4            \n\t"\
+        "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
+        "add               %%"REG_S", %%"REG_c" \n\t"\
+        "add               %%"REG_a", %%"REG_D" \n\t"\
+        "xor               %%"REG_a", %%"REG_a" \n\t"\
+
+#else
+#define CALL_MMXEXT_FILTER_CODE \
+        "movl (%%"REG_b"), %%esi        \n\t"\
+        "call         *%4                       \n\t"\
+        "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
+        "add               %%"REG_a", %%"REG_D" \n\t"\
+        "xor               %%"REG_a", %%"REG_a" \n\t"\
+
+#endif /* ARCH_X86_64 */
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+
+#if defined(PIC)
+        "mov                      %5, %%"REG_b" \n\t"
+#if ARCH_X86_64
+        "mov                      %6, %%"REG_a" \n\t"
+        "mov               %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#else
+#if ARCH_X86_64
+        "mov                      %5, %%"REG_a" \n\t"
+        "mov               %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#endif
+        :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
+           "m" (mmxextFilterCode)
+#if defined(PIC)
+          ,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+          ,"m"(retsave)
+#endif
+        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if !defined(PIC)
+         ,"%"REG_b
+#endif
+    );
+
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+        dst[i] = src[srcW-1]*128;
+}
+
+static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
+                                 int dstWidth, const uint8_t *src1,
+                                 const uint8_t *src2, int srcW, int xInc)
+{
+    int32_t *filterPos = c->hChrFilterPos;
+    int16_t *filter    = c->hChrFilter;
+    void    *mmxextFilterCode = c->chrMmxextFilterCode;
+    int i;
+#if defined(PIC)
+    DECLARE_ALIGNED(8, uint64_t, ebxsave);
+#endif
+#if ARCH_X86_64
+    DECLARE_ALIGNED(8, uint64_t, retsave);
+#endif
+
+    __asm__ volatile(
+#if defined(PIC)
+        "mov          %%"REG_b", %7         \n\t"
+#if ARCH_X86_64
+        "mov          -8(%%rsp), %%"REG_a"  \n\t"
+        "mov          %%"REG_a", %8         \n\t"
+#endif
+#else
+#if ARCH_X86_64
+        "mov          -8(%%rsp), %%"REG_a"  \n\t"
+        "mov          %%"REG_a", %7         \n\t"
+#endif
+#endif
+        "pxor             %%mm7, %%mm7      \n\t"
+        "mov                 %0, %%"REG_c"  \n\t"
+        "mov                 %1, %%"REG_D"  \n\t"
+        "mov                 %2, %%"REG_d"  \n\t"
+        "mov                 %3, %%"REG_b"  \n\t"
+        "xor          %%"REG_a", %%"REG_a"  \n\t" // i
+        PREFETCH"   (%%"REG_c")             \n\t"
+        PREFETCH" 32(%%"REG_c")             \n\t"
+        PREFETCH" 64(%%"REG_c")             \n\t"
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        "xor          %%"REG_a", %%"REG_a"  \n\t" // i
+        "mov                 %5, %%"REG_c"  \n\t" // src
+        "mov                 %6, %%"REG_D"  \n\t" // buf2
+        PREFETCH"   (%%"REG_c")             \n\t"
+        PREFETCH" 32(%%"REG_c")             \n\t"
+        PREFETCH" 64(%%"REG_c")             \n\t"
+
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+        CALL_MMXEXT_FILTER_CODE
+
+#if defined(PIC)
+        "mov %7, %%"REG_b"    \n\t"
+#if ARCH_X86_64
+        "mov                 %8, %%"REG_a"  \n\t"
+        "mov          %%"REG_a", -8(%%rsp)  \n\t"
+#endif
+#else
+#if ARCH_X86_64
+        "mov                 %7, %%"REG_a"  \n\t"
+        "mov          %%"REG_a", -8(%%rsp)  \n\t"
+#endif
+#endif
+        :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
+           "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
+#if defined(PIC)
+          ,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+          ,"m"(retsave)
+#endif
+        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if !defined(PIC)
+         ,"%"REG_b
+#endif
+    );
+
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+        dst1[i] = src1[srcW-1]*128;
+        dst2[i] = src2[srcW-1]*128;
+    }
+}
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+
+static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
+{
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    c->use_mmx_vfilter= 0;
+    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+        && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
+            if (c->flags & SWS_ACCURATE_RND) {
+                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+                    switch (c->dstFormat) {
+                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
+                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
+                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
+                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
+                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
+                    default: break;
+                    }
+                }
+            } else {
+                c->use_mmx_vfilter= 1;
+                c->yuv2planeX = RENAME(yuv2yuvX    );
+                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+                    switch (c->dstFormat) {
+                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
+                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
+                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
+                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
+                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
+                    default: break;
+                    }
+                }
+            }
+        if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+            switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB32:
+                c->yuv2packed1 = RENAME(yuv2rgb32_1);
+                c->yuv2packed2 = RENAME(yuv2rgb32_2);
+                break;
+            case AV_PIX_FMT_BGR24:
+                c->yuv2packed1 = RENAME(yuv2bgr24_1);
+                c->yuv2packed2 = RENAME(yuv2bgr24_2);
+                break;
+            case AV_PIX_FMT_RGB555:
+                c->yuv2packed1 = RENAME(yuv2rgb555_1);
+                c->yuv2packed2 = RENAME(yuv2rgb555_2);
+                break;
+            case AV_PIX_FMT_RGB565:
+                c->yuv2packed1 = RENAME(yuv2rgb565_1);
+                c->yuv2packed2 = RENAME(yuv2rgb565_2);
+                break;
+            case AV_PIX_FMT_YUYV422:
+                c->yuv2packed1 = RENAME(yuv2yuyv422_1);
+                c->yuv2packed2 = RENAME(yuv2yuyv422_2);
+                break;
+            default:
+                break;
+            }
+        }
+    }
+
+    if (c->srcBpc == 8 && c->dstBpc <= 14) {
+    // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
+#if COMPILE_TEMPLATE_MMXEXT
+    if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
+        c->hyscale_fast = RENAME(hyscale_fast);
+        c->hcscale_fast = RENAME(hcscale_fast);
+    } else {
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+        c->hyscale_fast = NULL;
+        c->hcscale_fast = NULL;
+#if COMPILE_TEMPLATE_MMXEXT
+    }
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+    }
+}
diff --git a/ffmpeg/libswscale/x86/w64xmmtest.c b/ffmpeg/libswscale/x86/w64xmmtest.c
new file mode 100644
index 0000000..dd9a2a4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+               const int srcStride[], int srcSliceY, int srcSliceH,
+               uint8_t *const dst[], const int dstStride[]))
+{
+    testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+                    srcSliceH, dst, dstStride);
+}
diff --git a/ffmpeg/libswscale/x86/yuv2rgb.c b/ffmpeg/libswscale/x86/yuv2rgb.c
new file mode 100644
index 0000000..3938e6b
--- /dev/null
+++ b/ffmpeg/libswscale/x86/yuv2rgb.c
@@ -0,0 +1,113 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * MMX/MMXEXT template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/cpu.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP // only for MMX
+
+/* hope these constant values are cache line aligned */
+DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMX_INLINE */
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMXEXT_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
+{
+#if HAVE_INLINE_ASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB24:
+            return yuv420_rgb24_MMXEXT;
+        case AV_PIX_FMT_BGR24:
+            return yuv420_bgr24_MMXEXT;
+        }
+    }
+#endif
+
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+                    return yuva420_rgb32_MMX;
+#endif
+                    break;
+                } else return yuv420_rgb32_MMX;
+            case AV_PIX_FMT_BGR32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+                    return yuva420_bgr32_MMX;
+#endif
+                    break;
+                } else return yuv420_bgr32_MMX;
+            case AV_PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
+            case AV_PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
+            case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+            case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX;
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+
+    return NULL;
+}
diff --git a/ffmpeg/libswscale/x86/yuv2rgb_template.c b/ffmpeg/libswscale/x86/yuv2rgb_template.c
new file mode 100644
index 0000000..c879102
--- /dev/null
+++ b/ffmpeg/libswscale/x86/yuv2rgb_template.c
@@ -0,0 +1,451 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2001-2007 Michael Niedermayer
+ *           (c) 2010 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE " # nop"
+#endif
+
+#define REG_BLUE  "0"
+#define REG_RED   "1"
+#define REG_GREEN "2"
+#define REG_ALPHA "3"
+
+#define YUV2RGB_LOOP(depth)                                          \
+    h_size = (c->dstW + 7) & ~7;                                     \
+    if (h_size * depth > FFABS(dstStride[0]))                        \
+        h_size -= 8;                                                 \
+                                                                     \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
+                                                                     \
+    __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
+    for (y = 0; y < srcSliceH; y++) {                                \
+        uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
+        const uint8_t *py = src[0] +               y * srcStride[0]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
+        x86_reg index = -h_size / 2;                                 \
+
+#define YUV2RGB_INITIAL_LOAD          \
+    __asm__ volatile (                \
+        "movq (%5, %0, 2), %%mm6\n\t" \
+        "movd    (%2, %0), %%mm0\n\t" \
+        "movd    (%3, %0), %%mm1\n\t" \
+        "1: \n\t"                     \
+
+/* YUV2RGB core
+ * Conversion is performed in usual way:
+ * R = Y' * Ycoef + Vred * V'
+ * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
+ * B = Y' * Ycoef               + Ublue * U'
+ *
+ * where X' = X * 8 - Xoffset (multiplication is performed to increase
+ * precision a bit).
+ * Since it operates in YUV420 colorspace, Y component is additionally
+ * split into Y1 and Y2 for even and odd pixels.
+ *
+ * Input:
+ * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
+ * Output:
+ * mm1 - R, mm2 - G, mm0 - B
+ */
+#define YUV2RGB                                  \
+    /* convert Y, U, V into Y1', Y2', U', V' */  \
+    "movq      %%mm6, %%mm7\n\t"                 \
+    "punpcklbw %%mm4, %%mm0\n\t"                 \
+    "punpcklbw %%mm4, %%mm1\n\t"                 \
+    "pand     "MANGLE(mmx_00ffw)", %%mm6\n\t"    \
+    "psrlw     $8,    %%mm7\n\t"                 \
+    "psllw     $3,    %%mm0\n\t"                 \
+    "psllw     $3,    %%mm1\n\t"                 \
+    "psllw     $3,    %%mm6\n\t"                 \
+    "psllw     $3,    %%mm7\n\t"                 \
+    "psubsw   "U_OFFSET"(%4), %%mm0\n\t"         \
+    "psubsw   "V_OFFSET"(%4), %%mm1\n\t"         \
+    "psubw    "Y_OFFSET"(%4), %%mm6\n\t"         \
+    "psubw    "Y_OFFSET"(%4), %%mm7\n\t"         \
+\
+     /* multiply by coefficients */              \
+    "movq      %%mm0, %%mm2\n\t"                 \
+    "movq      %%mm1, %%mm3\n\t"                 \
+    "pmulhw   "UG_COEFF"(%4), %%mm2\n\t"         \
+    "pmulhw   "VG_COEFF"(%4), %%mm3\n\t"         \
+    "pmulhw   "Y_COEFF" (%4), %%mm6\n\t"         \
+    "pmulhw   "Y_COEFF" (%4), %%mm7\n\t"         \
+    "pmulhw   "UB_COEFF"(%4), %%mm0\n\t"         \
+    "pmulhw   "VR_COEFF"(%4), %%mm1\n\t"         \
+    "paddsw    %%mm3, %%mm2\n\t"                 \
+    /* now: mm0 = UB, mm1 = VR, mm2 = CG */      \
+    /*      mm6 = Y1, mm7 = Y2 */                \
+\
+    /* produce RGB */                            \
+    "movq      %%mm7, %%mm3\n\t"                 \
+    "movq      %%mm7, %%mm5\n\t"                 \
+    "paddsw    %%mm0, %%mm3\n\t"                 \
+    "paddsw    %%mm1, %%mm5\n\t"                 \
+    "paddsw    %%mm2, %%mm7\n\t"                 \
+    "paddsw    %%mm6, %%mm0\n\t"                 \
+    "paddsw    %%mm6, %%mm1\n\t"                 \
+    "paddsw    %%mm6, %%mm2\n\t"                 \
+
+#define RGB_PACK_INTERLEAVE                  \
+    /* pack and interleave even/odd pixels */    \
+    "packuswb  %%mm1, %%mm0\n\t"                 \
+    "packuswb  %%mm5, %%mm3\n\t"                 \
+    "packuswb  %%mm2, %%mm2\n\t"                 \
+    "movq      %%mm0, %%mm1\n\n"                 \
+    "packuswb  %%mm7, %%mm7\n\t"                 \
+    "punpcklbw %%mm3, %%mm0\n\t"                 \
+    "punpckhbw %%mm3, %%mm1\n\t"                 \
+    "punpcklbw %%mm7, %%mm2\n\t"                 \
+
+#define YUV2RGB_ENDLOOP(depth)                   \
+    "movq 8 (%5, %0, 2), %%mm6\n\t"              \
+    "movd 4 (%3, %0),    %%mm1\n\t"              \
+    "movd 4 (%2, %0),    %%mm0\n\t"              \
+    "add $"AV_STRINGIFY(depth * 8)", %1\n\t"     \
+    "add  $4, %0\n\t"                            \
+    "js   1b\n\t"                                \
+
+#define YUV2RGB_OPERANDS                                          \
+        : "+r" (index), "+r" (image)                              \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+          "r" (py - 2*index)                                      \
+        : "memory"                                                \
+        );                                                        \
+    }                                                             \
+
+#define YUV2RGB_OPERANDS_ALPHA                                    \
+        : "+r" (index), "+r" (image)                              \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+          "r" (py - 2*index), "r" (pa - 2*index)                  \
+        : "memory"                                                \
+        );                                                        \
+    }                                                             \
+
+#define YUV2RGB_ENDFUNC                          \
+    __asm__ volatile (SFENCE"\n\t"               \
+                    "emms    \n\t");             \
+    return srcSliceH;                            \
+
+#define IF0(x)
+#define IF1(x) x
+
+#define RGB_PACK16(gmask, is15)                  \
+    "pand      "MANGLE(mmx_redmask)", %%mm0\n\t" \
+    "pand      "MANGLE(mmx_redmask)", %%mm1\n\t" \
+    "movq      %%mm2,     %%mm3\n\t"             \
+    "psllw   $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
+    "psrlw   $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
+    "psrlw     $3,        %%mm0\n\t"             \
+    IF##is15("psrlw  $1,  %%mm1\n\t")            \
+    "pand "MANGLE(pb_e0)", %%mm2\n\t"            \
+    "pand "MANGLE(gmask)", %%mm3\n\t"            \
+    "por       %%mm2,     %%mm0\n\t"             \
+    "por       %%mm3,     %%mm1\n\t"             \
+    "movq      %%mm0,     %%mm2\n\t"             \
+    "punpcklbw %%mm1,     %%mm0\n\t"             \
+    "punpckhbw %%mm1,     %%mm2\n\t"             \
+    MOVNTQ "   %%mm0,      (%1)\n\t"             \
+    MOVNTQ "   %%mm2,     8(%1)\n\t"             \
+
+#define DITHER_RGB                               \
+    "paddusb "BLUE_DITHER"(%4),  %%mm0\n\t"      \
+    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
+    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither8[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+        DITHER_RGB
+#endif
+        RGB_PACK16(pb_03, 1)
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither4[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+        DITHER_RGB
+#endif
+        RGB_PACK16(pb_07, 0)
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
+
+#define RGB_PACK24(blue, red)\
+    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
+    "packuswb  %%mm5,      %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
+    "packuswb  %%mm7,      %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
+    "movq      %%mm"red",  %%mm3 \n"\
+    "movq      %%mm"blue", %%mm6 \n"\
+    "psrlq     $32,        %%mm"red" \n" /* R1 R3 R5 R7 */\
+    "punpcklbw %%mm2,      %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
+    "punpcklbw %%mm"red",  %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
+    "movq      %%mm3,      %%mm5 \n"\
+    "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
+    "punpcklwd %%mm6,      %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
+    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
+    RGB_PACK24_B
+
+#if COMPILE_TEMPLATE_MMXEXT
+DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+    "pshufw    $0xc6,  %%mm2, %%mm1 \n"\
+    "pshufw    $0x84,  %%mm3, %%mm6 \n"\
+    "pshufw    $0x38,  %%mm5, %%mm7 \n"\
+    "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
+    "movq      %%mm1,         %%mm0 \n"\
+    "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
+    "movq      %%mm1,         %%mm2 \n"\
+    "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
+    "psrlq       $48,         %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
+    "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
+    "psllq       $32,         %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
+    "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
+    "por       %%mm3,         %%mm1 \n"\
+    "por       %%mm6,         %%mm0 \n"\
+    "por       %%mm5,         %%mm1 \n"\
+    "por       %%mm7,         %%mm2 \n"\
+    MOVNTQ"    %%mm0,          (%1) \n"\
+    MOVNTQ"    %%mm1,         8(%1) \n"\
+    MOVNTQ"    %%mm2,        16(%1) \n"\
+
+#else
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+    "movd      %%mm3,       (%1) \n" /* R0 G0 B0 R1 */\
+    "movd      %%mm2,      4(%1) \n" /* G1 B1 */\
+    "psrlq     $32,        %%mm3 \n"\
+    "psrlq     $16,        %%mm2 \n"\
+    "movd      %%mm3,      6(%1) \n" /* R2 G2 B2 R3 */\
+    "movd      %%mm2,     10(%1) \n" /* G3 B3 */\
+    "psrlq     $16,        %%mm2 \n"\
+    "movd      %%mm5,     12(%1) \n" /* R4 G4 B4 R5 */\
+    "movd      %%mm2,     16(%1) \n" /* G5 B5 */\
+    "psrlq     $32,        %%mm5 \n"\
+    "movd      %%mm2,     20(%1) \n" /* -- -- G7 B7 */\
+    "movd      %%mm5,     18(%1) \n" /* R6 G6 B6 R7 */\
+
+#endif
+
+static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK24(REG_BLUE, REG_RED)
+
+    YUV2RGB_ENDLOOP(3)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK24(REG_RED, REG_BLUE)
+
+    YUV2RGB_ENDLOOP(3)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+
+#define SET_EMPTY_ALPHA                                                      \
+    "pcmpeqd   %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
+
+#define LOAD_ALPHA                                   \
+    "movq      (%6, %0, 2),     %%mm"REG_ALPHA"\n\t" \
+
+#define RGB_PACK32(red, green, blue, alpha)  \
+    "movq      %%mm"blue",  %%mm5\n\t"       \
+    "movq      %%mm"red",   %%mm6\n\t"       \
+    "punpckhbw %%mm"green", %%mm5\n\t"       \
+    "punpcklbw %%mm"green", %%mm"blue"\n\t"  \
+    "punpckhbw %%mm"alpha", %%mm6\n\t"       \
+    "punpcklbw %%mm"alpha", %%mm"red"\n\t"   \
+    "movq      %%mm"blue",  %%mm"green"\n\t" \
+    "movq      %%mm5,       %%mm"alpha"\n\t" \
+    "punpcklwd %%mm"red",   %%mm"blue"\n\t"  \
+    "punpckhwd %%mm"red",   %%mm"green"\n\t" \
+    "punpcklwd %%mm6,       %%mm5\n\t"       \
+    "punpckhwd %%mm6,       %%mm"alpha"\n\t" \
+    MOVNTQ "   %%mm"blue",   0(%1)\n\t"      \
+    MOVNTQ "   %%mm"green",  8(%1)\n\t"      \
+    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
+    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        SET_EMPTY_ALPHA
+        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        LOAD_ALPHA
+        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS_ALPHA
+    YUV2RGB_ENDFUNC
+}
+#endif
+
+static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        SET_EMPTY_ALPHA
+        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        LOAD_ALPHA
+        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS_ALPHA
+    YUV2RGB_ENDFUNC
+}
+#endif
+
+#endif /* !COMPILE_TEMPLATE_MMXEXT */