summaryrefslogtreecommitdiff
path: root/ffmpeg/libswscale
diff options
context:
space:
mode:
Diffstat (limited to 'ffmpeg/libswscale')
-rw-r--r--ffmpeg/libswscale/Makefile19
-rw-r--r--ffmpeg/libswscale/bfin/Makefile3
-rw-r--r--ffmpeg/libswscale/bfin/internal_bfin.S613
-rw-r--r--ffmpeg/libswscale/bfin/swscale_bfin.c86
-rw-r--r--ffmpeg/libswscale/bfin/yuv2rgb_bfin.c202
-rw-r--r--ffmpeg/libswscale/colorspace-test.c170
-rw-r--r--ffmpeg/libswscale/input.c1354
-rw-r--r--ffmpeg/libswscale/libswscale.pc14
-rw-r--r--ffmpeg/libswscale/libswscale.v4
-rw-r--r--ffmpeg/libswscale/options.c81
-rw-r--r--ffmpeg/libswscale/output.c1722
-rw-r--r--ffmpeg/libswscale/ppc/Makefile3
-rw-r--r--ffmpeg/libswscale/ppc/swscale_altivec.c328
-rw-r--r--ffmpeg/libswscale/ppc/yuv2rgb_altivec.c854
-rw-r--r--ffmpeg/libswscale/ppc/yuv2rgb_altivec.h51
-rw-r--r--ffmpeg/libswscale/ppc/yuv2yuv_altivec.c194
-rw-r--r--ffmpeg/libswscale/rgb2rgb.c390
-rw-r--r--ffmpeg/libswscale/rgb2rgb.h166
-rw-r--r--ffmpeg/libswscale/rgb2rgb_template.c927
-rw-r--r--ffmpeg/libswscale/sparc/Makefile1
-rw-r--r--ffmpeg/libswscale/sparc/yuv2rgb_vis.c212
-rw-r--r--ffmpeg/libswscale/swscale-test.c415
-rw-r--r--ffmpeg/libswscale/swscale.c974
-rw-r--r--ffmpeg/libswscale/swscale.h355
-rw-r--r--ffmpeg/libswscale/swscale_internal.h840
-rw-r--r--ffmpeg/libswscale/swscale_unscaled.c1146
-rw-r--r--ffmpeg/libswscale/utils.c1882
-rw-r--r--ffmpeg/libswscale/version.h59
-rw-r--r--ffmpeg/libswscale/x86/Makefile11
-rw-r--r--ffmpeg/libswscale/x86/input.asm670
-rw-r--r--ffmpeg/libswscale/x86/output.asm413
-rw-r--r--ffmpeg/libswscale/x86/rgb2rgb.c149
-rw-r--r--ffmpeg/libswscale/x86/rgb2rgb_template.c2498
-rw-r--r--ffmpeg/libswscale/x86/scale.asm431
-rw-r--r--ffmpeg/libswscale/x86/swscale.c585
-rw-r--r--ffmpeg/libswscale/x86/swscale_template.c1717
-rw-r--r--ffmpeg/libswscale/x86/w64xmmtest.c31
-rw-r--r--ffmpeg/libswscale/x86/yuv2rgb.c113
-rw-r--r--ffmpeg/libswscale/x86/yuv2rgb_template.c451
-rw-r--r--ffmpeg/libswscale/yuv2rgb.c927
40 files changed, 21061 insertions, 0 deletions
diff --git a/ffmpeg/libswscale/Makefile b/ffmpeg/libswscale/Makefile
new file mode 100644
index 0000000..dd00f7d
--- /dev/null
+++ b/ffmpeg/libswscale/Makefile
@@ -0,0 +1,19 @@
+include $(SUBDIR)../config.mak
+
+NAME = swscale
+FFLIBS = avutil
+
+HEADERS = swscale.h \
+ version.h \
+
+OBJS = input.o \
+ options.o \
+ output.o \
+ rgb2rgb.o \
+ swscale.o \
+ swscale_unscaled.o \
+ utils.o \
+ yuv2rgb.o \
+
+TESTPROGS = colorspace \
+ swscale \
diff --git a/ffmpeg/libswscale/bfin/Makefile b/ffmpeg/libswscale/bfin/Makefile
new file mode 100644
index 0000000..5f34550
--- /dev/null
+++ b/ffmpeg/libswscale/bfin/Makefile
@@ -0,0 +1,3 @@
+OBJS += bfin/internal_bfin.o \
+ bfin/swscale_bfin.o \
+ bfin/yuv2rgb_bfin.o \
diff --git a/ffmpeg/libswscale/bfin/internal_bfin.S b/ffmpeg/libswscale/bfin/internal_bfin.S
new file mode 100644
index 0000000..eab30aa
--- /dev/null
+++ b/ffmpeg/libswscale/bfin/internal_bfin.S
@@ -0,0 +1,613 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ * April 20, 2007
+ *
+ * Blackfin video color space converter operations
+ * convert I420 YV12 to RGB in various formats
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+/*
+YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
+and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
+
+
+The following calculation is used for the conversion:
+
+ r = clipz((y - oy) * cy + crv * (v - 128))
+ g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
+ b = clipz((y - oy) * cy + cbu * (u - 128))
+
+y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
+
+
+New factorization to eliminate the truncation error which was
+occurring due to the byteop3p.
+
+
+1) Use the bytop16m to subtract quad bytes we use this in U8 this
+ then so the offsets need to be renormalized to 8bits.
+
+2) Scale operands up by a factor of 4 not 8 because Blackfin
+ multiplies include a shift.
+
+3) Compute into the accumulators cy * yx0, cy * yx1.
+
+4) Compute each of the linear equations:
+ r = clipz((y - oy) * cy + crv * (v - 128))
+
+ g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
+
+ b = clipz((y - oy) * cy + cbu * (u - 128))
+
+ Reuse of the accumulators requires that we actually multiply
+ twice once with addition and the second time with a subtraction.
+
+ Because of this we need to compute the equations in the order R B
+ then G saving the writes for B in the case of 24/32 bit color
+ formats.
+
+ API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
+ int dW, uint32_t *coeffs);
+
+ A B
+ --- ---
+ i2 = cb i3 = cr
+ i1 = coeff i0 = y
+
+Where coeffs have the following layout in memory.
+
+uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
+
+coeffs is a pointer to oy.
+
+The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
+replication is used to simplify the internal algorithms for the dual Mac
+architecture of BlackFin.
+
+All routines are exported with _ff_bfin_ as a symbol prefix.
+
+Rough performance gain compared against -O3:
+
+2779809/1484290 187.28%
+
+which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
+c/pel for the optimized implementations. Not sure why there is such a
+huge variation on the reference codes on Blackfin I guess it must have
+to do with the memory system.
+*/
+
+#define mL3 .text
+#if defined(__FDPIC__) && CONFIG_SRAM
+#define mL1 .l1.text
+#else
+#define mL1 mL3
+#endif
+#define MEM mL1
+
+#define DEFUN(fname,where,interface) \
+ .section where; \
+ .global _ff_bfin_ ## fname; \
+ .type _ff_bfin_ ## fname, STT_FUNC; \
+ .align 8; \
+ _ff_bfin_ ## fname
+
+#define DEFUN_END(fname) \
+ .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
+
+
+.text
+
+#define COEFF_LEN 11*4
+#define COEFF_REL_CY_OFF 4*4
+
+#define ARG_OUT 20
+#define ARG_W 24
+#define ARG_COEFF 28
+
+DEFUN(yuv2rgb565_line,MEM,
+ (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+ link 0;
+ [--sp] = (r7:4);
+ p1 = [fp+ARG_OUT];
+ r3 = [fp+ARG_W];
+
+ i0 = r0;
+ i2 = r1;
+ i3 = r2;
+
+ r0 = [fp+ARG_COEFF];
+ i1 = r0;
+ b1 = i1;
+ l1 = COEFF_LEN;
+ m0 = COEFF_REL_CY_OFF;
+ p0 = r3;
+
+ r0 = [i0++]; // 2Y
+ r1.l = w[i2++]; // 2u
+ r1.h = w[i3++]; // 2v
+ p0 = p0>>2;
+
+ lsetup (.L0565, .L1565) lc0 = p0;
+
+ /*
+ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+ r0 -- used to load 4ys
+ r1 -- used to load 2us,2vs
+ r4 -- y3,y2
+ r5 -- y1,y0
+ r6 -- u1,u0
+ r7 -- v1,v0
+ */
+ r2=[i1++]; // oy
+.L0565:
+ /*
+ rrrrrrrr gggggggg bbbbbbbb
+ 5432109876543210
+ bbbbb >>3
+ gggggggg <<3
+ rrrrrrrr <<8
+ rrrrrggggggbbbbb
+ */
+ (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
+ (r7,r6) = byteop16m (r1:0, r3:2) (r);
+ r5 = r5 << 2 (v); // y1,y0
+ r4 = r4 << 2 (v); // y3,y2
+ r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
+ r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
+ /* Y' = y*cy */
+ a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2 = r2 >> 3 (v);
+ r3 = r2 & r5;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+ a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+ r2 = r2 << 8 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
+ r2 = r2 << 3 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+ [p1++]=r3 || r1=[i1++]; // cy
+
+ /* Y' = y*cy */
+
+ a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+ a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2 = r2 >> 3 (v);
+ r3 = r2 & r5;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+ a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+ r2 = r2 << 8 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
+ r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+ [p1++]=r3 || r1.h = w[i3++]; // 2v
+.L1565: r2=[i1++]; // oy
+
+ l1 = 0;
+
+ (r7:4) = [sp++];
+ unlink;
+ rts;
+DEFUN_END(yuv2rgb565_line)
+
+DEFUN(yuv2rgb555_line,MEM,
+ (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+ link 0;
+ [--sp] = (r7:4);
+ p1 = [fp+ARG_OUT];
+ r3 = [fp+ARG_W];
+
+ i0 = r0;
+ i2 = r1;
+ i3 = r2;
+
+ r0 = [fp+ARG_COEFF];
+ i1 = r0;
+ b1 = i1;
+ l1 = COEFF_LEN;
+ m0 = COEFF_REL_CY_OFF;
+ p0 = r3;
+
+ r0 = [i0++]; // 2Y
+ r1.l = w[i2++]; // 2u
+ r1.h = w[i3++]; // 2v
+ p0 = p0>>2;
+
+ lsetup (.L0555, .L1555) lc0 = p0;
+
+ /*
+ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+ r0 -- used to load 4ys
+ r1 -- used to load 2us,2vs
+ r4 -- y3,y2
+ r5 -- y1,y0
+ r6 -- u1,u0
+ r7 -- v1,v0
+ */
+ r2=[i1++]; // oy
+.L0555:
+ /*
+ rrrrrrrr gggggggg bbbbbbbb
+ 5432109876543210
+ bbbbb >>3
+ gggggggg <<2
+ rrrrrrrr <<7
+ xrrrrrgggggbbbbb
+ */
+
+ (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
+ (r7,r6) = byteop16m (r1:0, r3:2) (r);
+ r5 = r5 << 2 (v); // y1,y0
+ r4 = r4 << 2 (v); // y3,y2
+ r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
+ r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
+ /* Y' = y*cy */
+ a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2 = r2 >> 3 (v);
+ r3 = r2 & r5;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+ a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+ r2 = r2 << 7 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
+ r2 = r2 << 2 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+ [p1++]=r3 || r1=[i1++]; // cy
+
+ /* Y' = y*cy */
+
+ a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+ a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2 = r2 >> 3 (v);
+ r3 = r2 & r5;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+ a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+ r2 = r2 << 7 (v);
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
+ r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
+ r2 = r2 & r5;
+ r3 = r3 | r2;
+ [p1++]=r3 || r1.h=w[i3++]; // 2v
+
+.L1555: r2=[i1++]; // oy
+
+ l1 = 0;
+
+ (r7:4) = [sp++];
+ unlink;
+ rts;
+DEFUN_END(yuv2rgb555_line)
+
+DEFUN(yuv2rgb24_line,MEM,
+ (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+ link 0;
+ [--sp] = (r7:4);
+ p1 = [fp+ARG_OUT];
+ r3 = [fp+ARG_W];
+ p2 = p1;
+ p2 += 3;
+
+ i0 = r0;
+ i2 = r1;
+ i3 = r2;
+
+ r0 = [fp+ARG_COEFF]; // coeff buffer
+ i1 = r0;
+ b1 = i1;
+ l1 = COEFF_LEN;
+ m0 = COEFF_REL_CY_OFF;
+ p0 = r3;
+
+ r0 = [i0++]; // 2Y
+ r1.l = w[i2++]; // 2u
+ r1.h = w[i3++]; // 2v
+ p0 = p0>>2;
+
+ lsetup (.L0888, .L1888) lc0 = p0;
+
+ /*
+ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+ r0 -- used to load 4ys
+ r1 -- used to load 2us,2vs
+ r4 -- y3,y2
+ r5 -- y1,y0
+ r6 -- u1,u0
+ r7 -- v1,v0
+ */
+ r2=[i1++]; // oy
+.L0888:
+ (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
+ (r7,r6) = byteop16m (r1:0, r3:2) (r);
+ r5 = r5 << 2 (v); // y1,y0
+ r4 = r4 << 2 (v); // y3,y2
+ r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
+ r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
+
+ /* Y' = y*cy */
+ a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2=r2>>16 || B[p1++]=r2;
+ B[p2++]=r2;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+ a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
+ r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+ r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
+
+ r2=r2>>16 || B[p1++]=r2;
+ B[p2++]=r2;
+
+ r3=r3>>16 || B[p1++]=r3;
+ B[p2++]=r3 || r1=[i1++]; // cy
+
+ p1+=3;
+ p2+=3;
+ /* Y' = y*cy */
+ a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
+
+ /* R = Y+ crv*(Cr-128) */
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+ a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
+ r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
+ r2=r2>>16 || B[p1++]=r2;
+ B[p2++]=r2;
+
+ /* B = Y+ cbu*(Cb-128) */
+ r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+ a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
+ r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
+
+ /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+ a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
+ r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+ r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
+ r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
+ B[p2++]=r2 || r1.l = w[i2++]; // 2u
+ r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
+ B[p2++]=r3 || r2=[i1++]; // oy
+
+ p1+=3;
+.L1888: p2+=3;
+
+ l1 = 0;
+
+ (r7:4) = [sp++];
+ unlink;
+ rts;
+DEFUN_END(yuv2rgb24_line)
+
+
+
+#define ARG_vdst 20
+#define ARG_width 24
+#define ARG_height 28
+#define ARG_lumStride 32
+#define ARG_chromStride 36
+#define ARG_srcStride 40
+
+DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)):
+ link 0;
+ [--sp] = (r7:4,p5:4);
+
+ p0 = r1; // Y top even
+
+ i2 = r2; // *u
+ r2 = [fp + ARG_vdst];
+ i3 = r2; // *v
+
+ r1 = [fp + ARG_srcStride];
+ r2 = r0 + r1;
+ i0 = r0; // uyvy_T even
+ i1 = r2; // uyvy_B odd
+
+ p2 = [fp + ARG_lumStride];
+ p1 = p0 + p2; // Y bot odd
+
+ p5 = [fp + ARG_width];
+ p4 = [fp + ARG_height];
+ r0 = p5;
+ p4 = p4 >> 1;
+ p5 = p5 >> 2;
+
+ r2 = r0 << 1;
+ r1 = r1 << 1;
+ r1 = r1 - r2; // srcStride + (srcStride - 2*width)
+ r1 += -8; // i0,i1 is pre read need to correct
+ m0 = r1;
+
+ r2 = [fp + ARG_chromStride];
+ r0 = r0 >> 1;
+ r2 = r2 - r0;
+ m1 = r2;
+
+ /* I0,I1 - src input line pointers
+ * p0,p1 - luma output line pointers
+ * I2 - dstU
+ * I3 - dstV
+ */
+
+ lsetup (0f, 1f) lc1 = p4; // H/2
+0: r0 = [i0++] || r2 = [i1++];
+ r1 = [i0++] || r3 = [i1++];
+ r4 = byteop1p(r1:0, r3:2);
+ r5 = byteop1p(r1:0, r3:2) (r);
+ lsetup (2f, 3f) lc0 = p5; // W/4
+2: r0 = r0 >> 8(v);
+ r1 = r1 >> 8(v);
+ r2 = r2 >> 8(v);
+ r3 = r3 >> 8(v);
+ r0 = bytepack(r0, r1);
+ r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
+ r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
+ r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
+ r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
+ r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
+3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
+
+ i0 += m0;
+ i1 += m0;
+ i2 += m1;
+ i3 += m1;
+ p0 = p0 + p2;
+1: p1 = p1 + p2;
+
+ (r7:4,p5:4) = [sp++];
+ unlink;
+ rts;
+DEFUN_END(uyvytoyv12)
+
+DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)):
+ link 0;
+ [--sp] = (r7:4,p5:4);
+
+ p0 = r1; // Y top even
+
+ i2 = r2; // *u
+ r2 = [fp + ARG_vdst];
+ i3 = r2; // *v
+
+ r1 = [fp + ARG_srcStride];
+ r2 = r0 + r1;
+
+ i0 = r0; // uyvy_T even
+ i1 = r2; // uyvy_B odd
+
+ p2 = [fp + ARG_lumStride];
+ p1 = p0 + p2; // Y bot odd
+
+ p5 = [fp + ARG_width];
+ p4 = [fp + ARG_height];
+ r0 = p5;
+ p4 = p4 >> 1;
+ p5 = p5 >> 2;
+
+ r2 = r0 << 1;
+ r1 = r1 << 1;
+ r1 = r1 - r2; // srcStride + (srcStride - 2*width)
+ r1 += -8; // i0,i1 is pre read need to correct
+ m0 = r1;
+
+ r2 = [fp + ARG_chromStride];
+ r0 = r0 >> 1;
+ r2 = r2 - r0;
+ m1 = r2;
+
+ /* I0,I1 - src input line pointers
+ * p0,p1 - luma output line pointers
+ * I2 - dstU
+ * I3 - dstV
+ */
+
+ lsetup (0f, 1f) lc1 = p4; // H/2
+0: r0 = [i0++] || r2 = [i1++];
+ r1 = [i0++] || r3 = [i1++];
+ r4 = bytepack(r0, r1);
+ r5 = bytepack(r2, r3);
+ lsetup (2f, 3f) lc0 = p5; // W/4
+2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
+ r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
+ r2 = r2 >> 8(v);
+ r3 = r3 >> 8(v);
+ r4 = byteop1p(r1:0, r3:2);
+ r5 = byteop1p(r1:0, r3:2) (r);
+ r6 = pack(r5.l, r4.l);
+ r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
+ r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
+ r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
+3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
+
+ i0 += m0;
+ i1 += m0;
+ i2 += m1;
+ i3 += m1;
+ p0 = p0 + p2;
+1: p1 = p1 + p2;
+
+ (r7:4,p5:4) = [sp++];
+ unlink;
+ rts;
+DEFUN_END(yuyvtoyv12)
diff --git a/ffmpeg/libswscale/bfin/swscale_bfin.c b/ffmpeg/libswscale/bfin/swscale_bfin.c
new file mode 100644
index 0000000..2b93858
--- /dev/null
+++ b/ffmpeg/libswscale/bfin/swscale_bfin.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * Blackfin software video scaler operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libswscale/swscale_internal.h"
+
+#if defined (__FDPIC__) && CONFIG_SRAM
+#define L1CODE __attribute__((l1_text))
+#else
+#define L1CODE
+#endif
+
+int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height,
+ int lumStride, int chromStride, int srcStride) L1CODE;
+
+int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height,
+ int lumStride, int chromStride, int srcStride) L1CODE;
+
+static int uyvytoyv12_unscaled(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+ uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+ const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+ int w = dstStride[0];
+
+ ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0]);
+
+ return srcSliceH;
+}
+
+static int yuyvtoyv12_unscaled(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+ uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+ const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+ int w = dstStride[0];
+
+ ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0]);
+
+ return srcSliceH;
+}
+
+void ff_bfin_get_unscaled_swscale(SwsContext *c)
+{
+ if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_UYVY422) {
+ av_log(NULL, AV_LOG_VERBOSE,
+ "selecting Blackfin optimized uyvytoyv12_unscaled\n");
+ c->swScale = uyvytoyv12_unscaled;
+ }
+ if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_YUYV422) {
+ av_log(NULL, AV_LOG_VERBOSE,
+ "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
+ c->swScale = yuyvtoyv12_unscaled;
+ }
+}
diff --git a/ffmpeg/libswscale/bfin/yuv2rgb_bfin.c b/ffmpeg/libswscale/bfin/yuv2rgb_bfin.c
new file mode 100644
index 0000000..e1b7afa
--- /dev/null
+++ b/ffmpeg/libswscale/bfin/yuv2rgb_bfin.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * Blackfin video color space converter operations
+ * convert I420 YV12 to RGB in various formats
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include <stdint.h>
+
+#include "config.h"
+#include "libswscale/swscale_internal.h"
+
+#if defined(__FDPIC__) && CONFIG_SRAM
+#define L1CODE __attribute__((l1_text))
+#else
+#define L1CODE
+#endif
+
+void ff_bfin_yuv2rgb555_line(const uint8_t *Y, const uint8_t *U,
+ const uint8_t *V, uint8_t *out,
+ int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb565_line(const uint8_t *Y, const uint8_t *U,
+ const uint8_t *V, uint8_t *out,
+ int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb24_line(const uint8_t *Y, const uint8_t *U,
+ const uint8_t *V, uint8_t *out,
+ int w, uint32_t *coeffs) L1CODE;
+
+typedef void (*ltransform)(const uint8_t *Y, const uint8_t *U, const uint8_t *V,
+ uint8_t *out, int w, uint32_t *coeffs);
+
+static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
+{
+ int oy;
+ oy = c->yOffset & 0xffff;
+ oy = oy >> 3; // keep everything U8.0 for offset calculation
+
+ c->oc = 128 * 0x01010101U;
+ c->oy = oy * 0x01010101U;
+
+ /* copy 64bit vector coeffs down to 32bit vector coeffs */
+ c->cy = c->yCoeff;
+ c->zero = 0;
+
+ if (rgb) {
+ c->crv = c->vrCoeff;
+ c->cbu = c->ubCoeff;
+ c->cgu = c->ugCoeff;
+ c->cgv = c->vgCoeff;
+ } else {
+ c->crv = c->ubCoeff;
+ c->cbu = c->vrCoeff;
+ c->cgu = c->vgCoeff;
+ c->cgv = c->ugCoeff;
+ }
+
+ if (masks == 555) {
+ c->rmask = 0x001f * 0x00010001U;
+ c->gmask = 0x03e0 * 0x00010001U;
+ c->bmask = 0x7c00 * 0x00010001U;
+ } else if (masks == 565) {
+ c->rmask = 0x001f * 0x00010001U;
+ c->gmask = 0x07e0 * 0x00010001U;
+ c->bmask = 0xf800 * 0x00010001U;
+ }
+}
+
+static int core_yuv420_rgb(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH, uint8_t **oplanes,
+ int *outstrides, ltransform lcscf,
+ int rgb, int masks)
+{
+ const uint8_t *py, *pu, *pv;
+ uint8_t *op;
+ int w = instrides[0];
+ int h2 = srcSliceH >> 1;
+ int i;
+
+ bfin_prepare_coefficients(c, rgb, masks);
+
+ py = in[0];
+ pu = in[1 + (1 ^ rgb)];
+ pv = in[1 + (0 ^ rgb)];
+
+ op = oplanes[0] + srcSliceY * outstrides[0];
+
+ for (i = 0; i < h2; i++) {
+ lcscf(py, pu, pv, op, w, &c->oy);
+
+ py += instrides[0];
+ op += outstrides[0];
+
+ lcscf(py, pu, pv, op, w, &c->oy);
+
+ py += instrides[0];
+ pu += instrides[1];
+ pv += instrides[2];
+ op += outstrides[0];
+ }
+
+ return srcSliceH;
+}
+
+static int bfin_yuv420_rgb555(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
+}
+
+static int bfin_yuv420_bgr555(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
+}
+
+static int bfin_yuv420_rgb24(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
+}
+
+static int bfin_yuv420_bgr24(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
+}
+
+static int bfin_yuv420_rgb565(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
+}
+
+static int bfin_yuv420_bgr565(SwsContext *c, const uint8_t **in, int *instrides,
+ int srcSliceY, int srcSliceH,
+ uint8_t **oplanes, int *outstrides)
+{
+ return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+ outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
+}
+
+SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
+{
+ SwsFunc f;
+
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB555:
+ f = bfin_yuv420_rgb555;
+ break;
+ case AV_PIX_FMT_BGR555:
+ f = bfin_yuv420_bgr555;
+ break;
+ case AV_PIX_FMT_RGB565:
+ f = bfin_yuv420_rgb565;
+ break;
+ case AV_PIX_FMT_BGR565:
+ f = bfin_yuv420_bgr565;
+ break;
+ case AV_PIX_FMT_RGB24:
+ f = bfin_yuv420_rgb24;
+ break;
+ case AV_PIX_FMT_BGR24:
+ f = bfin_yuv420_bgr24;
+ break;
+ default:
+ return 0;
+ }
+
+ av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
+ av_get_pix_fmt_name(c->dstFormat));
+
+ return f;
+}
diff --git a/ffmpeg/libswscale/colorspace-test.c b/ffmpeg/libswscale/colorspace-test.c
new file mode 100644
index 0000000..42a915b
--- /dev/null
+++ b/ffmpeg/libswscale/colorspace-test.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <string.h> /* for memset() */
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "swscale.h"
+#include "rgb2rgb.h"
+#include "libavutil/mem.h"
+
+#define SIZE 1000
+#define srcByte 0x55
+#define dstByte 0xBB
+
+#define FUNC(s, d, n) { s, d, #n, n }
+
+int main(int argc, char **argv)
+{
+ int i, funcNum;
+ uint8_t *srcBuffer = av_malloc(SIZE);
+ uint8_t *dstBuffer = av_malloc(SIZE);
+ int failedNum = 0;
+ int passedNum = 0;
+
+ if (!srcBuffer || !dstBuffer)
+ return -1;
+
+ av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
+ sws_rgb2rgb_init();
+
+ for (funcNum = 0; ; funcNum++) {
+ struct func_info_s {
+ int src_bpp;
+ int dst_bpp;
+ const char *name;
+ void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
+ } func_info[] = {
+ FUNC(2, 2, rgb12to15),
+ FUNC(2, 2, rgb15to16),
+ FUNC(2, 3, rgb15to24),
+ FUNC(2, 4, rgb15to32),
+ FUNC(2, 3, rgb16to24),
+ FUNC(2, 4, rgb16to32),
+ FUNC(3, 2, rgb24to15),
+ FUNC(3, 2, rgb24to16),
+ FUNC(3, 4, rgb24to32),
+ FUNC(4, 2, rgb32to15),
+ FUNC(4, 2, rgb32to16),
+ FUNC(4, 3, rgb32to24),
+ FUNC(2, 2, rgb16to15),
+ FUNC(2, 2, rgb12tobgr12),
+ FUNC(2, 2, rgb15tobgr15),
+ FUNC(2, 2, rgb15tobgr16),
+ FUNC(2, 3, rgb15tobgr24),
+ FUNC(2, 4, rgb15tobgr32),
+ FUNC(2, 2, rgb16tobgr15),
+ FUNC(2, 2, rgb16tobgr16),
+ FUNC(2, 3, rgb16tobgr24),
+ FUNC(2, 4, rgb16tobgr32),
+ FUNC(3, 2, rgb24tobgr15),
+ FUNC(3, 2, rgb24tobgr16),
+ FUNC(3, 3, rgb24tobgr24),
+ FUNC(3, 4, rgb24tobgr32),
+ FUNC(4, 2, rgb32tobgr15),
+ FUNC(4, 2, rgb32tobgr16),
+ FUNC(4, 3, rgb32tobgr24),
+ FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
+ FUNC(6, 6, rgb48tobgr48_nobswap),
+ FUNC(6, 6, rgb48tobgr48_bswap),
+ FUNC(8, 6, rgb64to48_nobswap),
+ FUNC(8, 6, rgb64to48_bswap),
+ FUNC(8, 6, rgb64tobgr48_nobswap),
+ FUNC(8, 6, rgb64tobgr48_bswap),
+ FUNC(0, 0, NULL)
+ };
+ int width;
+ int failed = 0;
+ int srcBpp = 0;
+ int dstBpp = 0;
+
+ if (!func_info[funcNum].func)
+ break;
+
+ av_log(NULL, AV_LOG_INFO, ".");
+ memset(srcBuffer, srcByte, SIZE);
+
+ for (width = 63; width > 0; width--) {
+ int dstOffset;
+ for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
+ int srcOffset;
+ memset(dstBuffer, dstByte, SIZE);
+
+ for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
+ uint8_t *src = srcBuffer + srcOffset;
+ uint8_t *dst = dstBuffer + dstOffset;
+ const char *name = NULL;
+
+ // don't fill the screen with shit ...
+ if (failed)
+ break;
+
+ srcBpp = func_info[funcNum].src_bpp;
+ dstBpp = func_info[funcNum].dst_bpp;
+ name = func_info[funcNum].name;
+
+ func_info[funcNum].func(src, dst, width * srcBpp);
+
+ if (!srcBpp)
+ break;
+
+ for (i = 0; i < SIZE; i++) {
+ if (srcBuffer[i] != srcByte) {
+ av_log(NULL, AV_LOG_INFO,
+ "src damaged at %d w:%d src:%d dst:%d %s\n",
+ i, width, srcOffset, dstOffset, name);
+ failed = 1;
+ break;
+ }
+ }
+ for (i = 0; i < dstOffset; i++) {
+ if (dstBuffer[i] != dstByte) {
+ av_log(NULL, AV_LOG_INFO,
+ "dst damaged at %d w:%d src:%d dst:%d %s\n",
+ i, width, srcOffset, dstOffset, name);
+ failed = 1;
+ break;
+ }
+ }
+ for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
+ if (dstBuffer[i] != dstByte) {
+ av_log(NULL, AV_LOG_INFO,
+ "dst damaged at %d w:%d src:%d dst:%d %s\n",
+ i, width, srcOffset, dstOffset, name);
+ failed = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (failed)
+ failedNum++;
+ else if (srcBpp)
+ passedNum++;
+ }
+
+ av_log(NULL, AV_LOG_INFO,
+ "\n%d converters passed, %d converters randomly overwrote memory\n",
+ passedNum, failedNum);
+ return failedNum;
+}
diff --git a/ffmpeg/libswscale/input.c b/ffmpeg/libswscale/input.c
new file mode 100644
index 0000000..2def2de
--- /dev/null
+++ b/ffmpeg/libswscale/input.c
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+#define RGB2YUV_SHIFT 15
+#define BY ((int)(0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BV (-(int)(0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BU ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GY ((int)(0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GV (-(int)(0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GU (-(int)(0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RY ((int)(0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RV ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RU (-(int)(0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? b_r : r_b)
+#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? r_b : b_r)
+
+static av_always_inline void
+rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
+ enum AVPixelFormat origin)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ unsigned int r_b = input_pixel(&src[i*4+0]);
+ unsigned int g = input_pixel(&src[i*4+1]);
+ unsigned int b_r = input_pixel(&src[i*4+2]);
+
+ dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+static av_always_inline void
+rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin)
+{
+ int i;
+ av_assert1(src1==src2);
+ for (i = 0; i < width; i++) {
+ int r_b = input_pixel(&src1[i*4+0]);
+ int g = input_pixel(&src1[i*4+1]);
+ int b_r = input_pixel(&src1[i*4+2]);
+
+ dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+static av_always_inline void
+rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin)
+{
+ int i;
+ av_assert1(src1==src2);
+ for (i = 0; i < width; i++) {
+ int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+ int g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+
+ dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+#define rgb64funcs(pattern, BE_LE, origin) \
+static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
+ int width, uint32_t *unused) \
+{ \
+ const uint16_t *src = (const uint16_t *) _src; \
+ uint16_t *dst = (uint16_t *) _dst; \
+ rgb64ToY_c_template(dst, src, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
+ const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+ int width, uint32_t *unused) \
+{ \
+ const uint16_t *src1 = (const uint16_t *) _src1, \
+ *src2 = (const uint16_t *) _src2; \
+ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+ rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
+ const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+ int width, uint32_t *unused) \
+{ \
+ const uint16_t *src1 = (const uint16_t *) _src1, \
+ *src2 = (const uint16_t *) _src2; \
+ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+ rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
+}
+
+rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
+rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
+
+static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
+ const uint16_t *src, int width,
+ enum AVPixelFormat origin)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ unsigned int r_b = input_pixel(&src[i * 3 + 0]);
+ unsigned int g = input_pixel(&src[i * 3 + 1]);
+ unsigned int b_r = input_pixel(&src[i * 3 + 2]);
+
+ dst[i] = (RY * r + GY * g + BY * b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin)
+{
+ int i;
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int r_b = input_pixel(&src1[i * 3 + 0]);
+ int g = input_pixel(&src1[i * 3 + 1]);
+ int b_r = input_pixel(&src1[i * 3 + 2]);
+
+ dstU[i] = (RU * r + GU * g + BU * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (RV * r + GV * g + BV * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin)
+{
+ int i;
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int r_b = (input_pixel(&src1[6 * i + 0]) +
+ input_pixel(&src1[6 * i + 3]) + 1) >> 1;
+ int g = (input_pixel(&src1[6 * i + 1]) +
+ input_pixel(&src1[6 * i + 4]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[6 * i + 2]) +
+ input_pixel(&src1[6 * i + 5]) + 1) >> 1;
+
+ dstU[i] = (RU * r + GU * g + BU * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (RV * r + GV * g + BV * b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+#undef r
+#undef b
+#undef input_pixel
+
+#define rgb48funcs(pattern, BE_LE, origin) \
+static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \
+ const uint8_t *_src, \
+ const uint8_t *unused0, const uint8_t *unused1,\
+ int width, \
+ uint32_t *unused) \
+{ \
+ const uint16_t *src = (const uint16_t *)_src; \
+ uint16_t *dst = (uint16_t *)_dst; \
+ rgb48ToY_c_template(dst, src, width, origin); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \
+ uint8_t *_dstV, \
+ const uint8_t *unused0, \
+ const uint8_t *_src1, \
+ const uint8_t *_src2, \
+ int width, \
+ uint32_t *unused) \
+{ \
+ const uint16_t *src1 = (const uint16_t *)_src1, \
+ *src2 = (const uint16_t *)_src2; \
+ uint16_t *dstU = (uint16_t *)_dstU, \
+ *dstV = (uint16_t *)_dstV; \
+ rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \
+ uint8_t *_dstV, \
+ const uint8_t *unused0, \
+ const uint8_t *_src1, \
+ const uint8_t *_src2, \
+ int width, \
+ uint32_t *unused) \
+{ \
+ const uint16_t *src1 = (const uint16_t *)_src1, \
+ *src2 = (const uint16_t *)_src2; \
+ uint16_t *dstU = (uint16_t *)_dstU, \
+ *dstV = (uint16_t *)_dstV; \
+ rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
+}
+
+rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
+rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
+rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
+rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
+
+#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \
+ origin == AV_PIX_FMT_BGRA || \
+ origin == AV_PIX_FMT_ARGB || \
+ origin == AV_PIX_FMT_ABGR) \
+ ? AV_RN32A(&src[(i) * 4]) \
+ : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \
+ : AV_RL16(&src[(i) * 2])))
+
+static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S)
+{
+ const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
+ const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int px = input_pixel(i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
+ }
+}
+
+static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S)
+{
+ const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
+ rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
+ const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int px = input_pixel(i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+ dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
+ }
+}
+
+static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S)
+{
+ const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
+ rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
+ maskgx = ~(maskr | maskb);
+ const unsigned rnd = (256U<<(S)) + (1<<(S-6));
+ int i;
+
+ maskr |= maskr << 1;
+ maskb |= maskb << 1;
+ maskg |= maskg << 1;
+ for (i = 0; i < width; i++) {
+ int px0 = input_pixel(2 * i + 0) >> shp;
+ int px1 = input_pixel(2 * i + 1) >> shp;
+ int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+ int rb = px0 + px1 - g;
+
+ b = (rb & maskb) >> shb;
+ if (shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
+ g >>= shg;
+ } else {
+ g = (g & maskg) >> shg;
+ }
+ r = (rb & maskr) >> shr;
+
+ dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+ dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
+ }
+}
+
+#undef input_pixel
+
+#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
+ maskg, maskb, rsh, gsh, bsh, S) \
+static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
+ int width, uint32_t *unused) \
+{ \
+ rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S); \
+} \
+ \
+static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
+ int width, uint32_t *unused) \
+{ \
+ rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S); \
+} \
+ \
+static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *unused0, const uint8_t *src, \
+ const uint8_t *dummy, \
+ int width, uint32_t *unused) \
+{ \
+ rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, \
+ rsh, gsh, bsh, S); \
+}
+
+rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR32_1, bgr321, 16, 0, 0, 8, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32, rgb32, 0, 0, 16, 0, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32_1, rgb321, 0, 0, 16, 8, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
+
+static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+ int width, uint32_t *unused)
+{
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ int i;
+ for (i = 0; i < width; i++) {
+ unsigned int g = gsrc[2*i] + gsrc[2*i+1];
+ unsigned int b = bsrc[2*i] + bsrc[2*i+1];
+ unsigned int r = rsrc[2*i] + rsrc[2*i+1];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+ dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+ }
+}
+
+static void rgba64ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ const uint16_t *src = (const uint16_t *)_src;
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = src[4 * i + 3];
+}
+
+static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i=0; i<width; i++) {
+ dst[i]= src[4*i]<<6;
+ }
+}
+
+static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i=0; i<width; i++) {
+ dst[i]= src[4*i+3]<<6;
+ }
+}
+
+static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i=0; i<width; i++) {
+ int d= src[i];
+
+ dst[i]= (pal[d] >> 24)<<6;
+ }
+}
+
+static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i = 0; i < width; i++) {
+ int d = src[i];
+
+ dst[i] = (pal[d] & 0xFF)<<6;
+ }
+}
+
+static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *pal)
+{
+ uint16_t *dstU = (uint16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i;
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int p = pal[src1[i]];
+
+ dstU[i] = (uint8_t)(p>> 8)<<6;
+ dstV[i] = (uint8_t)(p>>16)<<6;
+ }
+}
+
+static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i, j;
+ width = (width + 7) >> 3;
+ for (i = 0; i < width; i++) {
+ int d = ~src[i];
+ for (j = 0; j < 8; j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+ if(width&7){
+ int d= ~src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+}
+
+static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i, j;
+ width = (width + 7) >> 3;
+ for (i = 0; i < width; i++) {
+ int d = src[i];
+ for (j = 0; j < 8; j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+ if(width&7){
+ int d = src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j] = ((d>>(7-j))&1) * 16383;
+ }
+}
+
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
+ uint32_t *unused)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = src[2 * i];
+}
+
+static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ dstU[i] = src1[4 * i + 1];
+ dstV[i] = src1[4 * i + 3];
+ }
+ av_assert1(src1 == src2);
+}
+
+static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2, int width,
+ uint32_t *unused)
+{
+ int i;
+ const uint16_t *src = (const uint16_t *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+ for (i = 0; i < width; i++)
+ dst[i] = av_bswap16(src[i]);
+}
+
+static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
+ const uint8_t *_src2, int width, uint32_t *unused)
+{
+ int i;
+ const uint16_t *src1 = (const uint16_t *)_src1,
+ *src2 = (const uint16_t *)_src2;
+ uint16_t *dstU = (uint16_t *)_dstU, *dstV = (uint16_t *)_dstV;
+ for (i = 0; i < width; i++) {
+ dstU[i] = av_bswap16(src1[i]);
+ dstV[i] = av_bswap16(src2[i]);
+ }
+}
+
+/* This is almost identical to the previous, end exists only because
+ * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
+static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
+ uint32_t *unused)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = src[2 * i + 1];
+}
+
+static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ dstU[i] = src1[4 * i + 0];
+ dstV[i] = src1[4 * i + 2];
+ }
+ av_assert1(src1 == src2);
+}
+
+static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
+ const uint8_t *src, int width)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ dst1[i] = src[2 * i + 0];
+ dst2[i] = src[2 * i + 1];
+ }
+}
+
+static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ nvXXtoUV_c(dstU, dstV, src1, width);
+}
+
+static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ nvXXtoUV_c(dstV, dstU, src1, width);
+}
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i = 0; i < width; i++) {
+ int b = src[i * 3 + 0];
+ int g = src[i * 3 + 1];
+ int r = src[i * 3 + 2];
+
+ dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+}
+
+static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i;
+ for (i = 0; i < width; i++) {
+ int b = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int r = src1[3 * i + 2];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+ av_assert1(src1 == src2);
+}
+
+static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i;
+ for (i = 0; i < width; i++) {
+ int b = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int r = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ }
+ av_assert1(src1 == src2);
+}
+
+static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
+ uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i;
+ for (i = 0; i < width; i++) {
+ int r = src[i * 3 + 0];
+ int g = src[i * 3 + 1];
+ int b = src[i * 3 + 2];
+
+ dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+}
+
+static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i;
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int r = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int b = src1[3 * i + 2];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+}
+
+static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i;
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int r = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int b = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ }
+}
+
+static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width)
+{
+ uint16_t *dst = (uint16_t *)_dst;
+ int i;
+ for (i = 0; i < width; i++) {
+ int g = src[0][i];
+ int b = src[1][i];
+ int r = src[2][i];
+
+ dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ }
+}
+
+static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width)
+{
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ int i;
+ for (i = 0; i < width; i++) {
+ int g = src[0][i];
+ int b = src[1][i];
+ int r = src[2][i];
+
+ dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ }
+}
+
+#define rdpx(src) \
+ is_be ? AV_RB16(src) : AV_RL16(src)
+static av_always_inline void planar_rgb16_to_y(uint8_t *_dst, const uint8_t *_src[4],
+ int width, int bpc, int is_be)
+{
+ int i;
+ const uint16_t **src = (const uint16_t **)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+ for (i = 0; i < width; i++) {
+ int g = rdpx(src[0] + i);
+ int b = rdpx(src[1] + i);
+ int r = rdpx(src[2] + i);
+
+ dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + bpc - 14));
+ }
+}
+
+static void planar_rgb9le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 9, 0);
+}
+
+static void planar_rgb9be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 9, 1);
+}
+
+static void planar_rgb10le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 10, 0);
+}
+
+static void planar_rgb10be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 10, 1);
+}
+
+static void planar_rgb12le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 12, 0);
+}
+
+static void planar_rgb12be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 12, 1);
+}
+
+static void planar_rgb14le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 14, 0);
+}
+
+static void planar_rgb14be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 14, 1);
+}
+
+static void planar_rgb16le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 16, 0);
+}
+
+static void planar_rgb16be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_y(dst, src, w, 16, 1);
+}
+
+static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *_src[4], int width,
+ int bpc, int is_be)
+{
+ int i;
+ const uint16_t **src = (const uint16_t **)_src;
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ for (i = 0; i < width; i++) {
+ int g = rdpx(src[0] + i);
+ int b = rdpx(src[1] + i);
+ int r = rdpx(src[2] + i);
+
+ dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + bpc - 14);
+ dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + bpc - 14);
+ }
+}
+#undef rdpx
+
+static void planar_rgb9le_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 9, 0);
+}
+
+static void planar_rgb9be_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 9, 1);
+}
+
+static void planar_rgb10le_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 10, 0);
+}
+
+static void planar_rgb10be_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 10, 1);
+}
+
+static void planar_rgb12le_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 12, 0);
+}
+
+static void planar_rgb12be_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 12, 1);
+}
+
+static void planar_rgb14le_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 14, 0);
+}
+
+static void planar_rgb14be_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 14, 1);
+}
+
+static void planar_rgb16le_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 16, 0);
+}
+
+static void planar_rgb16be_to_uv(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src[4], int w)
+{
+ planar_rgb16_to_uv(dstU, dstV, src, w, 16, 1);
+}
+
+av_cold void ff_sws_init_input_funcs(SwsContext *c)
+{
+ enum AVPixelFormat srcFormat = c->srcFormat;
+
+ c->chrToYV12 = NULL;
+ switch (srcFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->chrToYV12 = yuy2ToUV_c;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->chrToYV12 = uyvyToUV_c;
+ break;
+ case AV_PIX_FMT_NV12:
+ c->chrToYV12 = nv12ToUV_c;
+ break;
+ case AV_PIX_FMT_NV21:
+ c->chrToYV12 = nv21ToUV_c;
+ break;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_PAL8:
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->chrToYV12 = palToUV_c;
+ break;
+ case AV_PIX_FMT_GBRP9LE:
+ c->readChrPlanar = planar_rgb9le_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP10LE:
+ c->readChrPlanar = planar_rgb10le_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP12LE:
+ c->readChrPlanar = planar_rgb12le_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP14LE:
+ c->readChrPlanar = planar_rgb14le_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP16LE:
+ c->readChrPlanar = planar_rgb16le_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP9BE:
+ c->readChrPlanar = planar_rgb9be_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP10BE:
+ c->readChrPlanar = planar_rgb10be_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP12BE:
+ c->readChrPlanar = planar_rgb12be_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP14BE:
+ c->readChrPlanar = planar_rgb14be_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP16BE:
+ c->readChrPlanar = planar_rgb16be_to_uv;
+ break;
+ case AV_PIX_FMT_GBRP:
+ c->readChrPlanar = planar_rgb_to_uv;
+ break;
+#if HAVE_BIGENDIAN
+ case AV_PIX_FMT_YUV444P9LE:
+ case AV_PIX_FMT_YUV422P9LE:
+ case AV_PIX_FMT_YUV420P9LE:
+ case AV_PIX_FMT_YUV422P10LE:
+ case AV_PIX_FMT_YUV444P10LE:
+ case AV_PIX_FMT_YUV420P10LE:
+ case AV_PIX_FMT_YUV422P12LE:
+ case AV_PIX_FMT_YUV444P12LE:
+ case AV_PIX_FMT_YUV420P12LE:
+ case AV_PIX_FMT_YUV422P14LE:
+ case AV_PIX_FMT_YUV444P14LE:
+ case AV_PIX_FMT_YUV420P14LE:
+ case AV_PIX_FMT_YUV420P16LE:
+ case AV_PIX_FMT_YUV422P16LE:
+ case AV_PIX_FMT_YUV444P16LE:
+
+ case AV_PIX_FMT_YUVA444P9LE:
+ case AV_PIX_FMT_YUVA422P9LE:
+ case AV_PIX_FMT_YUVA420P9LE:
+ case AV_PIX_FMT_YUVA444P10LE:
+ case AV_PIX_FMT_YUVA422P10LE:
+ case AV_PIX_FMT_YUVA420P10LE:
+ case AV_PIX_FMT_YUVA420P16LE:
+ case AV_PIX_FMT_YUVA422P16LE:
+ case AV_PIX_FMT_YUVA444P16LE:
+ c->chrToYV12 = bswap16UV_c;
+ break;
+#else
+ case AV_PIX_FMT_YUV444P9BE:
+ case AV_PIX_FMT_YUV422P9BE:
+ case AV_PIX_FMT_YUV420P9BE:
+ case AV_PIX_FMT_YUV444P10BE:
+ case AV_PIX_FMT_YUV422P10BE:
+ case AV_PIX_FMT_YUV420P10BE:
+ case AV_PIX_FMT_YUV444P12BE:
+ case AV_PIX_FMT_YUV422P12BE:
+ case AV_PIX_FMT_YUV420P12BE:
+ case AV_PIX_FMT_YUV444P14BE:
+ case AV_PIX_FMT_YUV422P14BE:
+ case AV_PIX_FMT_YUV420P14BE:
+ case AV_PIX_FMT_YUV420P16BE:
+ case AV_PIX_FMT_YUV422P16BE:
+ case AV_PIX_FMT_YUV444P16BE:
+
+ case AV_PIX_FMT_YUVA444P9BE:
+ case AV_PIX_FMT_YUVA422P9BE:
+ case AV_PIX_FMT_YUVA420P9BE:
+ case AV_PIX_FMT_YUVA444P10BE:
+ case AV_PIX_FMT_YUVA422P10BE:
+ case AV_PIX_FMT_YUVA420P10BE:
+ case AV_PIX_FMT_YUVA420P16BE:
+ case AV_PIX_FMT_YUVA422P16BE:
+ case AV_PIX_FMT_YUVA444P16BE:
+ c->chrToYV12 = bswap16UV_c;
+ break;
+#endif
+ }
+ if (c->chrSrcHSubSample) {
+ switch (srcFormat) {
+ case AV_PIX_FMT_RGBA64BE:
+ c->chrToYV12 = rgb64BEToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->chrToYV12 = rgb64LEToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->chrToYV12 = rgb48BEToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->chrToYV12 = rgb48LEToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->chrToYV12 = bgr48BEToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->chrToYV12 = bgr48LEToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->chrToYV12 = bgr32ToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->chrToYV12 = bgr321ToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->chrToYV12 = bgr16leToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->chrToYV12 = bgr16beToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->chrToYV12 = bgr15leToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->chrToYV12 = bgr15beToUV_half_c;
+ break;
+ case AV_PIX_FMT_GBR24P :
+ c->chrToYV12 = gbr24pToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->chrToYV12 = bgr12leToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->chrToYV12 = bgr12beToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->chrToYV12 = rgb32ToUV_half_c;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->chrToYV12 = rgb321ToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->chrToYV12 = rgb16leToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->chrToYV12 = rgb16beToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->chrToYV12 = rgb15leToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->chrToYV12 = rgb15beToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->chrToYV12 = rgb12leToUV_half_c;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->chrToYV12 = rgb12beToUV_half_c;
+ break;
+ }
+ } else {
+ switch (srcFormat) {
+ case AV_PIX_FMT_RGBA64BE:
+ c->chrToYV12 = rgb64BEToUV_c;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->chrToYV12 = rgb64LEToUV_c;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->chrToYV12 = rgb48BEToUV_c;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->chrToYV12 = rgb48LEToUV_c;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->chrToYV12 = bgr48BEToUV_c;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->chrToYV12 = bgr48LEToUV_c;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->chrToYV12 = bgr32ToUV_c;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->chrToYV12 = bgr321ToUV_c;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_c;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->chrToYV12 = bgr16leToUV_c;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->chrToYV12 = bgr16beToUV_c;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->chrToYV12 = bgr15leToUV_c;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->chrToYV12 = bgr15beToUV_c;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->chrToYV12 = bgr12leToUV_c;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->chrToYV12 = bgr12beToUV_c;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->chrToYV12 = rgb32ToUV_c;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->chrToYV12 = rgb321ToUV_c;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_c;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->chrToYV12 = rgb16leToUV_c;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->chrToYV12 = rgb16beToUV_c;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->chrToYV12 = rgb15leToUV_c;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->chrToYV12 = rgb15beToUV_c;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->chrToYV12 = rgb12leToUV_c;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->chrToYV12 = rgb12beToUV_c;
+ break;
+ }
+ }
+
+ c->lumToYV12 = NULL;
+ c->alpToYV12 = NULL;
+ switch (srcFormat) {
+ case AV_PIX_FMT_GBRP9LE:
+ c->readLumPlanar = planar_rgb9le_to_y;
+ break;
+ case AV_PIX_FMT_GBRP10LE:
+ c->readLumPlanar = planar_rgb10le_to_y;
+ break;
+ case AV_PIX_FMT_GBRP12LE:
+ c->readLumPlanar = planar_rgb12le_to_y;
+ break;
+ case AV_PIX_FMT_GBRP14LE:
+ c->readLumPlanar = planar_rgb14le_to_y;
+ break;
+ case AV_PIX_FMT_GBRP16LE:
+ c->readLumPlanar = planar_rgb16le_to_y;
+ break;
+ case AV_PIX_FMT_GBRP9BE:
+ c->readLumPlanar = planar_rgb9be_to_y;
+ break;
+ case AV_PIX_FMT_GBRP10BE:
+ c->readLumPlanar = planar_rgb10be_to_y;
+ break;
+ case AV_PIX_FMT_GBRP12BE:
+ c->readLumPlanar = planar_rgb12be_to_y;
+ break;
+ case AV_PIX_FMT_GBRP14BE:
+ c->readLumPlanar = planar_rgb14be_to_y;
+ break;
+ case AV_PIX_FMT_GBRP16BE:
+ c->readLumPlanar = planar_rgb16be_to_y;
+ break;
+ case AV_PIX_FMT_GBRP:
+ c->readLumPlanar = planar_rgb_to_y;
+ break;
+#if HAVE_BIGENDIAN
+ case AV_PIX_FMT_YUV444P9LE:
+ case AV_PIX_FMT_YUV422P9LE:
+ case AV_PIX_FMT_YUV420P9LE:
+ case AV_PIX_FMT_YUV444P10LE:
+ case AV_PIX_FMT_YUV422P10LE:
+ case AV_PIX_FMT_YUV420P10LE:
+ case AV_PIX_FMT_YUV444P12LE:
+ case AV_PIX_FMT_YUV422P12LE:
+ case AV_PIX_FMT_YUV420P12LE:
+ case AV_PIX_FMT_YUV444P14LE:
+ case AV_PIX_FMT_YUV422P14LE:
+ case AV_PIX_FMT_YUV420P14LE:
+ case AV_PIX_FMT_YUV420P16LE:
+ case AV_PIX_FMT_YUV422P16LE:
+ case AV_PIX_FMT_YUV444P16LE:
+
+ case AV_PIX_FMT_GRAY16LE:
+ c->lumToYV12 = bswap16Y_c;
+ break;
+ case AV_PIX_FMT_YUVA444P9LE:
+ case AV_PIX_FMT_YUVA422P9LE:
+ case AV_PIX_FMT_YUVA420P9LE:
+ case AV_PIX_FMT_YUVA444P10LE:
+ case AV_PIX_FMT_YUVA422P10LE:
+ case AV_PIX_FMT_YUVA420P10LE:
+ case AV_PIX_FMT_YUVA420P16LE:
+ case AV_PIX_FMT_YUVA422P16LE:
+ case AV_PIX_FMT_YUVA444P16LE:
+ c->lumToYV12 = bswap16Y_c;
+ c->alpToYV12 = bswap16Y_c;
+ break;
+#else
+ case AV_PIX_FMT_YUV444P9BE:
+ case AV_PIX_FMT_YUV422P9BE:
+ case AV_PIX_FMT_YUV420P9BE:
+ case AV_PIX_FMT_YUV444P10BE:
+ case AV_PIX_FMT_YUV422P10BE:
+ case AV_PIX_FMT_YUV420P10BE:
+ case AV_PIX_FMT_YUV444P12BE:
+ case AV_PIX_FMT_YUV422P12BE:
+ case AV_PIX_FMT_YUV420P12BE:
+ case AV_PIX_FMT_YUV444P14BE:
+ case AV_PIX_FMT_YUV422P14BE:
+ case AV_PIX_FMT_YUV420P14BE:
+ case AV_PIX_FMT_YUV420P16BE:
+ case AV_PIX_FMT_YUV422P16BE:
+ case AV_PIX_FMT_YUV444P16BE:
+
+ case AV_PIX_FMT_GRAY16BE:
+ c->lumToYV12 = bswap16Y_c;
+ break;
+ case AV_PIX_FMT_YUVA444P9BE:
+ case AV_PIX_FMT_YUVA422P9BE:
+ case AV_PIX_FMT_YUVA420P9BE:
+ case AV_PIX_FMT_YUVA444P10BE:
+ case AV_PIX_FMT_YUVA422P10BE:
+ case AV_PIX_FMT_YUVA420P10BE:
+ case AV_PIX_FMT_YUVA420P16BE:
+ case AV_PIX_FMT_YUVA422P16BE:
+ case AV_PIX_FMT_YUVA444P16BE:
+ c->lumToYV12 = bswap16Y_c;
+ c->alpToYV12 = bswap16Y_c;
+ break;
+#endif
+ case AV_PIX_FMT_YUYV422:
+ case AV_PIX_FMT_Y400A:
+ c->lumToYV12 = yuy2ToY_c;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->lumToYV12 = uyvyToY_c;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->lumToYV12 = bgr24ToY_c;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->lumToYV12 = bgr16leToY_c;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->lumToYV12 = bgr16beToY_c;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->lumToYV12 = bgr15leToY_c;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->lumToYV12 = bgr15beToY_c;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->lumToYV12 = bgr12leToY_c;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->lumToYV12 = bgr12beToY_c;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->lumToYV12 = rgb24ToY_c;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->lumToYV12 = rgb16leToY_c;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->lumToYV12 = rgb16beToY_c;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->lumToYV12 = rgb15leToY_c;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->lumToYV12 = rgb15beToY_c;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->lumToYV12 = rgb12leToY_c;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->lumToYV12 = rgb12beToY_c;
+ break;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_PAL8:
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->lumToYV12 = palToY_c;
+ break;
+ case AV_PIX_FMT_MONOBLACK:
+ c->lumToYV12 = monoblack2Y_c;
+ break;
+ case AV_PIX_FMT_MONOWHITE:
+ c->lumToYV12 = monowhite2Y_c;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->lumToYV12 = bgr32ToY_c;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->lumToYV12 = bgr321ToY_c;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->lumToYV12 = rgb32ToY_c;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->lumToYV12 = rgb321ToY_c;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->lumToYV12 = rgb48BEToY_c;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->lumToYV12 = rgb48LEToY_c;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->lumToYV12 = bgr48BEToY_c;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->lumToYV12 = bgr48LEToY_c;
+ break;
+ case AV_PIX_FMT_RGBA64BE:
+ c->lumToYV12 = rgb64BEToY_c;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->lumToYV12 = rgb64LEToY_c;
+ break;
+ }
+ if (c->alpPixBuf) {
+ if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
+ if (HAVE_BIGENDIAN == !isBE(srcFormat))
+ c->alpToYV12 = bswap16Y_c;
+ }
+ switch (srcFormat) {
+ case AV_PIX_FMT_RGBA64LE:
+ case AV_PIX_FMT_RGBA64BE: c->alpToYV12 = rgba64ToA_c; break;
+ case AV_PIX_FMT_BGRA:
+ case AV_PIX_FMT_RGBA:
+ c->alpToYV12 = rgbaToA_c;
+ break;
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_ARGB:
+ c->alpToYV12 = abgrToA_c;
+ break;
+ case AV_PIX_FMT_Y400A:
+ c->alpToYV12 = uyvyToY_c;
+ break;
+ case AV_PIX_FMT_PAL8 :
+ c->alpToYV12 = palToA_c;
+ break;
+ }
+ }
+}
diff --git a/ffmpeg/libswscale/libswscale.pc b/ffmpeg/libswscale/libswscale.pc
new file mode 100644
index 0000000..523b5e4
--- /dev/null
+++ b/ffmpeg/libswscale/libswscale.pc
@@ -0,0 +1,14 @@
+prefix=/usr/local
+exec_prefix=${prefix}
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: libswscale
+Description: FFmpeg image rescaling library
+Version: 2.2.100
+Requires:
+Requires.private: libavutil = 52.22.100
+Conflicts:
+Libs: -L${libdir} -lswscale
+Libs.private: -lm
+Cflags: -I${includedir}
diff --git a/ffmpeg/libswscale/libswscale.v b/ffmpeg/libswscale/libswscale.v
new file mode 100644
index 0000000..8b9a96a
--- /dev/null
+++ b/ffmpeg/libswscale/libswscale.v
@@ -0,0 +1,4 @@
+LIBSWSCALE_$MAJOR {
+ global: swscale_*; sws_*;
+ local: *;
+};
diff --git a/ffmpeg/libswscale/options.c b/ffmpeg/libswscale/options.c
new file mode 100644
index 0000000..fc571ac
--- /dev/null
+++ b/ffmpeg/libswscale/options.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avutil.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixfmt.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+static const char *sws_context_to_name(void *ptr)
+{
+ return "swscaler";
+}
+
+#define OFFSET(x) offsetof(SwsContext, x)
+#define DEFAULT 0
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption options[] = {
+ { "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = DEFAULT }, 0, UINT_MAX, VE, "sws_flags" },
+ { "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "experimental", "experimental", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_X }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_POINT }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "area", "averaging area", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_AREA }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "bicublin", "luma bicubic, chroma bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBLIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "gauss", "gaussian", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_GAUSS }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "sinc", "sinc", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SINC }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_LANCZOS }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "spline", "natural bicubic spline", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "print_info", "print info", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "accurate_rnd", "accurate rounding", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
+ { "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" },
+
+ { "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
+ { "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
+ { "dstw", "destination width", OFFSET(dstW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
+ { "dsth", "destination height", OFFSET(dstH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
+ { "src_format", "source format", OFFSET(srcFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
+ { "dst_format", "destination format", OFFSET(dstFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
+ { "src_range", "source range", OFFSET(srcRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
+ { "dst_range", "destination range", OFFSET(dstRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
+ { "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
+ { "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
+
+ { NULL }
+};
+
+const AVClass sws_context_class = {
+ .class_name = "SWScaler",
+ .item_name = sws_context_to_name,
+ .option = options,
+ .category = AV_CLASS_CATEGORY_SWSCALER,
+ .version = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *sws_get_class(void)
+{
+ return &sws_context_class;
+}
diff --git a/ffmpeg/libswscale/output.c b/ffmpeg/libswscale/output.c
new file mode 100644
index 0000000..d9745fb
--- /dev/null
+++ b/ffmpeg/libswscale/output.c
@@ -0,0 +1,1722 @@
+/*
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+DECLARE_ALIGNED(8, const uint8_t, dither_2x2_4)[][8]={
+{ 1, 3, 1, 3, 1, 3, 1, 3, },
+{ 2, 0, 2, 0, 2, 0, 2, 0, },
+{ 1, 3, 1, 3, 1, 3, 1, 3, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_2x2_8)[][8]={
+{ 6, 2, 6, 2, 6, 2, 6, 2, },
+{ 0, 4, 0, 4, 0, 4, 0, 4, },
+{ 6, 2, 6, 2, 6, 2, 6, 2, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[][8]={
+{ 8, 4, 11, 7, 8, 4, 11, 7, },
+{ 2, 14, 1, 13, 2, 14, 1, 13, },
+{ 10, 6, 9, 5, 10, 6, 9, 5, },
+{ 0, 12, 3, 15, 0, 12, 3, 15, },
+{ 8, 4, 11, 7, 8, 4, 11, 7, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[][8]={
+{ 17, 9, 23, 15, 16, 8, 22, 14, },
+{ 5, 29, 3, 27, 4, 28, 2, 26, },
+{ 21, 13, 19, 11, 20, 12, 18, 10, },
+{ 0, 24, 6, 30, 1, 25, 7, 31, },
+{ 16, 8, 22, 14, 17, 9, 23, 15, },
+{ 4, 28, 2, 26, 5, 29, 3, 27, },
+{ 20, 12, 18, 10, 21, 13, 19, 11, },
+{ 1, 25, 7, 31, 0, 24, 6, 30, },
+{ 17, 9, 23, 15, 16, 8, 22, 14, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[][8]={
+{ 0, 55, 14, 68, 3, 58, 17, 72, },
+{ 37, 18, 50, 32, 40, 22, 54, 35, },
+{ 9, 64, 5, 59, 13, 67, 8, 63, },
+{ 46, 27, 41, 23, 49, 31, 44, 26, },
+{ 2, 57, 16, 71, 1, 56, 15, 70, },
+{ 39, 21, 52, 34, 38, 19, 51, 33, },
+{ 11, 66, 7, 62, 10, 65, 6, 60, },
+{ 48, 30, 43, 25, 47, 29, 42, 24, },
+{ 0, 55, 14, 68, 3, 58, 17, 72, },
+};
+
+#if 1
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
+{117, 62, 158, 103, 113, 58, 155, 100, },
+{ 34, 199, 21, 186, 31, 196, 17, 182, },
+{144, 89, 131, 76, 141, 86, 127, 72, },
+{ 0, 165, 41, 206, 10, 175, 52, 217, },
+{110, 55, 151, 96, 120, 65, 162, 107, },
+{ 28, 193, 14, 179, 38, 203, 24, 189, },
+{138, 83, 124, 69, 148, 93, 134, 79, },
+{ 7, 172, 48, 213, 3, 168, 45, 210, },
+{117, 62, 158, 103, 113, 58, 155, 100, },
+};
+#elif 1
+// tries to correct a gamma of 1.5
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
+{ 0, 143, 18, 200, 2, 156, 25, 215, },
+{ 78, 28, 125, 64, 89, 36, 138, 74, },
+{ 10, 180, 3, 161, 16, 195, 8, 175, },
+{109, 51, 93, 38, 121, 60, 105, 47, },
+{ 1, 152, 23, 210, 0, 147, 20, 205, },
+{ 85, 33, 134, 71, 81, 30, 130, 67, },
+{ 14, 190, 6, 171, 12, 185, 5, 166, },
+{117, 57, 101, 44, 113, 54, 97, 41, },
+{ 0, 143, 18, 200, 2, 156, 25, 215, },
+};
+#elif 1
+// tries to correct a gamma of 2.0
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
+{ 0, 124, 8, 193, 0, 140, 12, 213, },
+{ 55, 14, 104, 42, 66, 19, 119, 52, },
+{ 3, 168, 1, 145, 6, 187, 3, 162, },
+{ 86, 31, 70, 21, 99, 39, 82, 28, },
+{ 0, 134, 11, 206, 0, 129, 9, 200, },
+{ 62, 17, 114, 48, 58, 16, 109, 45, },
+{ 5, 181, 2, 157, 4, 175, 1, 151, },
+{ 95, 36, 78, 26, 90, 34, 74, 24, },
+{ 0, 124, 8, 193, 0, 140, 12, 213, },
+};
+#else
+// tries to correct a gamma of 2.5
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
+{ 0, 107, 3, 187, 0, 125, 6, 212, },
+{ 39, 7, 86, 28, 49, 11, 102, 36, },
+{ 1, 158, 0, 131, 3, 180, 1, 151, },
+{ 68, 19, 52, 12, 81, 25, 64, 17, },
+{ 0, 119, 5, 203, 0, 113, 4, 195, },
+{ 45, 9, 96, 33, 42, 8, 91, 30, },
+{ 2, 172, 1, 144, 2, 165, 0, 137, },
+{ 77, 23, 60, 15, 72, 21, 56, 14, },
+{ 0, 107, 3, 187, 0, 125, 6, 212, },
+};
+#endif
+
+#define output_pixel(pos, val, bias, signedness) \
+ if (big_endian) { \
+ AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+ } else { \
+ AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+ }
+
+static av_always_inline void
+yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits)
+{
+ int i;
+ int shift = 3;
+ av_assert0(output_bits == 16);
+
+ for (i = 0; i < dstW; i++) {
+ int val = src[i] + (1 << (shift - 1));
+ output_pixel(&dest[i], val, 0, uint);
+ }
+}
+
+static av_always_inline void
+yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
+ const int32_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits)
+{
+ int i;
+ int shift = 15;
+ av_assert0(output_bits == 16);
+
+ for (i = 0; i < dstW; i++) {
+ int val = 1 << (shift - 1);
+ int j;
+
+ /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
+ * filters (or anything with negative coeffs, the range can be slightly
+ * wider in both directions. To account for this overflow, we subtract
+ * a constant so it always fits in the signed range (assuming a
+ * reasonable filterSize), and re-add that at the end. */
+ val -= 0x40000000;
+ for (j = 0; j < filterSize; j++)
+ val += src[j][i] * (unsigned)filter[j];
+
+ output_pixel(&dest[i], val, 0x8000, int);
+ }
+}
+
+#undef output_pixel
+
+#define output_pixel(pos, val) \
+ if (big_endian) { \
+ AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+ } else { \
+ AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+ }
+
+static av_always_inline void
+yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits)
+{
+ int i;
+ int shift = 15 - output_bits;
+
+ for (i = 0; i < dstW; i++) {
+ int val = src[i] + (1 << (shift - 1));
+ output_pixel(&dest[i], val);
+ }
+}
+
+static av_always_inline void
+yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
+ const int16_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits)
+{
+ int i;
+ int shift = 11 + 16 - output_bits;
+
+ for (i = 0; i < dstW; i++) {
+ int val = 1 << (shift - 1);
+ int j;
+
+ for (j = 0; j < filterSize; j++)
+ val += src[j][i] * filter[j];
+
+ output_pixel(&dest[i], val);
+ }
+}
+
+#undef output_pixel
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
+ uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset)\
+{ \
+ yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
+ (uint16_t *) dest, dstW, is_be, bits); \
+}\
+static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
+ const int16_t **src, uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset)\
+{ \
+ yuv2planeX_## template_size ## _c_template(filter, \
+ filterSize, (const typeX_t **) src, \
+ (uint16_t *) dest, dstW, is_be, bits); \
+}
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(12, BE, 1, 10, int16_t)
+yuv2NBPS(12, LE, 0, 10, int16_t)
+yuv2NBPS(14, BE, 1, 10, int16_t)
+yuv2NBPS(14, LE, 0, 10, int16_t)
+yuv2NBPS(16, BE, 1, 16, int32_t)
+yuv2NBPS(16, LE, 0, 16, int32_t)
+
+static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ int i;
+ for (i=0; i<dstW; i++) {
+ int val = dither[(i + offset) & 7] << 12;
+ int j;
+ for (j=0; j<filterSize; j++)
+ val += src[j][i] * filter[j];
+
+ dest[i]= av_clip_uint8(val>>19);
+ }
+}
+
+static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ int i;
+ for (i=0; i<dstW; i++) {
+ int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+ dest[i]= av_clip_uint8(val);
+ }
+}
+
+static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW)
+{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+ const uint8_t *chrDither = c->chrDither8;
+ int i;
+
+ if (dstFormat == AV_PIX_FMT_NV12)
+ for (i=0; i<chrDstW; i++) {
+ int u = chrDither[i & 7] << 12;
+ int v = chrDither[(i + 3) & 7] << 12;
+ int j;
+ for (j=0; j<chrFilterSize; j++) {
+ u += chrUSrc[j][i] * chrFilter[j];
+ v += chrVSrc[j][i] * chrFilter[j];
+ }
+
+ dest[2*i]= av_clip_uint8(u>>19);
+ dest[2*i+1]= av_clip_uint8(v>>19);
+ }
+ else
+ for (i=0; i<chrDstW; i++) {
+ int u = chrDither[i & 7] << 12;
+ int v = chrDither[(i + 3) & 7] << 12;
+ int j;
+ for (j=0; j<chrFilterSize; j++) {
+ u += chrUSrc[j][i] * chrFilter[j];
+ v += chrVSrc[j][i] * chrFilter[j];
+ }
+
+ dest[2*i]= av_clip_uint8(v>>19);
+ dest[2*i+1]= av_clip_uint8(u>>19);
+ }
+}
+
+#define accumulate_bit(acc, val) \
+ acc <<= 1; \
+ acc |= (val) >= (128 + 110)
+#define output_pixel(pos, acc) \
+ if (target == AV_PIX_FMT_MONOBLACK) { \
+ pos = acc; \
+ } else { \
+ pos = ~acc; \
+ }
+
+static av_always_inline void
+yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target)
+{
+ const uint8_t * const d128=dither_8x8_220[y&7];
+ int i;
+ unsigned acc = 0;
+ int err = 0;
+
+ for (i = 0; i < dstW; i += 2) {
+ int j;
+ int Y1 = 1 << 18;
+ int Y2 = 1 << 18;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][i] * lumFilter[j];
+ Y2 += lumSrc[j][i+1] * lumFilter[j];
+ }
+ Y1 >>= 19;
+ Y2 >>= 19;
+ if ((Y1 | Y2) & 0x100) {
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ }
+ if (c->flags & SWS_ERROR_DIFFUSION) {
+ Y1 += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
+ c->dither_error[0][i] = err;
+ acc = 2*acc + (Y1 >= 128);
+ Y1 -= 220*(acc&1);
+
+ err = Y2 + ((7*Y1 + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4);
+ c->dither_error[0][i+1] = Y1;
+ acc = 2*acc + (err >= 128);
+ err -= 220*(acc&1);
+ } else {
+ accumulate_bit(acc, Y1 + d128[(i + 0) & 7]);
+ accumulate_bit(acc, Y2 + d128[(i + 1) & 7]);
+ }
+ if ((i & 7) == 6) {
+ output_pixel(*dest++, acc);
+ }
+ }
+ c->dither_error[0][i] = err;
+
+ if (i & 6) {
+ output_pixel(*dest, acc);
+ }
+}
+
+static av_always_inline void
+yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1];
+ const uint8_t * const d128 = dither_8x8_220[y & 7];
+ int yalpha1 = 4096 - yalpha;
+ int i;
+
+ if (c->flags & SWS_ERROR_DIFFUSION) {
+ int err = 0;
+ int acc = 0;
+ for (i = 0; i < dstW; i +=2) {
+ int Y;
+
+ Y = (buf0[i + 0] * yalpha1 + buf1[i + 0] * yalpha) >> 19;
+ Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
+ c->dither_error[0][i] = err;
+ acc = 2*acc + (Y >= 128);
+ Y -= 220*(acc&1);
+
+ err = (buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19;
+ err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4;
+ c->dither_error[0][i+1] = Y;
+ acc = 2*acc + (err >= 128);
+ err -= 220*(acc&1);
+
+ if ((i & 7) == 6)
+ output_pixel(*dest++, acc);
+ }
+ c->dither_error[0][i] = err;
+ } else {
+ for (i = 0; i < dstW; i += 8) {
+ int Y, acc = 0;
+
+ Y = (buf0[i + 0] * yalpha1 + buf1[i + 0] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[0]);
+ Y = (buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[1]);
+ Y = (buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[2]);
+ Y = (buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[3]);
+ Y = (buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[4]);
+ Y = (buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[5]);
+ Y = (buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[6]);
+ Y = (buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19;
+ accumulate_bit(acc, Y + d128[7]);
+
+ output_pixel(*dest++, acc);
+ }
+ }
+}
+
+static av_always_inline void
+yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target)
+{
+ const uint8_t * const d128 = dither_8x8_220[y & 7];
+ int i;
+
+ if (c->flags & SWS_ERROR_DIFFUSION) {
+ int err = 0;
+ int acc = 0;
+ for (i = 0; i < dstW; i +=2) {
+ int Y;
+
+ Y = ((buf0[i + 0] + 64) >> 7);
+ Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
+ c->dither_error[0][i] = err;
+ acc = 2*acc + (Y >= 128);
+ Y -= 220*(acc&1);
+
+ err = ((buf0[i + 1] + 64) >> 7);
+ err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4;
+ c->dither_error[0][i+1] = Y;
+ acc = 2*acc + (err >= 128);
+ err -= 220*(acc&1);
+
+ if ((i & 7) == 6)
+ output_pixel(*dest++, acc);
+ }
+ c->dither_error[0][i] = err;
+ } else {
+ for (i = 0; i < dstW; i += 8) {
+ int acc = 0;
+ accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]);
+ accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]);
+ accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]);
+ accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]);
+ accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]);
+ accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]);
+ accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]);
+ accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]);
+
+ output_pixel(*dest++, acc);
+ }
+ }
+}
+
+#undef output_pixel
+#undef accumulate_bit
+
+#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **lumSrc, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, int chrFilterSize, \
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \
+ int y) \
+{ \
+ name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+ alpSrc, dest, dstW, y, fmt); \
+} \
+ \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \
+ int yalpha, int uvalpha, int y) \
+{ \
+ name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+ dest, dstW, yalpha, uvalpha, y, fmt); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf0, uint8_t *dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
+ abuf0, dest, dstW, uvalpha, \
+ y, fmt); \
+}
+
+YUV2PACKEDWRAPPER(yuv2mono,, white, AV_PIX_FMT_MONOWHITE)
+YUV2PACKEDWRAPPER(yuv2mono,, black, AV_PIX_FMT_MONOBLACK)
+
+#define output_pixels(pos, Y1, U, Y2, V) \
+ if (target == AV_PIX_FMT_YUYV422) { \
+ dest[pos + 0] = Y1; \
+ dest[pos + 1] = U; \
+ dest[pos + 2] = Y2; \
+ dest[pos + 3] = V; \
+ } else { \
+ dest[pos + 0] = U; \
+ dest[pos + 1] = Y1; \
+ dest[pos + 2] = V; \
+ dest[pos + 3] = Y2; \
+ }
+
+static av_always_inline void
+yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target)
+{
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int j;
+ int Y1 = 1 << 18;
+ int Y2 = 1 << 18;
+ int U = 1 << 18;
+ int V = 1 << 18;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][i * 2] * lumFilter[j];
+ Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+ }
+ Y1 >>= 19;
+ Y2 >>= 19;
+ U >>= 19;
+ V >>= 19;
+ if ((Y1 | Y2 | U | V) & 0x100) {
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+ }
+ output_pixels(4*i, Y1, U, Y2, V);
+ }
+}
+
+static av_always_inline void
+yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 19;
+ int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 19;
+ int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19;
+ int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19;
+
+ if ((Y1 | Y2 | U | V) & 0x100) {
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+ }
+
+ output_pixels(i * 4, Y1, U, Y2, V);
+ }
+}
+
+static av_always_inline void
+yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i;
+
+ if (uvalpha < 2048) {
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2 ]+64) >> 7;
+ int Y2 = (buf0[i * 2 + 1]+64) >> 7;
+ int U = (ubuf0[i] +64) >> 7;
+ int V = (vbuf0[i] +64) >> 7;
+
+ if ((Y1 | Y2 | U | V) & 0x100) {
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+ }
+
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+
+ output_pixels(i * 4, Y1, U, Y2, V);
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2 ] + 64) >> 7;
+ int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[i] + ubuf1[i]+128) >> 8;
+ int V = (vbuf0[i] + vbuf1[i]+128) >> 8;
+
+ if ((Y1 | Y2 | U | V) & 0x100) {
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+ }
+
+ Y1 = av_clip_uint8(Y1);
+ Y2 = av_clip_uint8(Y2);
+ U = av_clip_uint8(U);
+ V = av_clip_uint8(V);
+
+ output_pixels(i * 4, Y1, U, Y2, V);
+ }
+ }
+}
+
+#undef output_pixels
+
+YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
+YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
+
+#define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE) ? R : B)
+#define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE) ? B : R)
+#define output_pixel(pos, val) \
+ if (isBE(target)) { \
+ AV_WB16(pos, val); \
+ } else { \
+ AV_WL16(pos, val); \
+ }
+
+static av_always_inline void
+yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
+ const int32_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int32_t **chrUSrc,
+ const int32_t **chrVSrc, int chrFilterSize,
+ const int32_t **alpSrc, uint16_t *dest, int dstW,
+ int y, enum AVPixelFormat target)
+{
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int j;
+ int Y1 = -0x40000000;
+ int Y2 = -0x40000000;
+ int U = -128 << 23; // 19
+ int V = -128 << 23;
+ int R, G, B;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][i * 2] * (unsigned)lumFilter[j];
+ Y2 += lumSrc[j][i * 2 + 1] * (unsigned)lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {;
+ U += chrUSrc[j][i] * (unsigned)chrFilter[j];
+ V += chrVSrc[j][i] * (unsigned)chrFilter[j];
+ }
+
+ // 8bit: 12+15=27; 16-bit: 12+19=31
+ Y1 >>= 14; // 10
+ Y1 += 0x10000;
+ Y2 >>= 14;
+ Y2 += 0x10000;
+ U >>= 14;
+ V >>= 14;
+
+ // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
+ Y1 -= c->yuv2rgb_y_offset;
+ Y2 -= c->yuv2rgb_y_offset;
+ Y1 *= c->yuv2rgb_y_coeff;
+ Y2 *= c->yuv2rgb_y_coeff;
+ Y1 += 1 << 13; // 21
+ Y2 += 1 << 13;
+ // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
+
+ R = V * c->yuv2rgb_v2r_coeff;
+ G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+ B = U * c->yuv2rgb_u2b_coeff;
+
+ // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
+ output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+ output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
+ output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+ output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+ output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
+ output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+ dest += 6;
+ }
+}
+
+static av_always_inline void
+yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
+ const int32_t *ubuf[2], const int32_t *vbuf[2],
+ const int32_t *abuf[2], uint16_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target)
+{
+ const int32_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 14;
+ int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 14;
+ int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha + (-128 << 23)) >> 14;
+ int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha + (-128 << 23)) >> 14;
+ int R, G, B;
+
+ Y1 -= c->yuv2rgb_y_offset;
+ Y2 -= c->yuv2rgb_y_offset;
+ Y1 *= c->yuv2rgb_y_coeff;
+ Y2 *= c->yuv2rgb_y_coeff;
+ Y1 += 1 << 13;
+ Y2 += 1 << 13;
+
+ R = V * c->yuv2rgb_v2r_coeff;
+ G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+ B = U * c->yuv2rgb_u2b_coeff;
+
+ output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+ output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
+ output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+ output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+ output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
+ output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+ dest += 6;
+ }
+}
+
+static av_always_inline void
+yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
+ const int32_t *ubuf[2], const int32_t *vbuf[2],
+ const int32_t *abuf0, uint16_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target)
+{
+ const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i;
+
+ if (uvalpha < 2048) {
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2] ) >> 2;
+ int Y2 = (buf0[i * 2 + 1]) >> 2;
+ int U = (ubuf0[i] + (-128 << 11)) >> 2;
+ int V = (vbuf0[i] + (-128 << 11)) >> 2;
+ int R, G, B;
+
+ Y1 -= c->yuv2rgb_y_offset;
+ Y2 -= c->yuv2rgb_y_offset;
+ Y1 *= c->yuv2rgb_y_coeff;
+ Y2 *= c->yuv2rgb_y_coeff;
+ Y1 += 1 << 13;
+ Y2 += 1 << 13;
+
+ R = V * c->yuv2rgb_v2r_coeff;
+ G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+ B = U * c->yuv2rgb_u2b_coeff;
+
+ output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+ output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
+ output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+ output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+ output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
+ output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+ dest += 6;
+ }
+ } else {
+ const int32_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2] ) >> 2;
+ int Y2 = (buf0[i * 2 + 1]) >> 2;
+ int U = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
+ int V = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
+ int R, G, B;
+
+ Y1 -= c->yuv2rgb_y_offset;
+ Y2 -= c->yuv2rgb_y_offset;
+ Y1 *= c->yuv2rgb_y_coeff;
+ Y2 *= c->yuv2rgb_y_coeff;
+ Y1 += 1 << 13;
+ Y2 += 1 << 13;
+
+ R = V * c->yuv2rgb_v2r_coeff;
+ G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+ B = U * c->yuv2rgb_u2b_coeff;
+
+ output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
+ output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
+ output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
+ output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
+ output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
+ output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
+ dest += 6;
+ }
+ }
+}
+
+#undef output_pixel
+#undef r_b
+#undef b_r
+
+#define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **_lumSrc, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **_chrUSrc, \
+ const int16_t **_chrVSrc, int chrFilterSize, \
+ const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
+ int y) \
+{ \
+ const int32_t **lumSrc = (const int32_t **) _lumSrc, \
+ **chrUSrc = (const int32_t **) _chrUSrc, \
+ **chrVSrc = (const int32_t **) _chrVSrc, \
+ **alpSrc = (const int32_t **) _alpSrc; \
+ uint16_t *dest = (uint16_t *) _dest; \
+ name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+ alpSrc, dest, dstW, y, fmt); \
+} \
+ \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
+ const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
+ const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
+ int yalpha, int uvalpha, int y) \
+{ \
+ const int32_t **buf = (const int32_t **) _buf, \
+ **ubuf = (const int32_t **) _ubuf, \
+ **vbuf = (const int32_t **) _vbuf, \
+ **abuf = (const int32_t **) _abuf; \
+ uint16_t *dest = (uint16_t *) _dest; \
+ name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+ dest, dstW, yalpha, uvalpha, y, fmt); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
+ const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
+ const int16_t *_abuf0, uint8_t *_dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ const int32_t *buf0 = (const int32_t *) _buf0, \
+ **ubuf = (const int32_t **) _ubuf, \
+ **vbuf = (const int32_t **) _vbuf, \
+ *abuf0 = (const int32_t *) _abuf0; \
+ uint16_t *dest = (uint16_t *) _dest; \
+ name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
+ dstW, uvalpha, y, fmt); \
+}
+
+YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, AV_PIX_FMT_RGB48BE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, AV_PIX_FMT_RGB48LE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, AV_PIX_FMT_BGR48BE)
+YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, AV_PIX_FMT_BGR48LE)
+
+/*
+ * Write out 2 RGB pixels in the target pixel format. This function takes a
+ * R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
+ * things like endianness conversion and shifting. The caller takes care of
+ * setting the correct offset in these tables from the chroma (U/V) values.
+ * This function then uses the luminance (Y1/Y2) values to write out the
+ * correct RGB values into the destination buffer.
+ */
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+ unsigned A1, unsigned A2,
+ const void *_r, const void *_g, const void *_b, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+ target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+ uint32_t *dest = (uint32_t *) _dest;
+ const uint32_t *r = (const uint32_t *) _r;
+ const uint32_t *g = (const uint32_t *) _g;
+ const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+ int sh = hasAlpha ? ((target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24) : 0;
+
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
+#else
+ if (hasAlpha) {
+ int sh = (target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
+ } else {
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+ }
+#endif
+ } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+ dest[i * 6 + 0] = r_b[Y1];
+ dest[i * 6 + 1] = g[Y1];
+ dest[i * 6 + 2] = b_r[Y1];
+ dest[i * 6 + 3] = r_b[Y2];
+ dest[i * 6 + 4] = g[Y2];
+ dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+ } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+ target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+ target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+ uint16_t *dest = (uint16_t *) _dest;
+ const uint16_t *r = (const uint16_t *) _r;
+ const uint16_t *g = (const uint16_t *) _g;
+ const uint16_t *b = (const uint16_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+ dr1 = dither_2x2_8[ y & 1 ][0];
+ dg1 = dither_2x2_4[ y & 1 ][0];
+ db1 = dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = dither_2x2_8[ y & 1 ][1];
+ dg2 = dither_2x2_4[ y & 1 ][1];
+ db2 = dither_2x2_8[(y & 1) ^ 1][1];
+ } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+ dr1 = dither_2x2_8[ y & 1 ][0];
+ dg1 = dither_2x2_8[ y & 1 ][1];
+ db1 = dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = dither_2x2_8[ y & 1 ][1];
+ dg2 = dither_2x2_8[ y & 1 ][0];
+ db2 = dither_2x2_8[(y & 1) ^ 1][1];
+ } else {
+ dr1 = dither_4x4_16[ y & 3 ][0];
+ dg1 = dither_4x4_16[ y & 3 ][1];
+ db1 = dither_4x4_16[(y & 3) ^ 3][0];
+ dr2 = dither_4x4_16[ y & 3 ][1];
+ dg2 = dither_4x4_16[ y & 3 ][0];
+ db2 = dither_4x4_16[(y & 3) ^ 3][1];
+ }
+
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ } else /* 8/4-bit */ {
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+ const uint8_t * const d64 = dither_8x8_73[y & 7];
+ const uint8_t * const d32 = dither_8x8_32[y & 7];
+ dr1 = dg1 = d32[(i * 2 + 0) & 7];
+ db1 = d64[(i * 2 + 0) & 7];
+ dr2 = dg2 = d32[(i * 2 + 1) & 7];
+ db2 = d64[(i * 2 + 1) & 7];
+ } else {
+ const uint8_t * const d64 = dither_8x8_73 [y & 7];
+ const uint8_t * const d128 = dither_8x8_220[y & 7];
+ dr1 = db1 = d128[(i * 2 + 0) & 7];
+ dg1 = d64[(i * 2 + 0) & 7];
+ dr2 = db2 = d128[(i * 2 + 1) & 7];
+ dg2 = d64[(i * 2 + 1) & 7];
+ }
+
+ if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+ dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+ ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+ } else {
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ }
+ }
+}
+
+static av_always_inline void
+yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target, int hasAlpha)
+{
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int j, A1, A2;
+ int Y1 = 1 << 18;
+ int Y2 = 1 << 18;
+ int U = 1 << 18;
+ int V = 1 << 18;
+ const void *r, *g, *b;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][i * 2] * lumFilter[j];
+ Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+ }
+ Y1 >>= 19;
+ Y2 >>= 19;
+ U >>= 19;
+ V >>= 19;
+ if (hasAlpha) {
+ A1 = 1 << 18;
+ A2 = 1 << 18;
+ for (j = 0; j < lumFilterSize; j++) {
+ A1 += alpSrc[j][i * 2 ] * lumFilter[j];
+ A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
+ }
+ A1 >>= 19;
+ A2 >>= 19;
+ if ((A1 | A2) & 0x100) {
+ A1 = av_clip_uint8(A1);
+ A2 = av_clip_uint8(A2);
+ }
+ }
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+ r, g, b, y, target, hasAlpha);
+ }
+}
+
+static av_always_inline void
+yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+ *abuf0 = hasAlpha ? abuf[0] : NULL,
+ *abuf1 = hasAlpha ? abuf[1] : NULL;
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int i;
+
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 19;
+ int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 19;
+ int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19;
+ int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19;
+ int A1, A2;
+ const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ if (hasAlpha) {
+ A1 = (abuf0[i * 2 ] * yalpha1 + abuf1[i * 2 ] * yalpha) >> 19;
+ A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
+ A1 = av_clip_uint8(A1);
+ A2 = av_clip_uint8(A2);
+ }
+
+ yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+ r, g, b, y, target, hasAlpha);
+ }
+}
+
+static av_always_inline void
+yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i;
+
+ if (uvalpha < 2048) {
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2 ] + 64) >> 7;
+ int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[i] + 64) >> 7;
+ int V = (vbuf0[i] + 64) >> 7;
+ int A1, A2;
+ const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ if (hasAlpha) {
+ A1 = abuf0[i * 2 ] * 255 + 16384 >> 15;
+ A2 = abuf0[i * 2 + 1] * 255 + 16384 >> 15;
+ A1 = av_clip_uint8(A1);
+ A2 = av_clip_uint8(A2);
+ }
+
+ yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+ r, g, b, y, target, hasAlpha);
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ for (i = 0; i < ((dstW + 1) >> 1); i++) {
+ int Y1 = (buf0[i * 2 ] + 64) >> 7;
+ int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[i] + ubuf1[i] + 128) >> 8;
+ int V = (vbuf0[i] + vbuf1[i] + 128) >> 8;
+ int A1, A2;
+ const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ *b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ if (hasAlpha) {
+ A1 = (abuf0[i * 2 ] + 64) >> 7;
+ A2 = (abuf0[i * 2 + 1] + 64) >> 7;
+ A1 = av_clip_uint8(A1);
+ A2 = av_clip_uint8(A2);
+ }
+
+ yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
+ r, g, b, y, target, hasAlpha);
+ }
+ }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **lumSrc, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, int chrFilterSize, \
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \
+ int y) \
+{ \
+ name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+ alpSrc, dest, dstW, y, fmt, hasAlpha); \
+}
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \
+ int yalpha, int uvalpha, int y) \
+{ \
+ name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
+ dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf0, uint8_t *dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
+ dstW, uvalpha, y, fmt, hasAlpha); \
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2rgb,, 32_1, AV_PIX_FMT_RGB32_1, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPER(yuv2rgb,, 32, AV_PIX_FMT_RGB32, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2rgb,, a32_1, AV_PIX_FMT_RGB32_1, 1)
+YUV2RGBWRAPPER(yuv2rgb,, a32, AV_PIX_FMT_RGB32, 1)
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
+
+static av_always_inline void
+yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y, enum AVPixelFormat target, int hasAlpha)
+{
+ int i;
+ int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ int err[4] = {0};
+ int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int Y = 1<<9;
+ int U = (1<<9)-(128 << 19);
+ int V = (1<<9)-(128 << 19);
+ int R, G, B, A;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y += lumSrc[j][i] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+ }
+ Y >>= 10;
+ U >>= 10;
+ V >>= 10;
+ if (hasAlpha) {
+ A = 1 << 18;
+ for (j = 0; j < lumFilterSize; j++) {
+ A += alpSrc[j][i] * lumFilter[j];
+ }
+ A >>= 19;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= c->yuv2rgb_y_offset;
+ Y *= c->yuv2rgb_y_coeff;
+ Y += 1 << 21;
+ R = Y + V*c->yuv2rgb_v2r_coeff;
+ G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
+ B = Y + U*c->yuv2rgb_u2b_coeff;
+ if ((R | G | B) & 0xC0000000) {
+ R = av_clip_uintp2(R, 30);
+ G = av_clip_uintp2(G, 30);
+ B = av_clip_uintp2(B, 30);
+ }
+
+ switch(target) {
+ case AV_PIX_FMT_ARGB:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = R >> 22;
+ dest[2] = G >> 22;
+ dest[3] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGB24:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGBA:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_ABGR:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = B >> 22;
+ dest[2] = G >> 22;
+ dest[3] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGR24:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGRA:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_RGB8:
+ {
+ int r,g,b;
+ R >>= 22;
+ G >>= 22;
+ B >>= 22;
+ R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+ G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+ B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+ r = R >> (isrgb8 ? 5 : 7);
+ g = G >> (isrgb8 ? 5 : 6);
+ b = B >> (isrgb8 ? 6 : 7);
+ r = av_clip(r, 0, isrgb8 ? 7 : 1);
+ g = av_clip(g, 0, isrgb8 ? 7 : 3);
+ b = av_clip(b, 0, isrgb8 ? 3 : 1);
+ err[0] = R - r*(isrgb8 ? 36 : 255);
+ err[1] = G - g*(isrgb8 ? 36 : 85);
+ err[2] = B - b*(isrgb8 ? 85 : 255);
+ if(target == AV_PIX_FMT_BGR4_BYTE) {
+ dest[0] = r + 2*g + 8*b;
+ } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+ dest[0] = b + 2*g + 8*r;
+ } else if(target == AV_PIX_FMT_BGR8) {
+ dest[0] = r + 8*g + 64*b;
+ } else if(target == AV_PIX_FMT_RGB8) {
+ dest[0] = b + 4*g + 32*r;
+ } else
+ av_assert2(0);
+ step = 1;
+ break;}
+ }
+ dest += step;
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
+YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
+#endif
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
+#endif
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
+YUV2RGBWRAPPERX(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
+
+static void
+yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t **dest,
+ int dstW, int y)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
+ int i;
+ int hasAlpha = 0;
+ uint16_t **dest16 = (uint16_t**)dest;
+ int SH = 22 + 7 - desc->comp[0].depth_minus1;
+
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int Y = 1 << 9;
+ int U = (1 << 9) - (128 << 19);
+ int V = (1 << 9) - (128 << 19);
+ int R, G, B, A;
+
+ for (j = 0; j < lumFilterSize; j++)
+ Y += lumSrc[j][i] * lumFilter[j];
+
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+ }
+
+ Y >>= 10;
+ U >>= 10;
+ V >>= 10;
+
+ if (hasAlpha) {
+ A = 1 << 18;
+
+ for (j = 0; j < lumFilterSize; j++)
+ A += alpSrc[j][i] * lumFilter[j];
+
+ A >>= 19;
+
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+
+ Y -= c->yuv2rgb_y_offset;
+ Y *= c->yuv2rgb_y_coeff;
+ Y += 1 << 21;
+ R = Y + V * c->yuv2rgb_v2r_coeff;
+ G = Y + V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
+ B = Y + U * c->yuv2rgb_u2b_coeff;
+
+ if ((R | G | B) & 0xC0000000) {
+ R = av_clip_uintp2(R, 30);
+ G = av_clip_uintp2(G, 30);
+ B = av_clip_uintp2(B, 30);
+ }
+
+ if (SH != 22) {
+ dest16[0][i] = G >> SH;
+ dest16[1][i] = B >> SH;
+ dest16[2][i] = R >> SH;
+ } else {
+ dest[0][i] = G >> 22;
+ dest[1][i] = B >> 22;
+ dest[2][i] = R >> 22;
+ }
+ }
+ if (SH != 22 && (!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
+ for (i = 0; i < dstW; i++) {
+ dest16[0][i] = av_bswap16(dest16[0][i]);
+ dest16[1][i] = av_bswap16(dest16[1][i]);
+ dest16[2][i] = av_bswap16(dest16[2][i]);
+ }
+ }
+}
+
+av_cold void ff_sws_init_output_funcs(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX)
+{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
+
+ if (is16BPS(dstFormat)) {
+ *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c : yuv2planeX_16LE_c;
+ *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c : yuv2plane1_16LE_c;
+ } else if (is9_OR_10BPS(dstFormat)) {
+ if (desc->comp[0].depth_minus1 == 8) {
+ *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c : yuv2planeX_9LE_c;
+ *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c : yuv2plane1_9LE_c;
+ } else if (desc->comp[0].depth_minus1 == 9) {
+ *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c : yuv2planeX_10LE_c;
+ *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c : yuv2plane1_10LE_c;
+ } else if (desc->comp[0].depth_minus1 == 11) {
+ *yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_c : yuv2planeX_12LE_c;
+ *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_c : yuv2plane1_12LE_c;
+ } else if (desc->comp[0].depth_minus1 == 13) {
+ *yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_c : yuv2planeX_14LE_c;
+ *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_c : yuv2plane1_14LE_c;
+ } else
+ av_assert0(0);
+ } else {
+ *yuv2plane1 = yuv2plane1_8_c;
+ *yuv2planeX = yuv2planeX_8_c;
+ if (dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21)
+ *yuv2nv12cX = yuv2nv12cX_c;
+ }
+
+ if(c->flags & SWS_FULL_CHR_H_INT) {
+ switch (dstFormat) {
+ case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+ *yuv2packedX = yuv2rgba32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packedX = yuv2rgba32_full_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packedX = yuv2rgbx32_full_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+ *yuv2packedX = yuv2argb32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packedX = yuv2argb32_full_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packedX = yuv2xrgb32_full_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+ *yuv2packedX = yuv2bgra32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packedX = yuv2bgra32_full_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packedX = yuv2bgrx32_full_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+ *yuv2packedX = yuv2abgr32_full_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packedX = yuv2abgr32_full_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packedX = yuv2xbgr32_full_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ *yuv2packedX = yuv2rgb24_full_X_c;
+ break;
+ case AV_PIX_FMT_BGR24:
+ *yuv2packedX = yuv2bgr24_full_X_c;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ *yuv2packedX = yuv2bgr4_byte_full_X_c;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ *yuv2packedX = yuv2rgb4_byte_full_X_c;
+ break;
+ case AV_PIX_FMT_BGR8:
+ *yuv2packedX = yuv2bgr8_full_X_c;
+ break;
+ case AV_PIX_FMT_RGB8:
+ *yuv2packedX = yuv2rgb8_full_X_c;
+ break;
+ case AV_PIX_FMT_GBRP:
+ case AV_PIX_FMT_GBRP9BE:
+ case AV_PIX_FMT_GBRP9LE:
+ case AV_PIX_FMT_GBRP10BE:
+ case AV_PIX_FMT_GBRP10LE:
+ case AV_PIX_FMT_GBRP12BE:
+ case AV_PIX_FMT_GBRP12LE:
+ case AV_PIX_FMT_GBRP14BE:
+ case AV_PIX_FMT_GBRP14LE:
+ case AV_PIX_FMT_GBRP16BE:
+ case AV_PIX_FMT_GBRP16LE:
+ *yuv2anyX = yuv2gbrp_full_X_c;
+ break;
+ }
+ if (!*yuv2packedX && !*yuv2anyX)
+ goto YUV_PACKED;
+ } else {
+ YUV_PACKED:
+ switch (dstFormat) {
+ case AV_PIX_FMT_RGB48LE:
+ *yuv2packed1 = yuv2rgb48le_1_c;
+ *yuv2packed2 = yuv2rgb48le_2_c;
+ *yuv2packedX = yuv2rgb48le_X_c;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ *yuv2packed1 = yuv2rgb48be_1_c;
+ *yuv2packed2 = yuv2rgb48be_2_c;
+ *yuv2packedX = yuv2rgb48be_X_c;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ *yuv2packed1 = yuv2bgr48le_1_c;
+ *yuv2packed2 = yuv2bgr48le_2_c;
+ *yuv2packedX = yuv2bgr48le_X_c;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ *yuv2packed1 = yuv2bgr48be_1_c;
+ *yuv2packed2 = yuv2bgr48be_2_c;
+ *yuv2packedX = yuv2bgr48be_X_c;
+ break;
+ case AV_PIX_FMT_RGB32:
+ case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+ *yuv2packed1 = yuv2rgb32_1_c;
+ *yuv2packed2 = yuv2rgb32_2_c;
+ *yuv2packedX = yuv2rgb32_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packed1 = yuv2rgba32_1_c;
+ *yuv2packed2 = yuv2rgba32_2_c;
+ *yuv2packedX = yuv2rgba32_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packed1 = yuv2rgbx32_1_c;
+ *yuv2packed2 = yuv2rgbx32_2_c;
+ *yuv2packedX = yuv2rgbx32_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+ *yuv2packed1 = yuv2rgb32_1_1_c;
+ *yuv2packed2 = yuv2rgb32_1_2_c;
+ *yuv2packedX = yuv2rgb32_1_X_c;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->alpPixBuf) {
+ *yuv2packed1 = yuv2rgba32_1_1_c;
+ *yuv2packed2 = yuv2rgba32_1_2_c;
+ *yuv2packedX = yuv2rgba32_1_X_c;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ *yuv2packed1 = yuv2rgbx32_1_1_c;
+ *yuv2packed2 = yuv2rgbx32_1_2_c;
+ *yuv2packedX = yuv2rgbx32_1_X_c;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ *yuv2packed1 = yuv2rgb24_1_c;
+ *yuv2packed2 = yuv2rgb24_2_c;
+ *yuv2packedX = yuv2rgb24_X_c;
+ break;
+ case AV_PIX_FMT_BGR24:
+ *yuv2packed1 = yuv2bgr24_1_c;
+ *yuv2packed2 = yuv2bgr24_2_c;
+ *yuv2packedX = yuv2bgr24_X_c;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ case AV_PIX_FMT_RGB565BE:
+ case AV_PIX_FMT_BGR565LE:
+ case AV_PIX_FMT_BGR565BE:
+ *yuv2packed1 = yuv2rgb16_1_c;
+ *yuv2packed2 = yuv2rgb16_2_c;
+ *yuv2packedX = yuv2rgb16_X_c;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ case AV_PIX_FMT_RGB555BE:
+ case AV_PIX_FMT_BGR555LE:
+ case AV_PIX_FMT_BGR555BE:
+ *yuv2packed1 = yuv2rgb15_1_c;
+ *yuv2packed2 = yuv2rgb15_2_c;
+ *yuv2packedX = yuv2rgb15_X_c;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ case AV_PIX_FMT_RGB444BE:
+ case AV_PIX_FMT_BGR444LE:
+ case AV_PIX_FMT_BGR444BE:
+ *yuv2packed1 = yuv2rgb12_1_c;
+ *yuv2packed2 = yuv2rgb12_2_c;
+ *yuv2packedX = yuv2rgb12_X_c;
+ break;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ *yuv2packed1 = yuv2rgb8_1_c;
+ *yuv2packed2 = yuv2rgb8_2_c;
+ *yuv2packedX = yuv2rgb8_X_c;
+ break;
+ case AV_PIX_FMT_RGB4:
+ case AV_PIX_FMT_BGR4:
+ *yuv2packed1 = yuv2rgb4_1_c;
+ *yuv2packed2 = yuv2rgb4_2_c;
+ *yuv2packedX = yuv2rgb4_X_c;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR4_BYTE:
+ *yuv2packed1 = yuv2rgb4b_1_c;
+ *yuv2packed2 = yuv2rgb4b_2_c;
+ *yuv2packedX = yuv2rgb4b_X_c;
+ break;
+ }
+ }
+ switch (dstFormat) {
+ case AV_PIX_FMT_MONOWHITE:
+ *yuv2packed1 = yuv2monowhite_1_c;
+ *yuv2packed2 = yuv2monowhite_2_c;
+ *yuv2packedX = yuv2monowhite_X_c;
+ break;
+ case AV_PIX_FMT_MONOBLACK:
+ *yuv2packed1 = yuv2monoblack_1_c;
+ *yuv2packed2 = yuv2monoblack_2_c;
+ *yuv2packedX = yuv2monoblack_X_c;
+ break;
+ case AV_PIX_FMT_YUYV422:
+ *yuv2packed1 = yuv2yuyv422_1_c;
+ *yuv2packed2 = yuv2yuyv422_2_c;
+ *yuv2packedX = yuv2yuyv422_X_c;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ *yuv2packed1 = yuv2uyvy422_1_c;
+ *yuv2packed2 = yuv2uyvy422_2_c;
+ *yuv2packedX = yuv2uyvy422_X_c;
+ break;
+ }
+}
diff --git a/ffmpeg/libswscale/ppc/Makefile b/ffmpeg/libswscale/ppc/Makefile
new file mode 100644
index 0000000..018955b
--- /dev/null
+++ b/ffmpeg/libswscale/ppc/Makefile
@@ -0,0 +1,3 @@
+ALTIVEC-OBJS += ppc/swscale_altivec.o \
+ ppc/yuv2rgb_altivec.o \
+ ppc/yuv2yuv_altivec.o \
diff --git a/ffmpeg/libswscale/ppc/swscale_altivec.c b/ffmpeg/libswscale/ppc/swscale_altivec.c
new file mode 100644
index 0000000..9ca2868
--- /dev/null
+++ b/ffmpeg/libswscale/ppc/swscale_altivec.c
@@ -0,0 +1,328 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "yuv2rgb_altivec.h"
+
+#define vzero vec_splat_s32(0)
+
+#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
+ vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
+ vector signed short ls = vec_perm(l1, l2, perm); \
+ vector signed int i1 = vec_mule(filter, ls); \
+ vector signed int i2 = vec_mulo(filter, ls); \
+ vector signed int vf1 = vec_mergeh(i1, i2); \
+ vector signed int vf2 = vec_mergel(i1, i2); \
+ d1 = vec_add(d1, vf1); \
+ d2 = vec_add(d2, vf2); \
+ l1 = l2; \
+ } while (0)
+
+static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest,
+ const uint8_t *dither, int offset, int x)
+{
+ register int i, j;
+ DECLARE_ALIGNED(16, int, val)[16];
+ vector signed int vo1, vo2, vo3, vo4;
+ vector unsigned short vs1, vs2;
+ vector unsigned char vf;
+ vector unsigned int altivec_vectorShiftInt19 =
+ vec_add(vec_splat_u32(10), vec_splat_u32(9));
+
+ for (i = 0; i < 16; i++)
+ val[i] = dither[(x + i + offset) & 7] << 12;
+
+ vo1 = vec_ld(0, val);
+ vo2 = vec_ld(16, val);
+ vo3 = vec_ld(32, val);
+ vo4 = vec_ld(48, val);
+
+ for (j = 0; j < filterSize; j++) {
+ vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
+ vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
+ vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
+ vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
+
+ perm = vec_lvsl(x << 1, src[j]);
+ l1 = vec_ld(x << 1, src[j]);
+
+ yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
+ yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
+ }
+
+ vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
+ vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
+ vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
+ vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
+ vs1 = vec_packsu(vo1, vo2);
+ vs2 = vec_packsu(vo3, vo4);
+ vf = vec_packsu(vs1, vs2);
+ vec_st(vf, 0, dest);
+}
+
+static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset, int x)
+{
+ int i, j;
+
+ for (i = x; i < dstW; i++) {
+ int t = dither[(i + offset) & 7] << 12;
+ for (j = 0; j < filterSize; j++)
+ t += src[j][i] * filter[j];
+ dest[i] = av_clip_uint8(t >> 19);
+ }
+}
+
+static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ int dst_u = -(uintptr_t)dest & 15;
+ int i;
+
+ yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
+
+ for (i = dst_u; i < dstW - 15; i += 16)
+ yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
+ offset, i);
+
+ yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
+}
+
+static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ register int i;
+ DECLARE_ALIGNED(16, int, tempo)[4];
+
+ if (filterSize % 4) {
+ for (i = 0; i < dstW; i++) {
+ register int j;
+ register int srcPos = filterPos[i];
+ register int val = 0;
+ for (j = 0; j < filterSize; j++)
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+ }
+ } else
+ switch (filterSize) {
+ case 4:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+
+ vector unsigned char src_v0 = vec_ld(srcPos, src);
+ vector unsigned char src_v1, src_vF;
+ vector signed short src_v, filter_v;
+ vector signed int val_vEven, val_s;
+ if ((((uintptr_t)src + srcPos) % 16) > 12) {
+ src_v1 = vec_ld(srcPos + 16, src);
+ }
+ src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+ // now put our elements in the even slots
+ src_v = vec_mergeh(src_v, (vector signed short)vzero);
+
+ filter_v = vec_ld(i << 3, filter);
+ // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
+
+ // The neat trick: We only care for half the elements,
+ // high or low depending on (i<<3)%16 (it's 0 or 8 here),
+ // and we're going to use vec_mule, so we choose
+ // carefully how to "unpack" the elements into the even slots.
+ if ((i << 3) % 16)
+ filter_v = vec_mergel(filter_v, (vector signed short)vzero);
+ else
+ filter_v = vec_mergeh(filter_v, (vector signed short)vzero);
+
+ val_vEven = vec_mule(src_v, filter_v);
+ val_s = vec_sums(val_vEven, vzero);
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+
+ case 8:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+
+ vector unsigned char src_v0 = vec_ld(srcPos, src);
+ vector unsigned char src_v1, src_vF;
+ vector signed short src_v, filter_v;
+ vector signed int val_v, val_s;
+ if ((((uintptr_t)src + srcPos) % 16) > 8) {
+ src_v1 = vec_ld(srcPos + 16, src);
+ }
+ src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+ filter_v = vec_ld(i << 4, filter);
+ // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
+
+ val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
+ val_s = vec_sums(val_v, vzero);
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+
+ case 16:
+ for (i = 0; i < dstW; i++) {
+ register int srcPos = filterPos[i];
+
+ vector unsigned char src_v0 = vec_ld(srcPos, src);
+ vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
+ vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+ vector signed short src_vA = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+ vector signed short src_vB = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
+
+ vector signed short filter_v0 = vec_ld(i << 5, filter);
+ vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
+ // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
+
+ vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
+ vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
+
+ vector signed int val_s = vec_sums(val_v, vzero);
+
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ break;
+
+ default:
+ for (i = 0; i < dstW; i++) {
+ register int j;
+ register int srcPos = filterPos[i];
+
+ vector signed int val_s, val_v = (vector signed int)vzero;
+ vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
+ vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
+
+ vector unsigned char src_v0 = vec_ld(srcPos, src);
+ vector unsigned char permS = vec_lvsl(srcPos, src);
+
+ for (j = 0; j < filterSize - 15; j += 16) {
+ vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
+ vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
+
+ vector signed short src_vA = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+ vector signed short src_vB = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
+
+ vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
+ vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
+ vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
+ vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
+
+ vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
+ val_v = vec_msums(src_vB, filter_v1, val_acc);
+
+ filter_v0R = filter_v2R;
+ src_v0 = src_v1;
+ }
+
+ if (j < filterSize - 7) {
+ // loading src_v0 is useless, it's already done above
+ // vector unsigned char src_v0 = vec_ld(srcPos + j, src);
+ vector unsigned char src_v1, src_vF;
+ vector signed short src_v, filter_v1R, filter_v;
+ if ((((uintptr_t)src + srcPos) % 16) > 8) {
+ src_v1 = vec_ld(srcPos + j + 16, src);
+ }
+ src_vF = vec_perm(src_v0, src_v1, permS);
+
+ src_v = // vec_unpackh sign-extends...
+ (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+ // loading filter_v0R is useless, it's already done above
+ // vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
+ filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
+ filter_v = vec_perm(filter_v0R, filter_v1R, permF);
+
+ val_v = vec_msums(src_v, filter_v, val_v);
+ }
+
+ val_s = vec_sums(val_v, vzero);
+
+ vec_st(val_s, 0, tempo);
+ dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+ }
+ }
+}
+
+av_cold void ff_sws_init_swScale_altivec(SwsContext *c)
+{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+ return;
+
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = hScale_altivec_real;
+ }
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
+ dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
+ !c->alpPixBuf) {
+ c->yuv2planeX = yuv2planeX_altivec;
+ }
+
+ /* The following list of supported dstFormat values should
+ * match what's found in the body of ff_yuv2packedX_altivec() */
+ if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_ABGR:
+ c->yuv2packedX = ff_yuv2abgr_X_altivec;
+ break;
+ case AV_PIX_FMT_BGRA:
+ c->yuv2packedX = ff_yuv2bgra_X_altivec;
+ break;
+ case AV_PIX_FMT_ARGB:
+ c->yuv2packedX = ff_yuv2argb_X_altivec;
+ break;
+ case AV_PIX_FMT_RGBA:
+ c->yuv2packedX = ff_yuv2rgba_X_altivec;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packedX = ff_yuv2bgr24_X_altivec;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->yuv2packedX = ff_yuv2rgb24_X_altivec;
+ break;
+ }
+ }
+}
diff --git a/ffmpeg/libswscale/ppc/yuv2rgb_altivec.c b/ffmpeg/libswscale/ppc/yuv2rgb_altivec.c
new file mode 100644
index 0000000..a8501d9
--- /dev/null
+++ b/ffmpeg/libswscale/ppc/yuv2rgb_altivec.c
@@ -0,0 +1,854 @@
+/*
+ * AltiVec acceleration for colorspace conversion
+ *
+ * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * Convert I420 YV12 to RGB in various formats,
+ * it rejects images that are not in 420 formats,
+ * it rejects images that don't have widths of multiples of 16,
+ * it rejects images that don't have heights of multiples of 2.
+ * Reject defers to C simulation code.
+ *
+ * Lots of optimizations to be done here.
+ *
+ * 1. Need to fix saturation code. I just couldn't get it to fly with packs
+ * and adds, so we currently use max/min to clip.
+ *
+ * 2. The inefficient use of chroma loading needs a bit of brushing up.
+ *
+ * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
+ * pipeline stalls.
+ *
+ *
+ * MODIFIED to calculate coeffs from currently selected color space.
+ * MODIFIED core to be a macro where you specify the output format.
+ * ADDED UYVY conversion which is never called due to some thing in swscale.
+ * CORRECTED algorithim selection to be strict on input formats.
+ * ADDED runtime detection of AltiVec.
+ *
+ * ADDED altivec_yuv2packedX vertical scl + RGB converter
+ *
+ * March 27,2004
+ * PERFORMANCE ANALYSIS
+ *
+ * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
+ * used as test.
+ * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
+ * same sequence.
+ *
+ * 720 * 480 * 30 ~10MPS
+ *
+ * so we have roughly 10 clocks per pixel. This is too high, something has
+ * to be wrong.
+ *
+ * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
+ * need for vec_min.
+ *
+ * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
+ * have the input video frame, it was just decompressed so it probably resides
+ * in L1 caches. However, we are creating the output video stream. This needs
+ * to use the DSTST instruction to optimize for the cache. We couple this with
+ * the fact that we are not going to be visiting the input buffer again so we
+ * mark it Least Recently Used. This shaves 25% of the processor cycles off.
+ *
+ * Now memcpy is the largest mips consumer in the system, probably due
+ * to the inefficient X11 stuff.
+ *
+ * GL libraries seem to be very slow on this machine 1.33Ghz PB running
+ * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
+ * a versioning issue, however I have libGL.1.2.dylib for both
+ * machines. (We need to figure this out now.)
+ *
+ * GL2 libraries work now with patch for RGB32.
+ *
+ * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
+ *
+ * Integrated luma prescaling adjustment for saturation/contrast/brightness
+ * adjustment.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "yuv2rgb_altivec.h"
+
+#undef PROFILE_THE_BEAST
+#undef INC_SCALING
+
+typedef unsigned char ubyte;
+typedef signed char sbyte;
+
+/* RGB interleaver, 16 planar pels 8-bit samples per channel in
+ * homogeneous vector registers x0,x1,x2 are interleaved with the
+ * following technique:
+ *
+ * o0 = vec_mergeh(x0, x1);
+ * o1 = vec_perm(o0, x2, perm_rgb_0);
+ * o2 = vec_perm(o0, x2, perm_rgb_1);
+ * o3 = vec_mergel(x0, x1);
+ * o4 = vec_perm(o3, o2, perm_rgb_2);
+ * o5 = vec_perm(o3, o2, perm_rgb_3);
+ *
+ * perm_rgb_0: o0(RG).h v1(B) --> o1*
+ * 0 1 2 3 4
+ * rgbr|gbrg|brgb|rgbr
+ * 0010 0100 1001 0010
+ * 0102 3145 2673 894A
+ *
+ * perm_rgb_1: o0(RG).h v1(B) --> o2
+ * 0 1 2 3 4
+ * gbrg|brgb|bbbb|bbbb
+ * 0100 1001 1111 1111
+ * B5CD 6EF7 89AB CDEF
+ *
+ * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
+ * 0 1 2 3 4
+ * gbrg|brgb|rgbr|gbrg
+ * 1111 1111 0010 0100
+ * 89AB CDEF 0182 3945
+ *
+ * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
+ * 0 1 2 3 4
+ * brgb|rgbr|gbrg|brgb
+ * 1001 0010 0100 1001
+ * a67b 89cA BdCD eEFf
+ *
+ */
+static const vector unsigned char
+ perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
+ 0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
+ perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
+ perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
+ perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
+ 0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
+
+#define vec_merge3(x2, x1, x0, y0, y1, y2) \
+ do { \
+ __typeof__(x0) o0, o2, o3; \
+ o0 = vec_mergeh(x0, x1); \
+ y0 = vec_perm(o0, x2, perm_rgb_0); \
+ o2 = vec_perm(o0, x2, perm_rgb_1); \
+ o3 = vec_mergel(x0, x1); \
+ y1 = vec_perm(o3, o2, perm_rgb_2); \
+ y2 = vec_perm(o3, o2, perm_rgb_3); \
+ } while (0)
+
+#define vec_mstbgr24(x0, x1, x2, ptr) \
+ do { \
+ __typeof__(x0) _0, _1, _2; \
+ vec_merge3(x0, x1, x2, _0, _1, _2); \
+ vec_st(_0, 0, ptr++); \
+ vec_st(_1, 0, ptr++); \
+ vec_st(_2, 0, ptr++); \
+ } while (0)
+
+#define vec_mstrgb24(x0, x1, x2, ptr) \
+ do { \
+ __typeof__(x0) _0, _1, _2; \
+ vec_merge3(x2, x1, x0, _0, _1, _2); \
+ vec_st(_0, 0, ptr++); \
+ vec_st(_1, 0, ptr++); \
+ vec_st(_2, 0, ptr++); \
+ } while (0)
+
+/* pack the pixels in rgb0 format
+ * msb R
+ * lsb 0
+ */
+#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
+ do { \
+ T _0, _1, _2, _3; \
+ _0 = vec_mergeh(x0, x1); \
+ _1 = vec_mergeh(x2, x3); \
+ _2 = (T) vec_mergeh((vector unsigned short) _0, \
+ (vector unsigned short) _1); \
+ _3 = (T) vec_mergel((vector unsigned short) _0, \
+ (vector unsigned short) _1); \
+ vec_st(_2, 0 * 16, (T *) ptr); \
+ vec_st(_3, 1 * 16, (T *) ptr); \
+ _0 = vec_mergel(x0, x1); \
+ _1 = vec_mergel(x2, x3); \
+ _2 = (T) vec_mergeh((vector unsigned short) _0, \
+ (vector unsigned short) _1); \
+ _3 = (T) vec_mergel((vector unsigned short) _0, \
+ (vector unsigned short) _1); \
+ vec_st(_2, 2 * 16, (T *) ptr); \
+ vec_st(_3, 3 * 16, (T *) ptr); \
+ ptr += 4; \
+ } while (0)
+
+/*
+ * 1 0 1.4021 | | Y |
+ * 1 -0.3441 -0.7142 |x| Cb|
+ * 1 1.7718 0 | | Cr|
+ *
+ *
+ * Y: [-128 127]
+ * Cb/Cr : [-128 127]
+ *
+ * typical YUV conversion works on Y: 0-255 this version has been
+ * optimized for JPEG decoding.
+ */
+
+#define vec_unh(x) \
+ (vector signed short) \
+ vec_perm(x, (__typeof__(x)) { 0 }, \
+ ((vector unsigned char) { \
+ 0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
+ 0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
+
+#define vec_unl(x) \
+ (vector signed short) \
+ vec_perm(x, (__typeof__(x)) { 0 }, \
+ ((vector unsigned char) { \
+ 0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
+ 0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
+
+#define vec_clip_s16(x) \
+ vec_max(vec_min(x, ((vector signed short) { \
+ 235, 235, 235, 235, 235, 235, 235, 235 })), \
+ ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
+
+#define vec_packclp(x, y) \
+ (vector unsigned char) \
+ vec_packs((vector unsigned short) \
+ vec_max(x, ((vector signed short) { 0 })), \
+ (vector unsigned short) \
+ vec_max(y, ((vector signed short) { 0 })))
+
+//#define out_pixels(a, b, c, ptr) vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, a, a, ptr)
+
+static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
+ vector signed short U, vector signed short V,
+ vector signed short *R, vector signed short *G,
+ vector signed short *B)
+{
+ vector signed short vx, ux, uvx;
+
+ Y = vec_mradds(Y, c->CY, c->OY);
+ U = vec_sub(U, (vector signed short)
+ vec_splat((vector signed short) { 128 }, 0));
+ V = vec_sub(V, (vector signed short)
+ vec_splat((vector signed short) { 128 }, 0));
+
+ // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
+ ux = vec_sl(U, c->CSHIFT);
+ *B = vec_mradds(ux, c->CBU, Y);
+
+ // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
+ vx = vec_sl(V, c->CSHIFT);
+ *R = vec_mradds(vx, c->CRV, Y);
+
+ // uvx = ((CGU * u) + (CGV * v)) >> 15;
+ uvx = vec_mradds(U, c->CGU, Y);
+ *G = vec_mradds(V, c->CGV, uvx);
+}
+
+/*
+ * ------------------------------------------------------------------------------
+ * CS converters
+ * ------------------------------------------------------------------------------
+ */
+
+#define DEFCSP420_CVT(name, out_pixels) \
+static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
+ int *instrides, int srcSliceY, int srcSliceH, \
+ unsigned char **oplanes, int *outstrides) \
+{ \
+ int w = c->srcW; \
+ int h = srcSliceH; \
+ int i, j; \
+ int instrides_scl[3]; \
+ vector unsigned char y0, y1; \
+ \
+ vector signed char u, v; \
+ \
+ vector signed short Y0, Y1, Y2, Y3; \
+ vector signed short U, V; \
+ vector signed short vx, ux, uvx; \
+ vector signed short vx0, ux0, uvx0; \
+ vector signed short vx1, ux1, uvx1; \
+ vector signed short R0, G0, B0; \
+ vector signed short R1, G1, B1; \
+ vector unsigned char R, G, B; \
+ \
+ const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
+ vector unsigned char align_perm; \
+ \
+ vector signed short lCY = c->CY; \
+ vector signed short lOY = c->OY; \
+ vector signed short lCRV = c->CRV; \
+ vector signed short lCBU = c->CBU; \
+ vector signed short lCGU = c->CGU; \
+ vector signed short lCGV = c->CGV; \
+ vector unsigned short lCSHIFT = c->CSHIFT; \
+ \
+ const ubyte *y1i = in[0]; \
+ const ubyte *y2i = in[0] + instrides[0]; \
+ const ubyte *ui = in[1]; \
+ const ubyte *vi = in[2]; \
+ \
+ vector unsigned char *oute, *outo; \
+ \
+ /* loop moves y{1, 2}i by w */ \
+ instrides_scl[0] = instrides[0] * 2 - w; \
+ /* loop moves ui by w / 2 */ \
+ instrides_scl[1] = instrides[1] - w / 2; \
+ /* loop moves vi by w / 2 */ \
+ instrides_scl[2] = instrides[2] - w / 2; \
+ \
+ for (i = 0; i < h / 2; i++) { \
+ oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
+ (srcSliceY + i * 2)); \
+ outo = oute + (outstrides[0] >> 4); \
+ vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
+ vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
+ \
+ for (j = 0; j < w / 16; j++) { \
+ y1ivP = (const vector unsigned char *) y1i; \
+ y2ivP = (const vector unsigned char *) y2i; \
+ uivP = (const vector unsigned char *) ui; \
+ vivP = (const vector unsigned char *) vi; \
+ \
+ align_perm = vec_lvsl(0, y1i); \
+ y0 = (vector unsigned char) \
+ vec_perm(y1ivP[0], y1ivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl(0, y2i); \
+ y1 = (vector unsigned char) \
+ vec_perm(y2ivP[0], y2ivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl(0, ui); \
+ u = (vector signed char) \
+ vec_perm(uivP[0], uivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl(0, vi); \
+ v = (vector signed char) \
+ vec_perm(vivP[0], vivP[1], align_perm); \
+ \
+ u = (vector signed char) \
+ vec_sub(u, \
+ (vector signed char) \
+ vec_splat((vector signed char) { 128 }, 0)); \
+ v = (vector signed char) \
+ vec_sub(v, \
+ (vector signed char) \
+ vec_splat((vector signed char) { 128 }, 0)); \
+ \
+ U = vec_unpackh(u); \
+ V = vec_unpackh(v); \
+ \
+ Y0 = vec_unh(y0); \
+ Y1 = vec_unl(y0); \
+ Y2 = vec_unh(y1); \
+ Y3 = vec_unl(y1); \
+ \
+ Y0 = vec_mradds(Y0, lCY, lOY); \
+ Y1 = vec_mradds(Y1, lCY, lOY); \
+ Y2 = vec_mradds(Y2, lCY, lOY); \
+ Y3 = vec_mradds(Y3, lCY, lOY); \
+ \
+ /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
+ ux = vec_sl(U, lCSHIFT); \
+ ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
+ ux0 = vec_mergeh(ux, ux); \
+ ux1 = vec_mergel(ux, ux); \
+ \
+ /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
+ vx = vec_sl(V, lCSHIFT); \
+ vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
+ vx0 = vec_mergeh(vx, vx); \
+ vx1 = vec_mergel(vx, vx); \
+ \
+ /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
+ uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
+ uvx = vec_mradds(V, lCGV, uvx); \
+ uvx0 = vec_mergeh(uvx, uvx); \
+ uvx1 = vec_mergel(uvx, uvx); \
+ \
+ R0 = vec_add(Y0, vx0); \
+ G0 = vec_add(Y0, uvx0); \
+ B0 = vec_add(Y0, ux0); \
+ R1 = vec_add(Y1, vx1); \
+ G1 = vec_add(Y1, uvx1); \
+ B1 = vec_add(Y1, ux1); \
+ \
+ R = vec_packclp(R0, R1); \
+ G = vec_packclp(G0, G1); \
+ B = vec_packclp(B0, B1); \
+ \
+ out_pixels(R, G, B, oute); \
+ \
+ R0 = vec_add(Y2, vx0); \
+ G0 = vec_add(Y2, uvx0); \
+ B0 = vec_add(Y2, ux0); \
+ R1 = vec_add(Y3, vx1); \
+ G1 = vec_add(Y3, uvx1); \
+ B1 = vec_add(Y3, ux1); \
+ R = vec_packclp(R0, R1); \
+ G = vec_packclp(G0, G1); \
+ B = vec_packclp(B0, B1); \
+ \
+ \
+ out_pixels(R, G, B, outo); \
+ \
+ y1i += 16; \
+ y2i += 16; \
+ ui += 8; \
+ vi += 8; \
+ } \
+ \
+ ui += instrides_scl[1]; \
+ vi += instrides_scl[2]; \
+ y1i += instrides_scl[0]; \
+ y2i += instrides_scl[0]; \
+ } \
+ return srcSliceH; \
+}
+
+#define out_abgr(a, b, c, ptr) \
+ vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
+#define out_bgra(a, b, c, ptr) \
+ vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
+#define out_rgba(a, b, c, ptr) \
+ vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
+#define out_argb(a, b, c, ptr) \
+ vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
+#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
+#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
+
+DEFCSP420_CVT(yuv2_abgr, out_abgr)
+DEFCSP420_CVT(yuv2_bgra, out_bgra)
+DEFCSP420_CVT(yuv2_rgba, out_rgba)
+DEFCSP420_CVT(yuv2_argb, out_argb)
+DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
+DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
+
+// uyvy|uyvy|uyvy|uyvy
+// 0123 4567 89ab cdef
+static const vector unsigned char
+ demux_u = { 0x10, 0x00, 0x10, 0x00,
+ 0x10, 0x04, 0x10, 0x04,
+ 0x10, 0x08, 0x10, 0x08,
+ 0x10, 0x0c, 0x10, 0x0c },
+ demux_v = { 0x10, 0x02, 0x10, 0x02,
+ 0x10, 0x06, 0x10, 0x06,
+ 0x10, 0x0A, 0x10, 0x0A,
+ 0x10, 0x0E, 0x10, 0x0E },
+ demux_y = { 0x10, 0x01, 0x10, 0x03,
+ 0x10, 0x05, 0x10, 0x07,
+ 0x10, 0x09, 0x10, 0x0B,
+ 0x10, 0x0D, 0x10, 0x0F };
+
+/*
+ * this is so I can play live CCIR raw video
+ */
+static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
+ int *instrides, int srcSliceY, int srcSliceH,
+ unsigned char **oplanes, int *outstrides)
+{
+ int w = c->srcW;
+ int h = srcSliceH;
+ int i, j;
+ vector unsigned char uyvy;
+ vector signed short Y, U, V;
+ vector signed short R0, G0, B0, R1, G1, B1;
+ vector unsigned char R, G, B;
+ vector unsigned char *out;
+ const ubyte *img;
+
+ img = in[0];
+ out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
+
+ for (i = 0; i < h; i++)
+ for (j = 0; j < w / 16; j++) {
+ uyvy = vec_ld(0, img);
+
+ U = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
+ V = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
+ Y = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
+
+ cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
+
+ uyvy = vec_ld(16, img);
+
+ U = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
+ V = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
+ Y = (vector signed short)
+ vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
+
+ cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
+
+ R = vec_packclp(R0, R1);
+ G = vec_packclp(G0, G1);
+ B = vec_packclp(B0, B1);
+
+ // vec_mstbgr24 (R,G,B, out);
+ out_rgba(R, G, B, out);
+
+ img += 32;
+ }
+ return srcSliceH;
+}
+
+/* Ok currently the acceleration routine only supports
+ * inputs of widths a multiple of 16
+ * and heights a multiple 2
+ *
+ * So we just fall back to the C codes for this.
+ */
+av_cold SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
+{
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+ return NULL;
+
+ /*
+ * and this seems not to matter too much I tried a bunch of
+ * videos with abnormal widths and MPlayer crashes elsewhere.
+ * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
+ * boom with X11 bad match.
+ *
+ */
+ if ((c->srcW & 0xf) != 0)
+ return NULL;
+
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_YUV410P:
+ case AV_PIX_FMT_YUV420P:
+ /*case IMGFMT_CLPL: ??? */
+ case AV_PIX_FMT_GRAY8:
+ case AV_PIX_FMT_NV12:
+ case AV_PIX_FMT_NV21:
+ if ((c->srcH & 0x1) != 0)
+ return NULL;
+
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB24:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
+ return altivec_yuv2_rgb24;
+ case AV_PIX_FMT_BGR24:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
+ return altivec_yuv2_bgr24;
+ case AV_PIX_FMT_ARGB:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
+ return altivec_yuv2_argb;
+ case AV_PIX_FMT_ABGR:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
+ return altivec_yuv2_abgr;
+ case AV_PIX_FMT_RGBA:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
+ return altivec_yuv2_rgba;
+ case AV_PIX_FMT_BGRA:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
+ return altivec_yuv2_bgra;
+ default: return NULL;
+ }
+ break;
+
+ case AV_PIX_FMT_UYVY422:
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_BGR32:
+ av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
+ return altivec_uyvy_rgb32;
+ default: return NULL;
+ }
+ break;
+ }
+ return NULL;
+}
+
+av_cold void ff_yuv2rgb_init_tables_altivec(SwsContext *c,
+ const int inv_table[4],
+ int brightness,
+ int contrast,
+ int saturation)
+{
+ union {
+ DECLARE_ALIGNED(16, signed short, tmp)[8];
+ vector signed short vec;
+ } buf;
+
+ buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
+ buf.tmp[1] = -256 * brightness; // oy
+ buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
+ buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
+ buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
+ buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
+
+ c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
+ c->CY = vec_splat((vector signed short) buf.vec, 0);
+ c->OY = vec_splat((vector signed short) buf.vec, 1);
+ c->CRV = vec_splat((vector signed short) buf.vec, 2);
+ c->CBU = vec_splat((vector signed short) buf.vec, 3);
+ c->CGU = vec_splat((vector signed short) buf.vec, 4);
+ c->CGV = vec_splat((vector signed short) buf.vec, 5);
+ return;
+}
+
+static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c,
+ const int16_t *lumFilter,
+ const int16_t **lumSrc,
+ int lumFilterSize,
+ const int16_t *chrFilter,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize,
+ const int16_t **alpSrc,
+ uint8_t *dest,
+ int dstW, int dstY,
+ enum AVPixelFormat target)
+{
+ int i, j;
+ vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
+ vector signed short R0, G0, B0, R1, G1, B1;
+
+ vector unsigned char R, G, B;
+ vector unsigned char *out, *nout;
+
+ vector signed short RND = vec_splat_s16(1 << 3);
+ vector unsigned short SCL = vec_splat_u16(4);
+ DECLARE_ALIGNED(16, unsigned int, scratch)[16];
+
+ vector signed short *YCoeffs, *CCoeffs;
+
+ YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
+ CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
+
+ out = (vector unsigned char *) dest;
+
+ for (i = 0; i < dstW; i += 16) {
+ Y0 = RND;
+ Y1 = RND;
+ /* extract 16 coeffs from lumSrc */
+ for (j = 0; j < lumFilterSize; j++) {
+ X0 = vec_ld(0, &lumSrc[j][i]);
+ X1 = vec_ld(16, &lumSrc[j][i]);
+ Y0 = vec_mradds(X0, YCoeffs[j], Y0);
+ Y1 = vec_mradds(X1, YCoeffs[j], Y1);
+ }
+
+ U = RND;
+ V = RND;
+ /* extract 8 coeffs from U,V */
+ for (j = 0; j < chrFilterSize; j++) {
+ X = vec_ld(0, &chrUSrc[j][i / 2]);
+ U = vec_mradds(X, CCoeffs[j], U);
+ X = vec_ld(0, &chrVSrc[j][i / 2]);
+ V = vec_mradds(X, CCoeffs[j], V);
+ }
+
+ /* scale and clip signals */
+ Y0 = vec_sra(Y0, SCL);
+ Y1 = vec_sra(Y1, SCL);
+ U = vec_sra(U, SCL);
+ V = vec_sra(V, SCL);
+
+ Y0 = vec_clip_s16(Y0);
+ Y1 = vec_clip_s16(Y1);
+ U = vec_clip_s16(U);
+ V = vec_clip_s16(V);
+
+ /* now we have
+ * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+ * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
+ *
+ * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+ * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
+ * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
+ */
+
+ U0 = vec_mergeh(U, U);
+ V0 = vec_mergeh(V, V);
+
+ U1 = vec_mergel(U, U);
+ V1 = vec_mergel(V, V);
+
+ cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
+ cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
+
+ R = vec_packclp(R0, R1);
+ G = vec_packclp(G0, G1);
+ B = vec_packclp(B0, B1);
+
+ switch (target) {
+ case AV_PIX_FMT_ABGR:
+ out_abgr(R, G, B, out);
+ break;
+ case AV_PIX_FMT_BGRA:
+ out_bgra(R, G, B, out);
+ break;
+ case AV_PIX_FMT_RGBA:
+ out_rgba(R, G, B, out);
+ break;
+ case AV_PIX_FMT_ARGB:
+ out_argb(R, G, B, out);
+ break;
+ case AV_PIX_FMT_RGB24:
+ out_rgb24(R, G, B, out);
+ break;
+ case AV_PIX_FMT_BGR24:
+ out_bgr24(R, G, B, out);
+ break;
+ default:
+ {
+ /* If this is reached, the caller should have called yuv2packedXinC
+ * instead. */
+ static int printed_error_message;
+ if (!printed_error_message) {
+ av_log(c, AV_LOG_ERROR,
+ "altivec_yuv2packedX doesn't support %s output\n",
+ av_get_pix_fmt_name(c->dstFormat));
+ printed_error_message = 1;
+ }
+ return;
+ }
+ }
+ }
+
+ if (i < dstW) {
+ i -= 16;
+
+ Y0 = RND;
+ Y1 = RND;
+ /* extract 16 coeffs from lumSrc */
+ for (j = 0; j < lumFilterSize; j++) {
+ X0 = vec_ld(0, &lumSrc[j][i]);
+ X1 = vec_ld(16, &lumSrc[j][i]);
+ Y0 = vec_mradds(X0, YCoeffs[j], Y0);
+ Y1 = vec_mradds(X1, YCoeffs[j], Y1);
+ }
+
+ U = RND;
+ V = RND;
+ /* extract 8 coeffs from U,V */
+ for (j = 0; j < chrFilterSize; j++) {
+ X = vec_ld(0, &chrUSrc[j][i / 2]);
+ U = vec_mradds(X, CCoeffs[j], U);
+ X = vec_ld(0, &chrVSrc[j][i / 2]);
+ V = vec_mradds(X, CCoeffs[j], V);
+ }
+
+ /* scale and clip signals */
+ Y0 = vec_sra(Y0, SCL);
+ Y1 = vec_sra(Y1, SCL);
+ U = vec_sra(U, SCL);
+ V = vec_sra(V, SCL);
+
+ Y0 = vec_clip_s16(Y0);
+ Y1 = vec_clip_s16(Y1);
+ U = vec_clip_s16(U);
+ V = vec_clip_s16(V);
+
+ /* now we have
+ * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+ * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
+ *
+ * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+ * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
+ * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
+ */
+
+ U0 = vec_mergeh(U, U);
+ V0 = vec_mergeh(V, V);
+
+ U1 = vec_mergel(U, U);
+ V1 = vec_mergel(V, V);
+
+ cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
+ cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
+
+ R = vec_packclp(R0, R1);
+ G = vec_packclp(G0, G1);
+ B = vec_packclp(B0, B1);
+
+ nout = (vector unsigned char *) scratch;
+ switch (target) {
+ case AV_PIX_FMT_ABGR:
+ out_abgr(R, G, B, nout);
+ break;
+ case AV_PIX_FMT_BGRA:
+ out_bgra(R, G, B, nout);
+ break;
+ case AV_PIX_FMT_RGBA:
+ out_rgba(R, G, B, nout);
+ break;
+ case AV_PIX_FMT_ARGB:
+ out_argb(R, G, B, nout);
+ break;
+ case AV_PIX_FMT_RGB24:
+ out_rgb24(R, G, B, nout);
+ break;
+ case AV_PIX_FMT_BGR24:
+ out_bgr24(R, G, B, nout);
+ break;
+ default:
+ /* Unreachable, I think. */
+ av_log(c, AV_LOG_ERROR,
+ "altivec_yuv2packedX doesn't support %s output\n",
+ av_get_pix_fmt_name(c->dstFormat));
+ return;
+ }
+
+ memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
+ }
+}
+
+#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
+void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
+ const int16_t *lumFilter, \
+ const int16_t **lumSrc, \
+ int lumFilterSize, \
+ const int16_t *chrFilter, \
+ const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, \
+ int chrFilterSize, \
+ const int16_t **alpSrc, \
+ uint8_t *dest, int dstW, int dstY) \
+{ \
+ ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, \
+ chrFilterSize, alpSrc, \
+ dest, dstW, dstY, pixfmt); \
+}
+
+YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
+YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
+YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
+YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
+YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
+YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
diff --git a/ffmpeg/libswscale/ppc/yuv2rgb_altivec.h b/ffmpeg/libswscale/ppc/yuv2rgb_altivec.h
new file mode 100644
index 0000000..aa52a47
--- /dev/null
+++ b/ffmpeg/libswscale/ppc/yuv2rgb_altivec.h
@@ -0,0 +1,51 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
+#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
+
+#include <stdint.h>
+
+#include "libswscale/swscale_internal.h"
+
+#define YUV2PACKEDX_HEADER(suffix) \
+ void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
+ const int16_t *lumFilter, \
+ const int16_t **lumSrc, \
+ int lumFilterSize, \
+ const int16_t *chrFilter, \
+ const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, \
+ int chrFilterSize, \
+ const int16_t **alpSrc, \
+ uint8_t *dest, \
+ int dstW, int dstY);
+
+YUV2PACKEDX_HEADER(abgr);
+YUV2PACKEDX_HEADER(bgra);
+YUV2PACKEDX_HEADER(argb);
+YUV2PACKEDX_HEADER(rgba);
+YUV2PACKEDX_HEADER(rgb24);
+YUV2PACKEDX_HEADER(bgr24);
+
+#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */
diff --git a/ffmpeg/libswscale/ppc/yuv2yuv_altivec.c b/ffmpeg/libswscale/ppc/yuv2yuv_altivec.c
new file mode 100644
index 0000000..792deb9
--- /dev/null
+++ b/ffmpeg/libswscale/ppc/yuv2yuv_altivec.c
@@ -0,0 +1,194 @@
+/*
+ * AltiVec-enhanced yuv-to-yuv conversion routines.
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/cpu.h"
+
+static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY,
+ int srcSliceH, uint8_t *dstParam[],
+ int dstStride_a[])
+{
+ uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
+ // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
+ // srcStride[0], srcStride[1], dstStride[0]);
+ const uint8_t *ysrc = src[0];
+ const uint8_t *usrc = src[1];
+ const uint8_t *vsrc = src[2];
+ const int width = c->srcW;
+ const int height = srcSliceH;
+ const int lumStride = srcStride[0];
+ const int chromStride = srcStride[1];
+ const int dstStride = dstStride_a[0];
+ const vector unsigned char yperm = vec_lvsl(0, ysrc);
+ const int vertLumPerChroma = 2;
+ register unsigned int y;
+
+ /* This code assumes:
+ *
+ * 1) dst is 16 bytes-aligned
+ * 2) dstStride is a multiple of 16
+ * 3) width is a multiple of 16
+ * 4) lum & chrom stride are multiples of 8
+ */
+
+ for (y = 0; y < height; y++) {
+ int i;
+ for (i = 0; i < width - 31; i += 32) {
+ const unsigned int j = i >> 1;
+ vector unsigned char v_yA = vec_ld(i, ysrc);
+ vector unsigned char v_yB = vec_ld(i + 16, ysrc);
+ vector unsigned char v_yC = vec_ld(i + 32, ysrc);
+ vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
+ vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
+ vector unsigned char v_uA = vec_ld(j, usrc);
+ vector unsigned char v_uB = vec_ld(j + 16, usrc);
+ vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
+ vector unsigned char v_vA = vec_ld(j, vsrc);
+ vector unsigned char v_vB = vec_ld(j + 16, vsrc);
+ vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
+ vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
+ vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
+ vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
+ vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
+ vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
+ vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
+ vec_st(v_yuy2_0, (i << 1), dst);
+ vec_st(v_yuy2_1, (i << 1) + 16, dst);
+ vec_st(v_yuy2_2, (i << 1) + 32, dst);
+ vec_st(v_yuy2_3, (i << 1) + 48, dst);
+ }
+ if (i < width) {
+ const unsigned int j = i >> 1;
+ vector unsigned char v_y1 = vec_ld(i, ysrc);
+ vector unsigned char v_u = vec_ld(j, usrc);
+ vector unsigned char v_v = vec_ld(j, vsrc);
+ vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
+ vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
+ vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
+ vec_st(v_yuy2_0, (i << 1), dst);
+ vec_st(v_yuy2_1, (i << 1) + 16, dst);
+ }
+ if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+
+ return srcSliceH;
+}
+
+static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY,
+ int srcSliceH, uint8_t *dstParam[],
+ int dstStride_a[])
+{
+ uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
+ // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
+ // srcStride[0], srcStride[1], dstStride[0]);
+ const uint8_t *ysrc = src[0];
+ const uint8_t *usrc = src[1];
+ const uint8_t *vsrc = src[2];
+ const int width = c->srcW;
+ const int height = srcSliceH;
+ const int lumStride = srcStride[0];
+ const int chromStride = srcStride[1];
+ const int dstStride = dstStride_a[0];
+ const int vertLumPerChroma = 2;
+ const vector unsigned char yperm = vec_lvsl(0, ysrc);
+ register unsigned int y;
+
+ /* This code assumes:
+ *
+ * 1) dst is 16 bytes-aligned
+ * 2) dstStride is a multiple of 16
+ * 3) width is a multiple of 16
+ * 4) lum & chrom stride are multiples of 8
+ */
+
+ for (y = 0; y < height; y++) {
+ int i;
+ for (i = 0; i < width - 31; i += 32) {
+ const unsigned int j = i >> 1;
+ vector unsigned char v_yA = vec_ld(i, ysrc);
+ vector unsigned char v_yB = vec_ld(i + 16, ysrc);
+ vector unsigned char v_yC = vec_ld(i + 32, ysrc);
+ vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
+ vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
+ vector unsigned char v_uA = vec_ld(j, usrc);
+ vector unsigned char v_uB = vec_ld(j + 16, usrc);
+ vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
+ vector unsigned char v_vA = vec_ld(j, vsrc);
+ vector unsigned char v_vB = vec_ld(j + 16, vsrc);
+ vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
+ vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
+ vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
+ vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
+ vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
+ vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
+ vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
+ vec_st(v_uyvy_0, (i << 1), dst);
+ vec_st(v_uyvy_1, (i << 1) + 16, dst);
+ vec_st(v_uyvy_2, (i << 1) + 32, dst);
+ vec_st(v_uyvy_3, (i << 1) + 48, dst);
+ }
+ if (i < width) {
+ const unsigned int j = i >> 1;
+ vector unsigned char v_y1 = vec_ld(i, ysrc);
+ vector unsigned char v_u = vec_ld(j, usrc);
+ vector unsigned char v_v = vec_ld(j, vsrc);
+ vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
+ vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
+ vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
+ vec_st(v_uyvy_0, (i << 1), dst);
+ vec_st(v_uyvy_1, (i << 1) + 16, dst);
+ }
+ if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+ return srcSliceH;
+}
+
+void ff_swscale_get_unscaled_altivec(SwsContext *c)
+{
+ if ((av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) && !(c->srcW & 15) &&
+ !(c->flags & SWS_BITEXACT) && c->srcFormat == AV_PIX_FMT_YUV420P) {
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ // unscaled YV12 -> packed YUV, we want speed
+ if (dstFormat == AV_PIX_FMT_YUYV422)
+ c->swScale = yv12toyuy2_unscaled_altivec;
+ else if (dstFormat == AV_PIX_FMT_UYVY422)
+ c->swScale = yv12touyvy_unscaled_altivec;
+ }
+}
diff --git a/ffmpeg/libswscale/rgb2rgb.c b/ffmpeg/libswscale/rgb2rgb.c
new file mode 100644
index 0000000..1233a1d
--- /dev/null
+++ b/ffmpeg/libswscale/rgb2rgb.c
@@ -0,0 +1,390 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ * software YUV to YUV converter
+ * software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/bswap.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
+ uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+ uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+ int srcStride, int dstStride);
+void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+ int width, int height, int src1Stride,
+ int src2Stride, int dstStride);
+void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dst1, uint8_t *dst2,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int dstStride1, int dstStride2);
+void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
+ const uint8_t *src3, uint8_t *dst,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int srcStride3, int dstStride);
+void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
+#define RGB2YUV_SHIFT 8
+#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
+
+//plain C versions
+#include "rgb2rgb_template.c"
+
+/*
+ * RGB15->RGB16 original by Strepto/Astral
+ * ported to gcc & bugfixed : A'rpi
+ * MMXEXT, 3DNOW optimization by Nick Kurshev
+ * 32-bit C version, and and&add trick by Michael Niedermayer
+ */
+
+av_cold void sws_rgb2rgb_init(void)
+{
+ rgb2rgb_init_c();
+ if (HAVE_MMX)
+ rgb2rgb_init_x86();
+}
+
+void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i, num_pixels = src_size >> 2;
+
+ for (i = 0; i < num_pixels; i++) {
+#if HAVE_BIGENDIAN
+ /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
+ dst[3 * i + 0] = src[4 * i + 1];
+ dst[3 * i + 1] = src[4 * i + 2];
+ dst[3 * i + 2] = src[4 * i + 3];
+#else
+ dst[3 * i + 0] = src[4 * i + 2];
+ dst[3 * i + 1] = src[4 * i + 1];
+ dst[3 * i + 2] = src[4 * i + 0];
+#endif
+ }
+}
+
+void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i;
+
+ for (i = 0; 3 * i < src_size; i++) {
+#if HAVE_BIGENDIAN
+ /* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
+ dst[4 * i + 0] = 255;
+ dst[4 * i + 1] = src[3 * i + 0];
+ dst[4 * i + 2] = src[3 * i + 1];
+ dst[4 * i + 3] = src[3 * i + 2];
+#else
+ dst[4 * i + 0] = src[3 * i + 2];
+ dst[4 * i + 1] = src[3 * i + 1];
+ dst[4 * i + 2] = src[3 * i + 0];
+ dst[4 * i + 3] = 255;
+#endif
+ }
+}
+
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+ *d++ = 255;
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+#else
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = 255;
+#endif
+ }
+}
+
+void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t rgb, r, g, b;
+ uint16_t *d = (uint16_t *)dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ rgb = *s++;
+ r = rgb & 0xF00;
+ g = rgb & 0x0F0;
+ b = rgb & 0x00F;
+ r = (r << 3) | ((r & 0x800) >> 1);
+ g = (g << 2) | ((g & 0x080) >> 2);
+ b = (b << 1) | ( b >> 3);
+ *d++ = r | g | b;
+ }
+}
+
+void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ }
+}
+
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i, num_pixels = src_size >> 1;
+
+ for (i = 0; i < num_pixels; i++) {
+ unsigned rgb = ((const uint16_t *)src)[i];
+ ((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
+ }
+}
+
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i, num_pixels = src_size >> 1;
+
+ for (i = 0; i < num_pixels; i++) {
+ unsigned rgb = ((const uint16_t *)src)[i];
+ ((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
+ }
+}
+
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+ *d++ = 255;
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+#else
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = 255;
+#endif
+ }
+}
+
+void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ }
+}
+
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i, num_pixels = src_size >> 1;
+
+ for (i = 0; i < num_pixels; i++) {
+ unsigned rgb = ((const uint16_t *)src)[i];
+ ((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
+ }
+}
+
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ int i, num_pixels = src_size >> 1;
+
+ for (i = 0; i < num_pixels; i++) {
+ unsigned rgb = ((const uint16_t *)src)[i];
+ unsigned br = rgb & 0x7C1F;
+ ((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
+ }
+}
+
+void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ uint16_t *s = (uint16_t *)src;
+ int i, num_pixels = src_size >> 1;
+
+ for (i = 0; i < num_pixels; i++) {
+ unsigned rgb = s[i];
+ d[i] = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
+ }
+}
+
+
+#define DEFINE_SHUFFLE_BYTES(a, b, c, d) \
+void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src, \
+ uint8_t *dst, int src_size) \
+{ \
+ int i; \
+ \
+ for (i = 0; i < src_size; i += 4) { \
+ dst[i + 0] = src[i + a]; \
+ dst[i + 1] = src[i + b]; \
+ dst[i + 2] = src[i + c]; \
+ dst[i + 3] = src[i + d]; \
+ } \
+}
+
+DEFINE_SHUFFLE_BYTES(0, 3, 2, 1)
+DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
+DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
+DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
+
+#define DEFINE_RGB48TOBGR48(need_bswap, swap) \
+void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \
+ uint8_t *dst, int src_size) \
+{ \
+ uint16_t *d = (uint16_t *)dst; \
+ uint16_t *s = (uint16_t *)src; \
+ int i, num_pixels = src_size >> 1; \
+ \
+ for (i = 0; i < num_pixels; i += 3) { \
+ d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \
+ d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \
+ d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \
+ } \
+}
+
+DEFINE_RGB48TOBGR48(nobswap, 0)
+DEFINE_RGB48TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TOBGR48(need_bswap, swap) \
+void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \
+ uint8_t *dst, int src_size) \
+{ \
+ uint16_t *d = (uint16_t *)dst; \
+ uint16_t *s = (uint16_t *)src; \
+ int i, num_pixels = src_size >> 3; \
+ \
+ for (i = 0; i < num_pixels; i++) { \
+ d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
+ d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
+ d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
+ } \
+}
+
+DEFINE_RGB64TOBGR48(nobswap, 0)
+DEFINE_RGB64TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TO48(need_bswap, swap) \
+void rgb64to48_ ## need_bswap(const uint8_t *src, \
+ uint8_t *dst, int src_size) \
+{ \
+ uint16_t *d = (uint16_t *)dst; \
+ uint16_t *s = (uint16_t *)src; \
+ int i, num_pixels = src_size >> 3; \
+ \
+ for (i = 0; i < num_pixels; i++) { \
+ d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
+ d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
+ d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
+ } \
+}
+
+DEFINE_RGB64TO48(nobswap, 0)
+DEFINE_RGB64TO48(bswap, 1)
diff --git a/ffmpeg/libswscale/rgb2rgb.h b/ffmpeg/libswscale/rgb2rgb.h
new file mode 100644
index 0000000..e37f0fb
--- /dev/null
+++ b/ffmpeg/libswscale/rgb2rgb.h
@@ -0,0 +1,166 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by Software PAL8 to RGB converter
+ * Software YUV to YUV converter
+ * Software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_RGB2RGB_H
+#define SWSCALE_RGB2RGB_H
+
+#include <inttypes.h>
+
+#include "libswscale/swscale.h"
+#include "libavutil/avutil.h"
+
+/* A full collection of RGB to RGB(BGR) converters */
+extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+
+extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+
+void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
+
+void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height, int lumStride,
+ int chromStride, int srcStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+
+/**
+ * Width should be a multiple of 16.
+ */
+extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+
+/**
+ * Width should be a multiple of 16.
+ */
+extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write high quality version.
+ */
+extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+ int srcStride, int dstStride);
+
+extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+ int width, int height, int src1Stride,
+ int src2Stride, int dstStride);
+
+extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dst1, uint8_t *dst2,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int dstStride1, int dstStride2);
+
+extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ uint8_t *dst,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int srcStride3, int dstStride);
+
+extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride);
+
+void sws_rgb2rgb_init(void);
+
+void rgb2rgb_init_x86(void);
+
+#endif /* SWSCALE_RGB2RGB_H */
diff --git a/ffmpeg/libswscale/rgb2rgb_template.c b/ffmpeg/libswscale/rgb2rgb_template.c
new file mode 100644
index 0000000..8753594
--- /dev/null
+++ b/ffmpeg/libswscale/rgb2rgb_template.c
@@ -0,0 +1,927 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ * software YUV to YUV converter
+ * software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ * lot of big-endian byte order fixes by Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint8_t *dest = dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+#if HAVE_BIGENDIAN
+ /* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
+ *dest++ = 255;
+ *dest++ = s[2];
+ *dest++ = s[1];
+ *dest++ = s[0];
+ s += 3;
+#else
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = 255;
+#endif
+ }
+}
+
+static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint8_t *dest = dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+#if HAVE_BIGENDIAN
+ /* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
+ s++;
+ dest[2] = *s++;
+ dest[1] = *s++;
+ dest[0] = *s++;
+ dest += 3;
+#else
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = *s++;
+ s++;
+#endif
+ }
+}
+
+/*
+ * original by Strepto/Astral
+ * ported to gcc & bugfixed: A'rpi
+ * MMXEXT, 3DNOW optimization by Nick Kurshev
+ * 32-bit C version, and and&add trick by Michael Niedermayer
+ */
+static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ register uint8_t *d = dst;
+ register const uint8_t *s = src;
+ register const uint8_t *end = s + src_size;
+ const uint8_t *mm_end = end - 3;
+
+ while (s < mm_end) {
+ register unsigned x = *((const uint32_t *)s);
+ *((uint32_t *)d) = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
+ d += 4;
+ s += 4;
+ }
+ if (s < end) {
+ register unsigned short x = *((const uint16_t *)s);
+ *((uint16_t *)d) = (x & 0x7FFF) + (x & 0x7FE0);
+ }
+}
+
+static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ register uint8_t *d = dst;
+ register const uint8_t *s = src;
+ register const uint8_t *end = s + src_size;
+ const uint8_t *mm_end = end - 3;
+
+ while (s < mm_end) {
+ register uint32_t x = *((const uint32_t *)s);
+ *((uint32_t *)d) = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
+ s += 4;
+ d += 4;
+ }
+ if (s < end) {
+ register uint16_t x = *((const uint16_t *)s);
+ *((uint16_t *)d) = ((x >> 1) & 0x7FE0) | (x & 0x001F);
+ }
+}
+
+static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ register int rgb = *(const uint32_t *)s;
+ s += 4;
+ *d++ = ((rgb & 0xFF) >> 3) +
+ ((rgb & 0xFC00) >> 5) +
+ ((rgb & 0xF80000) >> 8);
+ }
+}
+
+static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ register int rgb = *(const uint32_t *)s;
+ s += 4;
+ *d++ = ((rgb & 0xF8) << 8) +
+ ((rgb & 0xFC00) >> 5) +
+ ((rgb & 0xF80000) >> 19);
+ }
+}
+
+static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ register int rgb = *(const uint32_t *)s;
+ s += 4;
+ *d++ = ((rgb & 0xFF) >> 3) +
+ ((rgb & 0xF800) >> 6) +
+ ((rgb & 0xF80000) >> 9);
+ }
+}
+
+static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ register int rgb = *(const uint32_t *)s;
+ s += 4;
+ *d++ = ((rgb & 0xF8) << 7) +
+ ((rgb & 0xF800) >> 6) +
+ ((rgb & 0xF80000) >> 19);
+ }
+}
+
+static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ const int b = *s++;
+ const int g = *s++;
+ const int r = *s++;
+ *d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
+ }
+}
+
+static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ const int r = *s++;
+ const int g = *s++;
+ const int b = *s++;
+ *d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
+ }
+}
+
+static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ const int b = *s++;
+ const int g = *s++;
+ const int r = *s++;
+ *d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
+ }
+}
+
+static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint16_t *d = (uint16_t *)dst;
+ const uint8_t *s = src;
+ const uint8_t *end = s + src_size;
+
+ while (s < end) {
+ const int r = *s++;
+ const int g = *s++;
+ const int b = *s++;
+ *d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
+ }
+}
+
+static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ }
+}
+
+static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ }
+}
+
+static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+ *d++ = 255;
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+#else
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ *d++ = 255;
+#endif
+ }
+}
+
+static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ const uint16_t *end = s + src_size / 2;
+
+ while (s < end) {
+ register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+ *d++ = 255;
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+#else
+ *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+ *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ *d++ = 255;
+#endif
+ }
+}
+
+static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
+ int src_size)
+{
+ int idx = 15 - src_size;
+ const uint8_t *s = src - idx;
+ uint8_t *d = dst - idx;
+
+ for (; idx < 15; idx += 4) {
+ register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
+ v &= 0xff00ff;
+ *(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
+ }
+}
+
+static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ unsigned i;
+
+ for (i = 0; i < src_size; i += 3) {
+ register uint8_t x = src[i + 2];
+ dst[i + 1] = src[i + 1];
+ dst[i + 2] = src[i + 0];
+ dst[i + 0] = x;
+ }
+}
+
+static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride,
+ int dstStride, int vertLumPerChroma)
+{
+ int y, i;
+ const int chromWidth = width >> 1;
+
+ for (y = 0; y < height; y++) {
+#if HAVE_FAST_64BIT
+ uint64_t *ldst = (uint64_t *)dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+ for (i = 0; i < chromWidth; i += 2) {
+ uint64_t k = yc[0] + (uc[0] << 8) +
+ (yc[1] << 16) + (unsigned)(vc[0] << 24);
+ uint64_t l = yc[2] + (uc[1] << 8) +
+ (yc[3] << 16) + (unsigned)(vc[1] << 24);
+ *ldst++ = k + (l << 32);
+ yc += 4;
+ uc += 2;
+ vc += 2;
+ }
+
+#else
+ int *idst = (int32_t *)dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+
+ for (i = 0; i < chromWidth; i++) {
+#if HAVE_BIGENDIAN
+ *idst++ = (yc[0] << 24) + (uc[0] << 16) +
+ (yc[1] << 8) + (vc[0] << 0);
+#else
+ *idst++ = yc[0] + (uc[0] << 8) +
+ (yc[1] << 16) + (vc[0] << 24);
+#endif
+ yc += 2;
+ uc++;
+ vc++;
+ }
+#endif
+ if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height, int lumStride,
+ int chromStride, int dstStride)
+{
+ //FIXME interpolate chroma
+ yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+ chromStride, dstStride, 2);
+}
+
+static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride,
+ int dstStride, int vertLumPerChroma)
+{
+ int y, i;
+ const int chromWidth = width >> 1;
+
+ for (y = 0; y < height; y++) {
+#if HAVE_FAST_64BIT
+ uint64_t *ldst = (uint64_t *)dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+ for (i = 0; i < chromWidth; i += 2) {
+ uint64_t k = uc[0] + (yc[0] << 8) +
+ (vc[0] << 16) + (unsigned)(yc[1] << 24);
+ uint64_t l = uc[1] + (yc[2] << 8) +
+ (vc[1] << 16) + (unsigned)(yc[3] << 24);
+ *ldst++ = k + (l << 32);
+ yc += 4;
+ uc += 2;
+ vc += 2;
+ }
+
+#else
+ int *idst = (int32_t *)dst;
+ const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+
+ for (i = 0; i < chromWidth; i++) {
+#if HAVE_BIGENDIAN
+ *idst++ = (uc[0] << 24) + (yc[0] << 16) +
+ (vc[0] << 8) + (yc[1] << 0);
+#else
+ *idst++ = uc[0] + (yc[0] << 8) +
+ (vc[0] << 16) + (yc[1] << 24);
+#endif
+ yc += 2;
+ uc++;
+ vc++;
+ }
+#endif
+ if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height, int lumStride,
+ int chromStride, int dstStride)
+{
+ //FIXME interpolate chroma
+ yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+ chromStride, dstStride, 2);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height, int lumStride,
+ int chromStride, int dstStride)
+{
+ yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+ chromStride, dstStride, 1);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+ const uint8_t *vsrc, uint8_t *dst,
+ int width, int height, int lumStride,
+ int chromStride, int dstStride)
+{
+ yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+ chromStride, dstStride, 1);
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
+ uint8_t *udst, uint8_t *vdst,
+ int width, int height, int lumStride,
+ int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = width >> 1;
+
+ for (y = 0; y < height; y += 2) {
+ int i;
+ for (i = 0; i < chromWidth; i++) {
+ ydst[2 * i + 0] = src[4 * i + 0];
+ udst[i] = src[4 * i + 1];
+ ydst[2 * i + 1] = src[4 * i + 2];
+ vdst[i] = src[4 * i + 3];
+ }
+ ydst += lumStride;
+ src += srcStride;
+
+ for (i = 0; i < chromWidth; i++) {
+ ydst[2 * i + 0] = src[4 * i + 0];
+ ydst[2 * i + 1] = src[4 * i + 2];
+ }
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+}
+
+static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
+ int srcHeight, int srcStride, int dstStride)
+{
+ int x, y;
+
+ dst[0] = src[0];
+
+ // first line
+ for (x = 0; x < srcWidth - 1; x++) {
+ dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
+ dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
+ }
+ dst[2 * srcWidth - 1] = src[srcWidth - 1];
+
+ dst += dstStride;
+
+ for (y = 1; y < srcHeight; y++) {
+ const int mmxSize = 1;
+
+ dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
+ dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
+
+ for (x = mmxSize - 1; x < srcWidth - 1; x++) {
+ dst[2 * x + 1] = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
+ dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
+ dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride]) >> 2;
+ dst[2 * x + 2] = (src[x + 1] * 3 + src[x + srcStride]) >> 2;
+ }
+ dst[srcWidth * 2 - 1] = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
+ dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
+
+ dst += dstStride * 2;
+ src += srcStride;
+ }
+
+ // last line
+ dst[0] = src[0];
+
+ for (x = 0; x < srcWidth - 1; x++) {
+ dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
+ dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
+ }
+ dst[2 * srcWidth - 1] = src[srcWidth - 1];
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write HQ version.
+ */
+static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+ uint8_t *udst, uint8_t *vdst,
+ int width, int height, int lumStride,
+ int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = width >> 1;
+
+ for (y = 0; y < height; y += 2) {
+ int i;
+ for (i = 0; i < chromWidth; i++) {
+ udst[i] = src[4 * i + 0];
+ ydst[2 * i + 0] = src[4 * i + 1];
+ vdst[i] = src[4 * i + 2];
+ ydst[2 * i + 1] = src[4 * i + 3];
+ }
+ ydst += lumStride;
+ src += srcStride;
+
+ for (i = 0; i < chromWidth; i++) {
+ ydst[2 * i + 0] = src[4 * i + 1];
+ ydst[2 * i + 1] = src[4 * i + 3];
+ }
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line,
+ * others are ignored in the C version.
+ * FIXME: Write HQ version.
+ */
+void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ uint8_t *vdst, int width, int height, int lumStride,
+ int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = width >> 1;
+
+ for (y = 0; y < height; y += 2) {
+ int i;
+ for (i = 0; i < chromWidth; i++) {
+ unsigned int b = src[6 * i + 0];
+ unsigned int g = src[6 * i + 1];
+ unsigned int r = src[6 * i + 2];
+
+ unsigned int Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
+ unsigned int V = ((RV * r + GV * g + BV * b) >> RGB2YUV_SHIFT) + 128;
+ unsigned int U = ((RU * r + GU * g + BU * b) >> RGB2YUV_SHIFT) + 128;
+
+ udst[i] = U;
+ vdst[i] = V;
+ ydst[2 * i] = Y;
+
+ b = src[6 * i + 3];
+ g = src[6 * i + 4];
+ r = src[6 * i + 5];
+
+ Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
+ ydst[2 * i + 1] = Y;
+ }
+ ydst += lumStride;
+ src += srcStride;
+
+ if (y+1 == height)
+ break;
+
+ for (i = 0; i < chromWidth; i++) {
+ unsigned int b = src[6 * i + 0];
+ unsigned int g = src[6 * i + 1];
+ unsigned int r = src[6 * i + 2];
+
+ unsigned int Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
+
+ ydst[2 * i] = Y;
+
+ b = src[6 * i + 3];
+ g = src[6 * i + 4];
+ r = src[6 * i + 5];
+
+ Y = ((RY * r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
+ ydst[2 * i + 1] = Y;
+ }
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+}
+
+static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dest, int width, int height,
+ int src1Stride, int src2Stride, int dstStride)
+{
+ int h;
+
+ for (h = 0; h < height; h++) {
+ int w;
+ for (w = 0; w < width; w++) {
+ dest[2 * w + 0] = src1[w];
+ dest[2 * w + 1] = src2[w];
+ }
+ dest += dstStride;
+ src1 += src1Stride;
+ src2 += src2Stride;
+ }
+}
+
+static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dst1, uint8_t *dst2,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int dstStride1, int dstStride2)
+{
+ int x, y;
+ int w = width / 2;
+ int h = height / 2;
+
+ for (y = 0; y < h; y++) {
+ const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
+ uint8_t *d = dst1 + dstStride1 * y;
+ for (x = 0; x < w; x++)
+ d[2 * x] = d[2 * x + 1] = s1[x];
+ }
+ for (y = 0; y < h; y++) {
+ const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
+ uint8_t *d = dst2 + dstStride2 * y;
+ for (x = 0; x < w; x++)
+ d[2 * x] = d[2 * x + 1] = s2[x];
+ }
+}
+
+static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
+ const uint8_t *src3, uint8_t *dst,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int srcStride3, int dstStride)
+{
+ int x, y;
+ int w = width / 2;
+ int h = height;
+
+ for (y = 0; y < h; y++) {
+ const uint8_t *yp = src1 + srcStride1 * y;
+ const uint8_t *up = src2 + srcStride2 * (y >> 2);
+ const uint8_t *vp = src3 + srcStride3 * (y >> 2);
+ uint8_t *d = dst + dstStride * y;
+ for (x = 0; x < w; x++) {
+ const int x2 = x << 2;
+ d[8 * x + 0] = yp[x2];
+ d[8 * x + 1] = up[x];
+ d[8 * x + 2] = yp[x2 + 1];
+ d[8 * x + 3] = vp[x];
+ d[8 * x + 4] = yp[x2 + 2];
+ d[8 * x + 5] = up[x];
+ d[8 * x + 6] = yp[x2 + 3];
+ d[8 * x + 7] = vp[x];
+ }
+ }
+}
+
+static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
+{
+ dst += count;
+ src += count * 2;
+ count = -count;
+ while (count < 0) {
+ dst[count] = src[2 * count];
+ count++;
+ }
+}
+
+static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
+ int count)
+{
+ dst0 += count;
+ dst1 += count;
+ src += count * 4;
+ count = -count;
+ while (count < 0) {
+ dst0[count] = src[4 * count + 0];
+ dst1[count] = src[4 * count + 2];
+ count++;
+ }
+}
+
+static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
+ uint8_t *dst0, uint8_t *dst1, int count)
+{
+ dst0 += count;
+ dst1 += count;
+ src0 += count * 4;
+ src1 += count * 4;
+ count = -count;
+ while (count < 0) {
+ dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
+ dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
+ count++;
+ }
+}
+
+static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
+ int count)
+{
+ dst0 += count;
+ dst1 += count;
+ src += count * 4;
+ count = -count;
+ src++;
+ while (count < 0) {
+ dst0[count] = src[4 * count + 0];
+ dst1[count] = src[4 * count + 2];
+ count++;
+ }
+}
+
+static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
+ uint8_t *dst0, uint8_t *dst1, int count)
+{
+ dst0 += count;
+ dst1 += count;
+ src0 += count * 4;
+ src1 += count * 4;
+ count = -count;
+ src0++;
+ src1++;
+ while (count < 0) {
+ dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
+ dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
+ count++;
+ }
+}
+
+static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = -((-width) >> 1);
+
+ for (y = 0; y < height; y++) {
+ extract_even_c(src, ydst, width);
+ if (y & 1) {
+ extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
+ udst += chromStride;
+ vdst += chromStride;
+ }
+
+ src += srcStride;
+ ydst += lumStride;
+ }
+}
+
+static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = -((-width) >> 1);
+
+ for (y = 0; y < height; y++) {
+ extract_even_c(src, ydst, width);
+ extract_odd2_c(src, udst, vdst, chromWidth);
+
+ src += srcStride;
+ ydst += lumStride;
+ udst += chromStride;
+ vdst += chromStride;
+ }
+}
+
+static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = -((-width) >> 1);
+
+ for (y = 0; y < height; y++) {
+ extract_even_c(src + 1, ydst, width);
+ if (y & 1) {
+ extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
+ udst += chromStride;
+ vdst += chromStride;
+ }
+
+ src += srcStride;
+ ydst += lumStride;
+ }
+}
+
+static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth = -((-width) >> 1);
+
+ for (y = 0; y < height; y++) {
+ extract_even_c(src + 1, ydst, width);
+ extract_even2_c(src, udst, vdst, chromWidth);
+
+ src += srcStride;
+ ydst += lumStride;
+ udst += chromStride;
+ vdst += chromStride;
+ }
+}
+
+static inline void rgb2rgb_init_c(void)
+{
+ rgb15to16 = rgb15to16_c;
+ rgb15tobgr24 = rgb15tobgr24_c;
+ rgb15to32 = rgb15to32_c;
+ rgb16tobgr24 = rgb16tobgr24_c;
+ rgb16to32 = rgb16to32_c;
+ rgb16to15 = rgb16to15_c;
+ rgb24tobgr16 = rgb24tobgr16_c;
+ rgb24tobgr15 = rgb24tobgr15_c;
+ rgb24tobgr32 = rgb24tobgr32_c;
+ rgb32to16 = rgb32to16_c;
+ rgb32to15 = rgb32to15_c;
+ rgb32tobgr24 = rgb32tobgr24_c;
+ rgb24to15 = rgb24to15_c;
+ rgb24to16 = rgb24to16_c;
+ rgb24tobgr24 = rgb24tobgr24_c;
+ shuffle_bytes_2103 = shuffle_bytes_2103_c;
+ rgb32tobgr16 = rgb32tobgr16_c;
+ rgb32tobgr15 = rgb32tobgr15_c;
+ yv12toyuy2 = yv12toyuy2_c;
+ yv12touyvy = yv12touyvy_c;
+ yuv422ptoyuy2 = yuv422ptoyuy2_c;
+ yuv422ptouyvy = yuv422ptouyvy_c;
+ yuy2toyv12 = yuy2toyv12_c;
+ planar2x = planar2x_c;
+ rgb24toyv12 = rgb24toyv12_c;
+ interleaveBytes = interleaveBytes_c;
+ vu9_to_vu12 = vu9_to_vu12_c;
+ yvu9_to_yuy2 = yvu9_to_yuy2_c;
+
+ uyvytoyuv420 = uyvytoyuv420_c;
+ uyvytoyuv422 = uyvytoyuv422_c;
+ yuyvtoyuv420 = yuyvtoyuv420_c;
+ yuyvtoyuv422 = yuyvtoyuv422_c;
+}
diff --git a/ffmpeg/libswscale/sparc/Makefile b/ffmpeg/libswscale/sparc/Makefile
new file mode 100644
index 0000000..2351ba4
--- /dev/null
+++ b/ffmpeg/libswscale/sparc/Makefile
@@ -0,0 +1 @@
+VIS-OBJS += sparc/yuv2rgb_vis.o \
diff --git a/ffmpeg/libswscale/sparc/yuv2rgb_vis.c b/ffmpeg/libswscale/sparc/yuv2rgb_vis.c
new file mode 100644
index 0000000..ed00837
--- /dev/null
+++ b/ffmpeg/libswscale/sparc/yuv2rgb_vis.c
@@ -0,0 +1,212 @@
+/*
+ * VIS optimized software YUV to RGB converter
+ * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+
+#include "libavutil/attributes.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#define YUV2RGB_INIT \
+ "wr %%g0, 0x10, %%gsr \n\t" \
+ "ldd [%5], %%f32 \n\t" \
+ "ldd [%5 + 8], %%f34 \n\t" \
+ "ldd [%5 + 16], %%f36 \n\t" \
+ "ldd [%5 + 24], %%f38 \n\t" \
+ "ldd [%5 + 32], %%f40 \n\t" \
+ "ldd [%5 + 40], %%f42 \n\t" \
+ "ldd [%5 + 48], %%f44 \n\t" \
+ "ldd [%5 + 56], %%f46 \n\t" \
+ "ldd [%5 + 64], %%f48 \n\t" \
+ "ldd [%5 + 72], %%f50 \n\t"
+
+#define YUV2RGB_KERNEL \
+ /* ^^^^ f0=Y f3=u f5=v */ \
+ "fmul8x16 %%f3, %%f48, %%f6 \n\t" \
+ "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
+ "fmul8x16 %%f5, %%f44, %%f8 \n\t" \
+ "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
+ "fmul8x16 %%f0, %%f42, %%f0 \n\t" \
+ "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
+ "fmul8x16 %%f3, %%f50, %%f2 \n\t" \
+ "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
+ "fmul8x16 %%f5, %%f46, %%f4 \n\t" \
+ "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
+ \
+ "fpsub16 %%f6, %%f34, %%f6 \n\t" /* 1 */ \
+ "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
+ "fpsub16 %%f8, %%f38, %%f8 \n\t" /* 3 */ \
+ "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
+ "fpsub16 %%f0, %%f32, %%f0 \n\t" /* 0 */ \
+ "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
+ "fpsub16 %%f2, %%f36, %%f2 \n\t" /* 2 */ \
+ "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
+ "fpsub16 %%f4, %%f40, %%f4 \n\t" /* 4 */ \
+ "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
+ \
+ "fpadd16 %%f0, %%f8, %%f8 \n\t" /* Gt */ \
+ "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
+ "fpadd16 %%f0, %%f4, %%f4 \n\t" /* R */ \
+ "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
+ "fpadd16 %%f0, %%f6, %%f6 \n\t" /* B */ \
+ "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
+ "fpadd16 %%f8, %%f2, %%f2 \n\t" /* G */ \
+ "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
+ \
+ "fpack16 %%f4, %%f4 \n\t" \
+ "fpack16 %%f20, %%f20 \n\t" \
+ "fpack16 %%f6, %%f6 \n\t" \
+ "fpack16 %%f22, %%f22 \n\t" \
+ "fpack16 %%f2, %%f2 \n\t" \
+ "fpack16 %%f18, %%f18 \n\t"
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_420P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, out1, out2, out3, out4, out5, out6;
+
+ for (y = 0; y < srcSliceH; ++y)
+ __asm__ volatile (
+ YUV2RGB_INIT
+ "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
+ "1: \n\t"
+ "ldda [%1] %%asi, %%f2 \n\t"
+ "ldda [%1 + 2] %%asi, %%f18 \n\t"
+ "ldda [%2] %%asi, %%f4 \n\t"
+ "ldda [%2 + 2] %%asi, %%f20 \n\t"
+ "ld [%0], %%f0 \n\t"
+ "ld [%0+4], %%f16 \n\t"
+ "fpmerge %%f3, %%f3, %%f2 \n\t"
+ "fpmerge %%f19, %%f19, %%f18 \n\t"
+ "fpmerge %%f5, %%f5, %%f4 \n\t"
+ "fpmerge %%f21, %%f21, %%f20 \n\t"
+ YUV2RGB_KERNEL
+ "fzero %%f0 \n\t"
+ "fpmerge %%f4, %%f6, %%f8 \n\t" // r, b, t1
+ "fpmerge %%f20, %%f22, %%f24 \n\t" // r, b, t1
+ "fpmerge %%f0, %%f2, %%f10 \n\t" // 0, g, t2
+ "fpmerge %%f0, %%f18, %%f26 \n\t" // 0, g, t2
+ "fpmerge %%f10, %%f8, %%f4 \n\t" // t2, t1, msb
+ "fpmerge %%f26, %%f24, %%f20 \n\t" // t2, t1, msb
+ "fpmerge %%f11, %%f9, %%f6 \n\t" // t2, t1, lsb
+ "fpmerge %%f27, %%f25, %%f22 \n\t" // t2, t1, lsb
+ "std %%f4, [%3] \n\t"
+ "std %%f20, [%3 + 16] \n\t"
+ "std %%f6, [%3 + 8] \n\t"
+ "std %%f22, [%3 + 24] \n\t"
+
+ "add %0, 8, %0 \n\t"
+ "add %1, 4, %1 \n\t"
+ "add %2, 4, %2 \n\t"
+ "subcc %4, 8, %4 \n\t"
+ "bne 1b \n\t"
+ "add %3, 32, %3 \n\t" // delay slot
+ : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+ : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + ((y + srcSliceY) >> 1) * srcStride[1]),
+ "2" (src[2] + ((y + srcSliceY) >> 1) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+ "4" (c->dstW),
+ "5" (c->sparc_coeffs)
+ );
+
+ return srcSliceH;
+}
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_422P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, out1, out2, out3, out4, out5, out6;
+
+ for (y = 0; y < srcSliceH; ++y)
+ __asm__ volatile (
+ YUV2RGB_INIT
+ "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
+ "1: \n\t"
+ "ldda [%1] %%asi, %%f2 \n\t"
+ "ldda [%1 + 2] %%asi, %%f18 \n\t"
+ "ldda [%2] %%asi, %%f4 \n\t"
+ "ldda [%2 + 2] %%asi, %%f20 \n\t"
+ "ld [%0], %%f0 \n\t"
+ "ld [%0 + 4], %%f16 \n\t"
+ "fpmerge %%f3, %%f3, %%f2 \n\t"
+ "fpmerge %%f19, %%f19, %%f18 \n\t"
+ "fpmerge %%f5, %%f5, %%f4 \n\t"
+ "fpmerge %%f21, %%f21, %%f20 \n\t"
+ YUV2RGB_KERNEL
+ "fzero %%f0 \n\t"
+ "fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1
+ "fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1
+ "fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2
+ "fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2
+ "fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb
+ "fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb
+ "fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb
+ "fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb
+ "std %%f4, [%3] \n\t"
+ "std %%f20, [%3 + 16] \n\t"
+ "std %%f6, [%3 + 8] \n\t"
+ "std %%f22, [%3 + 24] \n\t"
+
+ "add %0, 8, %0 \n\t"
+ "add %1, 4, %1 \n\t"
+ "add %2, 4, %2 \n\t"
+ "subcc %4, 8, %4 \n\t"
+ "bne 1b \n\t"
+ "add %3, 32, %3 \n\t" //delay slot
+ : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+ : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + (y + srcSliceY) * srcStride[1]),
+ "2" (src[2] + (y + srcSliceY) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+ "4" (c->dstW),
+ "5" (c->sparc_coeffs)
+ );
+
+ return srcSliceH;
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
+{
+ c->sparc_coeffs[5] = c->yCoeff;
+ c->sparc_coeffs[6] = c->vgCoeff;
+ c->sparc_coeffs[7] = c->vrCoeff;
+ c->sparc_coeffs[8] = c->ubCoeff;
+ c->sparc_coeffs[9] = c->ugCoeff;
+
+ c->sparc_coeffs[0] = (((int16_t)c->yOffset * (int16_t)c->yCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+ c->sparc_coeffs[1] = (((int16_t)c->uOffset * (int16_t)c->ubCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+ c->sparc_coeffs[2] = (((int16_t)c->uOffset * (int16_t)c->ugCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+ c->sparc_coeffs[3] = (((int16_t)c->vOffset * (int16_t)c->vgCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+ c->sparc_coeffs[4] = (((int16_t)c->vOffset * (int16_t)c->vrCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+
+ if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV422P && (c->dstW & 7) == 0) {
+ av_log(c, AV_LOG_INFO,
+ "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
+ return vis_422P_ARGB32;
+ } else if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV420P && (c->dstW & 7) == 0) {
+ av_log(c, AV_LOG_INFO,
+ "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
+ return vis_420P_ARGB32;
+ }
+ return NULL;
+}
diff --git a/ffmpeg/libswscale/swscale-test.c b/ffmpeg/libswscale/swscale-test.c
new file mode 100644
index 0000000..aece61e
--- /dev/null
+++ b/ffmpeg/libswscale/swscale-test.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdarg.h>
+
+#undef HAVE_AV_CONFIG_H
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/avutil.h"
+#include "libavutil/crc.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/lfg.h"
+#include "swscale.h"
+
+/* HACK Duplicated from swscale_internal.h.
+ * Should be removed when a cleaner pixel format system exists. */
+#define isGray(x) \
+ ((x) == AV_PIX_FMT_GRAY8 || \
+ (x) == AV_PIX_FMT_Y400A || \
+ (x) == AV_PIX_FMT_GRAY16BE || \
+ (x) == AV_PIX_FMT_GRAY16LE)
+#define hasChroma(x) \
+ (!(isGray(x) || \
+ (x) == AV_PIX_FMT_MONOBLACK || \
+ (x) == AV_PIX_FMT_MONOWHITE))
+#define isALPHA(x) \
+ ((x) == AV_PIX_FMT_BGR32 || \
+ (x) == AV_PIX_FMT_BGR32_1 || \
+ (x) == AV_PIX_FMT_RGB32 || \
+ (x) == AV_PIX_FMT_RGB32_1 || \
+ (x) == AV_PIX_FMT_YUVA420P)
+
+static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1,
+ int stride2, int w, int h)
+{
+ int x, y;
+ uint64_t ssd = 0;
+
+ for (y = 0; y < h; y++) {
+ for (x = 0; x < w; x++) {
+ int d = src1[x + y * stride1] - src2[x + y * stride2];
+ ssd += d * d;
+ }
+ }
+ return ssd;
+}
+
+struct Results {
+ uint64_t ssdY;
+ uint64_t ssdU;
+ uint64_t ssdV;
+ uint64_t ssdA;
+ uint32_t crc;
+};
+
+// test by ref -> src -> dst -> out & compare out against ref
+// ref & out are YV12
+static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
+ enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
+ int srcW, int srcH, int dstW, int dstH, int flags,
+ struct Results *r)
+{
+ const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
+ const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
+ const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
+ static enum AVPixelFormat cur_srcFormat;
+ static int cur_srcW, cur_srcH;
+ static uint8_t *src[4];
+ static int srcStride[4];
+ uint8_t *dst[4] = { 0 };
+ uint8_t *out[4] = { 0 };
+ int dstStride[4] = {0};
+ int i;
+ uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
+ struct SwsContext *dstContext = NULL, *outContext = NULL;
+ uint32_t crc = 0;
+ int res = 0;
+
+ if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
+ struct SwsContext *srcContext = NULL;
+ int p;
+
+ for (p = 0; p < 4; p++)
+ av_freep(&src[p]);
+
+ av_image_fill_linesizes(srcStride, srcFormat, srcW);
+ for (p = 0; p < 4; p++) {
+ srcStride[p] = FFALIGN(srcStride[p], 16);
+ if (srcStride[p])
+ src[p] = av_mallocz(srcStride[p] * srcH + 16);
+ if (srcStride[p] && !src[p]) {
+ perror("Malloc");
+ res = -1;
+ goto end;
+ }
+ }
+ srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
+ srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
+ if (!srcContext) {
+ fprintf(stderr, "Failed to get %s ---> %s\n",
+ desc_yuva420p->name,
+ desc_src->name);
+ res = -1;
+ goto end;
+ }
+ sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride);
+ sws_freeContext(srcContext);
+
+ cur_srcFormat = srcFormat;
+ cur_srcW = srcW;
+ cur_srcH = srcH;
+ }
+
+ av_image_fill_linesizes(dstStride, dstFormat, dstW);
+ for (i = 0; i < 4; i++) {
+ /* Image buffers passed into libswscale can be allocated any way you
+ * prefer, as long as they're aligned enough for the architecture, and
+ * they're freed appropriately (such as using av_free for buffers
+ * allocated with av_malloc). */
+ /* An extra 16 bytes is being allocated because some scalers may write
+ * out of bounds. */
+ dstStride[i] = FFALIGN(dstStride[i], 16);
+ if (dstStride[i])
+ dst[i] = av_mallocz(dstStride[i] * dstH + 16);
+ if (dstStride[i] && !dst[i]) {
+ perror("Malloc");
+ res = -1;
+
+ goto end;
+ }
+ }
+
+ dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
+ flags, NULL, NULL, NULL);
+ if (!dstContext) {
+ fprintf(stderr, "Failed to get %s ---> %s\n",
+ desc_src->name, desc_dst->name);
+ res = -1;
+ goto end;
+ }
+
+ printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
+ desc_src->name, srcW, srcH,
+ desc_dst->name, dstW, dstH,
+ flags);
+ fflush(stdout);
+
+ sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+
+ for (i = 0; i < 4 && dstStride[i]; i++)
+ crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+ dstStride[i] * dstH);
+
+ if (r && crc == r->crc) {
+ ssdY = r->ssdY;
+ ssdU = r->ssdU;
+ ssdV = r->ssdV;
+ ssdA = r->ssdA;
+ } else {
+ for (i = 0; i < 4; i++) {
+ refStride[i] = FFALIGN(refStride[i], 16);
+ if (refStride[i])
+ out[i] = av_mallocz(refStride[i] * h);
+ if (refStride[i] && !out[i]) {
+ perror("Malloc");
+ res = -1;
+ goto end;
+ }
+ }
+ outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
+ AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
+ NULL, NULL, NULL);
+ if (!outContext) {
+ fprintf(stderr, "Failed to get %s ---> %s\n",
+ desc_dst->name,
+ desc_yuva420p->name);
+ res = -1;
+ goto end;
+ }
+ sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride);
+
+ ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
+ if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
+ //FIXME check that output is really gray
+ ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
+ (w + 1) >> 1, (h + 1) >> 1);
+ ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
+ (w + 1) >> 1, (h + 1) >> 1);
+ }
+ if (isALPHA(srcFormat) && isALPHA(dstFormat))
+ ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
+
+ ssdY /= w * h;
+ ssdU /= w * h / 4;
+ ssdV /= w * h / 4;
+ ssdA /= w * h;
+
+ sws_freeContext(outContext);
+
+ for (i = 0; i < 4; i++)
+ if (refStride[i])
+ av_free(out[i]);
+ }
+
+ printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
+ crc, ssdY, ssdU, ssdV, ssdA);
+
+end:
+ sws_freeContext(dstContext);
+
+ for (i = 0; i < 4; i++)
+ if (dstStride[i])
+ av_free(dst[i]);
+
+ return res;
+}
+
+static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
+ enum AVPixelFormat srcFormat_in,
+ enum AVPixelFormat dstFormat_in)
+{
+ const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
+ SWS_X, SWS_POINT, SWS_AREA, 0 };
+ const int srcW = w;
+ const int srcH = h;
+ const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
+ const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
+ enum AVPixelFormat srcFormat, dstFormat;
+ const AVPixFmtDescriptor *desc_src, *desc_dst;
+
+ for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
+ srcFormat < AV_PIX_FMT_NB; srcFormat++) {
+ if (!sws_isSupportedInput(srcFormat) ||
+ !sws_isSupportedOutput(srcFormat))
+ continue;
+
+ desc_src = av_pix_fmt_desc_get(srcFormat);
+
+ for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
+ dstFormat < AV_PIX_FMT_NB; dstFormat++) {
+ int i, j, k;
+ int res = 0;
+
+ if (!sws_isSupportedInput(dstFormat) ||
+ !sws_isSupportedOutput(dstFormat))
+ continue;
+
+ desc_dst = av_pix_fmt_desc_get(dstFormat);
+
+ printf("%s -> %s\n", desc_src->name, desc_dst->name);
+ fflush(stdout);
+
+ for (k = 0; flags[k] && !res; k++)
+ for (i = 0; dstW[i] && !res; i++)
+ for (j = 0; dstH[j] && !res; j++)
+ res = doTest(ref, refStride, w, h,
+ srcFormat, dstFormat,
+ srcW, srcH, dstW[i], dstH[j], flags[k],
+ NULL);
+ if (dstFormat_in != AV_PIX_FMT_NONE)
+ break;
+ }
+ if (srcFormat_in != AV_PIX_FMT_NONE)
+ break;
+ }
+}
+
+static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
+ enum AVPixelFormat srcFormat_in,
+ enum AVPixelFormat dstFormat_in)
+{
+ char buf[256];
+
+ while (fgets(buf, sizeof(buf), fp)) {
+ struct Results r;
+ enum AVPixelFormat srcFormat;
+ char srcStr[12];
+ int srcW, srcH;
+ enum AVPixelFormat dstFormat;
+ char dstStr[12];
+ int dstW, dstH;
+ int flags;
+ int ret;
+
+ ret = sscanf(buf,
+ " %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
+ " SSD=%"PRId64 ", %"PRId64 ", %"PRId64 ", %"PRId64 "\n",
+ srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
+ &flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
+ if (ret != 12) {
+ srcStr[0] = dstStr[0] = 0;
+ ret = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
+ }
+
+ srcFormat = av_get_pix_fmt(srcStr);
+ dstFormat = av_get_pix_fmt(dstStr);
+
+ if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
+ srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
+ fprintf(stderr, "malformed input file\n");
+ return -1;
+ }
+ if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
+ (dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
+ continue;
+ if (ret != 12) {
+ printf("%s", buf);
+ continue;
+ }
+
+ doTest(ref, refStride, w, h,
+ srcFormat, dstFormat,
+ srcW, srcH, dstW, dstH, flags,
+ &r);
+ }
+
+ return 0;
+}
+
+#define W 96
+#define H 96
+
+int main(int argc, char **argv)
+{
+ enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+ enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+ uint8_t *rgb_data = av_malloc(W * H * 4);
+ const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+ int rgb_stride[4] = { 4 * W, 0, 0, 0 };
+ uint8_t *data = av_malloc(4 * W * H);
+ uint8_t *src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+ int stride[4] = { W, W, W, W };
+ int x, y;
+ struct SwsContext *sws;
+ AVLFG rand;
+ int res = -1;
+ int i;
+ FILE *fp = NULL;
+
+ if (!rgb_data || !data)
+ return -1;
+
+ for (i = 1; i < argc; i += 2) {
+ if (argv[i][0] != '-' || i + 1 == argc)
+ goto bad_option;
+ if (!strcmp(argv[i], "-ref")) {
+ fp = fopen(argv[i + 1], "r");
+ if (!fp) {
+ fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
+ goto error;
+ }
+ } else if (!strcmp(argv[i], "-src")) {
+ srcFormat = av_get_pix_fmt(argv[i + 1]);
+ if (srcFormat == AV_PIX_FMT_NONE) {
+ fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+ return -1;
+ }
+ } else if (!strcmp(argv[i], "-dst")) {
+ dstFormat = av_get_pix_fmt(argv[i + 1]);
+ if (dstFormat == AV_PIX_FMT_NONE) {
+ fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+ return -1;
+ }
+ } else {
+bad_option:
+ fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
+ goto error;
+ }
+ }
+
+ sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
+ AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+
+ av_lfg_init(&rand, 1);
+
+ for (y = 0; y < H; y++)
+ for (x = 0; x < W * 4; x++)
+ rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+ sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
+ sws_freeContext(sws);
+ av_free(rgb_data);
+
+ if(fp) {
+ res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
+ fclose(fp);
+ } else {
+ selfTest(src, stride, W, H, srcFormat, dstFormat);
+ res = 0;
+ }
+error:
+ av_free(data);
+
+ return res;
+}
diff --git a/ffmpeg/libswscale/swscale.c b/ffmpeg/libswscale/swscale.c
new file mode 100644
index 0000000..bb90819
--- /dev/null
+++ b/ffmpeg/libswscale/swscale.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale_internal.h"
+#include "swscale.h"
+
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
+ { 36, 68, 60, 92, 34, 66, 58, 90, },
+ { 100, 4, 124, 28, 98, 2, 122, 26, },
+ { 52, 84, 44, 76, 50, 82, 42, 74, },
+ { 116, 20, 108, 12, 114, 18, 106, 10, },
+ { 32, 64, 56, 88, 38, 70, 62, 94, },
+ { 96, 0, 120, 24, 102, 6, 126, 30, },
+ { 48, 80, 40, 72, 54, 86, 46, 78, },
+ { 112, 16, 104, 8, 118, 22, 110, 14, },
+};
+
+DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
+ int height, int y, uint8_t val)
+{
+ int i;
+ uint8_t *ptr = plane + stride * y;
+ for (i = 0; i < height; i++) {
+ memset(ptr, val, width);
+ ptr += stride;
+ }
+}
+
+static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ const uint16_t *src = (const uint16_t *) _src;
+ int bits = desc->comp[0].depth_minus1;
+ int sh = bits - 4;
+
+ if((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth_minus1<15)
+ sh= 9;
+
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++) {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
+ }
+}
+
+static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int i;
+ const uint16_t *src = (const uint16_t *) _src;
+ int sh = desc->comp[0].depth_minus1;
+
+ if(sh<15)
+ sh= isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : desc->comp[0].depth_minus1;
+
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++) {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
+ }
+}
+
+// bilinear / bicubic scaling
+static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++) {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
+ }
+}
+
+static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++) {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
+ }
+}
+
+// FIXME all pal and rgb srcFormats could do this conversion as well
+// FIXME all scalers more complex than bilinear could do half of this transform
+static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++) {
+ dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
+}
+
+static void lumRangeFromJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i] * 14071 + 33561947) >> 14;
+}
+
+static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++) {
+ dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++) {
+ dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12;
+}
+
+static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
+}
+
+static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++) {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+// *** horizontal scale Y line to temp buffer
+static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
+ const uint8_t *src_in[4],
+ int srcW, int xInc,
+ const int16_t *hLumFilter,
+ const int32_t *hLumFilterPos,
+ int hLumFilterSize,
+ uint8_t *formatConvBuffer,
+ uint32_t *pal, int isAlpha)
+{
+ void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) =
+ isAlpha ? c->alpToYV12 : c->lumToYV12;
+ void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
+ const uint8_t *src = src_in[isAlpha ? 3 : 0];
+
+ if (toYV12) {
+ toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
+ src = formatConvBuffer;
+ } else if (c->readLumPlanar && !isAlpha) {
+ c->readLumPlanar(formatConvBuffer, src_in, srcW);
+ src = formatConvBuffer;
+ }
+
+ if (!c->hyscale_fast) {
+ c->hyScale(c, dst, dstWidth, src, hLumFilter,
+ hLumFilterPos, hLumFilterSize);
+ } else { // fast bilinear upscale / crap downscale
+ c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
+ }
+
+ if (convertRange)
+ convertRange(dst, dstWidth);
+}
+
+static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++) {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
+ dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+
+static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
+ int16_t *dst2, int dstWidth,
+ const uint8_t *src_in[4],
+ int srcW, int xInc,
+ const int16_t *hChrFilter,
+ const int32_t *hChrFilterPos,
+ int hChrFilterSize,
+ uint8_t *formatConvBuffer, uint32_t *pal)
+{
+ const uint8_t *src1 = src_in[1], *src2 = src_in[2];
+ if (c->chrToYV12) {
+ uint8_t *buf2 = formatConvBuffer +
+ FFALIGN(srcW*2+78, 16);
+ c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
+ src1= formatConvBuffer;
+ src2= buf2;
+ } else if (c->readChrPlanar) {
+ uint8_t *buf2 = formatConvBuffer +
+ FFALIGN(srcW*2+78, 16);
+ c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
+ src1 = formatConvBuffer;
+ src2 = buf2;
+ }
+
+ if (!c->hcscale_fast) {
+ c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
+ c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
+ } else { // fast bilinear upscale / crap downscale
+ c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
+ }
+
+ if (c->chrConvertRange)
+ c->chrConvertRange(dst1, dst2, dstWidth);
+}
+
+#define DEBUG_SWSCALE_BUFFERS 0
+#define DEBUG_BUFFERS(...) \
+ if (DEBUG_SWSCALE_BUFFERS) \
+ av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
+
+static int swScale(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY,
+ int srcSliceH, uint8_t *dst[], int dstStride[])
+{
+ /* load a few things into local vars to make the code more readable?
+ * and faster */
+ const int srcW = c->srcW;
+ const int dstW = c->dstW;
+ const int dstH = c->dstH;
+ const int chrDstW = c->chrDstW;
+ const int chrSrcW = c->chrSrcW;
+ const int lumXInc = c->lumXInc;
+ const int chrXInc = c->chrXInc;
+ const enum AVPixelFormat dstFormat = c->dstFormat;
+ const int flags = c->flags;
+ int32_t *vLumFilterPos = c->vLumFilterPos;
+ int32_t *vChrFilterPos = c->vChrFilterPos;
+ int32_t *hLumFilterPos = c->hLumFilterPos;
+ int32_t *hChrFilterPos = c->hChrFilterPos;
+ int16_t *hLumFilter = c->hLumFilter;
+ int16_t *hChrFilter = c->hChrFilter;
+ int32_t *lumMmxFilter = c->lumMmxFilter;
+ int32_t *chrMmxFilter = c->chrMmxFilter;
+ const int vLumFilterSize = c->vLumFilterSize;
+ const int vChrFilterSize = c->vChrFilterSize;
+ const int hLumFilterSize = c->hLumFilterSize;
+ const int hChrFilterSize = c->hChrFilterSize;
+ int16_t **lumPixBuf = c->lumPixBuf;
+ int16_t **chrUPixBuf = c->chrUPixBuf;
+ int16_t **chrVPixBuf = c->chrVPixBuf;
+ int16_t **alpPixBuf = c->alpPixBuf;
+ const int vLumBufSize = c->vLumBufSize;
+ const int vChrBufSize = c->vChrBufSize;
+ uint8_t *formatConvBuffer = c->formatConvBuffer;
+ uint32_t *pal = c->pal_yuv;
+ yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
+ yuv2planarX_fn yuv2planeX = c->yuv2planeX;
+ yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
+ yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
+ yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
+ yuv2packedX_fn yuv2packedX = c->yuv2packedX;
+ yuv2anyX_fn yuv2anyX = c->yuv2anyX;
+ const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample;
+ const int chrSrcSliceH = -((-srcSliceH) >> c->chrSrcVSubSample);
+ int should_dither = is9_OR_10BPS(c->srcFormat) ||
+ is16BPS(c->srcFormat);
+ int lastDstY;
+
+ /* vars which will change and which we need to store back in the context */
+ int dstY = c->dstY;
+ int lumBufIndex = c->lumBufIndex;
+ int chrBufIndex = c->chrBufIndex;
+ int lastInLumBuf = c->lastInLumBuf;
+ int lastInChrBuf = c->lastInChrBuf;
+
+ if (isPacked(c->srcFormat)) {
+ src[0] =
+ src[1] =
+ src[2] =
+ src[3] = src[0];
+ srcStride[0] =
+ srcStride[1] =
+ srcStride[2] =
+ srcStride[3] = srcStride[0];
+ }
+ srcStride[1] <<= c->vChrDrop;
+ srcStride[2] <<= c->vChrDrop;
+
+ DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
+ src[0], srcStride[0], src[1], srcStride[1],
+ src[2], srcStride[2], src[3], srcStride[3],
+ dst[0], dstStride[0], dst[1], dstStride[1],
+ dst[2], dstStride[2], dst[3], dstStride[3]);
+ DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
+ srcSliceY, srcSliceH, dstY, dstH);
+ DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
+ vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
+
+ if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 ||
+ dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
+ static int warnedAlready = 0; // FIXME maybe move this into the context
+ if (flags & SWS_PRINT_INFO && !warnedAlready) {
+ av_log(c, AV_LOG_WARNING,
+ "Warning: dstStride is not aligned!\n"
+ " ->cannot do aligned memory accesses anymore\n");
+ warnedAlready = 1;
+ }
+ }
+
+ if ( (uintptr_t)dst[0]%16 || (uintptr_t)dst[1]%16 || (uintptr_t)dst[2]%16
+ || (uintptr_t)src[0]%16 || (uintptr_t)src[1]%16 || (uintptr_t)src[2]%16
+ || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
+ || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
+ ) {
+ static int warnedAlready=0;
+ int cpu_flags = av_get_cpu_flags();
+ if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
+ av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
+ warnedAlready=1;
+ }
+ }
+
+ /* Note the user might start scaling the picture in the middle so this
+ * will not get executed. This is not really intended but works
+ * currently, so people might do it. */
+ if (srcSliceY == 0) {
+ lumBufIndex = -1;
+ chrBufIndex = -1;
+ dstY = 0;
+ lastInLumBuf = -1;
+ lastInChrBuf = -1;
+ }
+
+ if (!should_dither) {
+ c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
+ }
+ lastDstY = dstY;
+
+ for (; dstY < dstH; dstY++) {
+ const int chrDstY = dstY >> c->chrDstVSubSample;
+ uint8_t *dest[4] = {
+ dst[0] + dstStride[0] * dstY,
+ dst[1] + dstStride[1] * chrDstY,
+ dst[2] + dstStride[2] * chrDstY,
+ (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
+ };
+ int use_mmx_vfilter= c->use_mmx_vfilter;
+
+ // First line needed as input
+ const int firstLumSrcY = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
+ const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
+ // First line needed as input
+ const int firstChrSrcY = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
+
+ // Last line needed as input
+ int lastLumSrcY = FFMIN(c->srcH, firstLumSrcY + vLumFilterSize) - 1;
+ int lastLumSrcY2 = FFMIN(c->srcH, firstLumSrcY2 + vLumFilterSize) - 1;
+ int lastChrSrcY = FFMIN(c->chrSrcH, firstChrSrcY + vChrFilterSize) - 1;
+ int enough_lines;
+
+ // handle holes (FAST_BILINEAR & weird filters)
+ if (firstLumSrcY > lastInLumBuf)
+ lastInLumBuf = firstLumSrcY - 1;
+ if (firstChrSrcY > lastInChrBuf)
+ lastInChrBuf = firstChrSrcY - 1;
+ av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
+ av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
+
+ DEBUG_BUFFERS("dstY: %d\n", dstY);
+ DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
+ firstLumSrcY, lastLumSrcY, lastInLumBuf);
+ DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
+ firstChrSrcY, lastChrSrcY, lastInChrBuf);
+
+ // Do we have enough lines in this slice to output the dstY line
+ enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH &&
+ lastChrSrcY < -((-srcSliceY - srcSliceH) >> c->chrSrcVSubSample);
+
+ if (!enough_lines) {
+ lastLumSrcY = srcSliceY + srcSliceH - 1;
+ lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
+ DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
+ lastLumSrcY, lastChrSrcY);
+ }
+
+ // Do horizontal scaling
+ while (lastInLumBuf < lastLumSrcY) {
+ const uint8_t *src1[4] = {
+ src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
+ src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
+ src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
+ src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
+ };
+ lumBufIndex++;
+ av_assert0(lumBufIndex < 2 * vLumBufSize);
+ av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
+ av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
+ hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
+ hLumFilter, hLumFilterPos, hLumFilterSize,
+ formatConvBuffer, pal, 0);
+ if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
+ hyscale(c, alpPixBuf[lumBufIndex], dstW, src1, srcW,
+ lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
+ formatConvBuffer, pal, 1);
+ lastInLumBuf++;
+ DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
+ lumBufIndex, lastInLumBuf);
+ }
+ while (lastInChrBuf < lastChrSrcY) {
+ const uint8_t *src1[4] = {
+ src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
+ src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
+ src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
+ src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
+ };
+ chrBufIndex++;
+ av_assert0(chrBufIndex < 2 * vChrBufSize);
+ av_assert0(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
+ av_assert0(lastInChrBuf + 1 - chrSrcSliceY >= 0);
+ // FIXME replace parameters through context struct (some at least)
+
+ if (c->needs_hcscale)
+ hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
+ chrDstW, src1, chrSrcW, chrXInc,
+ hChrFilter, hChrFilterPos, hChrFilterSize,
+ formatConvBuffer, pal);
+ lastInChrBuf++;
+ DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
+ chrBufIndex, lastInChrBuf);
+ }
+ // wrap buf index around to stay inside the ring buffer
+ if (lumBufIndex >= vLumBufSize)
+ lumBufIndex -= vLumBufSize;
+ if (chrBufIndex >= vChrBufSize)
+ chrBufIndex -= vChrBufSize;
+ if (!enough_lines)
+ break; // we can't output a dstY line so let's try with the next slice
+
+#if HAVE_MMX_INLINE
+ updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
+ lastInLumBuf, lastInChrBuf);
+#endif
+ if (should_dither) {
+ c->chrDither8 = dither_8x8_128[chrDstY & 7];
+ c->lumDither8 = dither_8x8_128[dstY & 7];
+ }
+ if (dstY >= dstH - 2) {
+ /* hmm looks like we can't use MMX here without overwriting
+ * this array's tail */
+ ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
+ &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
+ use_mmx_vfilter= 0;
+ }
+
+ {
+ const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+ const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
+ (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+ int16_t *vLumFilter = c->vLumFilter;
+ int16_t *vChrFilter = c->vChrFilter;
+
+ if (isPlanarYUV(dstFormat) ||
+ (isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like
+ const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
+
+ vLumFilter += dstY * vLumFilterSize;
+ vChrFilter += chrDstY * vChrFilterSize;
+
+// av_assert0(use_mmx_vfilter != (
+// yuv2planeX == yuv2planeX_10BE_c
+// || yuv2planeX == yuv2planeX_10LE_c
+// || yuv2planeX == yuv2planeX_9BE_c
+// || yuv2planeX == yuv2planeX_9LE_c
+// || yuv2planeX == yuv2planeX_16BE_c
+// || yuv2planeX == yuv2planeX_16LE_c
+// || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
+
+ if(use_mmx_vfilter){
+ vLumFilter= (int16_t *)c->lumMmxFilter;
+ vChrFilter= (int16_t *)c->chrMmxFilter;
+ }
+
+ if (vLumFilterSize == 1) {
+ yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
+ } else {
+ yuv2planeX(vLumFilter, vLumFilterSize,
+ lumSrcPtr, dest[0],
+ dstW, c->lumDither8, 0);
+ }
+
+ if (!((dstY & chrSkipMask) || isGray(dstFormat))) {
+ if (yuv2nv12cX) {
+ yuv2nv12cX(c, vChrFilter,
+ vChrFilterSize, chrUSrcPtr, chrVSrcPtr,
+ dest[1], chrDstW);
+ } else if (vChrFilterSize == 1) {
+ yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
+ yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
+ } else {
+ yuv2planeX(vChrFilter,
+ vChrFilterSize, chrUSrcPtr, dest[1],
+ chrDstW, c->chrDither8, 0);
+ yuv2planeX(vChrFilter,
+ vChrFilterSize, chrVSrcPtr, dest[2],
+ chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
+ }
+ }
+
+ if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+ if(use_mmx_vfilter){
+ vLumFilter= (int16_t *)c->alpMmxFilter;
+ }
+ if (vLumFilterSize == 1) {
+ yuv2plane1(alpSrcPtr[0], dest[3], dstW,
+ c->lumDither8, 0);
+ } else {
+ yuv2planeX(vLumFilter,
+ vLumFilterSize, alpSrcPtr, dest[3],
+ dstW, c->lumDither8, 0);
+ }
+ }
+ } else if (yuv2packedX) {
+ av_assert1(lumSrcPtr + vLumFilterSize - 1 < (const int16_t **)lumPixBuf + vLumBufSize * 2);
+ av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2);
+ if (c->yuv2packed1 && vLumFilterSize == 1 &&
+ vChrFilterSize <= 2) { // unscaled RGB
+ int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
+ yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
+ alpPixBuf ? *alpSrcPtr : NULL,
+ dest[0], dstW, chrAlpha, dstY);
+ } else if (c->yuv2packed2 && vLumFilterSize == 2 &&
+ vChrFilterSize == 2) { // bilinear upscale RGB
+ int lumAlpha = vLumFilter[2 * dstY + 1];
+ int chrAlpha = vChrFilter[2 * dstY + 1];
+ lumMmxFilter[2] =
+ lumMmxFilter[3] = vLumFilter[2 * dstY] * 0x10001;
+ chrMmxFilter[2] =
+ chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
+ yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
+ alpPixBuf ? alpSrcPtr : NULL,
+ dest[0], dstW, lumAlpha, chrAlpha, dstY);
+ } else { // general RGB
+ yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
+ lumSrcPtr, vLumFilterSize,
+ vChrFilter + dstY * vChrFilterSize,
+ chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
+ alpSrcPtr, dest[0], dstW, dstY);
+ }
+ } else {
+ av_assert1(!yuv2packed1 && !yuv2packed2);
+ yuv2anyX(c, vLumFilter + dstY * vLumFilterSize,
+ lumSrcPtr, vLumFilterSize,
+ vChrFilter + dstY * vChrFilterSize,
+ chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
+ alpSrcPtr, dest, dstW, dstY);
+ }
+ }
+ }
+ if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
+ int length = dstW;
+ int height = dstY - lastDstY;
+
+ if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
+ fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
+ 1, desc->comp[3].depth_minus1,
+ isBE(dstFormat));
+ } else
+ fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
+ }
+
+#if HAVE_MMXEXT_INLINE
+ if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
+ __asm__ volatile ("sfence" ::: "memory");
+#endif
+ emms_c();
+
+ /* store changed local vars back in the context */
+ c->dstY = dstY;
+ c->lumBufIndex = lumBufIndex;
+ c->chrBufIndex = chrBufIndex;
+ c->lastInLumBuf = lastInLumBuf;
+ c->lastInChrBuf = lastInChrBuf;
+
+ return dstY - lastDstY;
+}
+
+static av_cold void sws_init_swScale_c(SwsContext *c)
+{
+ enum AVPixelFormat srcFormat = c->srcFormat;
+
+ ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+
+ ff_sws_init_input_funcs(c);
+
+
+ if (c->srcBpc == 8) {
+ if (c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = hScale8To15_c;
+ if (c->flags & SWS_FAST_BILINEAR) {
+ c->hyscale_fast = hyscale_fast_c;
+ c->hcscale_fast = hcscale_fast_c;
+ }
+ } else {
+ c->hyScale = c->hcScale = hScale8To19_c;
+ }
+ } else {
+ c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c
+ : hScale16To15_c;
+ }
+
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_c;
+ c->chrConvertRange = chrRangeFromJpeg_c;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_c;
+ c->chrConvertRange = chrRangeToJpeg_c;
+ }
+ } else {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg16_c;
+ c->chrConvertRange = chrRangeFromJpeg16_c;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg16_c;
+ c->chrConvertRange = chrRangeToJpeg16_c;
+ }
+ }
+ }
+
+ if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
+ srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
+ c->needs_hcscale = 1;
+}
+
+SwsFunc ff_getSwsFunc(SwsContext *c)
+{
+ sws_init_swScale_c(c);
+
+ if (HAVE_MMX)
+ ff_sws_init_swScale_mmx(c);
+ if (HAVE_ALTIVEC)
+ ff_sws_init_swScale_altivec(c);
+
+ return swScale;
+}
+
+static void reset_ptr(const uint8_t *src[], int format)
+{
+ if (!isALPHA(format))
+ src[3] = NULL;
+ if (!isPlanar(format)) {
+ src[3] = src[2] = NULL;
+
+ if (!usePal(format))
+ src[1] = NULL;
+ }
+}
+
+static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt,
+ const int linesizes[4])
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ int plane = desc->comp[i].plane;
+ if (!data[plane] || !linesizes[plane])
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * swscale wrapper, so we don't need to export the SwsContext.
+ * Assumes planar YUV to be in YUV order instead of YVU.
+ */
+int attribute_align_arg sws_scale(struct SwsContext *c,
+ const uint8_t * const srcSlice[],
+ const int srcStride[], int srcSliceY,
+ int srcSliceH, uint8_t *const dst[],
+ const int dstStride[])
+{
+ int i, ret;
+ const uint8_t *src2[4];
+ uint8_t *dst2[4];
+ uint8_t *rgb0_tmp = NULL;
+
+ if (!srcSlice || !dstStride || !dst || !srcSlice) {
+ av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n");
+ return 0;
+ }
+ memcpy(src2, srcSlice, sizeof(src2));
+ memcpy(dst2, dst, sizeof(dst2));
+
+ // do not mess up sliceDir if we have a "trailing" 0-size slice
+ if (srcSliceH == 0)
+ return 0;
+
+ if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
+ av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
+ return 0;
+ }
+ if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
+ av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
+ return 0;
+ }
+
+ if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
+ av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
+ return 0;
+ }
+ if (c->sliceDir == 0) {
+ if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
+ }
+
+ if (usePal(c->srcFormat)) {
+ for (i = 0; i < 256; i++) {
+ int p, r, g, b, y, u, v, a = 0xff;
+ if (c->srcFormat == AV_PIX_FMT_PAL8) {
+ p = ((const uint32_t *)(srcSlice[1]))[i];
+ a = (p >> 24) & 0xFF;
+ r = (p >> 16) & 0xFF;
+ g = (p >> 8) & 0xFF;
+ b = p & 0xFF;
+ } else if (c->srcFormat == AV_PIX_FMT_RGB8) {
+ r = ( i >> 5 ) * 36;
+ g = ((i >> 2) & 7) * 36;
+ b = ( i & 3) * 85;
+ } else if (c->srcFormat == AV_PIX_FMT_BGR8) {
+ b = ( i >> 6 ) * 85;
+ g = ((i >> 3) & 7) * 36;
+ r = ( i & 7) * 36;
+ } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
+ r = ( i >> 3 ) * 255;
+ g = ((i >> 1) & 3) * 85;
+ b = ( i & 1) * 255;
+ } else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) {
+ r = g = b = i;
+ } else {
+ av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
+ b = ( i >> 3 ) * 255;
+ g = ((i >> 1) & 3) * 85;
+ r = ( i & 1) * 255;
+ }
+#define RGB2YUV_SHIFT 15
+#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+
+ y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+ u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+ v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+ c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24);
+
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_BGR32:
+#if !HAVE_BIGENDIAN
+ case AV_PIX_FMT_RGB24:
+#endif
+ c->pal_rgb[i]= r + (g<<8) + (b<<16) + ((unsigned)a<<24);
+ break;
+ case AV_PIX_FMT_BGR32_1:
+#if HAVE_BIGENDIAN
+ case AV_PIX_FMT_BGR24:
+#endif
+ c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24);
+ break;
+ case AV_PIX_FMT_RGB32_1:
+#if HAVE_BIGENDIAN
+ case AV_PIX_FMT_RGB24:
+#endif
+ c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24);
+ break;
+ case AV_PIX_FMT_RGB32:
+#if !HAVE_BIGENDIAN
+ case AV_PIX_FMT_BGR24:
+#endif
+ default:
+ c->pal_rgb[i]= b + (g<<8) + (r<<16) + ((unsigned)a<<24);
+ }
+ }
+ }
+
+ if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
+ uint8_t *base;
+ int x,y;
+ rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
+ base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
+ for (y=0; y<srcSliceH; y++){
+ memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
+ for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
+ base[ srcStride[0]*y + x] = 0xFF;
+ }
+ }
+ src2[0] = base;
+ }
+
+ if (!srcSliceY && (c->flags & SWS_BITEXACT) && (c->flags & SWS_ERROR_DIFFUSION) && c->dither_error[0])
+ for (i = 0; i < 4; i++)
+ memset(c->dither_error[i], 0, sizeof(c->dither_error[0][0]) * (c->dstW+2));
+
+
+ // copy strides, so they can safely be modified
+ if (c->sliceDir == 1) {
+ // slices go from top to bottom
+ int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2],
+ srcStride[3] };
+ int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2],
+ dstStride[3] };
+
+ reset_ptr(src2, c->srcFormat);
+ reset_ptr((void*)dst2, c->dstFormat);
+
+ /* reset slice direction at end of frame */
+ if (srcSliceY + srcSliceH == c->srcH)
+ c->sliceDir = 0;
+
+ ret = c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
+ dstStride2);
+ } else {
+ // slices go from bottom to top => we flip the image internally
+ int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2],
+ -srcStride[3] };
+ int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2],
+ -dstStride[3] };
+
+ src2[0] += (srcSliceH - 1) * srcStride[0];
+ if (!usePal(c->srcFormat))
+ src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
+ src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
+ src2[3] += (srcSliceH - 1) * srcStride[3];
+ dst2[0] += ( c->dstH - 1) * dstStride[0];
+ dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
+ dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
+ dst2[3] += ( c->dstH - 1) * dstStride[3];
+
+ reset_ptr(src2, c->srcFormat);
+ reset_ptr((void*)dst2, c->dstFormat);
+
+ /* reset slice direction at end of frame */
+ if (!srcSliceY)
+ c->sliceDir = 0;
+
+ ret = c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
+ srcSliceH, dst2, dstStride2);
+ }
+
+ av_free(rgb0_tmp);
+ return ret;
+}
+
diff --git a/ffmpeg/libswscale/swscale.h b/ffmpeg/libswscale/swscale.h
new file mode 100644
index 0000000..5f6ae0f
--- /dev/null
+++ b/ffmpeg/libswscale/swscale.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_SWSCALE_H
+#define SWSCALE_SWSCALE_H
+
+/**
+ * @file
+ * @ingroup lsws
+ * external API header
+ */
+
+/**
+ * @defgroup lsws Libswscale
+ * @{
+ */
+
+#include <stdint.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "version.h"
+
+/**
+ * Return the LIBSWSCALE_VERSION_INT constant.
+ */
+unsigned swscale_version(void);
+
+/**
+ * Return the libswscale build-time configuration.
+ */
+const char *swscale_configuration(void);
+
+/**
+ * Return the libswscale license.
+ */
+const char *swscale_license(void);
+
+/* values for the flags, the stuff on the command line is different */
+#define SWS_FAST_BILINEAR 1
+#define SWS_BILINEAR 2
+#define SWS_BICUBIC 4
+#define SWS_X 8
+#define SWS_POINT 0x10
+#define SWS_AREA 0x20
+#define SWS_BICUBLIN 0x40
+#define SWS_GAUSS 0x80
+#define SWS_SINC 0x100
+#define SWS_LANCZOS 0x200
+#define SWS_SPLINE 0x400
+
+#define SWS_SRC_V_CHR_DROP_MASK 0x30000
+#define SWS_SRC_V_CHR_DROP_SHIFT 16
+
+#define SWS_PARAM_DEFAULT 123456
+
+#define SWS_PRINT_INFO 0x1000
+
+//the following 3 flags are not completely implemented
+//internal chrominace subsampling info
+#define SWS_FULL_CHR_H_INT 0x2000
+//input subsampling info
+#define SWS_FULL_CHR_H_INP 0x4000
+#define SWS_DIRECT_BGR 0x8000
+#define SWS_ACCURATE_RND 0x40000
+#define SWS_BITEXACT 0x80000
+#define SWS_ERROR_DIFFUSION 0x800000
+
+#if FF_API_SWS_CPU_CAPS
+/**
+ * CPU caps are autodetected now, those flags
+ * are only provided for API compatibility.
+ */
+#define SWS_CPU_CAPS_MMX 0x80000000
+#define SWS_CPU_CAPS_MMXEXT 0x20000000
+#define SWS_CPU_CAPS_MMX2 0x20000000
+#define SWS_CPU_CAPS_3DNOW 0x40000000
+#define SWS_CPU_CAPS_ALTIVEC 0x10000000
+#define SWS_CPU_CAPS_BFIN 0x01000000
+#define SWS_CPU_CAPS_SSE2 0x02000000
+#endif
+
+#define SWS_MAX_REDUCE_CUTOFF 0.002
+
+#define SWS_CS_ITU709 1
+#define SWS_CS_FCC 4
+#define SWS_CS_ITU601 5
+#define SWS_CS_ITU624 5
+#define SWS_CS_SMPTE170M 5
+#define SWS_CS_SMPTE240M 7
+#define SWS_CS_DEFAULT 5
+
+/**
+ * Return a pointer to yuv<->rgb coefficients for the given colorspace
+ * suitable for sws_setColorspaceDetails().
+ *
+ * @param colorspace One of the SWS_CS_* macros. If invalid,
+ * SWS_CS_DEFAULT is used.
+ */
+const int *sws_getCoefficients(int colorspace);
+
+// when used for filters they must have an odd number of elements
+// coeffs cannot be shared between vectors
+typedef struct SwsVector {
+ double *coeff; ///< pointer to the list of coefficients
+ int length; ///< number of coefficients in the vector
+} SwsVector;
+
+// vectors can be shared
+typedef struct SwsFilter {
+ SwsVector *lumH;
+ SwsVector *lumV;
+ SwsVector *chrH;
+ SwsVector *chrV;
+} SwsFilter;
+
+struct SwsContext;
+
+/**
+ * Return a positive value if pix_fmt is a supported input format, 0
+ * otherwise.
+ */
+int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
+
+/**
+ * Return a positive value if pix_fmt is a supported output format, 0
+ * otherwise.
+ */
+int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
+
+/**
+ * Allocate an empty SwsContext. This must be filled and passed to
+ * sws_init_context(). For filling see AVOptions, options.c and
+ * sws_setColorspaceDetails().
+ */
+struct SwsContext *sws_alloc_context(void);
+
+/**
+ * Initialize the swscaler context sws_context.
+ *
+ * @return zero or positive value on success, a negative value on
+ * error
+ */
+int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
+
+/**
+ * Free the swscaler context swsContext.
+ * If swsContext is NULL, then does nothing.
+ */
+void sws_freeContext(struct SwsContext *swsContext);
+
+#if FF_API_SWS_GETCONTEXT
+/**
+ * Allocate and return an SwsContext. You need it to perform
+ * scaling/conversion operations using sws_scale().
+ *
+ * @param srcW the width of the source image
+ * @param srcH the height of the source image
+ * @param srcFormat the source image format
+ * @param dstW the width of the destination image
+ * @param dstH the height of the destination image
+ * @param dstFormat the destination image format
+ * @param flags specify which algorithm and options to use for rescaling
+ * @return a pointer to an allocated context, or NULL in case of error
+ * @note this function is to be removed after a saner alternative is
+ * written
+ * @deprecated Use sws_getCachedContext() instead.
+ */
+struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
+ int dstW, int dstH, enum AVPixelFormat dstFormat,
+ int flags, SwsFilter *srcFilter,
+ SwsFilter *dstFilter, const double *param);
+#endif
+
+/**
+ * Scale the image slice in srcSlice and put the resulting scaled
+ * slice in the image in dst. A slice is a sequence of consecutive
+ * rows in an image.
+ *
+ * Slices have to be provided in sequential order, either in
+ * top-bottom or bottom-top order. If slices are provided in
+ * non-sequential order the behavior of the function is undefined.
+ *
+ * @param c the scaling context previously created with
+ * sws_getContext()
+ * @param srcSlice the array containing the pointers to the planes of
+ * the source slice
+ * @param srcStride the array containing the strides for each plane of
+ * the source image
+ * @param srcSliceY the position in the source image of the slice to
+ * process, that is the number (counted starting from
+ * zero) in the image of the first row of the slice
+ * @param srcSliceH the height of the source slice, that is the number
+ * of rows in the slice
+ * @param dst the array containing the pointers to the planes of
+ * the destination image
+ * @param dstStride the array containing the strides for each plane of
+ * the destination image
+ * @return the height of the output slice
+ */
+int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+ const int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *const dst[], const int dstStride[]);
+
+/**
+ * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
+ * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
+ * @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param brightness 16.16 fixed point brightness correction
+ * @param contrast 16.16 fixed point contrast correction
+ * @param saturation 16.16 fixed point saturation correction
+ * @return -1 if not supported
+ */
+int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
+ int srcRange, const int table[4], int dstRange,
+ int brightness, int contrast, int saturation);
+
+/**
+ * @return -1 if not supported
+ */
+int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
+ int *srcRange, int **table, int *dstRange,
+ int *brightness, int *contrast, int *saturation);
+
+/**
+ * Allocate and return an uninitialized vector with length coefficients.
+ */
+SwsVector *sws_allocVec(int length);
+
+/**
+ * Return a normalized Gaussian curve used to filter stuff
+ * quality = 3 is high quality, lower is lower quality.
+ */
+SwsVector *sws_getGaussianVec(double variance, double quality);
+
+/**
+ * Allocate and return a vector with length coefficients, all
+ * with the same value c.
+ */
+SwsVector *sws_getConstVec(double c, int length);
+
+/**
+ * Allocate and return a vector with just one coefficient, with
+ * value 1.0.
+ */
+SwsVector *sws_getIdentityVec(void);
+
+/**
+ * Scale all the coefficients of a by the scalar value.
+ */
+void sws_scaleVec(SwsVector *a, double scalar);
+
+/**
+ * Scale all the coefficients of a so that their sum equals height.
+ */
+void sws_normalizeVec(SwsVector *a, double height);
+void sws_convVec(SwsVector *a, SwsVector *b);
+void sws_addVec(SwsVector *a, SwsVector *b);
+void sws_subVec(SwsVector *a, SwsVector *b);
+void sws_shiftVec(SwsVector *a, int shift);
+
+/**
+ * Allocate and return a clone of the vector a, that is a vector
+ * with the same coefficients as a.
+ */
+SwsVector *sws_cloneVec(SwsVector *a);
+
+/**
+ * Print with av_log() a textual representation of the vector a
+ * if log_level <= av_log_level.
+ */
+void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
+
+void sws_freeVec(SwsVector *a);
+
+SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
+ float lumaSharpen, float chromaSharpen,
+ float chromaHShift, float chromaVShift,
+ int verbose);
+void sws_freeFilter(SwsFilter *filter);
+
+/**
+ * Check if context can be reused, otherwise reallocate a new one.
+ *
+ * If context is NULL, just calls sws_getContext() to get a new
+ * context. Otherwise, checks if the parameters are the ones already
+ * saved in context. If that is the case, returns the current
+ * context. Otherwise, frees context and gets a new context with
+ * the new parameters.
+ *
+ * Be warned that srcFilter and dstFilter are not checked, they
+ * are assumed to remain the same.
+ */
+struct SwsContext *sws_getCachedContext(struct SwsContext *context,
+ int srcW, int srcH, enum AVPixelFormat srcFormat,
+ int dstW, int dstH, enum AVPixelFormat dstFormat,
+ int flags, SwsFilter *srcFilter,
+ SwsFilter *dstFilter, const double *param);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
+ *
+ * The output frame will have the same packed format as the palette.
+ *
+ * @param src source frame buffer
+ * @param dst destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
+ *
+ * With the palette format "ABCD", the destination frame ends up with the format "ABC".
+ *
+ * @param src source frame buffer
+ * @param dst destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Get the AVClass for swsContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *sws_get_class(void);
+
+/**
+ * @}
+ */
+
+#endif /* SWSCALE_SWSCALE_H */
diff --git a/ffmpeg/libswscale/swscale_internal.h b/ffmpeg/libswscale/swscale_internal.h
new file mode 100644
index 0000000..83d3a00
--- /dev/null
+++ b/ffmpeg/libswscale/swscale_internal.h
@@ -0,0 +1,840 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_SWSCALE_INTERNAL_H
+#define SWSCALE_SWSCALE_INTERNAL_H
+
+#include "config.h"
+
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/avassert.h"
+#include "libavutil/avutil.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/pixdesc.h"
+
+#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
+
+#define YUVRGB_TABLE_HEADROOM 128
+
+#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
+
+#define MAX_FILTER_SIZE 256
+
+#define DITHER1XBPP
+
+#if HAVE_BIGENDIAN
+#define ALT32_CORR (-1)
+#else
+#define ALT32_CORR 1
+#endif
+
+#if ARCH_X86_64
+# define APCK_PTR2 8
+# define APCK_COEF 16
+# define APCK_SIZE 24
+#else
+# define APCK_PTR2 4
+# define APCK_COEF 8
+# define APCK_SIZE 16
+#endif
+
+struct SwsContext;
+
+typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]);
+
+/**
+ * Write one line of horizontally scaled data to planar output
+ * without any additional vertical scaling (or point-scaling).
+ *
+ * @param src scaled source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param dest pointer to the output plane. For >8bit
+ * output, this is in uint16_t
+ * @param dstW width of destination in pixels
+ * @param dither ordered dither array of type int16_t and size 8
+ * @param offset Dither offset
+ */
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+/**
+ * Write one line of horizontally scaled data to planar output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param filter vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param src scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param filterSize number of vertical input lines to scale
+ * @param dest pointer to output plane. For >8bit
+ * output, this is in uint16_t
+ * @param dstW width of destination pixels
+ * @param offset Dither offset
+ */
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+/**
+ * Write one line of horizontally scaled chroma to interleaved output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param c SWS scaling context
+ * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param dest pointer to the output plane. For >8bit
+ * output, this is in uint16_t
+ * @param dstW width of chroma planes
+ */
+typedef void (*yuv2interleavedX_fn)(struct SwsContext *c,
+ const int16_t *chrFilter,
+ int chrFilterSize,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ uint8_t *dest, int dstW);
+
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output without any additional vertical scaling (or point-scaling). Note
+ * that this function may do chroma scaling, see the "uvalpha" argument.
+ *
+ * @param c SWS scaling context
+ * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param dest pointer to the output plane. For 16bit output, this is
+ * uint16_t
+ * @param dstW width of lumSrc and alpSrc in pixels, number of pixels
+ * to write into dest[]
+ * @param uvalpha chroma scaling coefficient for the second line of chroma
+ * pixels, either 2048 or 0. If 0, one chroma input is used
+ * for 2 output pixels (or if the SWS_FLAG_FULL_CHR_INT flag
+ * is set, it generates 1 output pixel). If 2048, two chroma
+ * input pixels should be averaged for 2 output pixels (this
+ * only happens if SWS_FLAG_FULL_CHR_INT is not set)
+ * @param y vertical line number for this output. This does not need
+ * to be used to calculate the offset in the destination,
+ * but can be used to generate comfort noise using dithering
+ * for some output formats.
+ */
+typedef void (*yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc,
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc, uint8_t *dest,
+ int dstW, int uvalpha, int y);
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output by doing bilinear scaling between two input lines.
+ *
+ * @param c SWS scaling context
+ * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param dest pointer to the output plane. For 16bit output, this is
+ * uint16_t
+ * @param dstW width of lumSrc and alpSrc in pixels, number of pixels
+ * to write into dest[]
+ * @param yalpha luma/alpha scaling coefficients for the second input line.
+ * The first line's coefficients can be calculated by using
+ * 4096 - yalpha
+ * @param uvalpha chroma scaling coefficient for the second input line. The
+ * first line's coefficients can be calculated by using
+ * 4096 - uvalpha
+ * @param y vertical line number for this output. This does not need
+ * to be used to calculate the offset in the destination,
+ * but can be used to generate comfort noise using dithering
+ * for some output formats.
+ */
+typedef void (*yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2],
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc[2],
+ uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y);
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output by doing multi-point vertical scaling between input pixels.
+ *
+ * @param c SWS scaling context
+ * @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param lumFilterSize number of vertical luma/alpha input lines to scale
+ * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param dest pointer to the output plane. For 16bit output, this is
+ * uint16_t
+ * @param dstW width of lumSrc and alpSrc in pixels, number of pixels
+ * to write into dest[]
+ * @param y vertical line number for this output. This does not need
+ * to be used to calculate the offset in the destination,
+ * but can be used to generate comfort noise using dithering
+ * or some output formats.
+ */
+typedef void (*yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y);
+
+/**
+ * Write one line of horizontally scaled Y/U/V/A to YUV/RGB
+ * output by doing multi-point vertical scaling between input pixels.
+ *
+ * @param c SWS scaling context
+ * @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param lumFilterSize number of vertical luma/alpha input lines to scale
+ * @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
+ * 19-bit for 16bit output (in int32_t)
+ * @param dest pointer to the output planes. For 16bit output, this is
+ * uint16_t
+ * @param dstW width of lumSrc and alpSrc in pixels, number of pixels
+ * to write into dest[]
+ * @param y vertical line number for this output. This does not need
+ * to be used to calculate the offset in the destination,
+ * but can be used to generate comfort noise using dithering
+ * or some output formats.
+ */
+typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t **dest,
+ int dstW, int y);
+
+/* This struct should be aligned on at least a 32-byte boundary. */
+typedef struct SwsContext {
+ /**
+ * info on struct for av_log
+ */
+ const AVClass *av_class;
+
+ /**
+ * Note that src, dst, srcStride, dstStride will be copied in the
+ * sws_scale() wrapper so they can be freely modified here.
+ */
+ SwsFunc swScale;
+ int srcW; ///< Width of source luma/alpha planes.
+ int srcH; ///< Height of source luma/alpha planes.
+ int dstH; ///< Height of destination luma/alpha planes.
+ int chrSrcW; ///< Width of source chroma planes.
+ int chrSrcH; ///< Height of source chroma planes.
+ int chrDstW; ///< Width of destination chroma planes.
+ int chrDstH; ///< Height of destination chroma planes.
+ int lumXInc, chrXInc;
+ int lumYInc, chrYInc;
+ enum AVPixelFormat dstFormat; ///< Destination pixel format.
+ enum AVPixelFormat srcFormat; ///< Source pixel format.
+ int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.
+ int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.
+ int dstBpc, srcBpc;
+ int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.
+ int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.
+ int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
+ int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.
+ int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
+ int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
+ double param[2]; ///< Input parameters for scaling algorithms that need them.
+
+ uint32_t pal_yuv[256];
+ uint32_t pal_rgb[256];
+
+ /**
+ * @name Scaled horizontal lines ring buffer.
+ * The horizontal scaler keeps just enough scaled lines in a ring buffer
+ * so they may be passed to the vertical scaler. The pointers to the
+ * allocated buffers for each line are duplicated in sequence in the ring
+ * buffer to simplify indexing and avoid wrapping around between lines
+ * inside the vertical scaler code. The wrapping is done before the
+ * vertical scaler is called.
+ */
+ //@{
+ int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
+ int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
+ int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
+ int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
+ int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
+ int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.
+ int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
+ int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.
+ //@}
+
+ uint8_t *formatConvBuffer;
+
+ /**
+ * @name Horizontal and vertical filters.
+ * To better understand the following fields, here is a pseudo-code of
+ * their usage in filtering a horizontal line:
+ * @code
+ * for (i = 0; i < width; i++) {
+ * dst[i] = 0;
+ * for (j = 0; j < filterSize; j++)
+ * dst[i] += src[ filterPos[i] + j ] * filter[ filterSize * i + j ];
+ * dst[i] >>= FRAC_BITS; // The actual implementation is fixed-point.
+ * }
+ * @endcode
+ */
+ //@{
+ int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.
+ int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
+ int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
+ int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
+ int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
+ int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
+ int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
+ int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
+ int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
+ int vChrFilterSize; ///< Vertical filter size for chroma pixels.
+ //@}
+
+ int lumMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for luma/alpha planes.
+ int chrMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for chroma planes.
+ uint8_t *lumMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for luma/alpha planes.
+ uint8_t *chrMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for chroma planes.
+
+ int canMMXEXTBeUsed;
+
+ int dstY; ///< Last destination vertical line output from last slice.
+ int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
+ void *yuvTable; // pointer to the yuv->rgb table start so it can be freed()
+ uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+ int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+
+ int *dither_error[4];
+
+ //Colorspace stuff
+ int contrast, brightness, saturation; // for sws_getColorspaceDetails
+ int srcColorspaceTable[4];
+ int dstColorspaceTable[4];
+ int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).
+ int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+ int src0Alpha;
+ int dst0Alpha;
+ int yuv2rgb_y_offset;
+ int yuv2rgb_y_coeff;
+ int yuv2rgb_v2r_coeff;
+ int yuv2rgb_v2g_coeff;
+ int yuv2rgb_u2g_coeff;
+ int yuv2rgb_u2b_coeff;
+
+#define RED_DITHER "0*8"
+#define GREEN_DITHER "1*8"
+#define BLUE_DITHER "2*8"
+#define Y_COEFF "3*8"
+#define VR_COEFF "4*8"
+#define UB_COEFF "5*8"
+#define VG_COEFF "6*8"
+#define UG_COEFF "7*8"
+#define Y_OFFSET "8*8"
+#define U_OFFSET "9*8"
+#define V_OFFSET "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
+#define ESP_OFFSET "11*8+4*4*256*2+8"
+#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
+#define U_TEMP "11*8+4*4*256*2+24"
+#define V_TEMP "11*8+4*4*256*2+32"
+#define Y_TEMP "11*8+4*4*256*2+40"
+#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
+#define UV_OFF_PX "11*8+4*4*256*3+48"
+#define UV_OFF_BYTE "11*8+4*4*256*3+56"
+#define DITHER16 "11*8+4*4*256*3+64"
+#define DITHER32 "11*8+4*4*256*3+80"
+
+ DECLARE_ALIGNED(8, uint64_t, redDither);
+ DECLARE_ALIGNED(8, uint64_t, greenDither);
+ DECLARE_ALIGNED(8, uint64_t, blueDither);
+
+ DECLARE_ALIGNED(8, uint64_t, yCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vrCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ubCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vgCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ugCoeff);
+ DECLARE_ALIGNED(8, uint64_t, yOffset);
+ DECLARE_ALIGNED(8, uint64_t, uOffset);
+ DECLARE_ALIGNED(8, uint64_t, vOffset);
+ int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
+ int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
+ int dstW; ///< Width of destination luma/alpha planes.
+ DECLARE_ALIGNED(8, uint64_t, esp);
+ DECLARE_ALIGNED(8, uint64_t, vRounder);
+ DECLARE_ALIGNED(8, uint64_t, u_temp);
+ DECLARE_ALIGNED(8, uint64_t, v_temp);
+ DECLARE_ALIGNED(8, uint64_t, y_temp);
+ int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
+ // alignment of these values is not necessary, but merely here
+ // to maintain the same offset across x8632 and x86-64. Once we
+ // use proper offset macros in the asm, they can be removed.
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
+ DECLARE_ALIGNED(8, uint16_t, dither16)[8];
+ DECLARE_ALIGNED(8, uint32_t, dither32)[8];
+
+ const uint8_t *chrDither8, *lumDither8;
+
+#if HAVE_ALTIVEC
+ vector signed short CY;
+ vector signed short CRV;
+ vector signed short CBU;
+ vector signed short CGU;
+ vector signed short CGV;
+ vector signed short OY;
+ vector unsigned short CSHIFT;
+ vector signed short *vYCoeffsBank, *vCCoeffsBank;
+#endif
+
+#if ARCH_BFIN
+ DECLARE_ALIGNED(4, uint32_t, oy);
+ DECLARE_ALIGNED(4, uint32_t, oc);
+ DECLARE_ALIGNED(4, uint32_t, zero);
+ DECLARE_ALIGNED(4, uint32_t, cy);
+ DECLARE_ALIGNED(4, uint32_t, crv);
+ DECLARE_ALIGNED(4, uint32_t, rmask);
+ DECLARE_ALIGNED(4, uint32_t, cbu);
+ DECLARE_ALIGNED(4, uint32_t, bmask);
+ DECLARE_ALIGNED(4, uint32_t, cgu);
+ DECLARE_ALIGNED(4, uint32_t, cgv);
+ DECLARE_ALIGNED(4, uint32_t, gmask);
+#endif
+
+#if HAVE_VIS
+ DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
+#endif
+ int use_mmx_vfilter;
+
+ /* function pointers for swScale() */
+ yuv2planar1_fn yuv2plane1;
+ yuv2planarX_fn yuv2planeX;
+ yuv2interleavedX_fn yuv2nv12cX;
+ yuv2packed1_fn yuv2packed1;
+ yuv2packed2_fn yuv2packed2;
+ yuv2packedX_fn yuv2packedX;
+ yuv2anyX_fn yuv2anyX;
+
+ /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
+ void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
+ void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
+ void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+
+ /**
+ * Functions to read planar input, such as planar RGB, and convert
+ * internally to Y/UV.
+ */
+ /** @{ */
+ void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width);
+ void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
+ int width);
+ /** @} */
+
+ /**
+ * Scale one horizontal line of input data using a bilinear filter
+ * to produce one line of output data. Compared to SwsContext->hScale(),
+ * please take note of the following caveats when using these:
+ * - Scaling is done using only 7bit instead of 14bit coefficients.
+ * - You can use no more than 5 input pixels to produce 4 output
+ * pixels. Therefore, this filter should not be used for downscaling
+ * by more than ~20% in width (because that equals more than 5/4th
+ * downscaling and thus more than 5 pixels input per 4 pixels output).
+ * - In general, bilinear filters create artifacts during downscaling
+ * (even when <20%), because one output pixel will span more than one
+ * input pixel, and thus some pixels will need edges of both neighbor
+ * pixels to interpolate the output pixel. Since you can use at most
+ * two input pixels per output pixel in bilinear scaling, this is
+ * impossible and thus downscaling by any size will create artifacts.
+ * To enable this type of scaling, set SWS_FLAG_FAST_BILINEAR
+ * in SwsContext->flags.
+ */
+ /** @{ */
+ void (*hyscale_fast)(struct SwsContext *c,
+ int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc);
+ void (*hcscale_fast)(struct SwsContext *c,
+ int16_t *dst1, int16_t *dst2, int dstWidth,
+ const uint8_t *src1, const uint8_t *src2,
+ int srcW, int xInc);
+ /** @} */
+
+ /**
+ * Scale one horizontal line of input data using a filter over the input
+ * lines, to produce one (differently sized) line of output data.
+ *
+ * @param dst pointer to destination buffer for horizontally scaled
+ * data. If the number of bits per component of one
+ * destination pixel (SwsContext->dstBpc) is <= 10, data
+ * will be 15bpc in 16bits (int16_t) width. Else (i.e.
+ * SwsContext->dstBpc == 16), data will be 19bpc in
+ * 32bits (int32_t) width.
+ * @param dstW width of destination image
+ * @param src pointer to source data to be scaled. If the number of
+ * bits per component of a source pixel (SwsContext->srcBpc)
+ * is 8, this is 8bpc in 8bits (uint8_t) width. Else
+ * (i.e. SwsContext->dstBpc > 8), this is native depth
+ * in 16bits (uint16_t) width. In other words, for 9-bit
+ * YUV input, this is 9bpc, for 10-bit YUV input, this is
+ * 10bpc, and for 16-bit RGB or YUV, this is 16bpc.
+ * @param filter filter coefficients to be used per output pixel for
+ * scaling. This contains 14bpp filtering coefficients.
+ * Guaranteed to contain dstW * filterSize entries.
+ * @param filterPos position of the first input pixel to be used for
+ * each output pixel during scaling. Guaranteed to
+ * contain dstW entries.
+ * @param filterSize the number of input coefficients to be used (and
+ * thus the number of input pixels to be used) for
+ * creating a single output pixel. Is aligned to 4
+ * (and input coefficients thus padded with zeroes)
+ * to simplify creating SIMD code.
+ */
+ /** @{ */
+ void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+ void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+ /** @} */
+
+ /// Color range conversion function for luma plane if needed.
+ void (*lumConvertRange)(int16_t *dst, int width);
+ /// Color range conversion function for chroma planes if needed.
+ void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
+
+ int needs_hcscale; ///< Set if there are chroma planes to be converted.
+} SwsContext;
+//FIXME check init (where 0)
+
+SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
+int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
+ int fullRange, int brightness,
+ int contrast, int saturation);
+
+void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4],
+ int brightness, int contrast, int saturation);
+void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+ int lastInLumBuf, int lastInChrBuf);
+
+SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_vis(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
+SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
+void ff_bfin_get_unscaled_swscale(SwsContext *c);
+
+#if FF_API_SWS_FORMAT_NAME
+/**
+ * @deprecated Use av_get_pix_fmt_name() instead.
+ */
+attribute_deprecated
+const char *sws_format_name(enum AVPixelFormat format);
+#endif
+
+static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return desc->comp[0].depth_minus1 == 15;
+}
+
+static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13;
+}
+
+#define isNBPS(x) is9_OR_10BPS(x)
+
+static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return desc->flags & PIX_FMT_BE;
+}
+
+static av_always_inline int isYUV(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return !(desc->flags & PIX_FMT_RGB) && desc->nb_components >= 2;
+}
+
+static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return ((desc->flags & PIX_FMT_PLANAR) && isYUV(pix_fmt));
+}
+
+static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return (desc->flags & PIX_FMT_RGB);
+}
+
+#if 0 // FIXME
+#define isGray(x) \
+ (!(av_pix_fmt_desc_get(x)->flags & PIX_FMT_PAL) && \
+ av_pix_fmt_desc_get(x)->nb_components <= 2)
+#else
+#define isGray(x) \
+ ((x) == AV_PIX_FMT_GRAY8 || \
+ (x) == AV_PIX_FMT_Y400A || \
+ (x) == AV_PIX_FMT_GRAY16BE || \
+ (x) == AV_PIX_FMT_GRAY16LE)
+#endif
+
+#define isRGBinInt(x) \
+ ( \
+ (x) == AV_PIX_FMT_RGB48BE || \
+ (x) == AV_PIX_FMT_RGB48LE || \
+ (x) == AV_PIX_FMT_RGBA64BE || \
+ (x) == AV_PIX_FMT_RGBA64LE || \
+ (x) == AV_PIX_FMT_RGB32 || \
+ (x) == AV_PIX_FMT_RGB32_1 || \
+ (x) == AV_PIX_FMT_RGB24 || \
+ (x) == AV_PIX_FMT_RGB565BE || \
+ (x) == AV_PIX_FMT_RGB565LE || \
+ (x) == AV_PIX_FMT_RGB555BE || \
+ (x) == AV_PIX_FMT_RGB555LE || \
+ (x) == AV_PIX_FMT_RGB444BE || \
+ (x) == AV_PIX_FMT_RGB444LE || \
+ (x) == AV_PIX_FMT_RGB8 || \
+ (x) == AV_PIX_FMT_RGB4 || \
+ (x) == AV_PIX_FMT_RGB4_BYTE || \
+ (x) == AV_PIX_FMT_MONOBLACK || \
+ (x) == AV_PIX_FMT_MONOWHITE \
+ )
+#define isBGRinInt(x) \
+ ( \
+ (x) == AV_PIX_FMT_BGR48BE || \
+ (x) == AV_PIX_FMT_BGR48LE || \
+ (x) == AV_PIX_FMT_BGRA64BE || \
+ (x) == AV_PIX_FMT_BGRA64LE || \
+ (x) == AV_PIX_FMT_BGR32 || \
+ (x) == AV_PIX_FMT_BGR32_1 || \
+ (x) == AV_PIX_FMT_BGR24 || \
+ (x) == AV_PIX_FMT_BGR565BE || \
+ (x) == AV_PIX_FMT_BGR565LE || \
+ (x) == AV_PIX_FMT_BGR555BE || \
+ (x) == AV_PIX_FMT_BGR555LE || \
+ (x) == AV_PIX_FMT_BGR444BE || \
+ (x) == AV_PIX_FMT_BGR444LE || \
+ (x) == AV_PIX_FMT_BGR8 || \
+ (x) == AV_PIX_FMT_BGR4 || \
+ (x) == AV_PIX_FMT_BGR4_BYTE || \
+ (x) == AV_PIX_FMT_MONOBLACK || \
+ (x) == AV_PIX_FMT_MONOWHITE \
+ )
+
+#define isRGBinBytes(x) ( \
+ (x) == AV_PIX_FMT_RGB48BE \
+ || (x) == AV_PIX_FMT_RGB48LE \
+ || (x) == AV_PIX_FMT_RGBA64BE \
+ || (x) == AV_PIX_FMT_RGBA64LE \
+ || (x) == AV_PIX_FMT_RGBA \
+ || (x) == AV_PIX_FMT_ARGB \
+ || (x) == AV_PIX_FMT_RGB24 \
+ )
+#define isBGRinBytes(x) ( \
+ (x) == AV_PIX_FMT_BGR48BE \
+ || (x) == AV_PIX_FMT_BGR48LE \
+ || (x) == AV_PIX_FMT_BGRA64BE \
+ || (x) == AV_PIX_FMT_BGRA64LE \
+ || (x) == AV_PIX_FMT_BGRA \
+ || (x) == AV_PIX_FMT_ABGR \
+ || (x) == AV_PIX_FMT_BGR24 \
+ )
+
+#define isAnyRGB(x) \
+ ( \
+ isRGBinInt(x) || \
+ isBGRinInt(x) || \
+ isRGB(x) || \
+ (x)==AV_PIX_FMT_GBRP9LE || \
+ (x)==AV_PIX_FMT_GBRP9BE || \
+ (x)==AV_PIX_FMT_GBRP10LE || \
+ (x)==AV_PIX_FMT_GBRP10BE || \
+ (x)==AV_PIX_FMT_GBRP12LE || \
+ (x)==AV_PIX_FMT_GBRP12BE || \
+ (x)==AV_PIX_FMT_GBRP14LE || \
+ (x)==AV_PIX_FMT_GBRP14BE || \
+ (x)==AV_PIX_FMT_GBR24P \
+ )
+
+static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return desc->flags & PIX_FMT_ALPHA;
+}
+
+#if 1
+#define isPacked(x) ( \
+ (x)==AV_PIX_FMT_PAL8 \
+ || (x)==AV_PIX_FMT_YUYV422 \
+ || (x)==AV_PIX_FMT_UYVY422 \
+ || (x)==AV_PIX_FMT_Y400A \
+ || isRGBinInt(x) \
+ || isBGRinInt(x) \
+ )
+#else
+static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return ((desc->nb_components >= 2 && !(desc->flags & PIX_FMT_PLANAR)) ||
+ pix_fmt == AV_PIX_FMT_PAL8);
+}
+
+#endif
+static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return (desc->nb_components >= 2 && (desc->flags & PIX_FMT_PLANAR));
+}
+
+static av_always_inline int isPackedRGB(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return ((desc->flags & (PIX_FMT_PLANAR | PIX_FMT_RGB)) == PIX_FMT_RGB);
+}
+
+static av_always_inline int isPlanarRGB(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return ((desc->flags & (PIX_FMT_PLANAR | PIX_FMT_RGB)) ==
+ (PIX_FMT_PLANAR | PIX_FMT_RGB));
+}
+
+static av_always_inline int usePal(enum AVPixelFormat pix_fmt)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+ av_assert0(desc);
+ return (desc->flags & PIX_FMT_PAL) || (desc->flags & PIX_FMT_PSEUDOPAL);
+}
+
+extern const uint64_t ff_dither4[2];
+extern const uint64_t ff_dither8[2];
+extern const uint8_t dithers[8][8][8];
+extern const uint16_t dither_scale[15][16];
+
+
+extern const AVClass sws_context_class;
+
+/**
+ * Set c->swScale to an unscaled converter if one exists for the specific
+ * source and destination formats, bit depths, flags, etc.
+ */
+void ff_get_unscaled_swscale(SwsContext *c);
+
+void ff_swscale_get_unscaled_altivec(SwsContext *c);
+
+/**
+ * Return function pointer to fastest main scaler path function depending
+ * on architecture and available optimizations.
+ */
+SwsFunc ff_getSwsFunc(SwsContext *c);
+
+void ff_sws_init_input_funcs(SwsContext *c);
+void ff_sws_init_output_funcs(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX);
+void ff_sws_init_swScale_altivec(SwsContext *c);
+void ff_sws_init_swScale_mmx(SwsContext *c);
+
+static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
+ int alpha, int bits, const int big_endian)
+{
+ int i, j;
+ uint8_t *ptr = plane + stride * y;
+ int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits);
+ for (i = 0; i < height; i++) {
+#define FILL(wfunc) \
+ for (j = 0; j < width; j++) {\
+ wfunc(ptr+2*j, v);\
+ }
+ if (big_endian) {
+ FILL(AV_WB16);
+ } else {
+ FILL(AV_WL16);
+ }
+ ptr += stride;
+ }
+}
+
+#endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/ffmpeg/libswscale/swscale_unscaled.c b/ffmpeg/libswscale/swscale_unscaled.c
new file mode 100644
index 0000000..4a22fca
--- /dev/null
+++ b/ffmpeg/libswscale/swscale_unscaled.c
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include "config.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+#include "rgb2rgb.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/cpu.h"
+#include "libavutil/avutil.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/bswap.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+
+DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
+{
+ { 0, 1, 0, 1, 0, 1, 0, 1,},
+ { 1, 0, 1, 0, 1, 0, 1, 0,},
+ { 0, 1, 0, 1, 0, 1, 0, 1,},
+ { 1, 0, 1, 0, 1, 0, 1, 0,},
+ { 0, 1, 0, 1, 0, 1, 0, 1,},
+ { 1, 0, 1, 0, 1, 0, 1, 0,},
+ { 0, 1, 0, 1, 0, 1, 0, 1,},
+ { 1, 0, 1, 0, 1, 0, 1, 0,},
+},{
+ { 1, 2, 1, 2, 1, 2, 1, 2,},
+ { 3, 0, 3, 0, 3, 0, 3, 0,},
+ { 1, 2, 1, 2, 1, 2, 1, 2,},
+ { 3, 0, 3, 0, 3, 0, 3, 0,},
+ { 1, 2, 1, 2, 1, 2, 1, 2,},
+ { 3, 0, 3, 0, 3, 0, 3, 0,},
+ { 1, 2, 1, 2, 1, 2, 1, 2,},
+ { 3, 0, 3, 0, 3, 0, 3, 0,},
+},{
+ { 2, 4, 3, 5, 2, 4, 3, 5,},
+ { 6, 0, 7, 1, 6, 0, 7, 1,},
+ { 3, 5, 2, 4, 3, 5, 2, 4,},
+ { 7, 1, 6, 0, 7, 1, 6, 0,},
+ { 2, 4, 3, 5, 2, 4, 3, 5,},
+ { 6, 0, 7, 1, 6, 0, 7, 1,},
+ { 3, 5, 2, 4, 3, 5, 2, 4,},
+ { 7, 1, 6, 0, 7, 1, 6, 0,},
+},{
+ { 4, 8, 7, 11, 4, 8, 7, 11,},
+ { 12, 0, 15, 3, 12, 0, 15, 3,},
+ { 6, 10, 5, 9, 6, 10, 5, 9,},
+ { 14, 2, 13, 1, 14, 2, 13, 1,},
+ { 4, 8, 7, 11, 4, 8, 7, 11,},
+ { 12, 0, 15, 3, 12, 0, 15, 3,},
+ { 6, 10, 5, 9, 6, 10, 5, 9,},
+ { 14, 2, 13, 1, 14, 2, 13, 1,},
+},{
+ { 9, 17, 15, 23, 8, 16, 14, 22,},
+ { 25, 1, 31, 7, 24, 0, 30, 6,},
+ { 13, 21, 11, 19, 12, 20, 10, 18,},
+ { 29, 5, 27, 3, 28, 4, 26, 2,},
+ { 8, 16, 14, 22, 9, 17, 15, 23,},
+ { 24, 0, 30, 6, 25, 1, 31, 7,},
+ { 12, 20, 10, 18, 13, 21, 11, 19,},
+ { 28, 4, 26, 2, 29, 5, 27, 3,},
+},{
+ { 18, 34, 30, 46, 17, 33, 29, 45,},
+ { 50, 2, 62, 14, 49, 1, 61, 13,},
+ { 26, 42, 22, 38, 25, 41, 21, 37,},
+ { 58, 10, 54, 6, 57, 9, 53, 5,},
+ { 16, 32, 28, 44, 19, 35, 31, 47,},
+ { 48, 0, 60, 12, 51, 3, 63, 15,},
+ { 24, 40, 20, 36, 27, 43, 23, 39,},
+ { 56, 8, 52, 4, 59, 11, 55, 7,},
+},{
+ { 18, 34, 30, 46, 17, 33, 29, 45,},
+ { 50, 2, 62, 14, 49, 1, 61, 13,},
+ { 26, 42, 22, 38, 25, 41, 21, 37,},
+ { 58, 10, 54, 6, 57, 9, 53, 5,},
+ { 16, 32, 28, 44, 19, 35, 31, 47,},
+ { 48, 0, 60, 12, 51, 3, 63, 15,},
+ { 24, 40, 20, 36, 27, 43, 23, 39,},
+ { 56, 8, 52, 4, 59, 11, 55, 7,},
+},{
+ { 36, 68, 60, 92, 34, 66, 58, 90,},
+ { 100, 4,124, 28, 98, 2,122, 26,},
+ { 52, 84, 44, 76, 50, 82, 42, 74,},
+ { 116, 20,108, 12,114, 18,106, 10,},
+ { 32, 64, 56, 88, 38, 70, 62, 94,},
+ { 96, 0,120, 24,102, 6,126, 30,},
+ { 48, 80, 40, 72, 54, 86, 46, 78,},
+ { 112, 16,104, 8,118, 22,110, 14,},
+}};
+
+const uint16_t dither_scale[15][16]={
+{ 2, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,},
+{ 2, 3, 7, 7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,},
+{ 3, 3, 4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,},
+{ 3, 4, 4, 5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,},
+{ 3, 4, 5, 5, 6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,},
+{ 3, 5, 6, 6, 6, 7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,},
+{ 3, 5, 6, 7, 7, 7, 8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,},
+{ 3, 5, 6, 8, 8, 8, 8, 9, 511, 511, 1021, 2041, 4081, 8161,16321,32641,},
+{ 3, 5, 7, 8, 9, 9, 9, 9, 10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
+{ 3, 5, 7, 8, 10, 10, 10, 10, 10, 11, 2047, 2047, 4093, 8185,16369,32737,},
+{ 3, 5, 7, 8, 10, 11, 11, 11, 11, 11, 12, 4095, 4095, 8189,16377,32753,},
+{ 3, 5, 7, 9, 10, 12, 12, 12, 12, 12, 12, 13, 8191, 8191,16381,32761,},
+{ 3, 5, 7, 9, 10, 12, 13, 13, 13, 13, 13, 13, 14,16383,16383,32765,},
+{ 3, 5, 7, 9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,},
+{ 3, 5, 7, 9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,},
+};
+
+
+static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
+ uint8_t val)
+{
+ int i;
+ uint8_t *ptr = plane + stride * y;
+ for (i = 0; i < height; i++) {
+ memset(ptr, val, width);
+ ptr += stride;
+ }
+}
+
+static void copyPlane(const uint8_t *src, int srcStride,
+ int srcSliceY, int srcSliceH, int width,
+ uint8_t *dst, int dstStride)
+{
+ dst += dstStride * srcSliceY;
+ if (dstStride == srcStride && srcStride > 0) {
+ memcpy(dst, src, srcSliceH * dstStride);
+ } else {
+ int i;
+ for (i = 0; i < srcSliceH; i++) {
+ memcpy(dst, src, width);
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+}
+
+static int planarToNv12Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY,
+ int srcSliceH, uint8_t *dstParam[],
+ int dstStride[])
+{
+ uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY / 2;
+
+ copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
+
+ if (c->dstFormat == AV_PIX_FMT_NV12)
+ interleaveBytes(src[1], src[2], dst, c->srcW / 2, srcSliceH / 2,
+ srcStride[1], srcStride[2], dstStride[0]);
+ else
+ interleaveBytes(src[2], src[1], dst, c->srcW / 2, srcSliceH / 2,
+ srcStride[2], srcStride[1], dstStride[0]);
+
+ return srcSliceH;
+}
+
+static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
+
+ yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
+ srcStride[1], dstStride[0]);
+
+ return srcSliceH;
+}
+
+static int planarToUyvyWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
+
+ yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
+ srcStride[1], dstStride[0]);
+
+ return srcSliceH;
+}
+
+static int yuv422pToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
+
+ yuv422ptoyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
+ srcStride[1], dstStride[0]);
+
+ return srcSliceH;
+}
+
+static int yuv422pToUyvyWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
+
+ yuv422ptouyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
+ srcStride[1], dstStride[0]);
+
+ return srcSliceH;
+}
+
+static int yuyvToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
+ uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY / 2;
+
+ yuyvtoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
+ dstStride[1], srcStride[0]);
+
+ if (dstParam[3])
+ fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+
+ return srcSliceH;
+}
+
+static int yuyvToYuv422Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
+ uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY;
+ uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY;
+
+ yuyvtoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
+ dstStride[1], srcStride[0]);
+
+ return srcSliceH;
+}
+
+static int uyvyToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
+ uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY / 2;
+
+ uyvytoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
+ dstStride[1], srcStride[0]);
+
+ if (dstParam[3])
+ fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+
+ return srcSliceH;
+}
+
+static int uyvyToYuv422Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
+ uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY;
+ uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY;
+
+ uyvytoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
+ dstStride[1], srcStride[0]);
+
+ return srcSliceH;
+}
+
+static void gray8aToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels,
+ const uint8_t *palette)
+{
+ int i;
+ for (i = 0; i < num_pixels; i++)
+ ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i << 1]] | (src[(i << 1) + 1] << 24);
+}
+
+static void gray8aToPacked32_1(const uint8_t *src, uint8_t *dst, int num_pixels,
+ const uint8_t *palette)
+{
+ int i;
+
+ for (i = 0; i < num_pixels; i++)
+ ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i << 1]] | src[(i << 1) + 1];
+}
+
+static void gray8aToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels,
+ const uint8_t *palette)
+{
+ int i;
+
+ for (i = 0; i < num_pixels; i++) {
+ //FIXME slow?
+ dst[0] = palette[src[i << 1] * 4 + 0];
+ dst[1] = palette[src[i << 1] * 4 + 1];
+ dst[2] = palette[src[i << 1] * 4 + 2];
+ dst += 3;
+ }
+}
+
+static int packed_16bpc_bswap(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int i, j;
+ int srcstr = srcStride[0] >> 1;
+ int dststr = dstStride[0] >> 1;
+ uint16_t *dstPtr = (uint16_t *) dst[0];
+ const uint16_t *srcPtr = (const uint16_t *) src[0];
+ int min_stride = FFMIN(srcstr, dststr);
+
+ for (i = 0; i < srcSliceH; i++) {
+ for (j = 0; j < min_stride; j++) {
+ dstPtr[j] = av_bswap16(srcPtr[j]);
+ }
+ srcPtr += srcstr;
+ dstPtr += dststr;
+ }
+
+ return srcSliceH;
+}
+
+static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[],
+ int dstStride[])
+{
+ const enum AVPixelFormat srcFormat = c->srcFormat;
+ const enum AVPixelFormat dstFormat = c->dstFormat;
+ void (*conv)(const uint8_t *src, uint8_t *dst, int num_pixels,
+ const uint8_t *palette) = NULL;
+ int i;
+ uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
+ const uint8_t *srcPtr = src[0];
+
+ if (srcFormat == AV_PIX_FMT_GRAY8A) {
+ switch (dstFormat) {
+ case AV_PIX_FMT_RGB32 : conv = gray8aToPacked32; break;
+ case AV_PIX_FMT_BGR32 : conv = gray8aToPacked32; break;
+ case AV_PIX_FMT_BGR32_1: conv = gray8aToPacked32_1; break;
+ case AV_PIX_FMT_RGB32_1: conv = gray8aToPacked32_1; break;
+ case AV_PIX_FMT_RGB24 : conv = gray8aToPacked24; break;
+ case AV_PIX_FMT_BGR24 : conv = gray8aToPacked24; break;
+ }
+ } else if (usePal(srcFormat)) {
+ switch (dstFormat) {
+ case AV_PIX_FMT_RGB32 : conv = sws_convertPalette8ToPacked32; break;
+ case AV_PIX_FMT_BGR32 : conv = sws_convertPalette8ToPacked32; break;
+ case AV_PIX_FMT_BGR32_1: conv = sws_convertPalette8ToPacked32; break;
+ case AV_PIX_FMT_RGB32_1: conv = sws_convertPalette8ToPacked32; break;
+ case AV_PIX_FMT_RGB24 : conv = sws_convertPalette8ToPacked24; break;
+ case AV_PIX_FMT_BGR24 : conv = sws_convertPalette8ToPacked24; break;
+ }
+ }
+
+ if (!conv)
+ av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
+ av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
+ else {
+ for (i = 0; i < srcSliceH; i++) {
+ conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
+ srcPtr += srcStride[0];
+ dstPtr += dstStride[0];
+ }
+ }
+
+ return srcSliceH;
+}
+
+static void gbr24ptopacked24(const uint8_t *src[], int srcStride[],
+ uint8_t *dst, int dstStride, int srcSliceH,
+ int width)
+{
+ int x, h, i;
+ for (h = 0; h < srcSliceH; h++) {
+ uint8_t *dest = dst + dstStride * h;
+ for (x = 0; x < width; x++) {
+ *dest++ = src[0][x];
+ *dest++ = src[1][x];
+ *dest++ = src[2][x];
+ }
+
+ for (i = 0; i < 3; i++)
+ src[i] += srcStride[i];
+ }
+}
+
+static void gbr24ptopacked32(const uint8_t *src[], int srcStride[],
+ uint8_t *dst, int dstStride, int srcSliceH,
+ int alpha_first, int width)
+{
+ int x, h, i;
+ for (h = 0; h < srcSliceH; h++) {
+ uint8_t *dest = dst + dstStride * h;
+
+ if (alpha_first) {
+ for (x = 0; x < width; x++) {
+ *dest++ = 0xff;
+ *dest++ = src[0][x];
+ *dest++ = src[1][x];
+ *dest++ = src[2][x];
+ }
+ } else {
+ for (x = 0; x < width; x++) {
+ *dest++ = src[0][x];
+ *dest++ = src[1][x];
+ *dest++ = src[2][x];
+ *dest++ = 0xff;
+ }
+ }
+
+ for (i = 0; i < 3; i++)
+ src[i] += srcStride[i];
+ }
+}
+
+static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int alpha_first = 0;
+ const uint8_t *src102[] = { src[1], src[0], src[2] };
+ const uint8_t *src201[] = { src[2], src[0], src[1] };
+ int stride102[] = { srcStride[1], srcStride[0], srcStride[2] };
+ int stride201[] = { srcStride[2], srcStride[0], srcStride[1] };
+
+ if (c->srcFormat != AV_PIX_FMT_GBRP) {
+ av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n",
+ av_get_pix_fmt_name(c->srcFormat),
+ av_get_pix_fmt_name(c->dstFormat));
+ return srcSliceH;
+ }
+
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_BGR24:
+ gbr24ptopacked24(src102, stride102,
+ dst[0] + srcSliceY * dstStride[0], dstStride[0],
+ srcSliceH, c->srcW);
+ break;
+
+ case AV_PIX_FMT_RGB24:
+ gbr24ptopacked24(src201, stride201,
+ dst[0] + srcSliceY * dstStride[0], dstStride[0],
+ srcSliceH, c->srcW);
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ alpha_first = 1;
+ case AV_PIX_FMT_RGBA:
+ gbr24ptopacked32(src201, stride201,
+ dst[0] + srcSliceY * dstStride[0], dstStride[0],
+ srcSliceH, alpha_first, c->srcW);
+ break;
+
+ case AV_PIX_FMT_ABGR:
+ alpha_first = 1;
+ case AV_PIX_FMT_BGRA:
+ gbr24ptopacked32(src102, stride102,
+ dst[0] + srcSliceY * dstStride[0], dstStride[0],
+ srcSliceH, alpha_first, c->srcW);
+ break;
+
+ default:
+ av_log(c, AV_LOG_ERROR,
+ "unsupported planar RGB conversion %s -> %s\n",
+ av_get_pix_fmt_name(c->srcFormat),
+ av_get_pix_fmt_name(c->dstFormat));
+ }
+
+ return srcSliceH;
+}
+
+static void packedtogbr24p(const uint8_t *src, int srcStride,
+ uint8_t *dst[], int dstStride[], int srcSliceH,
+ int alpha_first, int inc_size, int width)
+{
+ uint8_t *dest[3];
+ int x, h;
+
+ dest[0] = dst[0];
+ dest[1] = dst[1];
+ dest[2] = dst[2];
+
+ if (alpha_first)
+ src++;
+
+ for (h = 0; h < srcSliceH; h++) {
+ for (x = 0; x < width; x++) {
+ dest[0][x] = src[0];
+ dest[1][x] = src[1];
+ dest[2][x] = src[2];
+
+ src += inc_size;
+ }
+ src += srcStride - width * inc_size;
+ dest[0] += dstStride[0];
+ dest[1] += dstStride[1];
+ dest[2] += dstStride[2];
+ }
+}
+
+static int rgbToPlanarRgbWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int alpha_first = 0;
+ int stride102[] = { dstStride[1], dstStride[0], dstStride[2] };
+ int stride201[] = { dstStride[2], dstStride[0], dstStride[1] };
+ uint8_t *dst102[] = { dst[1] + srcSliceY * dstStride[1],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[2] + srcSliceY * dstStride[2] };
+ uint8_t *dst201[] = { dst[2] + srcSliceY * dstStride[2],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[1] + srcSliceY * dstStride[1] };
+
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_RGB24:
+ packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst201,
+ stride201, srcSliceH, alpha_first, 3, c->srcW);
+ break;
+ case AV_PIX_FMT_BGR24:
+ packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst102,
+ stride102, srcSliceH, alpha_first, 3, c->srcW);
+ break;
+ case AV_PIX_FMT_ARGB:
+ alpha_first = 1;
+ case AV_PIX_FMT_RGBA:
+ packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst201,
+ stride201, srcSliceH, alpha_first, 4, c->srcW);
+ break;
+ case AV_PIX_FMT_ABGR:
+ alpha_first = 1;
+ case AV_PIX_FMT_BGRA:
+ packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst102,
+ stride102, srcSliceH, alpha_first, 4, c->srcW);
+ break;
+ default:
+ av_log(c, AV_LOG_ERROR,
+ "unsupported planar RGB conversion %s -> %s\n",
+ av_get_pix_fmt_name(c->srcFormat),
+ av_get_pix_fmt_name(c->dstFormat));
+ }
+
+ return srcSliceH;
+}
+
+#define isRGBA32(x) ( \
+ (x) == AV_PIX_FMT_ARGB \
+ || (x) == AV_PIX_FMT_RGBA \
+ || (x) == AV_PIX_FMT_BGRA \
+ || (x) == AV_PIX_FMT_ABGR \
+ )
+
+#define isRGBA64(x) ( \
+ (x) == AV_PIX_FMT_RGBA64LE \
+ || (x) == AV_PIX_FMT_RGBA64BE \
+ || (x) == AV_PIX_FMT_BGRA64LE \
+ || (x) == AV_PIX_FMT_BGRA64BE \
+ )
+
+#define isRGB48(x) ( \
+ (x) == AV_PIX_FMT_RGB48LE \
+ || (x) == AV_PIX_FMT_RGB48BE \
+ || (x) == AV_PIX_FMT_BGR48LE \
+ || (x) == AV_PIX_FMT_BGR48BE \
+ )
+
+/* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
+typedef void (* rgbConvFn) (const uint8_t *, uint8_t *, int);
+static rgbConvFn findRgbConvFn(SwsContext *c)
+{
+ const enum AVPixelFormat srcFormat = c->srcFormat;
+ const enum AVPixelFormat dstFormat = c->dstFormat;
+ const int srcId = c->srcFormatBpp;
+ const int dstId = c->dstFormatBpp;
+ rgbConvFn conv = NULL;
+
+#define IS_NOT_NE(bpp, desc) \
+ (((bpp + 7) >> 3) == 2 && \
+ (!(desc->flags & PIX_FMT_BE) != !HAVE_BIGENDIAN))
+
+#define CONV_IS(src, dst) (srcFormat == AV_PIX_FMT_##src && dstFormat == AV_PIX_FMT_##dst)
+
+ if (isRGBA32(srcFormat) && isRGBA32(dstFormat)) {
+ if ( CONV_IS(ABGR, RGBA)
+ || CONV_IS(ARGB, BGRA)
+ || CONV_IS(BGRA, ARGB)
+ || CONV_IS(RGBA, ABGR)) conv = shuffle_bytes_3210;
+ else if (CONV_IS(ABGR, ARGB)
+ || CONV_IS(ARGB, ABGR)) conv = shuffle_bytes_0321;
+ else if (CONV_IS(ABGR, BGRA)
+ || CONV_IS(ARGB, RGBA)) conv = shuffle_bytes_1230;
+ else if (CONV_IS(BGRA, RGBA)
+ || CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103;
+ else if (CONV_IS(BGRA, ABGR)
+ || CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012;
+ } else if (isRGB48(srcFormat) && isRGB48(dstFormat)) {
+ if (CONV_IS(RGB48LE, BGR48LE)
+ || CONV_IS(BGR48LE, RGB48LE)
+ || CONV_IS(RGB48BE, BGR48BE)
+ || CONV_IS(BGR48BE, RGB48BE)) conv = rgb48tobgr48_nobswap;
+ else if (CONV_IS(RGB48LE, BGR48BE)
+ || CONV_IS(BGR48LE, RGB48BE)
+ || CONV_IS(RGB48BE, BGR48LE)
+ || CONV_IS(BGR48BE, RGB48LE)) conv = rgb48tobgr48_bswap;
+ } else if (isRGBA64(srcFormat) && isRGB48(dstFormat)) {
+ if (CONV_IS(RGBA64LE, BGR48LE)
+ || CONV_IS(BGRA64LE, RGB48LE)
+ || CONV_IS(RGBA64BE, BGR48BE)
+ || CONV_IS(BGRA64BE, RGB48BE)) conv = rgb64tobgr48_nobswap;
+ else if (CONV_IS(RGBA64LE, BGR48BE)
+ || CONV_IS(BGRA64LE, RGB48BE)
+ || CONV_IS(RGBA64BE, BGR48LE)
+ || CONV_IS(BGRA64BE, RGB48LE)) conv = rgb64tobgr48_bswap;
+ else if (CONV_IS(RGBA64LE, RGB48LE)
+ || CONV_IS(BGRA64LE, BGR48LE)
+ || CONV_IS(RGBA64BE, RGB48BE)
+ || CONV_IS(BGRA64BE, BGR48BE)) conv = rgb64to48_nobswap;
+ else if (CONV_IS(RGBA64LE, RGB48BE)
+ || CONV_IS(BGRA64LE, BGR48BE)
+ || CONV_IS(RGBA64BE, RGB48LE)
+ || CONV_IS(BGRA64BE, BGR48LE)) conv = rgb64to48_bswap;
+ } else
+ /* BGR -> BGR */
+ if ((isBGRinInt(srcFormat) && isBGRinInt(dstFormat)) ||
+ (isRGBinInt(srcFormat) && isRGBinInt(dstFormat))) {
+ switch (srcId | (dstId << 16)) {
+ case 0x000F000C: conv = rgb12to15; break;
+ case 0x000F0010: conv = rgb16to15; break;
+ case 0x000F0018: conv = rgb24to15; break;
+ case 0x000F0020: conv = rgb32to15; break;
+ case 0x0010000F: conv = rgb15to16; break;
+ case 0x00100018: conv = rgb24to16; break;
+ case 0x00100020: conv = rgb32to16; break;
+ case 0x0018000F: conv = rgb15to24; break;
+ case 0x00180010: conv = rgb16to24; break;
+ case 0x00180020: conv = rgb32to24; break;
+ case 0x0020000F: conv = rgb15to32; break;
+ case 0x00200010: conv = rgb16to32; break;
+ case 0x00200018: conv = rgb24to32; break;
+ }
+ } else if ((isBGRinInt(srcFormat) && isRGBinInt(dstFormat)) ||
+ (isRGBinInt(srcFormat) && isBGRinInt(dstFormat))) {
+ switch (srcId | (dstId << 16)) {
+ case 0x000C000C: conv = rgb12tobgr12; break;
+ case 0x000F000F: conv = rgb15tobgr15; break;
+ case 0x000F0010: conv = rgb16tobgr15; break;
+ case 0x000F0018: conv = rgb24tobgr15; break;
+ case 0x000F0020: conv = rgb32tobgr15; break;
+ case 0x0010000F: conv = rgb15tobgr16; break;
+ case 0x00100010: conv = rgb16tobgr16; break;
+ case 0x00100018: conv = rgb24tobgr16; break;
+ case 0x00100020: conv = rgb32tobgr16; break;
+ case 0x0018000F: conv = rgb15tobgr24; break;
+ case 0x00180010: conv = rgb16tobgr24; break;
+ case 0x00180018: conv = rgb24tobgr24; break;
+ case 0x00180020: conv = rgb32tobgr24; break;
+ case 0x0020000F: conv = rgb15tobgr32; break;
+ case 0x00200010: conv = rgb16tobgr32; break;
+ case 0x00200018: conv = rgb24tobgr32; break;
+ }
+ }
+
+ if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) && !isRGBA32(srcFormat) && ALT32_CORR<0)
+ return NULL;
+
+ return conv;
+}
+
+/* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
+static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[],
+ int dstStride[])
+
+{
+ const enum AVPixelFormat srcFormat = c->srcFormat;
+ const enum AVPixelFormat dstFormat = c->dstFormat;
+ const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
+ const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
+ const int srcBpp = (c->srcFormatBpp + 7) >> 3;
+ const int dstBpp = (c->dstFormatBpp + 7) >> 3;
+ rgbConvFn conv = findRgbConvFn(c);
+
+ if (!conv) {
+ av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
+ av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
+ } else {
+ const uint8_t *srcPtr = src[0];
+ uint8_t *dstPtr = dst[0];
+ int src_bswap = IS_NOT_NE(c->srcFormatBpp, desc_src);
+ int dst_bswap = IS_NOT_NE(c->dstFormatBpp, desc_dst);
+
+ if ((srcFormat == AV_PIX_FMT_RGB32_1 || srcFormat == AV_PIX_FMT_BGR32_1) &&
+ !isRGBA32(dstFormat))
+ srcPtr += ALT32_CORR;
+
+ if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) &&
+ !isRGBA32(srcFormat))
+ dstPtr += ALT32_CORR;
+
+ if (dstStride[0] * srcBpp == srcStride[0] * dstBpp && srcStride[0] > 0 &&
+ !(srcStride[0] % srcBpp) && !dst_bswap && !src_bswap)
+ conv(srcPtr, dstPtr + dstStride[0] * srcSliceY,
+ srcSliceH * srcStride[0]);
+ else {
+ int i, j;
+ dstPtr += dstStride[0] * srcSliceY;
+
+ for (i = 0; i < srcSliceH; i++) {
+ if(src_bswap) {
+ for(j=0; j<c->srcW; j++)
+ ((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]);
+ conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp);
+ }else
+ conv(srcPtr, dstPtr, c->srcW * srcBpp);
+ if(dst_bswap)
+ for(j=0; j<c->srcW; j++)
+ ((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]);
+ srcPtr += srcStride[0];
+ dstPtr += dstStride[0];
+ }
+ }
+ }
+ return srcSliceH;
+}
+
+static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ rgb24toyv12(
+ src[0],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[1] + (srcSliceY >> 1) * dstStride[1],
+ dst[2] + (srcSliceY >> 1) * dstStride[2],
+ c->srcW, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0]);
+ if (dst[3])
+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+ return srcSliceH;
+}
+
+static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dst[0], dstStride[0]);
+
+ planar2x(src[1], dst[1] + dstStride[1] * (srcSliceY >> 1), c->chrSrcW,
+ srcSliceH >> 2, srcStride[1], dstStride[1]);
+ planar2x(src[2], dst[2] + dstStride[2] * (srcSliceY >> 1), c->chrSrcW,
+ srcSliceH >> 2, srcStride[2], dstStride[2]);
+ if (dst[3])
+ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
+ return srcSliceH;
+}
+
+/* unscaled copy like stuff (assumes nearly identical formats) */
+static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ if (dstStride[0] == srcStride[0] && srcStride[0] > 0)
+ memcpy(dst[0] + dstStride[0] * srcSliceY, src[0], srcSliceH * dstStride[0]);
+ else {
+ int i;
+ const uint8_t *srcPtr = src[0];
+ uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
+ int length = 0;
+
+ /* universal length finder */
+ while (length + c->srcW <= FFABS(dstStride[0]) &&
+ length + c->srcW <= FFABS(srcStride[0]))
+ length += c->srcW;
+ av_assert1(length != 0);
+
+ for (i = 0; i < srcSliceH; i++) {
+ memcpy(dstPtr, srcPtr, length);
+ srcPtr += srcStride[0];
+ dstPtr += dstStride[0];
+ }
+ }
+ return srcSliceH;
+}
+
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+ uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
+ int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
+ for (i = 0; i < height; i++) {\
+ const uint8_t *dither= dithers[src_depth-9][i&7];\
+ for (j = 0; j < length-7; j+=8){\
+ dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
+ dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
+ dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
+ dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
+ dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
+ dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
+ dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
+ dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
+ }\
+ for (; j < length; j++)\
+ dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
+ dst += dstStride;\
+ src += srcStride;\
+ }
+
+static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
+ const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
+ int plane, i, j;
+ for (plane = 0; plane < 4; plane++) {
+ int length = (plane == 0 || plane == 3) ? c->srcW : -((-c->srcW ) >> c->chrDstHSubSample);
+ int y = (plane == 0 || plane == 3) ? srcSliceY: -((-srcSliceY) >> c->chrDstVSubSample);
+ int height = (plane == 0 || plane == 3) ? srcSliceH: -((-srcSliceH) >> c->chrDstVSubSample);
+ const uint8_t *srcPtr = src[plane];
+ uint8_t *dstPtr = dst[plane] + dstStride[plane] * y;
+ int shiftonly= plane==1 || plane==2 || (!c->srcRange && plane==0);
+
+ if (!dst[plane])
+ continue;
+ // ignore palette for GRAY8
+ if (plane == 1 && !dst[2]) continue;
+ if (!src[plane] || (plane == 1 && !src[2])) {
+ if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) {
+ fillPlane16(dst[plane], dstStride[plane], length, height, y,
+ plane == 3, desc_dst->comp[plane].depth_minus1,
+ isBE(c->dstFormat));
+ } else {
+ fillPlane(dst[plane], dstStride[plane], length, height, y,
+ (plane == 3) ? 255 : 128);
+ }
+ } else {
+ if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat)
+ || (is16BPS(c->srcFormat) != is16BPS(c->dstFormat))
+ ) {
+ const int src_depth = desc_src->comp[plane].depth_minus1 + 1;
+ const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1;
+ const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
+ uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+
+ if (dst_depth == 8) {
+ if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+ DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , )
+ } else {
+ DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, )
+ }
+ } else if (src_depth == 8) {
+ for (i = 0; i < height; i++) {
+ #define COPY816(w)\
+ if(shiftonly){\
+ for (j = 0; j < length; j++)\
+ w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\
+ }else{\
+ for (j = 0; j < length; j++)\
+ w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\
+ (srcPtr[j]>>(2*8-dst_depth)));\
+ }
+ if(isBE(c->dstFormat)){
+ COPY816(AV_WB16)
+ } else {
+ COPY816(AV_WL16)
+ }
+ dstPtr2 += dstStride[plane]/2;
+ srcPtr += srcStride[plane];
+ }
+ } else if (src_depth <= dst_depth) {
+ int orig_length = length;
+ for (i = 0; i < height; i++) {
+ if(isBE(c->srcFormat) == HAVE_BIGENDIAN &&
+ isBE(c->dstFormat) == HAVE_BIGENDIAN &&
+ shiftonly) {
+ unsigned shift = dst_depth - src_depth;
+ length = orig_length;
+#if HAVE_FAST_64BIT
+#define FAST_COPY_UP(shift) \
+ for (j = 0; j < length - 3; j += 4) { \
+ uint64_t v = AV_RN64A(srcPtr2 + j); \
+ AV_WN64A(dstPtr2 + j, v << shift); \
+ } \
+ length &= 3;
+#else
+#define FAST_COPY_UP(shift) \
+ for (j = 0; j < length - 1; j += 2) { \
+ uint32_t v = AV_RN32A(srcPtr2 + j); \
+ AV_WN32A(dstPtr2 + j, v << shift); \
+ } \
+ length &= 1;
+#endif
+ switch (shift)
+ {
+ case 6: FAST_COPY_UP(6); break;
+ case 7: FAST_COPY_UP(7); break;
+ }
+ }
+#define COPY_UP(r,w) \
+ if(shiftonly){\
+ for (j = 0; j < length; j++){ \
+ unsigned int v= r(&srcPtr2[j]);\
+ w(&dstPtr2[j], v<<(dst_depth-src_depth));\
+ }\
+ }else{\
+ for (j = 0; j < length; j++){ \
+ unsigned int v= r(&srcPtr2[j]);\
+ w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \
+ (v>>(2*src_depth-dst_depth)));\
+ }\
+ }
+ if(isBE(c->srcFormat)){
+ if(isBE(c->dstFormat)){
+ COPY_UP(AV_RB16, AV_WB16)
+ } else {
+ COPY_UP(AV_RB16, AV_WL16)
+ }
+ } else {
+ if(isBE(c->dstFormat)){
+ COPY_UP(AV_RL16, AV_WB16)
+ } else {
+ COPY_UP(AV_RL16, AV_WL16)
+ }
+ }
+ dstPtr2 += dstStride[plane]/2;
+ srcPtr2 += srcStride[plane]/2;
+ }
+ } else {
+ if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+ if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+ DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , )
+ } else {
+ DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16)
+ }
+ }else{
+ if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+ DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, )
+ } else {
+ DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16)
+ }
+ }
+ }
+ } else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) &&
+ isBE(c->srcFormat) != isBE(c->dstFormat)) {
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < length; j++)
+ ((uint16_t *) dstPtr)[j] = av_bswap16(((const uint16_t *) srcPtr)[j]);
+ srcPtr += srcStride[plane];
+ dstPtr += dstStride[plane];
+ }
+ } else if (dstStride[plane] == srcStride[plane] &&
+ srcStride[plane] > 0 && srcStride[plane] == length) {
+ memcpy(dst[plane] + dstStride[plane] * y, src[plane],
+ height * dstStride[plane]);
+ } else {
+ if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat))
+ length *= 2;
+ else if (!desc_src->comp[0].depth_minus1)
+ length >>= 3; // monowhite/black
+ for (i = 0; i < height; i++) {
+ memcpy(dstPtr, srcPtr, length);
+ srcPtr += srcStride[plane];
+ dstPtr += dstStride[plane];
+ }
+ }
+ }
+ }
+ return srcSliceH;
+}
+
+
+#define IS_DIFFERENT_ENDIANESS(src_fmt, dst_fmt, pix_fmt) \
+ ((src_fmt == pix_fmt ## BE && dst_fmt == pix_fmt ## LE) || \
+ (src_fmt == pix_fmt ## LE && dst_fmt == pix_fmt ## BE))
+
+
+void ff_get_unscaled_swscale(SwsContext *c)
+{
+ const enum AVPixelFormat srcFormat = c->srcFormat;
+ const enum AVPixelFormat dstFormat = c->dstFormat;
+ const int flags = c->flags;
+ const int dstH = c->dstH;
+ int needsDither;
+
+ needsDither = isAnyRGB(dstFormat) &&
+ c->dstFormatBpp < 24 &&
+ (c->dstFormatBpp < c->srcFormatBpp || (!isAnyRGB(srcFormat)));
+
+ /* yv12_to_nv12 */
+ if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUVA420P) &&
+ (dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21)) {
+ c->swScale = planarToNv12Wrapper;
+ }
+ /* yuv2bgr */
+ if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUV422P ||
+ srcFormat == AV_PIX_FMT_YUVA420P) && isAnyRGB(dstFormat) &&
+ !(flags & (SWS_ACCURATE_RND|SWS_ERROR_DIFFUSION)) && !(dstH & 1)) {
+ c->swScale = ff_yuv2rgb_get_func_ptr(c);
+ }
+
+ if (srcFormat == AV_PIX_FMT_YUV410P &&
+ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+ !(flags & SWS_BITEXACT)) {
+ c->swScale = yvu9ToYv12Wrapper;
+ }
+
+ /* bgr24toYV12 */
+ if (srcFormat == AV_PIX_FMT_BGR24 &&
+ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+ !(flags & SWS_ACCURATE_RND))
+ c->swScale = bgr24ToYv12Wrapper;
+
+ /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+ if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+ && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
+ c->swScale= rgbToRgbWrapper;
+
+#define isByteRGB(f) ( \
+ f == AV_PIX_FMT_RGB32 || \
+ f == AV_PIX_FMT_RGB32_1 || \
+ f == AV_PIX_FMT_RGB24 || \
+ f == AV_PIX_FMT_BGR32 || \
+ f == AV_PIX_FMT_BGR32_1 || \
+ f == AV_PIX_FMT_BGR24)
+
+ if (srcFormat == AV_PIX_FMT_GBRP && isPlanar(srcFormat) && isByteRGB(dstFormat))
+ c->swScale = planarRgbToRgbWrapper;
+
+ if (av_pix_fmt_desc_get(srcFormat)->comp[0].depth_minus1 == 7 &&
+ isPackedRGB(srcFormat) && dstFormat == AV_PIX_FMT_GBRP)
+ c->swScale = rgbToPlanarRgbWrapper;
+
+ /* bswap 16 bits per pixel/component packed formats */
+ if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR444) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR48) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR555) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR565) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GRAY16) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB444) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB48) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB555) ||
+ IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB565))
+ c->swScale = packed_16bpc_bswap;
+
+ if (usePal(srcFormat) && isByteRGB(dstFormat))
+ c->swScale = palToRgbWrapper;
+
+ if (srcFormat == AV_PIX_FMT_YUV422P) {
+ if (dstFormat == AV_PIX_FMT_YUYV422)
+ c->swScale = yuv422pToYuy2Wrapper;
+ else if (dstFormat == AV_PIX_FMT_UYVY422)
+ c->swScale = yuv422pToUyvyWrapper;
+ }
+
+ /* LQ converters if -sws 0 or -sws 4*/
+ if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)) {
+ /* yv12_to_yuy2 */
+ if (srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUVA420P) {
+ if (dstFormat == AV_PIX_FMT_YUYV422)
+ c->swScale = planarToYuy2Wrapper;
+ else if (dstFormat == AV_PIX_FMT_UYVY422)
+ c->swScale = planarToUyvyWrapper;
+ }
+ }
+ if (srcFormat == AV_PIX_FMT_YUYV422 &&
+ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))
+ c->swScale = yuyvToYuv420Wrapper;
+ if (srcFormat == AV_PIX_FMT_UYVY422 &&
+ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))
+ c->swScale = uyvyToYuv420Wrapper;
+ if (srcFormat == AV_PIX_FMT_YUYV422 && dstFormat == AV_PIX_FMT_YUV422P)
+ c->swScale = yuyvToYuv422Wrapper;
+ if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P)
+ c->swScale = uyvyToYuv422Wrapper;
+
+#define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_GRAY8A)
+ /* simple copy */
+ if ( srcFormat == dstFormat ||
+ (srcFormat == AV_PIX_FMT_YUVA420P && dstFormat == AV_PIX_FMT_YUV420P) ||
+ (srcFormat == AV_PIX_FMT_YUV420P && dstFormat == AV_PIX_FMT_YUVA420P) ||
+ (isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) ||
+ (isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) ||
+ (isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) ||
+ (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) &&
+ c->chrDstHSubSample == c->chrSrcHSubSample &&
+ c->chrDstVSubSample == c->chrSrcVSubSample &&
+ dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
+ srcFormat != AV_PIX_FMT_NV12 && srcFormat != AV_PIX_FMT_NV21))
+ {
+ if (isPacked(c->srcFormat))
+ c->swScale = packedCopyWrapper;
+ else /* Planar YUV or gray */
+ c->swScale = planarCopyWrapper;
+ }
+
+ if (ARCH_BFIN)
+ ff_bfin_get_unscaled_swscale(c);
+ if (HAVE_ALTIVEC)
+ ff_swscale_get_unscaled_altivec(c);
+}
+
+/* Convert the palette to the same packed 32-bit format as the palette */
+void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst,
+ int num_pixels, const uint8_t *palette)
+{
+ int i;
+
+ for (i = 0; i < num_pixels; i++)
+ ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i]];
+}
+
+/* Palette format: ABCD -> dst format: ABC */
+void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst,
+ int num_pixels, const uint8_t *palette)
+{
+ int i;
+
+ for (i = 0; i < num_pixels; i++) {
+ //FIXME slow?
+ dst[0] = palette[src[i] * 4 + 0];
+ dst[1] = palette[src[i] * 4 + 1];
+ dst[2] = palette[src[i] * 4 + 2];
+ dst += 3;
+ }
+}
diff --git a/ffmpeg/libswscale/utils.c b/ffmpeg/libswscale/utils.c
new file mode 100644
index 0000000..932cf94
--- /dev/null
+++ b/ffmpeg/libswscale/utils.c
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#define _SVID_SOURCE // needed for MAP_ANONYMOUS
+#define _DARWIN_C_SOURCE // needed for MAP_ANON
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#if HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif
+#if HAVE_VIRTUALALLOC
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+unsigned swscale_version(void)
+{
+ av_assert0(LIBSWSCALE_VERSION_MICRO >= 100);
+ return LIBSWSCALE_VERSION_INT;
+}
+
+const char *swscale_configuration(void)
+{
+ return FFMPEG_CONFIGURATION;
+}
+
+const char *swscale_license(void)
+{
+#define LICENSE_PREFIX "libswscale license: "
+ return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+}
+
+#define RET 0xC3 // near return opcode for x86
+
+typedef struct FormatEntry {
+ int is_supported_in, is_supported_out;
+} FormatEntry;
+
+static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+ [AV_PIX_FMT_YUV420P] = { 1, 1 },
+ [AV_PIX_FMT_YUYV422] = { 1, 1 },
+ [AV_PIX_FMT_RGB24] = { 1, 1 },
+ [AV_PIX_FMT_BGR24] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P] = { 1, 1 },
+ [AV_PIX_FMT_YUV410P] = { 1, 1 },
+ [AV_PIX_FMT_YUV411P] = { 1, 1 },
+ [AV_PIX_FMT_GRAY8] = { 1, 1 },
+ [AV_PIX_FMT_MONOWHITE] = { 1, 1 },
+ [AV_PIX_FMT_MONOBLACK] = { 1, 1 },
+ [AV_PIX_FMT_PAL8] = { 1, 0 },
+ [AV_PIX_FMT_YUVJ420P] = { 1, 1 },
+ [AV_PIX_FMT_YUVJ422P] = { 1, 1 },
+ [AV_PIX_FMT_YUVJ444P] = { 1, 1 },
+ [AV_PIX_FMT_UYVY422] = { 1, 1 },
+ [AV_PIX_FMT_UYYVYY411] = { 0, 0 },
+ [AV_PIX_FMT_BGR8] = { 1, 1 },
+ [AV_PIX_FMT_BGR4] = { 0, 1 },
+ [AV_PIX_FMT_BGR4_BYTE] = { 1, 1 },
+ [AV_PIX_FMT_RGB8] = { 1, 1 },
+ [AV_PIX_FMT_RGB4] = { 0, 1 },
+ [AV_PIX_FMT_RGB4_BYTE] = { 1, 1 },
+ [AV_PIX_FMT_NV12] = { 1, 1 },
+ [AV_PIX_FMT_NV21] = { 1, 1 },
+ [AV_PIX_FMT_ARGB] = { 1, 1 },
+ [AV_PIX_FMT_RGBA] = { 1, 1 },
+ [AV_PIX_FMT_ABGR] = { 1, 1 },
+ [AV_PIX_FMT_BGRA] = { 1, 1 },
+ [AV_PIX_FMT_0RGB] = { 1, 1 },
+ [AV_PIX_FMT_RGB0] = { 1, 1 },
+ [AV_PIX_FMT_0BGR] = { 1, 1 },
+ [AV_PIX_FMT_BGR0] = { 1, 1 },
+ [AV_PIX_FMT_GRAY16BE] = { 1, 1 },
+ [AV_PIX_FMT_GRAY16LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV440P] = { 1, 1 },
+ [AV_PIX_FMT_YUVJ440P] = { 1, 1 },
+ [AV_PIX_FMT_YUVA420P] = { 1, 1 },
+ [AV_PIX_FMT_YUVA422P] = { 1, 1 },
+ [AV_PIX_FMT_YUVA444P] = { 1, 1 },
+ [AV_PIX_FMT_YUVA420P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA420P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA422P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA422P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA444P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA444P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUVA420P10BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA420P10LE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA422P10BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA422P10LE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA444P10BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA444P10LE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA420P16BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA420P16LE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA422P16BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA422P16LE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA444P16BE]= { 1, 1 },
+ [AV_PIX_FMT_YUVA444P16LE]= { 1, 1 },
+ [AV_PIX_FMT_RGB48BE] = { 1, 1 },
+ [AV_PIX_FMT_RGB48LE] = { 1, 1 },
+ [AV_PIX_FMT_RGBA64BE] = { 1, 0 },
+ [AV_PIX_FMT_RGBA64LE] = { 1, 0 },
+ [AV_PIX_FMT_RGB565BE] = { 1, 1 },
+ [AV_PIX_FMT_RGB565LE] = { 1, 1 },
+ [AV_PIX_FMT_RGB555BE] = { 1, 1 },
+ [AV_PIX_FMT_RGB555LE] = { 1, 1 },
+ [AV_PIX_FMT_BGR565BE] = { 1, 1 },
+ [AV_PIX_FMT_BGR565LE] = { 1, 1 },
+ [AV_PIX_FMT_BGR555BE] = { 1, 1 },
+ [AV_PIX_FMT_BGR555LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P16LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P16BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P16LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P16BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P16LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P16BE] = { 1, 1 },
+ [AV_PIX_FMT_RGB444LE] = { 1, 1 },
+ [AV_PIX_FMT_RGB444BE] = { 1, 1 },
+ [AV_PIX_FMT_BGR444LE] = { 1, 1 },
+ [AV_PIX_FMT_BGR444BE] = { 1, 1 },
+ [AV_PIX_FMT_Y400A] = { 1, 0 },
+ [AV_PIX_FMT_BGR48BE] = { 1, 1 },
+ [AV_PIX_FMT_BGR48LE] = { 1, 1 },
+ [AV_PIX_FMT_BGRA64BE] = { 0, 0 },
+ [AV_PIX_FMT_BGRA64LE] = { 0, 0 },
+ [AV_PIX_FMT_YUV420P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P10BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P10LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P12BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P12LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P14BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV420P14LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P10BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P10LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P12BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P12LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P14BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV422P14LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P9BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P9LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P10BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P10LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P12BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P12LE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P14BE] = { 1, 1 },
+ [AV_PIX_FMT_YUV444P14LE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP] = { 1, 1 },
+ [AV_PIX_FMT_GBRP9LE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP9BE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP10LE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP10BE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP12LE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP12BE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP14LE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP14BE] = { 1, 1 },
+ [AV_PIX_FMT_GBRP16LE] = { 1, 0 },
+ [AV_PIX_FMT_GBRP16BE] = { 1, 0 },
+};
+
+int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_in : 0;
+}
+
+int sws_isSupportedOutput(enum AVPixelFormat pix_fmt)
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_out : 0;
+}
+
+extern const int32_t ff_yuv2rgb_coeffs[8][4];
+
+#if FF_API_SWS_FORMAT_NAME
+const char *sws_format_name(enum AVPixelFormat format)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(format);
+ if (desc)
+ return desc->name;
+ else
+ return "Unknown format";
+}
+#endif
+
+static double getSplineCoeff(double a, double b, double c, double d,
+ double dist)
+{
+ if (dist <= 1.0)
+ return ((d * dist + c) * dist + b) * dist + a;
+ else
+ return getSplineCoeff(0.0,
+ b + 2.0 * c + 3.0 * d,
+ c + 3.0 * d,
+ -b - 3.0 * c - 6.0 * d,
+ dist - 1.0);
+}
+
+static int initFilter(int16_t **outFilter, int32_t **filterPos,
+ int *outFilterSize, int xInc, int srcW, int dstW,
+ int filterAlign, int one, int flags, int cpu_flags,
+ SwsVector *srcFilter, SwsVector *dstFilter,
+ double param[2])
+{
+ int i;
+ int filterSize;
+ int filter2Size;
+ int minFilterSize;
+ int64_t *filter = NULL;
+ int64_t *filter2 = NULL;
+ const int64_t fone = 1LL << (54 - FFMIN(av_log2(srcW/dstW), 8));
+ int ret = -1;
+
+ emms_c(); // FIXME should not be required but IS (even for non-MMX versions)
+
+ // NOTE: the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end
+ FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW + 3) * sizeof(**filterPos), fail);
+
+ if (FFABS(xInc - 0x10000) < 10) { // unscaled
+ int i;
+ filterSize = 1;
+ FF_ALLOCZ_OR_GOTO(NULL, filter,
+ dstW * sizeof(*filter) * filterSize, fail);
+
+ for (i = 0; i < dstW; i++) {
+ filter[i * filterSize] = fone;
+ (*filterPos)[i] = i;
+ }
+ } else if (flags & SWS_POINT) { // lame looking point sampling mode
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 1;
+ FF_ALLOC_OR_GOTO(NULL, filter,
+ dstW * sizeof(*filter) * filterSize, fail);
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++) {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+
+ (*filterPos)[i] = xx;
+ filter[i] = fone;
+ xDstInSrc += xInc;
+ }
+ } else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
+ (flags & SWS_FAST_BILINEAR)) { // bilinear upscale
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 2;
+ FF_ALLOC_OR_GOTO(NULL, filter,
+ dstW * sizeof(*filter) * filterSize, fail);
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++) {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+ int j;
+
+ (*filterPos)[i] = xx;
+ // bilinear upscale / linear interpolate / area averaging
+ for (j = 0; j < filterSize; j++) {
+ int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
+ if (coeff < 0)
+ coeff = 0;
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += xInc;
+ }
+ } else {
+ int64_t xDstInSrc;
+ int sizeFactor;
+
+ if (flags & SWS_BICUBIC)
+ sizeFactor = 4;
+ else if (flags & SWS_X)
+ sizeFactor = 8;
+ else if (flags & SWS_AREA)
+ sizeFactor = 1; // downscale only, for upscale it is bilinear
+ else if (flags & SWS_GAUSS)
+ sizeFactor = 8; // infinite ;)
+ else if (flags & SWS_LANCZOS)
+ sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;
+ else if (flags & SWS_SINC)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_SPLINE)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_BILINEAR)
+ sizeFactor = 2;
+ else {
+ av_assert0(0);
+ }
+
+ if (xInc <= 1 << 16)
+ filterSize = 1 + sizeFactor; // upscale
+ else
+ filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
+
+ filterSize = FFMIN(filterSize, srcW - 2);
+ filterSize = FFMAX(filterSize, 1);
+
+ FF_ALLOC_OR_GOTO(NULL, filter,
+ dstW * sizeof(*filter) * filterSize, fail);
+
+ xDstInSrc = xInc - 0x10000;
+ for (i = 0; i < dstW; i++) {
+ int xx = (xDstInSrc - ((filterSize - 2) << 16)) / (1 << 17);
+ int j;
+ (*filterPos)[i] = xx;
+ for (j = 0; j < filterSize; j++) {
+ int64_t d = (FFABS(((int64_t)xx << 17) - xDstInSrc)) << 13;
+ double floatd;
+ int64_t coeff;
+
+ if (xInc > 1 << 16)
+ d = d * dstW / srcW;
+ floatd = d * (1.0 / (1 << 30));
+
+ if (flags & SWS_BICUBIC) {
+ int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);
+ int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);
+
+ if (d >= 1LL << 31) {
+ coeff = 0.0;
+ } else {
+ int64_t dd = (d * d) >> 30;
+ int64_t ddd = (dd * d) >> 30;
+
+ if (d < 1LL << 30)
+ coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +
+ (-18 * (1 << 24) + 12 * B + 6 * C) * dd +
+ (6 * (1 << 24) - 2 * B) * (1 << 30);
+ else
+ coeff = (-B - 6 * C) * ddd +
+ (6 * B + 30 * C) * dd +
+ (-12 * B - 48 * C) * d +
+ (8 * B + 24 * C) * (1 << 30);
+ }
+ coeff /= (1LL<<54)/fone;
+ }
+#if 0
+ else if (flags & SWS_X) {
+ double p = param ? param * 0.01 : 0.3;
+ coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;
+ coeff *= pow(2.0, -p * d * d);
+ }
+#endif
+ else if (flags & SWS_X) {
+ double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
+ double c;
+
+ if (floatd < 1.0)
+ c = cos(floatd * M_PI);
+ else
+ c = -1.0;
+ if (c < 0.0)
+ c = -pow(-c, A);
+ else
+ c = pow(c, A);
+ coeff = (c * 0.5 + 0.5) * fone;
+ } else if (flags & SWS_AREA) {
+ int64_t d2 = d - (1 << 29);
+ if (d2 * xInc < -(1LL << (29 + 16)))
+ coeff = 1.0 * (1LL << (30 + 16));
+ else if (d2 * xInc < (1LL << (29 + 16)))
+ coeff = -d2 * xInc + (1LL << (29 + 16));
+ else
+ coeff = 0.0;
+ coeff *= fone >> (30 + 16);
+ } else if (flags & SWS_GAUSS) {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (pow(2.0, -p * floatd * floatd)) * fone;
+ } else if (flags & SWS_SINC) {
+ coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;
+ } else if (flags & SWS_LANCZOS) {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /
+ (floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;
+ if (floatd > p)
+ coeff = 0;
+ } else if (flags & SWS_BILINEAR) {
+ coeff = (1 << 30) - d;
+ if (coeff < 0)
+ coeff = 0;
+ coeff *= fone >> 30;
+ } else if (flags & SWS_SPLINE) {
+ double p = -2.196152422706632;
+ coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
+ } else {
+ av_assert0(0);
+ }
+
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += 2 * xInc;
+ }
+ }
+
+ /* apply src & dst Filter to filter -> filter2
+ * av_free(filter);
+ */
+ av_assert0(filterSize > 0);
+ filter2Size = filterSize;
+ if (srcFilter)
+ filter2Size += srcFilter->length - 1;
+ if (dstFilter)
+ filter2Size += dstFilter->length - 1;
+ av_assert0(filter2Size > 0);
+ FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size * dstW * sizeof(*filter2), fail);
+
+ for (i = 0; i < dstW; i++) {
+ int j, k;
+
+ if (srcFilter) {
+ for (k = 0; k < srcFilter->length; k++) {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + k + j] +=
+ srcFilter->coeff[k] * filter[i * filterSize + j];
+ }
+ } else {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + j] = filter[i * filterSize + j];
+ }
+ // FIXME dstFilter
+
+ (*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;
+ }
+ av_freep(&filter);
+
+ /* try to reduce the filter-size (step1 find size and shift left) */
+ // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
+ minFilterSize = 0;
+ for (i = dstW - 1; i >= 0; i--) {
+ int min = filter2Size;
+ int j;
+ int64_t cutOff = 0.0;
+
+ /* get rid of near zero elements on the left by shifting left */
+ for (j = 0; j < filter2Size; j++) {
+ int k;
+ cutOff += FFABS(filter2[i * filter2Size]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+
+ /* preserve monotonicity because the core can't handle the
+ * filter otherwise */
+ if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
+ break;
+
+ // move filter coefficients left
+ for (k = 1; k < filter2Size; k++)
+ filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
+ filter2[i * filter2Size + k - 1] = 0;
+ (*filterPos)[i]++;
+ }
+
+ cutOff = 0;
+ /* count near zeros on the right */
+ for (j = filter2Size - 1; j > 0; j--) {
+ cutOff += FFABS(filter2[i * filter2Size + j]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+ min--;
+ }
+
+ if (min > minFilterSize)
+ minFilterSize = min;
+ }
+
+ if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) {
+ // we can handle the special case 4, so we don't want to go the full 8
+ if (minFilterSize < 5)
+ filterAlign = 4;
+
+ /* We really don't want to waste our time doing useless computation, so
+ * fall back on the scalar C code for very small filters.
+ * Vectorizing is worth it only if you have a decent-sized vector. */
+ if (minFilterSize < 3)
+ filterAlign = 1;
+ }
+
+ if (INLINE_MMX(cpu_flags)) {
+ // special case for unscaled vertical filtering
+ if (minFilterSize == 1 && filterAlign == 2)
+ filterAlign = 1;
+ }
+
+ av_assert0(minFilterSize > 0);
+ filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
+ av_assert0(filterSize > 0);
+ filter = av_malloc(filterSize * dstW * sizeof(*filter));
+ if (filterSize >= MAX_FILTER_SIZE * 16 /
+ ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter) {
+ av_log(NULL, AV_LOG_ERROR, "sws: filterSize %d is too large, try less extreem scaling or increase MAX_FILTER_SIZE and recompile\n", filterSize);
+ goto fail;
+ }
+ *outFilterSize = filterSize;
+
+ if (flags & SWS_PRINT_INFO)
+ av_log(NULL, AV_LOG_VERBOSE,
+ "SwScaler: reducing / aligning filtersize %d -> %d\n",
+ filter2Size, filterSize);
+ /* try to reduce the filter-size (step2 reduce it) */
+ for (i = 0; i < dstW; i++) {
+ int j;
+
+ for (j = 0; j < filterSize; j++) {
+ if (j >= filter2Size)
+ filter[i * filterSize + j] = 0;
+ else
+ filter[i * filterSize + j] = filter2[i * filter2Size + j];
+ if ((flags & SWS_BITEXACT) && j >= minFilterSize)
+ filter[i * filterSize + j] = 0;
+ }
+ }
+
+ // FIXME try to align filterPos if possible
+
+ // fix borders
+ for (i = 0; i < dstW; i++) {
+ int j;
+ if ((*filterPos)[i] < 0) {
+ // move filter coefficients left to compensate for filterPos
+ for (j = 1; j < filterSize; j++) {
+ int left = FFMAX(j + (*filterPos)[i], 0);
+ filter[i * filterSize + left] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= 0;
+ }
+
+ if ((*filterPos)[i] + filterSize > srcW) {
+ int shift = (*filterPos)[i] + filterSize - srcW;
+ // move filter coefficients right to compensate for filterPos
+ for (j = filterSize - 2; j >= 0; j--) {
+ int right = FFMIN(j + shift, filterSize - 1);
+ filter[i * filterSize + right] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= srcW - filterSize;
+ }
+ }
+
+ // Note the +1 is for the MMX scaler which reads over the end
+ /* align at 16 for AltiVec (needed by hScale_altivec_real) */
+ FF_ALLOCZ_OR_GOTO(NULL, *outFilter,
+ *outFilterSize * (dstW + 3) * sizeof(int16_t), fail);
+
+ /* normalize & store in outFilter */
+ for (i = 0; i < dstW; i++) {
+ int j;
+ int64_t error = 0;
+ int64_t sum = 0;
+
+ for (j = 0; j < filterSize; j++) {
+ sum += filter[i * filterSize + j];
+ }
+ sum = (sum + one / 2) / one;
+ for (j = 0; j < *outFilterSize; j++) {
+ int64_t v = filter[i * filterSize + j] + error;
+ int intV = ROUNDED_DIV(v, sum);
+ (*outFilter)[i * (*outFilterSize) + j] = intV;
+ error = v - intV * sum;
+ }
+ }
+
+ (*filterPos)[dstW + 0] =
+ (*filterPos)[dstW + 1] =
+ (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; /* the MMX/SSE scaler will
+ * read over the end */
+ for (i = 0; i < *outFilterSize; i++) {
+ int k = (dstW - 1) * (*outFilterSize) + i;
+ (*outFilter)[k + 1 * (*outFilterSize)] =
+ (*outFilter)[k + 2 * (*outFilterSize)] =
+ (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
+ }
+
+ ret = 0;
+
+fail:
+ if(ret < 0)
+ av_log(NULL, AV_LOG_ERROR, "sws: initFilter failed\n");
+ av_free(filter);
+ av_free(filter2);
+ return ret;
+}
+
+#if HAVE_MMXEXT_INLINE
+static int init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
+ int16_t *filter, int32_t *filterPos,
+ int numSplits)
+{
+ uint8_t *fragmentA;
+ x86_reg imm8OfPShufW1A;
+ x86_reg imm8OfPShufW2A;
+ x86_reg fragmentLengthA;
+ uint8_t *fragmentB;
+ x86_reg imm8OfPShufW1B;
+ x86_reg imm8OfPShufW2B;
+ x86_reg fragmentLengthB;
+ int fragmentPos;
+
+ int xpos, i;
+
+ // create an optimized horizontal scaling routine
+ /* This scaler is made of runtime-generated MMXEXT code using specially tuned
+ * pshufw instructions. For every four output pixels, if four input pixels
+ * are enough for the fast bilinear scaling, then a chunk of fragmentB is
+ * used. If five input pixels are needed, then a chunk of fragmentA is used.
+ */
+
+ // code fragment
+
+ __asm__ volatile (
+ "jmp 9f \n\t"
+ // Begin
+ "0: \n\t"
+ "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
+ "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
+ "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "pshufw $0xFF, %%mm1, %%mm1 \n\t"
+ "1: \n\t"
+ "pshufw $0xFF, %%mm0, %%mm0 \n\t"
+ "2: \n\t"
+ "psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "psllw $7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+
+ "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ // End
+ "9: \n\t"
+ // "int $3 \n\t"
+ "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
+ "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
+ "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
+ "dec %1 \n\t"
+ "dec %2 \n\t"
+ "sub %0, %1 \n\t"
+ "sub %0, %2 \n\t"
+ "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
+ "sub %0, %3 \n\t"
+
+
+ : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
+ "=r" (fragmentLengthA)
+ );
+
+ __asm__ volatile (
+ "jmp 9f \n\t"
+ // Begin
+ "0: \n\t"
+ "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
+ "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "pshufw $0xFF, %%mm0, %%mm1 \n\t"
+ "1: \n\t"
+ "pshufw $0xFF, %%mm0, %%mm0 \n\t"
+ "2: \n\t"
+ "psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "psllw $7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+
+ "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ // End
+ "9: \n\t"
+ // "int $3 \n\t"
+ "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
+ "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
+ "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
+ "dec %1 \n\t"
+ "dec %2 \n\t"
+ "sub %0, %1 \n\t"
+ "sub %0, %2 \n\t"
+ "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
+ "sub %0, %3 \n\t"
+
+
+ : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
+ "=r" (fragmentLengthB)
+ );
+
+ xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
+ fragmentPos = 0;
+
+ for (i = 0; i < dstW / numSplits; i++) {
+ int xx = xpos >> 16;
+
+ if ((i & 3) == 0) {
+ int a = 0;
+ int b = ((xpos + xInc) >> 16) - xx;
+ int c = ((xpos + xInc * 2) >> 16) - xx;
+ int d = ((xpos + xInc * 3) >> 16) - xx;
+ int inc = (d + 1 < 4);
+ uint8_t *fragment = (d + 1 < 4) ? fragmentB : fragmentA;
+ x86_reg imm8OfPShufW1 = (d + 1 < 4) ? imm8OfPShufW1B : imm8OfPShufW1A;
+ x86_reg imm8OfPShufW2 = (d + 1 < 4) ? imm8OfPShufW2B : imm8OfPShufW2A;
+ x86_reg fragmentLength = (d + 1 < 4) ? fragmentLengthB : fragmentLengthA;
+ int maxShift = 3 - (d + inc);
+ int shift = 0;
+
+ if (filterCode) {
+ filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filterPos[i / 2] = xx;
+
+ memcpy(filterCode + fragmentPos, fragment, fragmentLength);
+
+ filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
+ ((b + inc) << 2) |
+ ((c + inc) << 4) |
+ ((d + inc) << 6);
+ filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
+ (c << 4) |
+ (d << 6);
+
+ if (i + 4 - inc >= dstW)
+ shift = maxShift; // avoid overread
+ else if ((filterPos[i / 2] & 3) <= maxShift)
+ shift = filterPos[i / 2] & 3; // align
+
+ if (shift && i >= shift) {
+ filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
+ filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
+ filterPos[i / 2] -= shift;
+ }
+ }
+
+ fragmentPos += fragmentLength;
+
+ if (filterCode)
+ filterCode[fragmentPos] = RET;
+ }
+ xpos += xInc;
+ }
+ if (filterCode)
+ filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
+
+ return fragmentPos + 1;
+}
+#endif /* HAVE_MMXEXT_INLINE */
+
+static void getSubSampleFactors(int *h, int *v, enum AVPixelFormat format)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(format);
+ *h = desc->log2_chroma_w;
+ *v = desc->log2_chroma_h;
+}
+
+int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
+ int srcRange, const int table[4], int dstRange,
+ int brightness, int contrast, int saturation)
+{
+ const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
+ const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
+ memcpy(c->srcColorspaceTable, inv_table, sizeof(int) * 4);
+ memcpy(c->dstColorspaceTable, table, sizeof(int) * 4);
+
+ if(!isYUV(c->dstFormat) && !isGray(c->dstFormat))
+ dstRange = 0;
+ if(!isYUV(c->srcFormat) && !isGray(c->srcFormat))
+ srcRange = 0;
+
+ c->brightness = brightness;
+ c->contrast = contrast;
+ c->saturation = saturation;
+ c->srcRange = srcRange;
+ c->dstRange = dstRange;
+
+ if (isYUV(c->dstFormat) || isGray(c->dstFormat))
+ return -1;
+
+ c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
+ c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
+
+ ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness,
+ contrast, saturation);
+ // FIXME factorize
+
+ if (HAVE_ALTIVEC && av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)
+ ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness,
+ contrast, saturation);
+ return 0;
+}
+
+int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
+ int *srcRange, int **table, int *dstRange,
+ int *brightness, int *contrast, int *saturation)
+{
+ if (!c || isYUV(c->dstFormat) || isGray(c->dstFormat))
+ return -1;
+
+ *inv_table = c->srcColorspaceTable;
+ *table = c->dstColorspaceTable;
+ *srcRange = c->srcRange;
+ *dstRange = c->dstRange;
+ *brightness = c->brightness;
+ *contrast = c->contrast;
+ *saturation = c->saturation;
+
+ return 0;
+}
+
+static int handle_jpeg(enum AVPixelFormat *format)
+{
+ switch (*format) {
+ case AV_PIX_FMT_YUVJ420P:
+ *format = AV_PIX_FMT_YUV420P;
+ return 1;
+ case AV_PIX_FMT_YUVJ422P:
+ *format = AV_PIX_FMT_YUV422P;
+ return 1;
+ case AV_PIX_FMT_YUVJ444P:
+ *format = AV_PIX_FMT_YUV444P;
+ return 1;
+ case AV_PIX_FMT_YUVJ440P:
+ *format = AV_PIX_FMT_YUV440P;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int handle_0alpha(enum AVPixelFormat *format)
+{
+ switch (*format) {
+ case AV_PIX_FMT_0BGR : *format = AV_PIX_FMT_ABGR ; return 1;
+ case AV_PIX_FMT_BGR0 : *format = AV_PIX_FMT_BGRA ; return 4;
+ case AV_PIX_FMT_0RGB : *format = AV_PIX_FMT_ARGB ; return 1;
+ case AV_PIX_FMT_RGB0 : *format = AV_PIX_FMT_RGBA ; return 4;
+ default: return 0;
+ }
+}
+
+SwsContext *sws_alloc_context(void)
+{
+ SwsContext *c = av_mallocz(sizeof(SwsContext));
+
+ if (c) {
+ c->av_class = &sws_context_class;
+ av_opt_set_defaults(c);
+ }
+
+ return c;
+}
+
+av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
+ SwsFilter *dstFilter)
+{
+ int i, j;
+ int usesVFilter, usesHFilter;
+ int unscaled;
+ SwsFilter dummyFilter = { NULL, NULL, NULL, NULL };
+ int srcW = c->srcW;
+ int srcH = c->srcH;
+ int dstW = c->dstW;
+ int dstH = c->dstH;
+ int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 66, 16);
+ int flags, cpu_flags;
+ enum AVPixelFormat srcFormat = c->srcFormat;
+ enum AVPixelFormat dstFormat = c->dstFormat;
+ const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
+ const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
+
+ cpu_flags = av_get_cpu_flags();
+ flags = c->flags;
+ emms_c();
+ if (!rgb15to16)
+ sws_rgb2rgb_init();
+
+ unscaled = (srcW == dstW && srcH == dstH);
+
+ handle_jpeg(&srcFormat);
+ handle_jpeg(&dstFormat);
+ handle_0alpha(&srcFormat);
+ handle_0alpha(&dstFormat);
+
+ if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat){
+ av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n");
+ c->srcFormat= srcFormat;
+ c->dstFormat= dstFormat;
+ }
+
+ if (!sws_isSupportedInput(srcFormat)) {
+ av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n",
+ av_get_pix_fmt_name(srcFormat));
+ return AVERROR(EINVAL);
+ }
+ if (!sws_isSupportedOutput(dstFormat)) {
+ av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n",
+ av_get_pix_fmt_name(dstFormat));
+ return AVERROR(EINVAL);
+ }
+
+ i = flags & (SWS_POINT |
+ SWS_AREA |
+ SWS_BILINEAR |
+ SWS_FAST_BILINEAR |
+ SWS_BICUBIC |
+ SWS_X |
+ SWS_GAUSS |
+ SWS_LANCZOS |
+ SWS_SINC |
+ SWS_SPLINE |
+ SWS_BICUBLIN);
+ if (!i || (i & (i - 1))) {
+ av_log(c, AV_LOG_ERROR, "Exactly one scaler algorithm must be chosen, got %X\n", i);
+ return AVERROR(EINVAL);
+ }
+ /* sanity check */
+ if (srcW < 1 || srcH < 1 || dstW < 1 || dstH < 1) {
+ /* FIXME check if these are enough and try to lower them after
+ * fixing the relevant parts of the code */
+ av_log(c, AV_LOG_ERROR, "%dx%d -> %dx%d is invalid scaling dimension\n",
+ srcW, srcH, dstW, dstH);
+ return AVERROR(EINVAL);
+ }
+
+ if (!dstFilter)
+ dstFilter = &dummyFilter;
+ if (!srcFilter)
+ srcFilter = &dummyFilter;
+
+ c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+ c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+ c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
+ c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
+ c->vRounder = 4 * 0x0001000100010001ULL;
+
+ usesVFilter = (srcFilter->lumV && srcFilter->lumV->length > 1) ||
+ (srcFilter->chrV && srcFilter->chrV->length > 1) ||
+ (dstFilter->lumV && dstFilter->lumV->length > 1) ||
+ (dstFilter->chrV && dstFilter->chrV->length > 1);
+ usesHFilter = (srcFilter->lumH && srcFilter->lumH->length > 1) ||
+ (srcFilter->chrH && srcFilter->chrH->length > 1) ||
+ (dstFilter->lumH && dstFilter->lumH->length > 1) ||
+ (dstFilter->chrH && dstFilter->chrH->length > 1);
+
+ getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
+ getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
+
+ if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
+ if (dstW&1) {
+ av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
+ flags |= SWS_FULL_CHR_H_INT;
+ c->flags = flags;
+ }
+ }
+
+ if(dstFormat == AV_PIX_FMT_BGR4_BYTE ||
+ dstFormat == AV_PIX_FMT_RGB4_BYTE ||
+ dstFormat == AV_PIX_FMT_BGR8 ||
+ dstFormat == AV_PIX_FMT_RGB8) {
+ if (flags & SWS_ERROR_DIFFUSION && !(flags & SWS_FULL_CHR_H_INT)) {
+ av_log(c, AV_LOG_DEBUG,
+ "Error diffusion dither is only supported in full chroma interpolation for destination format '%s'\n",
+ av_get_pix_fmt_name(dstFormat));
+ flags |= SWS_FULL_CHR_H_INT;
+ c->flags = flags;
+ }
+ if (!(flags & SWS_ERROR_DIFFUSION) && (flags & SWS_FULL_CHR_H_INT)) {
+ av_log(c, AV_LOG_DEBUG,
+ "Ordered dither is not supported in full chroma interpolation for destination format '%s'\n",
+ av_get_pix_fmt_name(dstFormat));
+ flags |= SWS_ERROR_DIFFUSION;
+ c->flags = flags;
+ }
+ }
+ if (isPlanarRGB(dstFormat)) {
+ if (!(flags & SWS_FULL_CHR_H_INT)) {
+ av_log(c, AV_LOG_DEBUG,
+ "%s output is not supported with half chroma resolution, switching to full\n",
+ av_get_pix_fmt_name(dstFormat));
+ flags |= SWS_FULL_CHR_H_INT;
+ c->flags = flags;
+ }
+ }
+
+ /* reuse chroma for 2 pixels RGB/BGR unless user wants full
+ * chroma interpolation */
+ if (flags & SWS_FULL_CHR_H_INT &&
+ isAnyRGB(dstFormat) &&
+ !isPlanarRGB(dstFormat) &&
+ dstFormat != AV_PIX_FMT_RGBA &&
+ dstFormat != AV_PIX_FMT_ARGB &&
+ dstFormat != AV_PIX_FMT_BGRA &&
+ dstFormat != AV_PIX_FMT_ABGR &&
+ dstFormat != AV_PIX_FMT_RGB24 &&
+ dstFormat != AV_PIX_FMT_BGR24 &&
+ dstFormat != AV_PIX_FMT_BGR4_BYTE &&
+ dstFormat != AV_PIX_FMT_RGB4_BYTE &&
+ dstFormat != AV_PIX_FMT_BGR8 &&
+ dstFormat != AV_PIX_FMT_RGB8
+ ) {
+ av_log(c, AV_LOG_WARNING,
+ "full chroma interpolation for destination format '%s' not yet implemented\n",
+ av_get_pix_fmt_name(dstFormat));
+ flags &= ~SWS_FULL_CHR_H_INT;
+ c->flags = flags;
+ }
+ if (isAnyRGB(dstFormat) && !(flags & SWS_FULL_CHR_H_INT))
+ c->chrDstHSubSample = 1;
+
+ // drop some chroma lines if the user wants it
+ c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >>
+ SWS_SRC_V_CHR_DROP_SHIFT;
+ c->chrSrcVSubSample += c->vChrDrop;
+
+ /* drop every other pixel for chroma calculation unless user
+ * wants full chroma */
+ if (isAnyRGB(srcFormat) && !(flags & SWS_FULL_CHR_H_INP) &&
+ srcFormat != AV_PIX_FMT_RGB8 && srcFormat != AV_PIX_FMT_BGR8 &&
+ srcFormat != AV_PIX_FMT_RGB4 && srcFormat != AV_PIX_FMT_BGR4 &&
+ srcFormat != AV_PIX_FMT_RGB4_BYTE && srcFormat != AV_PIX_FMT_BGR4_BYTE &&
+ srcFormat != AV_PIX_FMT_GBRP9BE && srcFormat != AV_PIX_FMT_GBRP9LE &&
+ srcFormat != AV_PIX_FMT_GBRP10BE && srcFormat != AV_PIX_FMT_GBRP10LE &&
+ srcFormat != AV_PIX_FMT_GBRP12BE && srcFormat != AV_PIX_FMT_GBRP12LE &&
+ srcFormat != AV_PIX_FMT_GBRP14BE && srcFormat != AV_PIX_FMT_GBRP14LE &&
+ srcFormat != AV_PIX_FMT_GBRP16BE && srcFormat != AV_PIX_FMT_GBRP16LE &&
+ ((dstW >> c->chrDstHSubSample) <= (srcW >> 1) ||
+ (flags & SWS_FAST_BILINEAR)))
+ c->chrSrcHSubSample = 1;
+
+ // Note the -((-x)>>y) is so that we always round toward +inf.
+ c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample);
+ c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample);
+ c->chrDstW = -((-dstW) >> c->chrDstHSubSample);
+ c->chrDstH = -((-dstH) >> c->chrDstVSubSample);
+
+ FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
+
+ /* unscaled special cases */
+ if (unscaled && !usesHFilter && !usesVFilter &&
+ (c->srcRange == c->dstRange || isAnyRGB(dstFormat))) {
+ ff_get_unscaled_swscale(c);
+
+ if (c->swScale) {
+ if (flags & SWS_PRINT_INFO)
+ av_log(c, AV_LOG_INFO,
+ "using unscaled %s -> %s special converter\n",
+ av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
+ return 0;
+ }
+ }
+
+ c->srcBpc = 1 + desc_src->comp[0].depth_minus1;
+ if (c->srcBpc < 8)
+ c->srcBpc = 8;
+ c->dstBpc = 1 + desc_dst->comp[0].depth_minus1;
+ if (c->dstBpc < 8)
+ c->dstBpc = 8;
+ if (isAnyRGB(srcFormat) || srcFormat == AV_PIX_FMT_PAL8)
+ c->srcBpc = 16;
+ if (c->dstBpc == 16)
+ dst_stride <<= 1;
+
+ if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 14) {
+ c->canMMXEXTBeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
+ (srcW & 15) == 0) ? 1 : 0;
+ if (!c->canMMXEXTBeUsed && dstW >= srcW && (srcW & 15) == 0
+
+ && (flags & SWS_FAST_BILINEAR)) {
+ if (flags & SWS_PRINT_INFO)
+ av_log(c, AV_LOG_INFO,
+ "output width is not a multiple of 32 -> no MMXEXT scaler\n");
+ }
+ if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat))
+ c->canMMXEXTBeUsed = 0;
+ } else
+ c->canMMXEXTBeUsed = 0;
+
+ c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;
+ c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;
+
+ /* Match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src
+ * to pixel n-2 of dst, but only for the FAST_BILINEAR mode otherwise do
+ * correct scaling.
+ * n-2 is the last chrominance sample available.
+ * This is not perfect, but no one should notice the difference, the more
+ * correct variant would be like the vertical one, but that would require
+ * some special code for the first and last pixel */
+ if (flags & SWS_FAST_BILINEAR) {
+ if (c->canMMXEXTBeUsed) {
+ c->lumXInc += 20;
+ c->chrXInc += 20;
+ }
+ // we don't use the x86 asm scaler if MMX is available
+ else if (INLINE_MMX(cpu_flags) && c->dstBpc <= 14) {
+ c->lumXInc = ((int64_t)(srcW - 2) << 16) / (dstW - 2) - 20;
+ c->chrXInc = ((int64_t)(c->chrSrcW - 2) << 16) / (c->chrDstW - 2) - 20;
+ }
+ }
+
+#define USE_MMAP (HAVE_MMAP && HAVE_MPROTECT && defined MAP_ANONYMOUS)
+
+ /* precalculate horizontal scaler filter coefficients */
+ {
+#if HAVE_MMXEXT_INLINE
+// can't downscale !!!
+ if (c->canMMXEXTBeUsed && (flags & SWS_FAST_BILINEAR)) {
+ c->lumMmxextFilterCodeSize = init_hscaler_mmxext(dstW, c->lumXInc, NULL,
+ NULL, NULL, 8);
+ c->chrMmxextFilterCodeSize = init_hscaler_mmxext(c->chrDstW, c->chrXInc,
+ NULL, NULL, NULL, 4);
+
+#if USE_MMAP
+ c->lumMmxextFilterCode = mmap(NULL, c->lumMmxextFilterCodeSize,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
+ c->chrMmxextFilterCode = mmap(NULL, c->chrMmxextFilterCodeSize,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
+#elif HAVE_VIRTUALALLOC
+ c->lumMmxextFilterCode = VirtualAlloc(NULL,
+ c->lumMmxextFilterCodeSize,
+ MEM_COMMIT,
+ PAGE_EXECUTE_READWRITE);
+ c->chrMmxextFilterCode = VirtualAlloc(NULL,
+ c->chrMmxextFilterCodeSize,
+ MEM_COMMIT,
+ PAGE_EXECUTE_READWRITE);
+#else
+ c->lumMmxextFilterCode = av_malloc(c->lumMmxextFilterCodeSize);
+ c->chrMmxextFilterCode = av_malloc(c->chrMmxextFilterCodeSize);
+#endif
+
+#ifdef MAP_ANONYMOUS
+ if (c->lumMmxextFilterCode == MAP_FAILED || c->chrMmxextFilterCode == MAP_FAILED)
+#else
+ if (!c->lumMmxextFilterCode || !c->chrMmxextFilterCode)
+#endif
+ {
+ av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n");
+ return AVERROR(ENOMEM);
+ }
+
+ FF_ALLOCZ_OR_GOTO(c, c->hLumFilter, (dstW / 8 + 8) * sizeof(int16_t), fail);
+ FF_ALLOCZ_OR_GOTO(c, c->hChrFilter, (c->chrDstW / 4 + 8) * sizeof(int16_t), fail);
+ FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW / 2 / 8 + 8) * sizeof(int32_t), fail);
+ FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail);
+
+ init_hscaler_mmxext( dstW, c->lumXInc, c->lumMmxextFilterCode,
+ c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
+ init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode,
+ c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
+
+#if USE_MMAP
+ mprotect(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize, PROT_EXEC | PROT_READ);
+ mprotect(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize, PROT_EXEC | PROT_READ);
+#endif
+ } else
+#endif /* HAVE_MMXEXT_INLINE */
+ {
+ const int filterAlign =
+ (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
+ (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
+ 1;
+
+ if (initFilter(&c->hLumFilter, &c->hLumFilterPos,
+ &c->hLumFilterSize, c->lumXInc,
+ srcW, dstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumH, dstFilter->lumH,
+ c->param) < 0)
+ goto fail;
+ if (initFilter(&c->hChrFilter, &c->hChrFilterPos,
+ &c->hChrFilterSize, c->chrXInc,
+ c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrH, dstFilter->chrH,
+ c->param) < 0)
+ goto fail;
+ }
+ } // initialize horizontal stuff
+
+ /* precalculate vertical scaler filter coefficients */
+ {
+ const int filterAlign =
+ (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? 2 :
+ (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
+ 1;
+
+ if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,
+ c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumV, dstFilter->lumV,
+ c->param) < 0)
+ goto fail;
+ if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
+ c->chrYInc, c->chrSrcH, c->chrDstH,
+ filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrV, dstFilter->chrV,
+ c->param) < 0)
+ goto fail;
+
+#if HAVE_ALTIVEC
+ FF_ALLOC_OR_GOTO(c, c->vYCoeffsBank, sizeof(vector signed short) * c->vLumFilterSize * c->dstH, fail);
+ FF_ALLOC_OR_GOTO(c, c->vCCoeffsBank, sizeof(vector signed short) * c->vChrFilterSize * c->chrDstH, fail);
+
+ for (i = 0; i < c->vLumFilterSize * c->dstH; i++) {
+ int j;
+ short *p = (short *)&c->vYCoeffsBank[i];
+ for (j = 0; j < 8; j++)
+ p[j] = c->vLumFilter[i];
+ }
+
+ for (i = 0; i < c->vChrFilterSize * c->chrDstH; i++) {
+ int j;
+ short *p = (short *)&c->vCCoeffsBank[i];
+ for (j = 0; j < 8; j++)
+ p[j] = c->vChrFilter[i];
+ }
+#endif
+ }
+
+ // calculate buffer sizes so that they won't run out while handling these damn slices
+ c->vLumBufSize = c->vLumFilterSize;
+ c->vChrBufSize = c->vChrFilterSize;
+ for (i = 0; i < dstH; i++) {
+ int chrI = (int64_t)i * c->chrDstH / dstH;
+ int nextSlice = FFMAX(c->vLumFilterPos[i] + c->vLumFilterSize - 1,
+ ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)
+ << c->chrSrcVSubSample));
+
+ nextSlice >>= c->chrSrcVSubSample;
+ nextSlice <<= c->chrSrcVSubSample;
+ if (c->vLumFilterPos[i] + c->vLumBufSize < nextSlice)
+ c->vLumBufSize = nextSlice - c->vLumFilterPos[i];
+ if (c->vChrFilterPos[chrI] + c->vChrBufSize <
+ (nextSlice >> c->chrSrcVSubSample))
+ c->vChrBufSize = (nextSlice >> c->chrSrcVSubSample) -
+ c->vChrFilterPos[chrI];
+ }
+
+ for (i = 0; i < 4; i++)
+ FF_ALLOCZ_OR_GOTO(c, c->dither_error[i], (c->dstW+2) * sizeof(int), fail);
+
+ /* Allocate pixbufs (we use dynamic allocation because otherwise we would
+ * need to allocate several megabytes to handle all possible cases) */
+ FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize * 3 * sizeof(int16_t *), fail);
+ FF_ALLOC_OR_GOTO(c, c->chrUPixBuf, c->vChrBufSize * 3 * sizeof(int16_t *), fail);
+ FF_ALLOC_OR_GOTO(c, c->chrVPixBuf, c->vChrBufSize * 3 * sizeof(int16_t *), fail);
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat))
+ FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize * 3 * sizeof(int16_t *), fail);
+ /* Note we need at least one pixel more at the end because of the MMX code
+ * (just in case someone wants to replace the 4000/8000). */
+ /* align at 16 bytes for AltiVec */
+ for (i = 0; i < c->vLumBufSize; i++) {
+ FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i + c->vLumBufSize],
+ dst_stride + 16, fail);
+ c->lumPixBuf[i] = c->lumPixBuf[i + c->vLumBufSize];
+ }
+ // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
+ c->uv_off = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
+ c->uv_offx2 = dst_stride + 16;
+ for (i = 0; i < c->vChrBufSize; i++) {
+ FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i + c->vChrBufSize],
+ dst_stride * 2 + 32, fail);
+ c->chrUPixBuf[i] = c->chrUPixBuf[i + c->vChrBufSize];
+ c->chrVPixBuf[i] = c->chrVPixBuf[i + c->vChrBufSize]
+ = c->chrUPixBuf[i] + (dst_stride >> 1) + 8;
+ }
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
+ for (i = 0; i < c->vLumBufSize; i++) {
+ FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf[i + c->vLumBufSize],
+ dst_stride + 16, fail);
+ c->alpPixBuf[i] = c->alpPixBuf[i + c->vLumBufSize];
+ }
+
+ // try to avoid drawing green stuff between the right end and the stride end
+ for (i = 0; i < c->vChrBufSize; i++)
+ if(desc_dst->comp[0].depth_minus1 == 15){
+ av_assert0(c->dstBpc > 14);
+ for(j=0; j<dst_stride/2+1; j++)
+ ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
+ } else
+ for(j=0; j<dst_stride+1; j++)
+ ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
+
+ av_assert0(c->chrDstH <= dstH);
+
+ if (flags & SWS_PRINT_INFO) {
+ if (flags & SWS_FAST_BILINEAR)
+ av_log(c, AV_LOG_INFO, "FAST_BILINEAR scaler, ");
+ else if (flags & SWS_BILINEAR)
+ av_log(c, AV_LOG_INFO, "BILINEAR scaler, ");
+ else if (flags & SWS_BICUBIC)
+ av_log(c, AV_LOG_INFO, "BICUBIC scaler, ");
+ else if (flags & SWS_X)
+ av_log(c, AV_LOG_INFO, "Experimental scaler, ");
+ else if (flags & SWS_POINT)
+ av_log(c, AV_LOG_INFO, "Nearest Neighbor / POINT scaler, ");
+ else if (flags & SWS_AREA)
+ av_log(c, AV_LOG_INFO, "Area Averaging scaler, ");
+ else if (flags & SWS_BICUBLIN)
+ av_log(c, AV_LOG_INFO, "luma BICUBIC / chroma BILINEAR scaler, ");
+ else if (flags & SWS_GAUSS)
+ av_log(c, AV_LOG_INFO, "Gaussian scaler, ");
+ else if (flags & SWS_SINC)
+ av_log(c, AV_LOG_INFO, "Sinc scaler, ");
+ else if (flags & SWS_LANCZOS)
+ av_log(c, AV_LOG_INFO, "Lanczos scaler, ");
+ else if (flags & SWS_SPLINE)
+ av_log(c, AV_LOG_INFO, "Bicubic spline scaler, ");
+ else
+ av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
+
+ av_log(c, AV_LOG_INFO, "from %s to %s%s ",
+ av_get_pix_fmt_name(srcFormat),
+#ifdef DITHER1XBPP
+ dstFormat == AV_PIX_FMT_BGR555 || dstFormat == AV_PIX_FMT_BGR565 ||
+ dstFormat == AV_PIX_FMT_RGB444BE || dstFormat == AV_PIX_FMT_RGB444LE ||
+ dstFormat == AV_PIX_FMT_BGR444BE || dstFormat == AV_PIX_FMT_BGR444LE ?
+ "dithered " : "",
+#else
+ "",
+#endif
+ av_get_pix_fmt_name(dstFormat));
+
+ if (INLINE_MMXEXT(cpu_flags))
+ av_log(c, AV_LOG_INFO, "using MMXEXT\n");
+ else if (INLINE_AMD3DNOW(cpu_flags))
+ av_log(c, AV_LOG_INFO, "using 3DNOW\n");
+ else if (INLINE_MMX(cpu_flags))
+ av_log(c, AV_LOG_INFO, "using MMX\n");
+ else if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC)
+ av_log(c, AV_LOG_INFO, "using AltiVec\n");
+ else
+ av_log(c, AV_LOG_INFO, "using C\n");
+
+ av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
+ av_log(c, AV_LOG_DEBUG,
+ "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+ c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
+ av_log(c, AV_LOG_DEBUG,
+ "chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+ c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH,
+ c->chrXInc, c->chrYInc);
+ }
+
+ c->swScale = ff_getSwsFunc(c);
+ return 0;
+fail: // FIXME replace things by appropriate error codes
+ return -1;
+}
+
+#if FF_API_SWS_GETCONTEXT
+SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
+ int dstW, int dstH, enum AVPixelFormat dstFormat,
+ int flags, SwsFilter *srcFilter,
+ SwsFilter *dstFilter, const double *param)
+{
+ SwsContext *c;
+
+ if (!(c = sws_alloc_context()))
+ return NULL;
+
+ c->flags = flags;
+ c->srcW = srcW;
+ c->srcH = srcH;
+ c->dstW = dstW;
+ c->dstH = dstH;
+ c->srcRange = handle_jpeg(&srcFormat);
+ c->dstRange = handle_jpeg(&dstFormat);
+ c->src0Alpha = handle_0alpha(&srcFormat);
+ c->dst0Alpha = handle_0alpha(&dstFormat);
+ c->srcFormat = srcFormat;
+ c->dstFormat = dstFormat;
+
+ if (param) {
+ c->param[0] = param[0];
+ c->param[1] = param[1];
+ }
+ sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], c->srcRange,
+ ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/,
+ c->dstRange, 0, 1 << 16, 1 << 16);
+
+ if (sws_init_context(c, srcFilter, dstFilter) < 0) {
+ sws_freeContext(c);
+ return NULL;
+ }
+
+ return c;
+}
+#endif
+
+SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
+ float lumaSharpen, float chromaSharpen,
+ float chromaHShift, float chromaVShift,
+ int verbose)
+{
+ SwsFilter *filter = av_malloc(sizeof(SwsFilter));
+ if (!filter)
+ return NULL;
+
+ if (lumaGBlur != 0.0) {
+ filter->lumH = sws_getGaussianVec(lumaGBlur, 3.0);
+ filter->lumV = sws_getGaussianVec(lumaGBlur, 3.0);
+ } else {
+ filter->lumH = sws_getIdentityVec();
+ filter->lumV = sws_getIdentityVec();
+ }
+
+ if (chromaGBlur != 0.0) {
+ filter->chrH = sws_getGaussianVec(chromaGBlur, 3.0);
+ filter->chrV = sws_getGaussianVec(chromaGBlur, 3.0);
+ } else {
+ filter->chrH = sws_getIdentityVec();
+ filter->chrV = sws_getIdentityVec();
+ }
+
+ if (chromaSharpen != 0.0) {
+ SwsVector *id = sws_getIdentityVec();
+ sws_scaleVec(filter->chrH, -chromaSharpen);
+ sws_scaleVec(filter->chrV, -chromaSharpen);
+ sws_addVec(filter->chrH, id);
+ sws_addVec(filter->chrV, id);
+ sws_freeVec(id);
+ }
+
+ if (lumaSharpen != 0.0) {
+ SwsVector *id = sws_getIdentityVec();
+ sws_scaleVec(filter->lumH, -lumaSharpen);
+ sws_scaleVec(filter->lumV, -lumaSharpen);
+ sws_addVec(filter->lumH, id);
+ sws_addVec(filter->lumV, id);
+ sws_freeVec(id);
+ }
+
+ if (chromaHShift != 0.0)
+ sws_shiftVec(filter->chrH, (int)(chromaHShift + 0.5));
+
+ if (chromaVShift != 0.0)
+ sws_shiftVec(filter->chrV, (int)(chromaVShift + 0.5));
+
+ sws_normalizeVec(filter->chrH, 1.0);
+ sws_normalizeVec(filter->chrV, 1.0);
+ sws_normalizeVec(filter->lumH, 1.0);
+ sws_normalizeVec(filter->lumV, 1.0);
+
+ if (verbose)
+ sws_printVec2(filter->chrH, NULL, AV_LOG_DEBUG);
+ if (verbose)
+ sws_printVec2(filter->lumH, NULL, AV_LOG_DEBUG);
+
+ return filter;
+}
+
+SwsVector *sws_allocVec(int length)
+{
+ SwsVector *vec;
+
+ if(length <= 0 || length > INT_MAX/ sizeof(double))
+ return NULL;
+
+ vec = av_malloc(sizeof(SwsVector));
+ if (!vec)
+ return NULL;
+ vec->length = length;
+ vec->coeff = av_malloc(sizeof(double) * length);
+ if (!vec->coeff)
+ av_freep(&vec);
+ return vec;
+}
+
+SwsVector *sws_getGaussianVec(double variance, double quality)
+{
+ const int length = (int)(variance * quality + 0.5) | 1;
+ int i;
+ double middle = (length - 1) * 0.5;
+ SwsVector *vec;
+
+ if(variance < 0 || quality < 0)
+ return NULL;
+
+ vec = sws_allocVec(length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < length; i++) {
+ double dist = i - middle;
+ vec->coeff[i] = exp(-dist * dist / (2 * variance * variance)) /
+ sqrt(2 * variance * M_PI);
+ }
+
+ sws_normalizeVec(vec, 1.0);
+
+ return vec;
+}
+
+SwsVector *sws_getConstVec(double c, int length)
+{
+ int i;
+ SwsVector *vec = sws_allocVec(length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < length; i++)
+ vec->coeff[i] = c;
+
+ return vec;
+}
+
+SwsVector *sws_getIdentityVec(void)
+{
+ return sws_getConstVec(1.0, 1);
+}
+
+static double sws_dcVec(SwsVector *a)
+{
+ int i;
+ double sum = 0;
+
+ for (i = 0; i < a->length; i++)
+ sum += a->coeff[i];
+
+ return sum;
+}
+
+void sws_scaleVec(SwsVector *a, double scalar)
+{
+ int i;
+
+ for (i = 0; i < a->length; i++)
+ a->coeff[i] *= scalar;
+}
+
+void sws_normalizeVec(SwsVector *a, double height)
+{
+ sws_scaleVec(a, height / sws_dcVec(a));
+}
+
+static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b)
+{
+ int length = a->length + b->length - 1;
+ int i, j;
+ SwsVector *vec = sws_getConstVec(0.0, length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < a->length; i++) {
+ for (j = 0; j < b->length; j++) {
+ vec->coeff[i + j] += a->coeff[i] * b->coeff[j];
+ }
+ }
+
+ return vec;
+}
+
+static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b)
+{
+ int length = FFMAX(a->length, b->length);
+ int i;
+ SwsVector *vec = sws_getConstVec(0.0, length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < a->length; i++)
+ vec->coeff[i + (length - 1) / 2 - (a->length - 1) / 2] += a->coeff[i];
+ for (i = 0; i < b->length; i++)
+ vec->coeff[i + (length - 1) / 2 - (b->length - 1) / 2] += b->coeff[i];
+
+ return vec;
+}
+
+static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b)
+{
+ int length = FFMAX(a->length, b->length);
+ int i;
+ SwsVector *vec = sws_getConstVec(0.0, length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < a->length; i++)
+ vec->coeff[i + (length - 1) / 2 - (a->length - 1) / 2] += a->coeff[i];
+ for (i = 0; i < b->length; i++)
+ vec->coeff[i + (length - 1) / 2 - (b->length - 1) / 2] -= b->coeff[i];
+
+ return vec;
+}
+
+/* shift left / or right if "shift" is negative */
+static SwsVector *sws_getShiftedVec(SwsVector *a, int shift)
+{
+ int length = a->length + FFABS(shift) * 2;
+ int i;
+ SwsVector *vec = sws_getConstVec(0.0, length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < a->length; i++) {
+ vec->coeff[i + (length - 1) / 2 -
+ (a->length - 1) / 2 - shift] = a->coeff[i];
+ }
+
+ return vec;
+}
+
+void sws_shiftVec(SwsVector *a, int shift)
+{
+ SwsVector *shifted = sws_getShiftedVec(a, shift);
+ av_free(a->coeff);
+ a->coeff = shifted->coeff;
+ a->length = shifted->length;
+ av_free(shifted);
+}
+
+void sws_addVec(SwsVector *a, SwsVector *b)
+{
+ SwsVector *sum = sws_sumVec(a, b);
+ av_free(a->coeff);
+ a->coeff = sum->coeff;
+ a->length = sum->length;
+ av_free(sum);
+}
+
+void sws_subVec(SwsVector *a, SwsVector *b)
+{
+ SwsVector *diff = sws_diffVec(a, b);
+ av_free(a->coeff);
+ a->coeff = diff->coeff;
+ a->length = diff->length;
+ av_free(diff);
+}
+
+void sws_convVec(SwsVector *a, SwsVector *b)
+{
+ SwsVector *conv = sws_getConvVec(a, b);
+ av_free(a->coeff);
+ a->coeff = conv->coeff;
+ a->length = conv->length;
+ av_free(conv);
+}
+
+SwsVector *sws_cloneVec(SwsVector *a)
+{
+ int i;
+ SwsVector *vec = sws_allocVec(a->length);
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < a->length; i++)
+ vec->coeff[i] = a->coeff[i];
+
+ return vec;
+}
+
+void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level)
+{
+ int i;
+ double max = 0;
+ double min = 0;
+ double range;
+
+ for (i = 0; i < a->length; i++)
+ if (a->coeff[i] > max)
+ max = a->coeff[i];
+
+ for (i = 0; i < a->length; i++)
+ if (a->coeff[i] < min)
+ min = a->coeff[i];
+
+ range = max - min;
+
+ for (i = 0; i < a->length; i++) {
+ int x = (int)((a->coeff[i] - min) * 60.0 / range + 0.5);
+ av_log(log_ctx, log_level, "%1.3f ", a->coeff[i]);
+ for (; x > 0; x--)
+ av_log(log_ctx, log_level, " ");
+ av_log(log_ctx, log_level, "|\n");
+ }
+}
+
+void sws_freeVec(SwsVector *a)
+{
+ if (!a)
+ return;
+ av_freep(&a->coeff);
+ a->length = 0;
+ av_free(a);
+}
+
+void sws_freeFilter(SwsFilter *filter)
+{
+ if (!filter)
+ return;
+
+ if (filter->lumH)
+ sws_freeVec(filter->lumH);
+ if (filter->lumV)
+ sws_freeVec(filter->lumV);
+ if (filter->chrH)
+ sws_freeVec(filter->chrH);
+ if (filter->chrV)
+ sws_freeVec(filter->chrV);
+ av_free(filter);
+}
+
+void sws_freeContext(SwsContext *c)
+{
+ int i;
+ if (!c)
+ return;
+
+ if (c->lumPixBuf) {
+ for (i = 0; i < c->vLumBufSize; i++)
+ av_freep(&c->lumPixBuf[i]);
+ av_freep(&c->lumPixBuf);
+ }
+
+ if (c->chrUPixBuf) {
+ for (i = 0; i < c->vChrBufSize; i++)
+ av_freep(&c->chrUPixBuf[i]);
+ av_freep(&c->chrUPixBuf);
+ av_freep(&c->chrVPixBuf);
+ }
+
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ for (i = 0; i < c->vLumBufSize; i++)
+ av_freep(&c->alpPixBuf[i]);
+ av_freep(&c->alpPixBuf);
+ }
+
+ for (i = 0; i < 4; i++)
+ av_freep(&c->dither_error[i]);
+
+ av_freep(&c->vLumFilter);
+ av_freep(&c->vChrFilter);
+ av_freep(&c->hLumFilter);
+ av_freep(&c->hChrFilter);
+#if HAVE_ALTIVEC
+ av_freep(&c->vYCoeffsBank);
+ av_freep(&c->vCCoeffsBank);
+#endif
+
+ av_freep(&c->vLumFilterPos);
+ av_freep(&c->vChrFilterPos);
+ av_freep(&c->hLumFilterPos);
+ av_freep(&c->hChrFilterPos);
+
+#if HAVE_MMX_INLINE
+#if USE_MMAP
+ if (c->lumMmxextFilterCode)
+ munmap(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize);
+ if (c->chrMmxextFilterCode)
+ munmap(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize);
+#elif HAVE_VIRTUALALLOC
+ if (c->lumMmxextFilterCode)
+ VirtualFree(c->lumMmxextFilterCode, 0, MEM_RELEASE);
+ if (c->chrMmxextFilterCode)
+ VirtualFree(c->chrMmxextFilterCode, 0, MEM_RELEASE);
+#else
+ av_free(c->lumMmxextFilterCode);
+ av_free(c->chrMmxextFilterCode);
+#endif
+ c->lumMmxextFilterCode = NULL;
+ c->chrMmxextFilterCode = NULL;
+#endif /* HAVE_MMX_INLINE */
+
+ av_freep(&c->yuvTable);
+ av_freep(&c->formatConvBuffer);
+
+ av_free(c);
+}
+
+struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
+ int srcH, enum AVPixelFormat srcFormat,
+ int dstW, int dstH,
+ enum AVPixelFormat dstFormat, int flags,
+ SwsFilter *srcFilter,
+ SwsFilter *dstFilter,
+ const double *param)
+{
+ static const double default_param[2] = { SWS_PARAM_DEFAULT,
+ SWS_PARAM_DEFAULT };
+
+ if (!param)
+ param = default_param;
+
+ if (context &&
+ (context->srcW != srcW ||
+ context->srcH != srcH ||
+ context->srcFormat != srcFormat ||
+ context->dstW != dstW ||
+ context->dstH != dstH ||
+ context->dstFormat != dstFormat ||
+ context->flags != flags ||
+ context->param[0] != param[0] ||
+ context->param[1] != param[1])) {
+ sws_freeContext(context);
+ context = NULL;
+ }
+
+ if (!context) {
+ if (!(context = sws_alloc_context()))
+ return NULL;
+ context->srcW = srcW;
+ context->srcH = srcH;
+ context->srcRange = handle_jpeg(&srcFormat);
+ context->src0Alpha = handle_0alpha(&srcFormat);
+ context->srcFormat = srcFormat;
+ context->dstW = dstW;
+ context->dstH = dstH;
+ context->dstRange = handle_jpeg(&dstFormat);
+ context->dst0Alpha = handle_0alpha(&dstFormat);
+ context->dstFormat = dstFormat;
+ context->flags = flags;
+ context->param[0] = param[0];
+ context->param[1] = param[1];
+ sws_setColorspaceDetails(context, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT],
+ context->srcRange,
+ ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/,
+ context->dstRange, 0, 1 << 16, 1 << 16);
+ if (sws_init_context(context, srcFilter, dstFilter) < 0) {
+ sws_freeContext(context);
+ return NULL;
+ }
+ }
+ return context;
+}
diff --git a/ffmpeg/libswscale/version.h b/ffmpeg/libswscale/version.h
new file mode 100644
index 0000000..c430f2d
--- /dev/null
+++ b/ffmpeg/libswscale/version.h
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_VERSION_H
+#define SWSCALE_VERSION_H
+
+/**
+ * @file
+ * swscale version macros
+ */
+
+#include "libavutil/avutil.h"
+
+#define LIBSWSCALE_VERSION_MAJOR 2
+#define LIBSWSCALE_VERSION_MINOR 2
+#define LIBSWSCALE_VERSION_MICRO 100
+
+#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
+ LIBSWSCALE_VERSION_MINOR, \
+ LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
+ LIBSWSCALE_VERSION_MINOR, \
+ LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
+
+#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ */
+
+#ifndef FF_API_SWS_GETCONTEXT
+#define FF_API_SWS_GETCONTEXT (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+#ifndef FF_API_SWS_CPU_CAPS
+#define FF_API_SWS_CPU_CAPS (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+#ifndef FF_API_SWS_FORMAT_NAME
+#define FF_API_SWS_FORMAT_NAME (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+
+#endif /* SWSCALE_VERSION_H */
diff --git a/ffmpeg/libswscale/x86/Makefile b/ffmpeg/libswscale/x86/Makefile
new file mode 100644
index 0000000..7d219b4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/Makefile
@@ -0,0 +1,11 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
+MMX-OBJS += x86/rgb2rgb.o \
+ x86/swscale.o \
+ x86/yuv2rgb.o \
+
+YASM-OBJS += x86/input.o \
+ x86/output.o \
+ x86/scale.o \
diff --git a/ffmpeg/libswscale/x86/input.asm b/ffmpeg/libswscale/x86/input.asm
new file mode 100644
index 0000000..9d5a871
--- /dev/null
+++ b/ffmpeg/libswscale/x86/input.asm
@@ -0,0 +1,670 @@
+;******************************************************************************
+;* x86-optimized input routines; does shuffling of packed
+;* YUV formats into individual planes, and converts RGB
+;* into YUV planes also.
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+%define RY 0x20DE
+%define GY 0x4087
+%define BY 0x0C88
+%define RU 0xECFF
+%define GU 0xDAC8
+%define BU 0x3838
+%define RV 0x3838
+%define GV 0xD0E3
+%define BV 0xF6E4
+
+rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
+rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
+bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+rgba_Ycoeff_rb: times 4 dw RY, BY
+rgba_Ycoeff_br: times 4 dw BY, RY
+rgba_Ycoeff_ga: times 4 dw GY, 0
+rgba_Ycoeff_ag: times 4 dw 0, GY
+rgba_Ucoeff_rb: times 4 dw RU, BU
+rgba_Ucoeff_br: times 4 dw BU, RU
+rgba_Ucoeff_ga: times 4 dw GU, 0
+rgba_Ucoeff_ag: times 4 dw 0, GU
+rgba_Vcoeff_rb: times 4 dw RV, BV
+rgba_Vcoeff_br: times 4 dw BV, RV
+rgba_Vcoeff_ga: times 4 dw GV, 0
+rgba_Vcoeff_ag: times 4 dw 0, GV
+
+shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
+ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
+shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
+ 8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; RGB to Y/UV.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+; const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_Y_FN 2-3
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+%if mmsize == 8
+ mova m5, [%2_Ycoeff_12x4]
+ mova m6, [%2_Ycoeff_3x56]
+%define coeff1 m5
+%define coeff2 m6
+%elif ARCH_X86_64
+ mova m8, [%2_Ycoeff_12x4]
+ mova m9, [%2_Ycoeff_3x56]
+%define coeff1 m8
+%define coeff2 m9
+%else ; x86-32 && mmsize == 16
+%define coeff1 [%2_Ycoeff_12x4]
+%define coeff2 [%2_Ycoeff_3x56]
+%endif ; x86-32/64 && mmsize == 8/16
+%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+ jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
+%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+.body:
+%if cpuflag(ssse3)
+ mova m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+ mova m10, [shuf_rgb_3x56]
+%define shuf_rgb2 m10
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+ movsxd wq, wd
+%endif
+ add wq, wq
+ add dstq, wq
+ neg wq
+%if notcpuflag(ssse3)
+ pxor m7, m7
+%endif ; !cpuflag(ssse3)
+ mova m4, [rgb_Yrnd]
+.loop:
+%if cpuflag(ssse3)
+ movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
+ movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
+ pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+ pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+ pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+ pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+ movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
+ movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
+ movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
+ movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16 ; i.e. sse2
+ punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+ punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+ movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
+ movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
+ movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
+ movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
+ punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+ punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16
+ punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+ punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+ punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+ punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+ add srcq, 3 * mmsize / 2
+ pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
+ pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
+ pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
+ pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
+ paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
+ paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
+ paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
+ paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
+ psrad m0, 9
+ psrad m2, 9
+ packssdw m0, m2 ; (word) { Y[0-7] }
+ mova [dstq+wq], m0
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_UV_FN 2-3
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+ mova m8, [%2_Ucoeff_12x4]
+ mova m9, [%2_Ucoeff_3x56]
+ mova m10, [%2_Vcoeff_12x4]
+ mova m11, [%2_Vcoeff_3x56]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [%2_Ucoeff_12x4]
+%define coeffU2 [%2_Ucoeff_3x56]
+%define coeffV1 [%2_Vcoeff_12x4]
+%define coeffV2 [%2_Vcoeff_3x56]
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
+ jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 3
+.body:
+%if cpuflag(ssse3)
+ mova m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+ mova m12, [shuf_rgb_3x56]
+%define shuf_rgb2 m12
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+ movsxd wq, dword r5m
+%else ; x86-32
+ mov wq, r5m
+%endif
+ add wq, wq
+ add dstUq, wq
+ add dstVq, wq
+ neg wq
+ mova m6, [rgb_UVrnd]
+%if notcpuflag(ssse3)
+ pxor m7, m7
+%endif
+.loop:
+%if cpuflag(ssse3)
+ movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
+ movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
+ pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+ pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+%else ; !cpuflag(ssse3)
+ movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
+ movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
+ movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
+ movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16
+ punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+ punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+ movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
+ movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
+%endif ; mmsize == 16
+ punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+ punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+%endif ; cpuflag(ssse3)
+ pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
+ pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
+ pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
+ pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
+ paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
+ paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
+%if cpuflag(ssse3)
+ pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+ pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+%if mmsize == 16
+ movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
+ movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
+ punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+ punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16 && !cpuflag(ssse3)
+ punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+ punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+ add srcq, 3 * mmsize / 2
+ pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
+ pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
+ pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
+ pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
+ paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
+ paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
+ paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
+ paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
+ paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
+ paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
+ psrad m0, 9
+ psrad m2, 9
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m0, m1 ; (word) { U[0-7] }
+ packssdw m2, m4 ; (word) { V[0-7] }
+%if mmsize == 8
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%else ; mmsize == 16
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%endif ; mmsize == 8/16
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB24_FUNCS 2
+RGB24_TO_Y_FN %1, rgb
+RGB24_TO_Y_FN %1, bgr, rgb
+RGB24_TO_UV_FN %2, rgb
+RGB24_TO_UV_FN %2, bgr, rgb
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB24_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB24_FUNCS 10, 12
+
+INIT_XMM ssse3
+RGB24_FUNCS 11, 13
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB24_FUNCS 11, 13
+%endif
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
+ mova m5, [rgba_Ycoeff_%2%4]
+ mova m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+ jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+ movsxd wq, wd
+%endif
+ lea srcq, [srcq+wq*4]
+ add wq, wq
+ add dstq, wq
+ neg wq
+ mova m4, [rgb_Yrnd]
+ pcmpeqb m7, m7
+ psrlw m7, 8 ; (word) { 0x00ff } x4
+.loop:
+ ; FIXME check alignment and use mova
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+ pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
+ pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
+ pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
+ pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
+ paddd m0, m4 ; += rgb_Yrnd
+ paddd m2, m4 ; += rgb_Yrnd
+ paddd m0, m1 ; (dword) { Y[0-3] }
+ paddd m2, m3 ; (dword) { Y[4-7] }
+ psrad m0, 9
+ psrad m2, 9
+ packssdw m0, m2 ; (word) { Y[0-7] }
+ mova [dstq+wq], m0
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
+%if ARCH_X86_64
+ mova m8, [rgba_Ucoeff_%2%4]
+ mova m9, [rgba_Ucoeff_%3%5]
+ mova m10, [rgba_Vcoeff_%2%4]
+ mova m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+ jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+ movsxd wq, dword r5m
+%else ; x86-32
+ mov wq, r5m
+%endif
+ add wq, wq
+ add dstUq, wq
+ add dstVq, wq
+ lea srcq, [srcq+wq*2]
+ neg wq
+ pcmpeqb m7, m7
+ psrlw m7, 8 ; (word) { 0x00ff } x4
+ mova m6, [rgb_UVrnd]
+.loop:
+ ; FIXME check alignment and use mova
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+ pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
+ pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
+ pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
+ pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
+ paddd m3, m6 ; += rgb_UVrnd
+ paddd m1, m6 ; += rgb_UVrnd
+ paddd m2, m3 ; (dword) { V[0-3] }
+ paddd m0, m1 ; (dword) { U[0-3] }
+ pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
+ pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
+ pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
+ pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
+ paddd m3, m6 ; += rgb_UVrnd
+ paddd m5, m6 ; += rgb_UVrnd
+ psrad m0, 9
+ paddd m1, m3 ; (dword) { V[4-7] }
+ paddd m4, m5 ; (dword) { U[4-7] }
+ psrad m2, 9
+ psrad m4, 9
+ psrad m1, 9
+ packssdw m0, m4 ; (word) { U[0-7] }
+ packssdw m2, m1 ; (word) { V[0-7] }
+%if mmsize == 8
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%else ; mmsize == 16
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
+%endif ; mmsize == 8/16
+ add wq, mmsize
+ jl .loop
+ REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+%endif
+
+;-----------------------------------------------------------------------------
+; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+; const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_Y 2
+.loop_%1:
+ mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
+ mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+%ifidn %2, yuyv
+ pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
+ pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
+%else ; uyvy
+ psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
+ psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
+%endif ; yuyv/uyvy
+ packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
+ mova [dstq+wq], m0
+ add wq, mmsize
+ jl .loop_%1
+ REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+; will be the same (i.e. YUYV+AVX), and thus we don't need to
+; split the loop in an aligned and unaligned case
+%macro YUYV_TO_Y_FN 2-3
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
+%if ARCH_X86_64
+ movsxd wq, wd
+%endif
+ add dstq, wq
+%if mmsize == 16
+ test srcq, 15
+%endif
+ lea srcq, [srcq+wq*2]
+%ifidn %2, yuyv
+ pcmpeqb m2, m2 ; (byte) { 0xff } x 16
+ psrlw m2, 8 ; (word) { 0x00ff } x 8
+%endif ; yuyv
+%if mmsize == 16
+ jnz .loop_u_start
+ neg wq
+ LOOP_YUYV_TO_Y a, %2
+.loop_u_start:
+ neg wq
+ LOOP_YUYV_TO_Y u, %2
+%else ; mmsize == 8
+ neg wq
+ LOOP_YUYV_TO_Y a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_UV 2
+.loop_%1:
+%ifidn %2, yuyv
+ mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
+ mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+ psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
+ psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
+%else ; uyvy
+%if cpuflag(avx)
+ vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
+ vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
+%else
+ mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
+ mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+ pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
+ pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
+%endif
+%endif ; yuyv/uyvy
+ packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
+ pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
+ psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
+%if mmsize == 16
+ packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
+ movh [dstUq+wq], m1
+ movhps [dstVq+wq], m1
+%else ; mmsize == 8
+ packuswb m1, m1 ; (byte) { U0, ... U3 }
+ packuswb m0, m0 ; (byte) { V0, ... V3 }
+ movh [dstUq+wq], m1
+ movh [dstVq+wq], m0
+%endif ; mmsize == 8/16
+ add wq, mmsize / 2
+ jl .loop_%1
+ REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+; will be the same (i.e. UYVY+AVX), and thus we don't need to
+; split the loop in an aligned and unaligned case
+%macro YUYV_TO_UV_FN 2-3
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+ movsxd wq, dword r5m
+%else ; x86-32
+ mov wq, r5m
+%endif
+ add dstUq, wq
+ add dstVq, wq
+%if mmsize == 16 && %0 == 2
+ test srcq, 15
+%endif
+ lea srcq, [srcq+wq*4]
+ pcmpeqb m2, m2 ; (byte) { 0xff } x 16
+ psrlw m2, 8 ; (word) { 0x00ff } x 8
+ ; NOTE: if uyvy+avx, u/a are identical
+%if mmsize == 16 && %0 == 2
+ jnz .loop_u_start
+ neg wq
+ LOOP_YUYV_TO_UV a, %2
+.loop_u_start:
+ neg wq
+ LOOP_YUYV_TO_UV u, %2
+%else ; mmsize == 8
+ neg wq
+ LOOP_YUYV_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = nv12 or nv21
+%macro LOOP_NVXX_TO_UV 2
+.loop_%1:
+ mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
+ mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
+ pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
+ pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
+ psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
+ psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
+ packuswb m2, m3 ; (byte) { U0, ..., U15 }
+ packuswb m0, m1 ; (byte) { V0, ..., V15 }
+%ifidn %2, nv12
+ mova [dstUq+wq], m2
+ mova [dstVq+wq], m0
+%else ; nv21
+ mova [dstVq+wq], m2
+ mova [dstUq+wq], m0
+%endif ; nv12/21
+ add wq, mmsize
+ jl .loop_%1
+ REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = nv12 or nv21
+%macro NVXX_TO_UV_FN 2
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+ movsxd wq, dword r5m
+%else ; x86-32
+ mov wq, r5m
+%endif
+ add dstUq, wq
+ add dstVq, wq
+%if mmsize == 16
+ test srcq, 15
+%endif
+ lea srcq, [srcq+wq*2]
+ pcmpeqb m5, m5 ; (byte) { 0xff } x 16
+ psrlw m5, 8 ; (word) { 0x00ff } x 8
+%if mmsize == 16
+ jnz .loop_u_start
+ neg wq
+ LOOP_NVXX_TO_UV a, %2
+.loop_u_start:
+ neg wq
+ LOOP_NVXX_TO_UV u, %2
+%else ; mmsize == 8
+ neg wq
+ LOOP_NVXX_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+YUYV_TO_Y_FN 0, yuyv
+YUYV_TO_Y_FN 0, uyvy
+YUYV_TO_UV_FN 0, yuyv
+YUYV_TO_UV_FN 0, uyvy
+NVXX_TO_UV_FN 0, nv12
+NVXX_TO_UV_FN 0, nv21
+%endif
+
+INIT_XMM sse2
+YUYV_TO_Y_FN 3, yuyv
+YUYV_TO_Y_FN 2, uyvy
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
+; that's not faster in practice
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy, 1
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/ffmpeg/libswscale/x86/output.asm b/ffmpeg/libswscale/x86/output.asm
new file mode 100644
index 0000000..f9add35
--- /dev/null
+++ b/ffmpeg/libswscale/x86/output.asm
@@ -0,0 +1,413 @@
+;******************************************************************************
+;* x86-optimized vertical line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;* Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+minshort: times 8 dw 0x8000
+yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
+yuv2yuvX_10_start: times 4 dd 0x10000
+yuv2yuvX_9_start: times 4 dd 0x20000
+yuv2yuvX_10_upper: times 8 dw 0x3ff
+yuv2yuvX_9_upper: times 8 dw 0x1ff
+pd_4: times 4 dd 4
+pd_4min0x40000:times 4 dd 4 - (0x40000)
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+; const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+; const int16_t **src, uint8_t *dst, int dstW,
+; const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+ pxor m6, m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+ SUB rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+ ; create registers holding dither
+ movq m_dith, [ditherq] ; dither
+ test offsetd, offsetd
+ jz .no_rot
+%if mmsize == 16
+ punpcklqdq m_dith, m_dith
+%endif ; mmsize == 16
+ PALIGNR m_dith, m_dith, 3, m0
+.no_rot:
+%if mmsize == 16
+ punpcklbw m_dith, m6
+%if ARCH_X86_64
+ punpcklwd m8, m_dith, m6
+ pslld m8, 12
+%else ; x86-32
+ punpcklwd m5, m_dith, m6
+ pslld m5, 12
+%endif ; x86-32/64
+ punpckhwd m_dith, m6
+ pslld m_dith, 12
+%if ARCH_X86_32
+ mova [rsp+ 0], m5
+ mova [rsp+16], m_dith
+%endif
+%else ; mmsize == 8
+ punpcklbw m5, m_dith, m6
+ punpckhbw m_dith, m6
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m3, m_dith, m6
+ punpckhwd m_dith, m6
+ pslld m4, 12
+ pslld m5, 12
+ pslld m3, 12
+ pslld m_dith, 12
+ mova [rsp+ 0], m4
+ mova [rsp+ 8], m5
+ mova [rsp+16], m3
+ mova [rsp+24], m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+ xor r5, r5
+
+.pixelloop:
+%assign %%i 0
+ ; the rep here is for the 8bit output mmx case, where dither covers
+ ; 8 pixels but we can only handle 2 pixels per register, and thus 4
+ ; pixels per iteration. In order to not have to keep track of where
+ ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
+%if %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
+
+%if %1 == 8
+%if ARCH_X86_32
+ mova m2, [rsp+mmsize*(0+%%i)]
+ mova m1, [rsp+mmsize*(1+%%i)]
+%else ; x86-64
+ mova m2, m8
+ mova m1, m_dith
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+ mova m1, [yuv2yuvX_%1_start]
+ mova m2, m1
+%endif ; %1 == 8/9/10/16
+ movsx cntr_reg, fltsizem
+.filterloop_ %+ %%i:
+ ; input pixels
+ mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
+%if %1 == 16
+ mova m3, [r6+r5*4]
+ mova m5, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+ mova m3, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+ mov r6, [srcq+gprsize*cntr_reg-gprsize]
+%if %1 == 16
+ mova m4, [r6+r5*4]
+ mova m6, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+ mova m4, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+
+ ; coefficients
+ movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
+%if %1 == 16
+ pshuflw m7, m0, 0 ; coeff[0]
+ pshuflw m0, m0, 0x55 ; coeff[1]
+ pmovsxwd m7, m7 ; word -> dword
+ pmovsxwd m0, m0 ; word -> dword
+
+ pmulld m3, m7
+ pmulld m5, m7
+ pmulld m4, m0
+ pmulld m6, m0
+
+ paddd m2, m3
+ paddd m1, m5
+ paddd m2, m4
+ paddd m1, m6
+%else ; %1 == 10/9/8
+ punpcklwd m5, m3, m4
+ punpckhwd m3, m4
+ SPLATD m0
+
+ pmaddwd m5, m0
+ pmaddwd m3, m0
+
+ paddd m2, m5
+ paddd m1, m3
+%endif ; %1 == 8/9/10/16
+
+ sub cntr_reg, 2
+ jg .filterloop_ %+ %%i
+
+%if %1 == 16
+ psrad m2, 31 - %1
+ psrad m1, 31 - %1
+%else ; %1 == 10/9/8
+ psrad m2, 27 - %1
+ psrad m1, 27 - %1
+%endif ; %1 == 8/9/10/16
+
+%if %1 == 8
+ packssdw m2, m1
+ packuswb m2, m2
+ movh [dstq+r5*1], m2
+%else ; %1 == 9/10/16
+%if %1 == 16
+ packssdw m2, m1
+ paddw m2, [minshort]
+%else ; %1 == 9/10
+%if cpuflag(sse4)
+ packusdw m2, m1
+%else ; mmxext/sse2
+ packssdw m2, m1
+ pmaxsw m2, m6
+%endif ; mmxext/sse2/sse4/avx
+ pminsw m2, [yuv2yuvX_%1_upper]
+%endif ; %1 == 9/10/16
+ mova [dstq+r5*2], m2
+%endif ; %1 == 8/9/10/16
+
+ add r5, mmsize/2
+ sub wd, mmsize/2
+
+%assign %%i %%i+2
+%endrep
+ jg .pixelloop
+
+%if %1 == 8
+%if ARCH_X86_32
+ ADD rsp, pad
+ RET
+%else ; x86-64
+ REP_RET
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+ REP_RET
+%endif ; %1 == 8/9/10/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+yuv2planeX_fn 8, 0, 7
+yuv2planeX_fn 9, 0, 5
+yuv2planeX_fn 10, 0, 5
+%endif
+
+INIT_XMM sse2
+yuv2planeX_fn 8, 10, 7
+yuv2planeX_fn 9, 7, 5
+yuv2planeX_fn 10, 7, 5
+
+INIT_XMM sse4
+yuv2planeX_fn 8, 10, 7
+yuv2planeX_fn 9, 7, 5
+yuv2planeX_fn 10, 7, 5
+yuv2planeX_fn 16, 8, 5
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2planeX_fn 8, 10, 7
+yuv2planeX_fn 9, 7, 5
+yuv2planeX_fn 10, 7, 5
+%endif
+
+; %1=outout-bpc, %2=alignment (u/a)
+%macro yuv2plane1_mainloop 2
+.loop_%2:
+%if %1 == 8
+ paddsw m0, m2, [srcq+wq*2+mmsize*0]
+ paddsw m1, m3, [srcq+wq*2+mmsize*1]
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+ mov%2 [dstq+wq], m0
+%elif %1 == 16
+ paddd m0, m4, [srcq+wq*4+mmsize*0]
+ paddd m1, m4, [srcq+wq*4+mmsize*1]
+ paddd m2, m4, [srcq+wq*4+mmsize*2]
+ paddd m3, m4, [srcq+wq*4+mmsize*3]
+ psrad m0, 3
+ psrad m1, 3
+ psrad m2, 3
+ psrad m3, 3
+%if cpuflag(sse4) ; avx/sse4
+ packusdw m0, m1
+ packusdw m2, m3
+%else ; mmx/sse2
+ packssdw m0, m1
+ packssdw m2, m3
+ paddw m0, m5
+ paddw m2, m5
+%endif ; mmx/sse2/sse4/avx
+ mov%2 [dstq+wq*2+mmsize*0], m0
+ mov%2 [dstq+wq*2+mmsize*1], m2
+%else ; %1 == 9/10
+ paddsw m0, m2, [srcq+wq*2+mmsize*0]
+ paddsw m1, m2, [srcq+wq*2+mmsize*1]
+ psraw m0, 15 - %1
+ psraw m1, 15 - %1
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m3
+ pminsw m1, m3
+ mov%2 [dstq+wq*2+mmsize*0], m0
+ mov%2 [dstq+wq*2+mmsize*1], m1
+%endif
+ add wq, mmsize
+ jl .loop_%2
+%endmacro
+
+%macro yuv2plane1_fn 3
+cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
+ movsxdifnidn wq, wd
+ add wq, mmsize - 1
+ and wq, ~(mmsize - 1)
+%if %1 == 8
+ add dstq, wq
+%else ; %1 != 8
+ lea dstq, [dstq+wq*2]
+%endif ; %1 == 8
+%if %1 == 16
+ lea srcq, [srcq+wq*4]
+%else ; %1 != 16
+ lea srcq, [srcq+wq*2]
+%endif ; %1 == 16
+ neg wq
+
+%if %1 == 8
+ pxor m4, m4 ; zero
+
+ ; create registers holding dither
+ movq m3, [ditherq] ; dither
+ test offsetd, offsetd
+ jz .no_rot
+%if mmsize == 16
+ punpcklqdq m3, m3
+%endif ; mmsize == 16
+ PALIGNR m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+ mova m2, m3
+ punpckhbw m3, m4 ; byte->word
+ punpcklbw m2, m4 ; byte->word
+%else
+ punpcklbw m3, m4
+ mova m2, m3
+%endif
+%elif %1 == 9
+ pxor m4, m4
+ mova m3, [pw_512]
+ mova m2, [pw_32]
+%elif %1 == 10
+ pxor m4, m4
+ mova m3, [pw_1024]
+ mova m2, [pw_16]
+%else ; %1 == 16
+%if cpuflag(sse4) ; sse4/avx
+ mova m4, [pd_4]
+%else ; mmx/sse2
+ mova m4, [pd_4min0x40000]
+ mova m5, [minshort]
+%endif ; mmx/sse2/sse4/avx
+%endif ; %1 == ..
+
+ ; actual pixel scaling
+%if mmsize == 8
+ yuv2plane1_mainloop %1, a
+%else ; mmsize == 16
+ test dstq, 15
+ jnz .unaligned
+ yuv2plane1_mainloop %1, a
+ REP_RET
+.unaligned:
+ yuv2plane1_mainloop %1, u
+%endif ; mmsize == 8/16
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+yuv2plane1_fn 8, 0, 5
+yuv2plane1_fn 16, 0, 3
+
+INIT_MMX mmxext
+yuv2plane1_fn 9, 0, 3
+yuv2plane1_fn 10, 0, 3
+%endif
+
+INIT_XMM sse2
+yuv2plane1_fn 8, 5, 5
+yuv2plane1_fn 9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 6, 3
+
+INIT_XMM sse4
+yuv2plane1_fn 16, 5, 3
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2plane1_fn 8, 5, 5
+yuv2plane1_fn 9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/ffmpeg/libswscale/x86/rgb2rgb.c b/ffmpeg/libswscale/x86/rgb2rgb.c
new file mode 100644
index 0000000..1e20176
--- /dev/null
+++ b/ffmpeg/libswscale/x86/rgb2rgb.c
@@ -0,0 +1,149 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ * software YUV to YUV converter
+ * software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
+DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
+DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
+#define mask16b mask15b
+DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
+
+#define RGB2YUV_SHIFT 8
+#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
+
+// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
+
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_AMD3DNOW 0
+#define COMPILE_TEMPLATE_SSE2 0
+
+//MMX versions
+#undef RENAME
+#define RENAME(a) a ## _MMX
+#include "rgb2rgb_template.c"
+
+// MMXEXT versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "rgb2rgb_template.c"
+
+//SSE2 versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_SSE2
+#define COMPILE_TEMPLATE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include "rgb2rgb_template.c"
+
+//3DNOW versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNOW
+#include "rgb2rgb_template.c"
+
+/*
+ RGB15->RGB16 original by Strepto/Astral
+ ported to gcc & bugfixed : A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void rgb2rgb_init_x86(void)
+{
+#if HAVE_INLINE_ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (INLINE_MMX(cpu_flags))
+ rgb2rgb_init_MMX();
+ if (INLINE_AMD3DNOW(cpu_flags))
+ rgb2rgb_init_3DNOW();
+ if (INLINE_MMXEXT(cpu_flags))
+ rgb2rgb_init_MMXEXT();
+ if (INLINE_SSE2(cpu_flags))
+ rgb2rgb_init_SSE2();
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/ffmpeg/libswscale/x86/rgb2rgb_template.c b/ffmpeg/libswscale/x86/rgb2rgb_template.c
new file mode 100644
index 0000000..d802ab4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/rgb2rgb_template.c
@@ -0,0 +1,2498 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ * software YUV to YUV converter
+ * software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ * lot of big-endian byte order fixes by Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#undef PREFETCH
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+#undef PAVGB
+
+#if COMPILE_TEMPLATE_AMD3DNOW
+#define PREFETCH "prefetch"
+#define PAVGB "pavgusb"
+#elif COMPILE_TEMPLATE_MMXEXT
+#define PREFETCH "prefetchnta"
+#define PAVGB "pavgb"
+#else
+#define PREFETCH " # nop"
+#endif
+
+#if COMPILE_TEMPLATE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
+#define EMMS "femms"
+#else
+#define EMMS "emms"
+#endif
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE " # nop"
+#endif
+
+#if !COMPILE_TEMPLATE_SSE2
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+
+static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *dest = dst;
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = end - 23;
+ __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "punpckldq 3(%1), %%mm0 \n\t"
+ "movd 6(%1), %%mm1 \n\t"
+ "punpckldq 9(%1), %%mm1 \n\t"
+ "movd 12(%1), %%mm2 \n\t"
+ "punpckldq 15(%1), %%mm2 \n\t"
+ "movd 18(%1), %%mm3 \n\t"
+ "punpckldq 21(%1), %%mm3 \n\t"
+ "por %%mm7, %%mm0 \n\t"
+ "por %%mm7, %%mm1 \n\t"
+ "por %%mm7, %%mm2 \n\t"
+ "por %%mm7, %%mm3 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ MOVNTQ" %%mm1, 8(%0) \n\t"
+ MOVNTQ" %%mm2, 16(%0) \n\t"
+ MOVNTQ" %%mm3, 24(%0)"
+ :: "r"(dest), "r"(s)
+ :"memory");
+ dest += 32;
+ s += 24;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = 255;
+ }
+}
+
+#define STORE_BGR24_MMX \
+ "psrlq $8, %%mm2 \n\t" \
+ "psrlq $8, %%mm3 \n\t" \
+ "psrlq $8, %%mm6 \n\t" \
+ "psrlq $8, %%mm7 \n\t" \
+ "pand "MANGLE(mask24l)", %%mm0\n\t" \
+ "pand "MANGLE(mask24l)", %%mm1\n\t" \
+ "pand "MANGLE(mask24l)", %%mm4\n\t" \
+ "pand "MANGLE(mask24l)", %%mm5\n\t" \
+ "pand "MANGLE(mask24h)", %%mm2\n\t" \
+ "pand "MANGLE(mask24h)", %%mm3\n\t" \
+ "pand "MANGLE(mask24h)", %%mm6\n\t" \
+ "pand "MANGLE(mask24h)", %%mm7\n\t" \
+ "por %%mm2, %%mm0 \n\t" \
+ "por %%mm3, %%mm1 \n\t" \
+ "por %%mm6, %%mm4 \n\t" \
+ "por %%mm7, %%mm5 \n\t" \
+ \
+ "movq %%mm1, %%mm2 \n\t" \
+ "movq %%mm4, %%mm3 \n\t" \
+ "psllq $48, %%mm2 \n\t" \
+ "psllq $32, %%mm3 \n\t" \
+ "por %%mm2, %%mm0 \n\t" \
+ "psrlq $16, %%mm1 \n\t" \
+ "psrlq $32, %%mm4 \n\t" \
+ "psllq $16, %%mm5 \n\t" \
+ "por %%mm3, %%mm1 \n\t" \
+ "por %%mm5, %%mm4 \n\t" \
+ \
+ MOVNTQ" %%mm0, (%0) \n\t" \
+ MOVNTQ" %%mm1, 8(%0) \n\t" \
+ MOVNTQ" %%mm4, 16(%0)"
+
+
+static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ uint8_t *dest = dst;
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = end - 31;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm5 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+ "movq %%mm4, %%mm6 \n\t"
+ "movq %%mm5, %%mm7 \n\t"
+ STORE_BGR24_MMX
+ :: "r"(dest), "r"(s)
+ :"memory");
+ dest += 24;
+ s += 32;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ *dest++ = *s++;
+ *dest++ = *s++;
+ *dest++ = *s++;
+ s++;
+ }
+}
+
+/*
+ original by Strepto/Astral
+ ported to gcc & bugfixed: A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ register const uint8_t* s=src;
+ register uint8_t* d=dst;
+ register const uint8_t *end;
+ const uint8_t *mm_end;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s));
+ __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
+ mm_end = end - 15;
+ while (s<mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "pand %%mm4, %%mm0 \n\t"
+ "pand %%mm4, %%mm2 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ MOVNTQ" %%mm2, 8(%0)"
+ :: "r"(d), "r"(s)
+ );
+ d+=16;
+ s+=16;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ mm_end = end - 3;
+ while (s < mm_end) {
+ register unsigned x= *((const uint32_t *)s);
+ *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
+ d+=4;
+ s+=4;
+ }
+ if (s < end) {
+ register unsigned short x= *((const uint16_t *)s);
+ *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
+ }
+}
+
+static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ register const uint8_t* s=src;
+ register uint8_t* d=dst;
+ register const uint8_t *end;
+ const uint8_t *mm_end;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s));
+ __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
+ __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
+ mm_end = end - 15;
+ while (s<mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlq $1, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm3, %%mm2 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ MOVNTQ" %%mm2, 8(%0)"
+ :: "r"(d), "r"(s)
+ );
+ d+=16;
+ s+=16;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ mm_end = end - 3;
+ while (s < mm_end) {
+ register uint32_t x= *((const uint32_t*)s);
+ *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
+ s+=4;
+ d+=4;
+ }
+ if (s < end) {
+ register uint16_t x= *((const uint16_t*)s);
+ *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
+ }
+}
+
+static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ mm_end = end - 15;
+ __asm__ volatile(
+ "movq %3, %%mm5 \n\t"
+ "movq %4, %%mm6 \n\t"
+ "movq %5, %%mm7 \n\t"
+ "jmp 2f \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm6, %%mm0 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "pmaddwd %%mm7, %%mm0 \n\t"
+ "pmaddwd %%mm7, %%mm3 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "pand %%mm5, %%mm4 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "psrld $5, %%mm0 \n\t"
+ "pslld $11, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ "add $16, %1 \n\t"
+ "add $8, %0 \n\t"
+ "2: \n\t"
+ "cmp %2, %1 \n\t"
+ " jb 1b \n\t"
+ : "+r" (d), "+r"(s)
+ : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+ );
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register int rgb = *(const uint32_t*)s; s += 4;
+ *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
+ }
+}
+
+static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_16mask),"m"(green_16mask));
+ mm_end = end - 15;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psllq $8, %%mm0 \n\t"
+ "psllq $8, %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "psrlq $5, %%mm1 \n\t"
+ "psrlq $5, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $19, %%mm2 \n\t"
+ "psrlq $19, %%mm5 \n\t"
+ "pand %2, %%mm2 \n\t"
+ "pand %2, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
+ d += 4;
+ s += 16;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register int rgb = *(const uint32_t*)s; s += 4;
+ *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
+ }
+}
+
+static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ mm_end = end - 15;
+ __asm__ volatile(
+ "movq %3, %%mm5 \n\t"
+ "movq %4, %%mm6 \n\t"
+ "movq %5, %%mm7 \n\t"
+ "jmp 2f \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm6, %%mm0 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "pmaddwd %%mm7, %%mm0 \n\t"
+ "pmaddwd %%mm7, %%mm3 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "pand %%mm5, %%mm4 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "psrld $6, %%mm0 \n\t"
+ "pslld $10, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ "add $16, %1 \n\t"
+ "add $8, %0 \n\t"
+ "2: \n\t"
+ "cmp %2, %1 \n\t"
+ " jb 1b \n\t"
+ : "+r" (d), "+r"(s)
+ : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+ );
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register int rgb = *(const uint32_t*)s; s += 4;
+ *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
+ }
+}
+
+static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_15mask),"m"(green_15mask));
+ mm_end = end - 15;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 4(%1), %%mm3 \n\t"
+ "punpckldq 8(%1), %%mm0 \n\t"
+ "punpckldq 12(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psllq $7, %%mm0 \n\t"
+ "psllq $7, %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "psrlq $6, %%mm1 \n\t"
+ "psrlq $6, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $19, %%mm2 \n\t"
+ "psrlq $19, %%mm5 \n\t"
+ "pand %2, %%mm2 \n\t"
+ "pand %2, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+ d += 4;
+ s += 16;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register int rgb = *(const uint32_t*)s; s += 4;
+ *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
+ }
+}
+
+static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_16mask),"m"(green_16mask));
+ mm_end = end - 11;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 3(%1), %%mm3 \n\t"
+ "punpckldq 6(%1), %%mm0 \n\t"
+ "punpckldq 9(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psrlq $3, %%mm0 \n\t"
+ "psrlq $3, %%mm3 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %2, %%mm3 \n\t"
+ "psrlq $5, %%mm1 \n\t"
+ "psrlq $5, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $8, %%mm2 \n\t"
+ "psrlq $8, %%mm5 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
+ d += 4;
+ s += 12;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ const int b = *s++;
+ const int g = *s++;
+ const int r = *s++;
+ *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
+ }
+}
+
+static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_16mask),"m"(green_16mask));
+ mm_end = end - 15;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 3(%1), %%mm3 \n\t"
+ "punpckldq 6(%1), %%mm0 \n\t"
+ "punpckldq 9(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psllq $8, %%mm0 \n\t"
+ "psllq $8, %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "psrlq $5, %%mm1 \n\t"
+ "psrlq $5, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $19, %%mm2 \n\t"
+ "psrlq $19, %%mm5 \n\t"
+ "pand %2, %%mm2 \n\t"
+ "pand %2, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
+ d += 4;
+ s += 12;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ const int r = *s++;
+ const int g = *s++;
+ const int b = *s++;
+ *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
+ }
+}
+
+static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_15mask),"m"(green_15mask));
+ mm_end = end - 11;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 3(%1), %%mm3 \n\t"
+ "punpckldq 6(%1), %%mm0 \n\t"
+ "punpckldq 9(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psrlq $3, %%mm0 \n\t"
+ "psrlq $3, %%mm3 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %2, %%mm3 \n\t"
+ "psrlq $6, %%mm1 \n\t"
+ "psrlq $6, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $9, %%mm2 \n\t"
+ "psrlq $9, %%mm5 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+ d += 4;
+ s += 12;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ const int b = *s++;
+ const int g = *s++;
+ const int r = *s++;
+ *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+ }
+}
+
+static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint8_t *s = src;
+ const uint8_t *end;
+ const uint8_t *mm_end;
+ uint16_t *d = (uint16_t *)dst;
+ end = s + src_size;
+ __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
+ __asm__ volatile(
+ "movq %0, %%mm7 \n\t"
+ "movq %1, %%mm6 \n\t"
+ ::"m"(red_15mask),"m"(green_15mask));
+ mm_end = end - 15;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 3(%1), %%mm3 \n\t"
+ "punpckldq 6(%1), %%mm0 \n\t"
+ "punpckldq 9(%1), %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psllq $7, %%mm0 \n\t"
+ "psllq $7, %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "psrlq $6, %%mm1 \n\t"
+ "psrlq $6, %%mm4 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "psrlq $19, %%mm2 \n\t"
+ "psrlq $19, %%mm5 \n\t"
+ "pand %2, %%mm2 \n\t"
+ "pand %2, %%mm5 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ "psllq $16, %%mm3 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ MOVNTQ" %%mm0, (%0) \n\t"
+ ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
+ d += 4;
+ s += 12;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ const int r = *s++;
+ const int g = *s++;
+ const int b = *s++;
+ *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
+ }
+}
+
+static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint16_t *end;
+ const uint16_t *mm_end;
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t*)src;
+ end = s + src_size/2;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = end - 7;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "movq %%mm1, %%mm4 \n\t"
+ "movq %%mm2, %%mm5 \n\t"
+ "punpcklwd %5, %%mm0 \n\t"
+ "punpcklwd %5, %%mm1 \n\t"
+ "punpcklwd %5, %%mm2 \n\t"
+ "punpckhwd %5, %%mm3 \n\t"
+ "punpckhwd %5, %%mm4 \n\t"
+ "punpckhwd %5, %%mm5 \n\t"
+ "psllq $8, %%mm1 \n\t"
+ "psllq $16, %%mm2 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "psllq $8, %%mm4 \n\t"
+ "psllq $16, %%mm5 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+
+ "movq %%mm0, %%mm6 \n\t"
+ "movq %%mm3, %%mm7 \n\t"
+
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "movq %%mm1, %%mm4 \n\t"
+ "movq %%mm2, %%mm5 \n\t"
+ "punpcklwd %5, %%mm0 \n\t"
+ "punpcklwd %5, %%mm1 \n\t"
+ "punpcklwd %5, %%mm2 \n\t"
+ "punpckhwd %5, %%mm3 \n\t"
+ "punpckhwd %5, %%mm4 \n\t"
+ "punpckhwd %5, %%mm5 \n\t"
+ "psllq $8, %%mm1 \n\t"
+ "psllq $16, %%mm2 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "psllq $8, %%mm4 \n\t"
+ "psllq $16, %%mm5 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+
+ :"=m"(*d)
+ :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+ :"memory");
+ /* borrowed 32 to 24 */
+ __asm__ volatile(
+ "movq %%mm0, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq %%mm7, %%mm1 \n\t"
+
+ "movq %%mm4, %%mm6 \n\t"
+ "movq %%mm5, %%mm7 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+
+ STORE_BGR24_MMX
+
+ :: "r"(d), "m"(*s)
+ :"memory");
+ d += 24;
+ s += 8;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ }
+}
+
+static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint16_t *end;
+ const uint16_t *mm_end;
+ uint8_t *d = (uint8_t *)dst;
+ const uint16_t *s = (const uint16_t *)src;
+ end = s + src_size/2;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ mm_end = end - 7;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "movq %%mm1, %%mm4 \n\t"
+ "movq %%mm2, %%mm5 \n\t"
+ "punpcklwd %5, %%mm0 \n\t"
+ "punpcklwd %5, %%mm1 \n\t"
+ "punpcklwd %5, %%mm2 \n\t"
+ "punpckhwd %5, %%mm3 \n\t"
+ "punpckhwd %5, %%mm4 \n\t"
+ "punpckhwd %5, %%mm5 \n\t"
+ "psllq $8, %%mm1 \n\t"
+ "psllq $16, %%mm2 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "psllq $8, %%mm4 \n\t"
+ "psllq $16, %%mm5 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+
+ "movq %%mm0, %%mm6 \n\t"
+ "movq %%mm3, %%mm7 \n\t"
+
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "movq %%mm1, %%mm4 \n\t"
+ "movq %%mm2, %%mm5 \n\t"
+ "punpcklwd %5, %%mm0 \n\t"
+ "punpcklwd %5, %%mm1 \n\t"
+ "punpcklwd %5, %%mm2 \n\t"
+ "punpckhwd %5, %%mm3 \n\t"
+ "punpckhwd %5, %%mm4 \n\t"
+ "punpckhwd %5, %%mm5 \n\t"
+ "psllq $8, %%mm1 \n\t"
+ "psllq $16, %%mm2 \n\t"
+ "por %%mm1, %%mm0 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "psllq $8, %%mm4 \n\t"
+ "psllq $16, %%mm5 \n\t"
+ "por %%mm4, %%mm3 \n\t"
+ "por %%mm5, %%mm3 \n\t"
+ :"=m"(*d)
+ :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+ :"memory");
+ /* borrowed 32 to 24 */
+ __asm__ volatile(
+ "movq %%mm0, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "movq %%mm7, %%mm1 \n\t"
+
+ "movq %%mm4, %%mm6 \n\t"
+ "movq %%mm5, %%mm7 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+
+ STORE_BGR24_MMX
+
+ :: "r"(d), "m"(*s)
+ :"memory");
+ d += 24;
+ s += 8;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ }
+}
+
+/*
+ * mm0 = 00 B3 00 B2 00 B1 00 B0
+ * mm1 = 00 G3 00 G2 00 G1 00 G0
+ * mm2 = 00 R3 00 R2 00 R1 00 R0
+ * mm6 = FF FF FF FF FF FF FF FF
+ * mm7 = 00 00 00 00 00 00 00 00
+ */
+#define PACK_RGB32 \
+ "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
+ "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
+ "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
+ "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
+ "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
+ "movq %%mm0, %%mm3 \n\t" \
+ "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
+ "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
+ MOVNTQ" %%mm0, (%0) \n\t" \
+ MOVNTQ" %%mm3, 8(%0) \n\t" \
+
+static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint16_t *end;
+ const uint16_t *mm_end;
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t *)src;
+ end = s + src_size/2;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
+ __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
+ mm_end = end - 3;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw %5, %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ PACK_RGB32
+ ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
+ :"memory");
+ d += 16;
+ s += 4;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+ *d++ = 255;
+ }
+}
+
+static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ const uint16_t *end;
+ const uint16_t *mm_end;
+ uint8_t *d = dst;
+ const uint16_t *s = (const uint16_t*)src;
+ end = s + src_size/2;
+ __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
+ __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
+ __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
+ mm_end = end - 3;
+ while (s < mm_end) {
+ __asm__ volatile(
+ PREFETCH" 32(%1) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "pand %2, %%mm0 \n\t"
+ "pand %3, %%mm1 \n\t"
+ "pand %4, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
+ PACK_RGB32
+ ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
+ :"memory");
+ d += 16;
+ s += 4;
+ }
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+ while (s < end) {
+ register uint16_t bgr;
+ bgr = *s++;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+ *d++ = 255;
+ }
+}
+
+static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ x86_reg idx = 15 - src_size;
+ const uint8_t *s = src-idx;
+ uint8_t *d = dst-idx;
+ __asm__ volatile(
+ "test %0, %0 \n\t"
+ "jns 2f \n\t"
+ PREFETCH" (%1, %0) \n\t"
+ "movq %3, %%mm7 \n\t"
+ "pxor %4, %%mm7 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "pxor %5, %%mm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1, %0) \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+# if COMPILE_TEMPLATE_MMXEXT
+ "pshufw $177, %%mm0, %%mm3 \n\t"
+ "pshufw $177, %%mm1, %%mm5 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm6, %%mm3 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm6, %%mm5 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ "por %%mm5, %%mm1 \n\t"
+# else
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm4 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm6, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm6, %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "pslld $16, %%mm2 \n\t"
+ "psrld $16, %%mm3 \n\t"
+ "pslld $16, %%mm4 \n\t"
+ "psrld $16, %%mm5 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm4, %%mm1 \n\t"
+ "por %%mm3, %%mm0 \n\t"
+ "por %%mm5, %%mm1 \n\t"
+# endif
+ MOVNTQ" %%mm0, (%2, %0) \n\t"
+ MOVNTQ" %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "js 1b \n\t"
+ SFENCE" \n\t"
+ EMMS" \n\t"
+ "2: \n\t"
+ : "+&r"(idx)
+ : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
+ : "memory");
+ for (; idx<15; idx+=4) {
+ register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
+ v &= 0xff00ff;
+ *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
+ }
+}
+
+static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
+{
+ unsigned i;
+ x86_reg mmx_size= 23 - src_size;
+ __asm__ volatile (
+ "test %%"REG_a", %%"REG_a" \n\t"
+ "jns 2f \n\t"
+ "movq "MANGLE(mask24r)", %%mm5 \n\t"
+ "movq "MANGLE(mask24g)", %%mm6 \n\t"
+ "movq "MANGLE(mask24b)", %%mm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1, %%"REG_a") \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
+ "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
+ "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
+ "psllq $16, %%mm0 \n\t" // 00 BGR BGR
+ "pand %%mm5, %%mm0 \n\t"
+ "pand %%mm6, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "por %%mm0, %%mm1 \n\t"
+ "por %%mm2, %%mm1 \n\t"
+ "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
+ MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
+ "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
+ "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm5, %%mm1 \n\t"
+ "pand %%mm6, %%mm2 \n\t"
+ "por %%mm0, %%mm1 \n\t"
+ "por %%mm2, %%mm1 \n\t"
+ "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
+ MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
+ "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
+ "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
+ "pand %%mm6, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm5, %%mm2 \n\t"
+ "por %%mm0, %%mm1 \n\t"
+ "por %%mm2, %%mm1 \n\t"
+ MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
+ "add $24, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ "2: \n\t"
+ : "+a" (mmx_size)
+ : "r" (src-mmx_size), "r"(dst-mmx_size)
+ );
+
+ __asm__ volatile(SFENCE:::"memory");
+ __asm__ volatile(EMMS:::"memory");
+
+ if (mmx_size==23) return; //finished, was multiple of 8
+
+ src+= src_size;
+ dst+= src_size;
+ src_size= 23-mmx_size;
+ src-= src_size;
+ dst-= src_size;
+ for (i=0; i<src_size; i+=3) {
+ register uint8_t x;
+ x = src[i + 2];
+ dst[i + 1] = src[i + 1];
+ dst[i + 2] = src[i + 0];
+ dst[i + 0] = x;
+ }
+}
+
+static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+ int y;
+ const x86_reg chromWidth= width>>1;
+ for (y=0; y<height; y++) {
+ //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
+ PREFETCH" 32(%2, %%"REG_a") \n\t"
+ PREFETCH" 32(%3, %%"REG_a") \n\t"
+ "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
+ "movq %%mm0, %%mm2 \n\t" // U(0)
+ "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
+ "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
+ "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
+
+ "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
+ "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
+ "movq %%mm3, %%mm4 \n\t" // Y(0)
+ "movq %%mm5, %%mm6 \n\t" // Y(8)
+ "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
+ "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
+ "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
+ "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
+
+ MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+ : "%"REG_a
+ );
+ if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+ __asm__(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride)
+{
+ //FIXME interpolate chroma
+ RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
+static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+ int y;
+ const x86_reg chromWidth= width>>1;
+ for (y=0; y<height; y++) {
+ //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
+ PREFETCH" 32(%2, %%"REG_a") \n\t"
+ PREFETCH" 32(%3, %%"REG_a") \n\t"
+ "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
+ "movq %%mm0, %%mm2 \n\t" // U(0)
+ "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
+ "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
+ "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
+
+ "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
+ "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
+ "movq %%mm0, %%mm4 \n\t" // Y(0)
+ "movq %%mm2, %%mm6 \n\t" // Y(8)
+ "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
+ "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
+ "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
+ "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
+
+ MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
+ MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+ : "%"REG_a
+ );
+ if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
+ usrc += chromStride;
+ vsrc += chromStride;
+ }
+ ysrc += lumStride;
+ dst += dstStride;
+ }
+ __asm__(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride)
+{
+ //FIXME interpolate chroma
+ RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride)
+{
+ RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+ int width, int height,
+ int lumStride, int chromStride, int dstStride)
+{
+ RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const x86_reg chromWidth= width>>1;
+ for (y=0; y<height; y+=2) {
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
+ "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
+ "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
+ "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
+ "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
+ "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
+ "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
+ "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
+ "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
+ "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
+ "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
+
+ MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
+
+ "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
+ "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
+ "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
+ "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
+ "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
+ "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
+ "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
+ "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
+ "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
+ "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
+
+ MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
+
+ "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
+ "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
+ "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
+ "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
+ "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
+ "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
+ "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
+ "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
+
+ MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
+ MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+ : "memory", "%"REG_a
+ );
+
+ ydst += lumStride;
+ src += srcStride;
+
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
+ "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
+ "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
+ "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
+ "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
+ "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
+ "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
+ "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
+ "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
+ "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
+ "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
+
+ MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+
+ ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+ : "memory", "%"REG_a
+ );
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+ __asm__ volatile(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
+{
+ int x,y;
+
+ dst[0]= src[0];
+
+ // first line
+ for (x=0; x<srcWidth-1; x++) {
+ dst[2*x+1]= (3*src[x] + src[x+1])>>2;
+ dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
+ }
+ dst[2*srcWidth-1]= src[srcWidth-1];
+
+ dst+= dstStride;
+
+ for (y=1; y<srcHeight; y++) {
+ const x86_reg mmxSize= srcWidth&~15;
+ __asm__ volatile(
+ "mov %4, %%"REG_a" \n\t"
+ "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
+ "movq (%0, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm4, %%mm2 \n\t"
+ "psllq $8, %%mm4 \n\t"
+ "pand %%mm0, %%mm2 \n\t"
+ "por %%mm2, %%mm4 \n\t"
+ "movq (%1, %%"REG_a"), %%mm5 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "psllq $8, %%mm5 \n\t"
+ "pand %%mm0, %%mm3 \n\t"
+ "por %%mm3, %%mm5 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
+ PAVGB" %%mm0, %%mm5 \n\t"
+ PAVGB" %%mm0, %%mm3 \n\t"
+ PAVGB" %%mm0, %%mm5 \n\t"
+ PAVGB" %%mm0, %%mm3 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ "movq %%mm5, %%mm7 \n\t"
+ "movq %%mm4, %%mm6 \n\t"
+ "punpcklbw %%mm3, %%mm5 \n\t"
+ "punpckhbw %%mm3, %%mm7 \n\t"
+ "punpcklbw %%mm2, %%mm4 \n\t"
+ "punpckhbw %%mm2, %%mm6 \n\t"
+ MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
+ "add $8, %%"REG_a" \n\t"
+ "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
+ "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
+ " js 1b \n\t"
+ :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
+ "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
+ "g" (-mmxSize)
+ : "%"REG_a
+ );
+
+ for (x=mmxSize-1; x<srcWidth-1; x++) {
+ dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
+ dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
+ dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
+ dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
+ }
+ dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
+ dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
+
+ dst+=dstStride*2;
+ src+=srcStride;
+ }
+
+ // last line
+ dst[0]= src[0];
+
+ for (x=0; x<srcWidth-1; x++) {
+ dst[2*x+1]= (3*src[x] + src[x+1])>>2;
+ dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
+ }
+ dst[2*srcWidth-1]= src[srcWidth-1];
+
+ __asm__ volatile(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+}
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write HQ version.
+ */
+static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const x86_reg chromWidth= width>>1;
+ for (y=0; y<height; y+=2) {
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
+ "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
+ "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
+ "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
+ "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
+ "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
+ "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
+ "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
+ "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
+ "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
+ "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
+
+ MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
+
+ "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
+ "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
+ "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
+ "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
+ "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
+ "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
+ "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
+ "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
+ "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
+ "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
+
+ MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
+
+ "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
+ "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
+ "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
+ "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
+ "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
+ "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
+ "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
+ "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
+
+ MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
+ MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+ : "memory", "%"REG_a
+ );
+
+ ydst += lumStride;
+ src += srcStride;
+
+ __asm__ volatile(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
+ "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
+ "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
+ "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
+ "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
+ "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
+ "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
+ "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
+ "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
+ "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
+ "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
+
+ MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ "cmp %4, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+
+ ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
+ : "memory", "%"REG_a
+ );
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+ __asm__ volatile(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line,
+ * others are ignored in the C version.
+ * FIXME: Write HQ version.
+ */
+static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const x86_reg chromWidth= width>>1;
+ for (y=0; y<height-2; y+=2) {
+ int i;
+ for (i=0; i<2; i++) {
+ __asm__ volatile(
+ "mov %2, %%"REG_a" \n\t"
+ "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
+ "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_d") \n\t"
+ "movd (%0, %%"REG_d"), %%mm0 \n\t"
+ "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm0 \n\t"
+ "pmaddwd %%mm6, %%mm1 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+ "pmaddwd %%mm6, %%mm3 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm0 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "pmaddwd %%mm5, %%mm0 \n\t"
+ "pmaddwd %%mm5, %%mm2 \n\t"
+ "packssdw %%mm2, %%mm0 \n\t"
+ "psraw $7, %%mm0 \n\t"
+
+ "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
+ "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm4 \n\t"
+ "pmaddwd %%mm6, %%mm1 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+ "pmaddwd %%mm6, %%mm3 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm4 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm1, %%mm4 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "pmaddwd %%mm5, %%mm4 \n\t"
+ "pmaddwd %%mm5, %%mm2 \n\t"
+ "add $24, %%"REG_d" \n\t"
+ "packssdw %%mm2, %%mm4 \n\t"
+ "psraw $7, %%mm4 \n\t"
+
+ "packuswb %%mm4, %%mm0 \n\t"
+ "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
+
+ MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
+ : "%"REG_a, "%"REG_d
+ );
+ ydst += lumStride;
+ src += srcStride;
+ }
+ src -= srcStride*2;
+ __asm__ volatile(
+ "mov %4, %%"REG_a" \n\t"
+ "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
+ "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
+ "add %%"REG_d", %%"REG_d" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%"REG_d") \n\t"
+ PREFETCH" 64(%1, %%"REG_d") \n\t"
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+ "movq (%0, %%"REG_d"), %%mm0 \n\t"
+ "movq (%1, %%"REG_d"), %%mm1 \n\t"
+ "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
+ "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlq $24, %%mm0 \n\t"
+ "psrlq $24, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+#else
+ "movd (%0, %%"REG_d"), %%mm0 \n\t"
+ "movd (%1, %%"REG_d"), %%mm1 \n\t"
+ "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
+ "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
+ "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm4 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm4, %%mm2 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm2 \n\t"
+#endif
+ "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
+ "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm0 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm0 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm2, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm1 \n\t"
+ "pmaddwd %%mm5, %%mm0 \n\t"
+ "pmaddwd %%mm5, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
+ "psraw $7, %%mm0 \n\t"
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+ "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
+ "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
+ "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
+ "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm4, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlq $24, %%mm4 \n\t"
+ "psrlq $24, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+#else
+ "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
+ "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
+ "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm4 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm4 \n\t"
+ "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
+ "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
+ "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
+ "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm5 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm5, %%mm2 \n\t"
+ "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm2 \n\t"
+#endif
+ "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
+ "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+
+ "pmaddwd %%mm4, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm4 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm4 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm2, %%mm4 \n\t"
+ "packssdw %%mm3, %%mm1 \n\t"
+ "pmaddwd %%mm5, %%mm4 \n\t"
+ "pmaddwd %%mm5, %%mm1 \n\t"
+ "add $24, %%"REG_d" \n\t"
+ "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
+ "psraw $7, %%mm4 \n\t"
+
+ "movq %%mm0, %%mm1 \n\t"
+ "punpckldq %%mm4, %%mm0 \n\t"
+ "punpckhdq %%mm4, %%mm1 \n\t"
+ "packsswb %%mm1, %%mm0 \n\t"
+ "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
+ "movd %%mm0, (%2, %%"REG_a") \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movd %%mm0, (%3, %%"REG_a") \n\t"
+ "add $4, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
+ : "%"REG_a, "%"REG_d
+ );
+
+ udst += chromStride;
+ vdst += chromStride;
+ src += srcStride*2;
+ }
+
+ __asm__ volatile(EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+
+ rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
+}
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
+ int width, int height, int src1Stride,
+ int src2Stride, int dstStride)
+{
+ int h;
+
+ for (h=0; h < height; h++) {
+ int w;
+
+#if COMPILE_TEMPLATE_SSE2
+ __asm__(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%1, %%"REG_a") \n\t"
+ PREFETCH" 64(%2, %%"REG_a") \n\t"
+ "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
+ "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
+ "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
+ "punpcklbw %%xmm2, %%xmm0 \n\t"
+ "punpckhbw %%xmm2, %%xmm1 \n\t"
+ "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
+ "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
+ "add $16, %%"REG_a" \n\t"
+ "cmp %3, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
+ : "memory", "%"REG_a""
+ );
+#else
+ __asm__(
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%1, %%"REG_a") \n\t"
+ PREFETCH" 64(%2, %%"REG_a") \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq (%2, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
+ "punpcklbw %%mm4, %%mm0 \n\t"
+ "punpckhbw %%mm4, %%mm1 \n\t"
+ "punpcklbw %%mm5, %%mm2 \n\t"
+ "punpckhbw %%mm5, %%mm3 \n\t"
+ MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
+ MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
+ "add $16, %%"REG_a" \n\t"
+ "cmp %3, %%"REG_a" \n\t"
+ " jb 1b \n\t"
+ ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
+ : "memory", "%"REG_a
+ );
+#endif
+ for (w= (width&(~15)); w < width; w++) {
+ dest[2*w+0] = src1[w];
+ dest[2*w+1] = src2[w];
+ }
+ dest += dstStride;
+ src1 += src1Stride;
+ src2 += src2Stride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if !COMPILE_TEMPLATE_SSE2
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dst1, uint8_t *dst2,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int dstStride1, int dstStride2)
+{
+ x86_reg x, y;
+ int w,h;
+ w=width/2; h=height/2;
+ __asm__ volatile(
+ PREFETCH" %0 \n\t"
+ PREFETCH" %1 \n\t"
+ ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
+ for (y=0;y<h;y++) {
+ const uint8_t* s1=src1+srcStride1*(y>>1);
+ uint8_t* d=dst1+dstStride1*y;
+ x=0;
+ for (;x<w-31;x+=32) {
+ __asm__ volatile(
+ PREFETCH" 32(%1,%2) \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ "movq 8(%1,%2), %%mm2 \n\t"
+ "movq 16(%1,%2), %%mm4 \n\t"
+ "movq 24(%1,%2), %%mm6 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "movq %%mm6, %%mm7 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpcklbw %%mm2, %%mm2 \n\t"
+ "punpckhbw %%mm3, %%mm3 \n\t"
+ "punpcklbw %%mm4, %%mm4 \n\t"
+ "punpckhbw %%mm5, %%mm5 \n\t"
+ "punpcklbw %%mm6, %%mm6 \n\t"
+ "punpckhbw %%mm7, %%mm7 \n\t"
+ MOVNTQ" %%mm0, (%0,%2,2) \n\t"
+ MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
+ MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
+ MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
+ MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
+ MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
+ MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
+ MOVNTQ" %%mm7, 56(%0,%2,2)"
+ :: "r"(d), "r"(s1), "r"(x)
+ :"memory");
+ }
+ for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
+ }
+ for (y=0;y<h;y++) {
+ const uint8_t* s2=src2+srcStride2*(y>>1);
+ uint8_t* d=dst2+dstStride2*y;
+ x=0;
+ for (;x<w-31;x+=32) {
+ __asm__ volatile(
+ PREFETCH" 32(%1,%2) \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ "movq 8(%1,%2), %%mm2 \n\t"
+ "movq 16(%1,%2), %%mm4 \n\t"
+ "movq 24(%1,%2), %%mm6 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "movq %%mm6, %%mm7 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpcklbw %%mm2, %%mm2 \n\t"
+ "punpckhbw %%mm3, %%mm3 \n\t"
+ "punpcklbw %%mm4, %%mm4 \n\t"
+ "punpckhbw %%mm5, %%mm5 \n\t"
+ "punpcklbw %%mm6, %%mm6 \n\t"
+ "punpckhbw %%mm7, %%mm7 \n\t"
+ MOVNTQ" %%mm0, (%0,%2,2) \n\t"
+ MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
+ MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
+ MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
+ MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
+ MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
+ MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
+ MOVNTQ" %%mm7, 56(%0,%2,2)"
+ :: "r"(d), "r"(s2), "r"(x)
+ :"memory");
+ }
+ for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+
+static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ uint8_t *dst,
+ int width, int height,
+ int srcStride1, int srcStride2,
+ int srcStride3, int dstStride)
+{
+ x86_reg x;
+ int y,w,h;
+ w=width/2; h=height;
+ for (y=0;y<h;y++) {
+ const uint8_t* yp=src1+srcStride1*y;
+ const uint8_t* up=src2+srcStride2*(y>>2);
+ const uint8_t* vp=src3+srcStride3*(y>>2);
+ uint8_t* d=dst+dstStride*y;
+ x=0;
+ for (;x<w-7;x+=8) {
+ __asm__ volatile(
+ PREFETCH" 32(%1, %0) \n\t"
+ PREFETCH" 32(%2, %0) \n\t"
+ PREFETCH" 32(%3, %0) \n\t"
+ "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+ "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
+ "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
+ "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+ "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
+ "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
+ "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
+ "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
+ "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
+ "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
+
+ "movq %%mm1, %%mm6 \n\t"
+ "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
+ "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
+ "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
+ MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
+ MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
+
+ "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
+ "movq 8(%1, %0, 4), %%mm0 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
+ "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
+ MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
+ MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
+
+ "movq %%mm4, %%mm6 \n\t"
+ "movq 16(%1, %0, 4), %%mm0 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "punpcklbw %%mm5, %%mm4 \n\t"
+ "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
+ "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
+ MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
+ MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
+
+ "punpckhbw %%mm5, %%mm6 \n\t"
+ "movq 24(%1, %0, 4), %%mm0 \n\t"
+ "movq %%mm0, %%mm3 \n\t"
+ "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
+ "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
+ MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
+ MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
+
+ : "+r" (x)
+ : "r"(yp), "r" (up), "r"(vp), "r"(d)
+ :"memory");
+ }
+ for (; x<w; x++) {
+ const int x2 = x<<2;
+ d[8*x+0] = yp[x2];
+ d[8*x+1] = up[x];
+ d[8*x+2] = yp[x2+1];
+ d[8*x+3] = vp[x];
+ d[8*x+4] = yp[x2+2];
+ d[8*x+5] = up[x];
+ d[8*x+6] = yp[x2+3];
+ d[8*x+7] = vp[x];
+ }
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
+{
+ dst += count;
+ src += 2*count;
+ count= - count;
+
+ if(count <= -16) {
+ count += 15;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -30(%1, %0, 2), %%mm0 \n\t"
+ "movq -22(%1, %0, 2), %%mm1 \n\t"
+ "movq -14(%1, %0, 2), %%mm2 \n\t"
+ "movq -6(%1, %0, 2), %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ MOVNTQ" %%mm0,-15(%2, %0) \n\t"
+ MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src), "r"(dst)
+ );
+ count -= 15;
+ }
+ while(count<0) {
+ dst[count]= src[2*count];
+ count++;
+ }
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+ dst0+= count;
+ dst1+= count;
+ src += 4*count;
+ count= - count;
+ if(count <= -8) {
+ count += 7;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -28(%1, %0, 4), %%mm0 \n\t"
+ "movq -20(%1, %0, 4), %%mm1 \n\t"
+ "movq -12(%1, %0, 4), %%mm2 \n\t"
+ "movq -4(%1, %0, 4), %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm2, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
+ MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
+ "add $8, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src), "r"(dst0), "r"(dst1)
+ );
+ count -= 7;
+ }
+ while(count<0) {
+ dst0[count]= src[4*count+0];
+ dst1[count]= src[4*count+2];
+ count++;
+ }
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+ dst0 += count;
+ dst1 += count;
+ src0 += 4*count;
+ src1 += 4*count;
+ count= - count;
+#ifdef PAVGB
+ if(count <= -8) {
+ count += 7;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -28(%1, %0, 4), %%mm0 \n\t"
+ "movq -20(%1, %0, 4), %%mm1 \n\t"
+ "movq -12(%1, %0, 4), %%mm2 \n\t"
+ "movq -4(%1, %0, 4), %%mm3 \n\t"
+ PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
+ PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
+ PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
+ PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm2, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
+ MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
+ "add $8, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
+ );
+ count -= 7;
+ }
+#endif
+ while(count<0) {
+ dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
+ dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
+ count++;
+ }
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+ dst0+= count;
+ dst1+= count;
+ src += 4*count;
+ count= - count;
+ if(count <= -8) {
+ count += 7;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -28(%1, %0, 4), %%mm0 \n\t"
+ "movq -20(%1, %0, 4), %%mm1 \n\t"
+ "movq -12(%1, %0, 4), %%mm2 \n\t"
+ "movq -4(%1, %0, 4), %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm1 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "psrlw $8, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm2, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
+ MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
+ "add $8, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src), "r"(dst0), "r"(dst1)
+ );
+ count -= 7;
+ }
+ src++;
+ while(count<0) {
+ dst0[count]= src[4*count+0];
+ dst1[count]= src[4*count+2];
+ count++;
+ }
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
+{
+ dst0 += count;
+ dst1 += count;
+ src0 += 4*count;
+ src1 += 4*count;
+ count= - count;
+#ifdef PAVGB
+ if(count <= -8) {
+ count += 7;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -28(%1, %0, 4), %%mm0 \n\t"
+ "movq -20(%1, %0, 4), %%mm1 \n\t"
+ "movq -12(%1, %0, 4), %%mm2 \n\t"
+ "movq -4(%1, %0, 4), %%mm3 \n\t"
+ PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
+ PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
+ PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
+ PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm1 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "psrlw $8, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm2, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
+ MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
+ "add $8, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
+ );
+ count -= 7;
+ }
+#endif
+ src0++;
+ src1++;
+ while(count<0) {
+ dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
+ dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
+ count++;
+ }
+}
+
+static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth= -((-width)>>1);
+
+ for (y=0; y<height; y++) {
+ RENAME(extract_even)(src, ydst, width);
+ if(y&1) {
+ RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
+ udst+= chromStride;
+ vdst+= chromStride;
+ }
+
+ src += srcStride;
+ ydst+= lumStride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth= -((-width)>>1);
+
+ for (y=0; y<height; y++) {
+ RENAME(extract_even)(src, ydst, width);
+ RENAME(extract_odd2)(src, udst, vdst, chromWidth);
+
+ src += srcStride;
+ ydst+= lumStride;
+ udst+= chromStride;
+ vdst+= chromStride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth= -((-width)>>1);
+
+ for (y=0; y<height; y++) {
+ RENAME(extract_even)(src+1, ydst, width);
+ if(y&1) {
+ RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
+ udst+= chromStride;
+ vdst+= chromStride;
+ }
+
+ src += srcStride;
+ ydst+= lumStride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+ int width, int height,
+ int lumStride, int chromStride, int srcStride)
+{
+ int y;
+ const int chromWidth= -((-width)>>1);
+
+ for (y=0; y<height; y++) {
+ RENAME(extract_even)(src+1, ydst, width);
+ RENAME(extract_even2)(src, udst, vdst, chromWidth);
+
+ src += srcStride;
+ ydst+= lumStride;
+ udst+= chromStride;
+ vdst+= chromStride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+static inline void RENAME(rgb2rgb_init)(void)
+{
+#if !COMPILE_TEMPLATE_SSE2
+#if !COMPILE_TEMPLATE_AMD3DNOW
+ rgb15to16 = RENAME(rgb15to16);
+ rgb15tobgr24 = RENAME(rgb15tobgr24);
+ rgb15to32 = RENAME(rgb15to32);
+ rgb16tobgr24 = RENAME(rgb16tobgr24);
+ rgb16to32 = RENAME(rgb16to32);
+ rgb16to15 = RENAME(rgb16to15);
+ rgb24tobgr16 = RENAME(rgb24tobgr16);
+ rgb24tobgr15 = RENAME(rgb24tobgr15);
+ rgb24tobgr32 = RENAME(rgb24tobgr32);
+ rgb32to16 = RENAME(rgb32to16);
+ rgb32to15 = RENAME(rgb32to15);
+ rgb32tobgr24 = RENAME(rgb32tobgr24);
+ rgb24to15 = RENAME(rgb24to15);
+ rgb24to16 = RENAME(rgb24to16);
+ rgb24tobgr24 = RENAME(rgb24tobgr24);
+ shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
+ rgb32tobgr16 = RENAME(rgb32tobgr16);
+ rgb32tobgr15 = RENAME(rgb32tobgr15);
+ yv12toyuy2 = RENAME(yv12toyuy2);
+ yv12touyvy = RENAME(yv12touyvy);
+ yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
+ yuv422ptouyvy = RENAME(yuv422ptouyvy);
+ yuy2toyv12 = RENAME(yuy2toyv12);
+ vu9_to_vu12 = RENAME(vu9_to_vu12);
+ yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
+ uyvytoyuv422 = RENAME(uyvytoyuv422);
+ yuyvtoyuv422 = RENAME(yuyvtoyuv422);
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+
+#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
+ planar2x = RENAME(planar2x);
+#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
+ rgb24toyv12 = RENAME(rgb24toyv12);
+
+ yuyvtoyuv420 = RENAME(yuyvtoyuv420);
+ uyvytoyuv420 = RENAME(uyvytoyuv420);
+#endif /* !COMPILE_TEMPLATE_SSE2 */
+
+#if !COMPILE_TEMPLATE_AMD3DNOW
+ interleaveBytes = RENAME(interleaveBytes);
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+}
diff --git a/ffmpeg/libswscale/x86/scale.asm b/ffmpeg/libswscale/x86/scale.asm
new file mode 100644
index 0000000..c6dafde
--- /dev/null
+++ b/ffmpeg/libswscale/x86/scale.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+max_19bit_int: times 4 dd 0x7ffff
+max_19bit_flt: times 4 dd 524287.0
+minshort: times 8 dw 0x8000
+unicoeff: times 4 dd 0x20000000
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
+; (SwsContext *c, int{16,32}_t *dst,
+; int dstW, const uint{8,16}_t *src,
+; const int16_t *filter,
+; const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is either 8-bits width or 16-bits width
+; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
+; downscale before multiplying). Filter is 14-bits. Output is either 15bits
+; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
+; output pixel is generated from $filterSize input pixels, the position of
+; the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
+%macro SCALE_FUNC 6
+%ifnidn %3, X
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
+%else
+cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
+%endif
+%if ARCH_X86_64
+ movsxd wq, wd
+%define mov32 movsxd
+%else ; x86-32
+%define mov32 mov
+%endif ; x86-64
+%if %2 == 19
+%if mmsize == 8 ; mmx
+ mova m2, [max_19bit_int]
+%elif cpuflag(sse4)
+ mova m2, [max_19bit_int]
+%else ; ssse3/sse2
+ mova m2, [max_19bit_flt]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %2 == 19
+%if %1 == 16
+ mova m6, [minshort]
+ mova m7, [unicoeff]
+%elif %1 == 8
+ pxor m3, m3
+%endif ; %1 == 8/16
+
+%if %1 == 8
+%define movlh movd
+%define movbh movh
+%define srcmul 1
+%else ; %1 == 9-16
+%define movlh movq
+%define movbh movu
+%define srcmul 2
+%endif ; %1 == 8/9-16
+
+%ifnidn %3, X
+
+ ; setup loop
+%if %3 == 8
+ shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
+%define wshr 1
+%else ; %3 == 4
+%define wshr 0
+%endif ; %3 == 8
+ lea filterq, [filterq+wq*8]
+%if %2 == 15
+ lea dstq, [dstq+wq*(2>>wshr)]
+%else ; %2 == 19
+ lea dstq, [dstq+wq*(4>>wshr)]
+%endif ; %2 == 15/19
+ lea fltposq, [fltposq+wq*(4>>wshr)]
+ neg wq
+
+.loop:
+%if %3 == 4 ; filterSize == 4 scaling
+ ; load 2x4 or 4x4 source pixels into m0/m1
+ mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
+ mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
+ movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
+%if mmsize == 8
+ movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
+%else ; mmsize == 16
+%if %1 > 8
+ movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
+%else ; %1 == 8
+ movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
+%endif
+ mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
+ mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
+ movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
+%if %1 > 8
+ movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
+%else ; %1 == 8
+ movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
+ punpckldq m0, m4
+ punpckldq m1, m5
+%endif ; %1 == 8
+%endif ; mmsize == 8/16
+%if %1 == 8
+ punpcklbw m0, m3 ; byte -> word
+ punpcklbw m1, m3 ; byte -> word
+%endif ; %1 == 8
+
+ ; multiply with filter coefficients
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+ ; add back 0x8000 * sum(coeffs) after the horizontal add
+ psubw m0, m6
+ psubw m1, m6
+%endif ; %1 == 16
+ pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
+ pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
+
+ ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
+%if mmsize == 8 ; mmx
+ movq m4, m0
+ punpckldq m0, m1
+ punpckhdq m4, m1
+ paddd m0, m4
+%elif notcpuflag(ssse3) ; sse2
+ mova m4, m0
+ shufps m0, m1, 10001000b
+ shufps m4, m1, 11011101b
+ paddd m0, m4
+%else ; ssse3/sse4
+ phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
+ ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
+ ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
+ ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
+%endif ; mmx/sse2/ssse3/sse4
+%else ; %3 == 8, i.e. filterSize == 8 scaling
+ ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
+ mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
+ mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
+ movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
+%if mmsize == 8
+ movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
+ movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
+ movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
+%else ; mmsize == 16
+ movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
+ mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
+ mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
+ movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
+ movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
+%endif ; mmsize == 8/16
+%if %1 == 8
+ punpcklbw m0, m3 ; byte -> word
+ punpcklbw m1, m3 ; byte -> word
+ punpcklbw m4, m3 ; byte -> word
+ punpcklbw m5, m3 ; byte -> word
+%endif ; %1 == 8
+
+ ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+ ; add back 0x8000 * sum(coeffs) after the horizontal add
+ psubw m0, m6
+ psubw m1, m6
+ psubw m4, m6
+ psubw m5, m6
+%endif ; %1 == 16
+ pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
+ pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
+ pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
+ pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
+
+ ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
+%if mmsize == 8
+ paddd m0, m1
+ paddd m4, m5
+ movq m1, m0
+ punpckldq m0, m4
+ punpckhdq m1, m4
+ paddd m0, m1
+%elif notcpuflag(ssse3) ; sse2
+%if %1 == 8
+%define mex m6
+%else
+%define mex m3
+%endif
+ ; emulate horizontal add as transpose + vertical add
+ mova mex, m0
+ punpckldq m0, m1
+ punpckhdq mex, m1
+ paddd m0, mex
+ mova m1, m4
+ punpckldq m4, m5
+ punpckhdq m1, m5
+ paddd m4, m1
+ mova m1, m0
+ punpcklqdq m0, m4
+ punpckhqdq m1, m4
+ paddd m0, m1
+%else ; ssse3/sse4
+ ; FIXME if we rearrange the filter in pairs of 4, we can
+ ; load pixels likewise and use 2 x paddd + phaddd instead
+ ; of 3 x phaddd here, faster on older cpus
+ phaddd m0, m1
+ phaddd m4, m5
+ phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
+ ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
+ ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
+ ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %3 == 4/8
+
+%else ; %3 == X, i.e. any filterSize scaling
+
+%ifidn %4, X4
+%define dlt 4
+%else ; %4 == X || %4 == X8
+%define dlt 0
+%endif ; %4 ==/!= X4
+%if ARCH_X86_64
+%define srcq r8
+%define pos1q r7
+%define srcendq r9
+ movsxd fltsizeq, fltsized ; filterSize
+ lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+%else ; x86-32
+%define srcq srcmemq
+%define pos1q dstq
+%define srcendq r6m
+ lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+ mov srcendq, pos0q
+%endif ; x86-32/64
+ lea fltposq, [fltposq+wq*4]
+%if %2 == 15
+ lea dstq, [dstq+wq*2]
+%else ; %2 == 19
+ lea dstq, [dstq+wq*4]
+%endif ; %2 == 15/19
+ movifnidn dstmp, dstq
+ neg wq
+
+.loop:
+ mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
+ mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
+ ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
+ pxor m4, m4
+ pxor m5, m5
+ mov srcq, srcmemmp
+
+.innerloop:
+ ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+ movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
+ movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
+%if %1 == 8
+ punpcklbw m0, m3
+ punpcklbw m1, m3
+%endif ; %1 == 8
+
+ ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+ ; add back 0x8000 * sum(coeffs) after the horizontal add
+ psubw m0, m6
+ psubw m1, m6
+%endif ; %1 == 16
+ pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
+ pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
+ paddd m4, m0
+ paddd m5, m1
+ add filterq, mmsize
+ add srcq, srcmul*mmsize/2
+ cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
+ jl .innerloop
+
+%ifidn %4, X4
+ mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
+ movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
+ sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
+%if %1 > 8
+ movhps m0, [srcq+(pos1q+dlt)*srcmul]
+%else ; %1 == 8
+ movd m1, [srcq+(pos1q+dlt)*srcmul]
+ punpckldq m0, m1
+%endif ; %1 == 8
+%if %1 == 8
+ punpcklbw m0, m3
+%endif ; %1 == 8
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+ ; add back 0x8000 * sum(coeffs) after the horizontal add
+ psubw m0, m6
+%endif ; %1 == 16
+ pmaddwd m0, [filterq]
+%endif ; %4 == X4
+
+ lea filterq, [filterq+(fltsizeq+dlt)*2]
+
+%if mmsize == 8 ; mmx
+ movq m0, m4
+ punpckldq m4, m5
+ punpckhdq m0, m5
+ paddd m0, m4
+%else ; mmsize == 16
+%if notcpuflag(ssse3) ; sse2
+ mova m1, m4
+ punpcklqdq m4, m5
+ punpckhqdq m1, m5
+ paddd m4, m1
+%else ; ssse3/sse4
+ phaddd m4, m5
+%endif ; sse2/ssse3/sse4
+%ifidn %4, X4
+ paddd m4, m0
+%endif ; %3 == X4
+%if notcpuflag(ssse3) ; sse2
+ pshufd m4, m4, 11011000b
+ movhlps m0, m4
+ paddd m0, m4
+%else ; ssse3/sse4
+ phaddd m4, m4
+ SWAP 0, 4
+%endif ; sse2/ssse3/sse4
+%endif ; mmsize == 8/16
+%endif ; %3 ==/!= X
+
+%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
+ paddd m0, m7
+%endif ; %1 == 16
+
+ ; clip, store
+ psrad m0, 14 + %1 - %2
+%ifidn %3, X
+ movifnidn dstq, dstmp
+%endif ; %3 == X
+%if %2 == 15
+ packssdw m0, m0
+%ifnidn %3, X
+ movh [dstq+wq*(2>>wshr)], m0
+%else ; %3 == X
+ movd [dstq+wq*2], m0
+%endif ; %3 ==/!= X
+%else ; %2 == 19
+%if mmsize == 8
+ PMINSD_MMX m0, m2, m4
+%elif cpuflag(sse4)
+ pminsd m0, m2
+%else ; sse2/ssse3
+ cvtdq2ps m0, m0
+ minps m0, m2
+ cvtps2dq m0, m0
+%endif ; mmx/sse2/ssse3/sse4
+%ifnidn %3, X
+ mova [dstq+wq*(4>>wshr)], m0
+%else ; %3 == X
+ movq [dstq+wq*4], m0
+%endif ; %3 ==/!= X
+%endif ; %2 == 15/19
+%ifnidn %3, X
+ add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+ ; per iteration. see "shl wq,1" above as for why we do this
+%else ; %3 == X
+ add wq, 2
+%endif ; %3 ==/!= X
+ jl .loop
+ REP_RET
+%endmacro
+
+; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
+%macro SCALE_FUNCS 3
+SCALE_FUNC %1, %2, 4, 4, 6, %3
+SCALE_FUNC %1, %2, 8, 8, 6, %3
+%if mmsize == 8
+SCALE_FUNC %1, %2, X, X, 7, %3
+%else
+SCALE_FUNC %1, %2, X, X4, 7, %3
+SCALE_FUNC %1, %2, X, X8, 7, %3
+%endif
+%endmacro
+
+; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
+%macro SCALE_FUNCS2 3
+%if notcpuflag(sse4)
+SCALE_FUNCS 8, 15, %1
+SCALE_FUNCS 9, 15, %2
+SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
+SCALE_FUNCS 16, 15, %3
+%endif ; !sse4
+SCALE_FUNCS 8, 19, %1
+SCALE_FUNCS 9, 19, %2
+SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
+SCALE_FUNCS 16, 19, %3
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SCALE_FUNCS2 0, 0, 0
+%endif
+INIT_XMM sse2
+SCALE_FUNCS2 6, 7, 8
+INIT_XMM ssse3
+SCALE_FUNCS2 6, 6, 8
+INIT_XMM sse4
+SCALE_FUNCS2 6, 6, 8
diff --git a/ffmpeg/libswscale/x86/swscale.c b/ffmpeg/libswscale/x86/swscale.c
new file mode 100644
index 0000000..2f67b1b
--- /dev/null
+++ b/ffmpeg/libswscale/x86/swscale.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP
+
+DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
+DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
+DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
+DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
+ 0x0103010301030103LL,
+ 0x0200020002000200LL,};
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
+ 0x0602060206020602LL,
+ 0x0004000400040004LL,};
+
+DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
+DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
+
+#ifdef FAST_BGR2YV12
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL;
+#else
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
+#endif /* FAST_BGR2YV12 */
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
+
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "swscale_template.c"
+#endif
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "swscale_template.c"
+#endif
+
+void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+ int lastInLumBuf, int lastInChrBuf)
+{
+ const int dstH= c->dstH;
+ const int flags= c->flags;
+ int16_t **lumPixBuf= c->lumPixBuf;
+ int16_t **chrUPixBuf= c->chrUPixBuf;
+ int16_t **alpPixBuf= c->alpPixBuf;
+ const int vLumBufSize= c->vLumBufSize;
+ const int vChrBufSize= c->vChrBufSize;
+ int32_t *vLumFilterPos= c->vLumFilterPos;
+ int32_t *vChrFilterPos= c->vChrFilterPos;
+ int16_t *vLumFilter= c->vLumFilter;
+ int16_t *vChrFilter= c->vChrFilter;
+ int32_t *lumMmxFilter= c->lumMmxFilter;
+ int32_t *chrMmxFilter= c->chrMmxFilter;
+ int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
+ const int vLumFilterSize= c->vLumFilterSize;
+ const int vChrFilterSize= c->vChrFilterSize;
+ const int chrDstY= dstY>>c->chrDstVSubSample;
+ const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+ const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+
+ c->blueDither= ff_dither8[dstY&1];
+ if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
+ c->greenDither= ff_dither8[dstY&1];
+ else
+ c->greenDither= ff_dither4[dstY&1];
+ c->redDither= ff_dither8[(dstY+1)&1];
+ if (dstY < dstH - 2) {
+ const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+ const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+ int i;
+
+ if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+ const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+ int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+ for (i = 0; i < neg; i++)
+ tmpY[i] = lumSrcPtr[neg];
+ for ( ; i < end; i++)
+ tmpY[i] = lumSrcPtr[i];
+ for ( ; i < vLumFilterSize; i++)
+ tmpY[i] = tmpY[i-1];
+ lumSrcPtr = tmpY;
+
+ if (alpSrcPtr) {
+ const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+ for (i = 0; i < neg; i++)
+ tmpA[i] = alpSrcPtr[neg];
+ for ( ; i < end; i++)
+ tmpA[i] = alpSrcPtr[i];
+ for ( ; i < vLumFilterSize; i++)
+ tmpA[i] = tmpA[i - 1];
+ alpSrcPtr = tmpA;
+ }
+ }
+ if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+ const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+ int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+ for (i = 0; i < neg; i++) {
+ tmpU[i] = chrUSrcPtr[neg];
+ }
+ for ( ; i < end; i++) {
+ tmpU[i] = chrUSrcPtr[i];
+ }
+ for ( ; i < vChrFilterSize; i++) {
+ tmpU[i] = tmpU[i - 1];
+ }
+ chrUSrcPtr = tmpU;
+ }
+
+ if (flags & SWS_ACCURATE_RND) {
+ int s= APCK_SIZE / 8;
+ for (i=0; i<vLumFilterSize; i+=2) {
+ *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
+ *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
+ lumMmxFilter[s*i+APCK_COEF/4 ]=
+ lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
+ if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+ *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
+ *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
+ alpMmxFilter[s*i+APCK_COEF/4 ]=
+ alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
+ }
+ }
+ for (i=0; i<vChrFilterSize; i+=2) {
+ *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
+ *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
+ chrMmxFilter[s*i+APCK_COEF/4 ]=
+ chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
+ }
+ } else {
+ for (i=0; i<vLumFilterSize; i++) {
+ *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
+ lumMmxFilter[4*i+2]=
+ lumMmxFilter[4*i+3]=
+ ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
+ if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+ *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
+ alpMmxFilter[4*i+2]=
+ alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
+ }
+ }
+ for (i=0; i<vChrFilterSize; i++) {
+ *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
+ chrMmxFilter[4*i+2]=
+ chrMmxFilter[4*i+3]=
+ ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
+ }
+ }
+ }
+}
+
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ if(((int)dest) & 15){
+ return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset);
+ }
+ if (offset) {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "psrlq $24, %%xmm3\n\t"
+ "psllq $40, %%xmm4\n\t"
+ "por %%xmm4, %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ } else {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ }
+ filterSize--;
+ __asm__ volatile(
+ "pxor %%xmm0, %%xmm0\n\t"
+ "punpcklbw %%xmm0, %%xmm3\n\t"
+ "movd %0, %%xmm1\n\t"
+ "punpcklwd %%xmm1, %%xmm1\n\t"
+ "punpckldq %%xmm1, %%xmm1\n\t"
+ "punpcklqdq %%xmm1, %%xmm1\n\t"
+ "psllw $3, %%xmm1\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "psraw $4, %%xmm3\n\t"
+ ::"m"(filterSize)
+ );
+ __asm__ volatile(
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
+ "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
+ "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%xmm0, %%xmm2 \n\t"\
+ "pmulhw %%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm2, %%xmm3 \n\t"\
+ "paddw %%xmm5, %%xmm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%xmm3 \n\t"\
+ "psraw $3, %%xmm4 \n\t"\
+ "packuswb %%xmm4, %%xmm3 \n\t"
+ "movntdq %%xmm3, (%1, %%"REG_c")\n\t"
+ "add $16, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movdqa %%xmm7, %%xmm3\n\t"
+ "movdqa %%xmm7, %%xmm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
+extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
+ SwsContext *c, int16_t *data, \
+ int dstW, const uint8_t *src, \
+ const int16_t *filter, \
+ const int32_t *filterPos, int filterSize)
+
+#define SCALE_FUNCS(filter_n, opt) \
+ SCALE_FUNC(filter_n, 8, 15, opt); \
+ SCALE_FUNC(filter_n, 9, 15, opt); \
+ SCALE_FUNC(filter_n, 10, 15, opt); \
+ SCALE_FUNC(filter_n, 12, 15, opt); \
+ SCALE_FUNC(filter_n, 14, 15, opt); \
+ SCALE_FUNC(filter_n, 16, 15, opt); \
+ SCALE_FUNC(filter_n, 8, 19, opt); \
+ SCALE_FUNC(filter_n, 9, 19, opt); \
+ SCALE_FUNC(filter_n, 10, 19, opt); \
+ SCALE_FUNC(filter_n, 12, 19, opt); \
+ SCALE_FUNC(filter_n, 14, 19, opt); \
+ SCALE_FUNC(filter_n, 16, 19, opt)
+
+#define SCALE_FUNCS_MMX(opt) \
+ SCALE_FUNCS(4, opt); \
+ SCALE_FUNCS(8, opt); \
+ SCALE_FUNCS(X, opt)
+
+#define SCALE_FUNCS_SSE(opt) \
+ SCALE_FUNCS(4, opt); \
+ SCALE_FUNCS(8, opt); \
+ SCALE_FUNCS(X4, opt); \
+ SCALE_FUNCS(X8, opt)
+
+#if ARCH_X86_32
+SCALE_FUNCS_MMX(mmx);
+#endif
+SCALE_FUNCS_SSE(sse2);
+SCALE_FUNCS_SSE(ssse3);
+SCALE_FUNCS_SSE(sse4);
+
+#define VSCALEX_FUNC(size, opt) \
+extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
+ const int16_t **src, uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset)
+#define VSCALEX_FUNCS(opt) \
+ VSCALEX_FUNC(8, opt); \
+ VSCALEX_FUNC(9, opt); \
+ VSCALEX_FUNC(10, opt)
+
+#if ARCH_X86_32
+VSCALEX_FUNCS(mmxext);
+#endif
+VSCALEX_FUNCS(sse2);
+VSCALEX_FUNCS(sse4);
+VSCALEX_FUNC(16, sse4);
+VSCALEX_FUNCS(avx);
+
+#define VSCALE_FUNC(size, opt) \
+extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
+ const uint8_t *dither, int offset)
+#define VSCALE_FUNCS(opt1, opt2) \
+ VSCALE_FUNC(8, opt1); \
+ VSCALE_FUNC(9, opt2); \
+ VSCALE_FUNC(10, opt2); \
+ VSCALE_FUNC(16, opt1)
+
+#if ARCH_X86_32
+VSCALE_FUNCS(mmx, mmxext);
+#endif
+VSCALE_FUNCS(sse2, sse2);
+VSCALE_FUNC(16, sse4);
+VSCALE_FUNCS(avx, avx);
+
+#define INPUT_Y_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
+ const uint8_t *unused1, const uint8_t *unused2, \
+ int w, uint32_t *unused)
+#define INPUT_UV_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *unused0, \
+ const uint8_t *src1, \
+ const uint8_t *src2, \
+ int w, uint32_t *unused)
+#define INPUT_FUNC(fmt, opt) \
+ INPUT_Y_FUNC(fmt, opt); \
+ INPUT_UV_FUNC(fmt, opt)
+#define INPUT_FUNCS(opt) \
+ INPUT_FUNC(uyvy, opt); \
+ INPUT_FUNC(yuyv, opt); \
+ INPUT_UV_FUNC(nv12, opt); \
+ INPUT_UV_FUNC(nv21, opt); \
+ INPUT_FUNC(rgba, opt); \
+ INPUT_FUNC(bgra, opt); \
+ INPUT_FUNC(argb, opt); \
+ INPUT_FUNC(abgr, opt); \
+ INPUT_FUNC(rgb24, opt); \
+ INPUT_FUNC(bgr24, opt)
+
+#if ARCH_X86_32
+INPUT_FUNCS(mmx);
+#endif
+INPUT_FUNCS(sse2);
+INPUT_FUNCS(ssse3);
+INPUT_FUNCS(avx);
+
+av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
+ if (cpu_flags & AV_CPU_FLAG_MMX)
+ sws_init_swScale_MMX(c);
+#if HAVE_MMXEXT_INLINE
+ if (cpu_flags & AV_CPU_FLAG_MMXEXT)
+ sws_init_swScale_MMXEXT(c);
+ if (cpu_flags & AV_CPU_FLAG_SSE3){
+ if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+ c->yuv2planeX = yuv2yuvX_sse3;
+ }
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+ if (c->srcBpc == 8) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 9) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 10) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale10to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 12) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+ } else { /* c->srcBpc == 16 */ \
+ av_assert0(c->srcBpc == 16);\
+ hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+ } \
+} while (0)
+#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+ switch (filtersize) { \
+ case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+ case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+ default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+ }
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
+switch(c->dstBpc){ \
+ case 16: do_16_case; break; \
+ case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
+ case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
+ default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \
+ }
+#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
+ switch(c->dstBpc){ \
+ case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
+ case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
+ case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
+ case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ default: av_assert0(c->dstBpc>8); \
+ }
+#define case_rgb(x, X, opt) \
+ case AV_PIX_FMT_ ## X: \
+ c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
+ if (!c->chrSrcHSubSample) \
+ c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
+ break
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(cpu_flags)) {
+ ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+ ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+ ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
+
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_Y400A:
+ c->lumToYV12 = ff_yuyvToY_mmx;
+ if (c->alpPixBuf)
+ c->alpToYV12 = ff_uyvyToY_mmx;
+ break;
+ case AV_PIX_FMT_YUYV422:
+ c->lumToYV12 = ff_yuyvToY_mmx;
+ c->chrToYV12 = ff_yuyvToUV_mmx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->lumToYV12 = ff_uyvyToY_mmx;
+ c->chrToYV12 = ff_uyvyToUV_mmx;
+ break;
+ case AV_PIX_FMT_NV12:
+ c->chrToYV12 = ff_nv12ToUV_mmx;
+ break;
+ case AV_PIX_FMT_NV21:
+ c->chrToYV12 = ff_nv21ToUV_mmx;
+ break;
+ case_rgb(rgb24, RGB24, mmx);
+ case_rgb(bgr24, BGR24, mmx);
+ case_rgb(bgra, BGRA, mmx);
+ case_rgb(rgba, RGBA, mmx);
+ case_rgb(abgr, ABGR, mmx);
+ case_rgb(argb, ARGB, mmx);
+ default:
+ break;
+ }
+ }
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
+ }
+#endif /* ARCH_X86_32 */
+#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+ switch (filtersize) { \
+ case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+ case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+ default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+ else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+ break; \
+ }
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
+ HAVE_ALIGNED_STACK || ARCH_X86_64);
+ ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_Y400A:
+ c->lumToYV12 = ff_yuyvToY_sse2;
+ if (c->alpPixBuf)
+ c->alpToYV12 = ff_uyvyToY_sse2;
+ break;
+ case AV_PIX_FMT_YUYV422:
+ c->lumToYV12 = ff_yuyvToY_sse2;
+ c->chrToYV12 = ff_yuyvToUV_sse2;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->lumToYV12 = ff_uyvyToY_sse2;
+ c->chrToYV12 = ff_uyvyToUV_sse2;
+ break;
+ case AV_PIX_FMT_NV12:
+ c->chrToYV12 = ff_nv12ToUV_sse2;
+ break;
+ case AV_PIX_FMT_NV21:
+ c->chrToYV12 = ff_nv21ToUV_sse2;
+ break;
+ case_rgb(rgb24, RGB24, sse2);
+ case_rgb(bgr24, BGR24, sse2);
+ case_rgb(bgra, BGRA, sse2);
+ case_rgb(rgba, RGBA, sse2);
+ case_rgb(abgr, ABGR, sse2);
+ case_rgb(argb, ARGB, sse2);
+ default:
+ break;
+ }
+ }
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+ switch (c->srcFormat) {
+ case_rgb(rgb24, RGB24, ssse3);
+ case_rgb(bgr24, BGR24, ssse3);
+ default:
+ break;
+ }
+ }
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ /* Xto15 don't need special sse4 functions */
+ ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
+ if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
+ HAVE_ALIGNED_STACK || ARCH_X86_64);
+ if (c->dstBpc == 16 && !isBE(c->dstFormat))
+ c->yuv2plane1 = ff_yuv2plane1_16_sse4;
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
+ HAVE_ALIGNED_STACK || ARCH_X86_64);
+ ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->chrToYV12 = ff_yuyvToUV_avx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->chrToYV12 = ff_uyvyToUV_avx;
+ break;
+ case AV_PIX_FMT_NV12:
+ c->chrToYV12 = ff_nv12ToUV_avx;
+ break;
+ case AV_PIX_FMT_NV21:
+ c->chrToYV12 = ff_nv21ToUV_avx;
+ break;
+ case_rgb(rgb24, RGB24, avx);
+ case_rgb(bgr24, BGR24, avx);
+ case_rgb(bgra, BGRA, avx);
+ case_rgb(rgba, RGBA, avx);
+ case_rgb(abgr, ABGR, avx);
+ case_rgb(argb, ARGB, avx);
+ default:
+ break;
+ }
+ }
+}
diff --git a/ffmpeg/libswscale/x86/swscale_template.c b/ffmpeg/libswscale/x86/swscale_template.c
new file mode 100644
index 0000000..f2567c1
--- /dev/null
+++ b/ffmpeg/libswscale/x86/swscale_template.c
@@ -0,0 +1,1717 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef REAL_MOVNTQ
+#undef MOVNTQ
+#undef MOVNTQ2
+#undef PREFETCH
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define PREFETCH "prefetchnta"
+#else
+#define PREFETCH " # nop"
+#endif
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
+#else
+#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
+#endif
+#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ dither_8to16(dither, offset);
+ filterSize--;
+ __asm__ volatile(
+ "movd %0, %%mm1\n\t"
+ "punpcklwd %%mm1, %%mm1\n\t"
+ "punpckldq %%mm1, %%mm1\n\t"
+ "psllw $3, %%mm1\n\t"
+ "paddw %%mm1, %%mm3\n\t"
+ "paddw %%mm1, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ ::"m"(filterSize)
+ );
+
+ __asm__ volatile(\
+ "movq %%mm3, %%mm6\n\t"
+ "movq %%mm4, %%mm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
+ "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%mm3 \n\t"\
+ "psraw $3, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm3 \n\t"
+ MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
+ "add $8, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movq %%mm6, %%mm3\n\t"
+ "movq %%mm7, %%mm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+
+#define YSCALEYUV2PACKEDX_UV \
+ __asm__ volatile(\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+ ".p2align 4 \n\t"\
+ "nop \n\t"\
+ "1: \n\t"\
+ "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
+ "movq %%mm3, %%mm4 \n\t"\
+ ".p2align 4 \n\t"\
+ "2: \n\t"\
+ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
+ "add %6, %%"REG_S" \n\t" \
+ "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ " jnz 2b \n\t"\
+
+#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
+ "lea "offset"(%0), %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
+ "movq "#dst1", "#dst2" \n\t"\
+ ".p2align 4 \n\t"\
+ "2: \n\t"\
+ "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
+ "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "pmulhw "#coeff", "#src1" \n\t"\
+ "pmulhw "#coeff", "#src2" \n\t"\
+ "paddw "#src1", "#dst1" \n\t"\
+ "paddw "#src2", "#dst2" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ " jnz 2b \n\t"\
+
+#define YSCALEYUV2PACKEDX \
+ YSCALEYUV2PACKEDX_UV \
+ YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
+
+#define YSCALEYUV2PACKEDX_END \
+ :: "r" (&c->redDither), \
+ "m" (dummy), "m" (dummy), "m" (dummy),\
+ "r" (dest), "m" (dstW_reg), "m"(uv_off) \
+ : "%"REG_a, "%"REG_d, "%"REG_S \
+ );
+
+#define YSCALEYUV2PACKEDX_ACCURATE_UV \
+ __asm__ volatile(\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+ ".p2align 4 \n\t"\
+ "nop \n\t"\
+ "1: \n\t"\
+ "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "pxor %%mm4, %%mm4 \n\t"\
+ "pxor %%mm5, %%mm5 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ ".p2align 4 \n\t"\
+ "2: \n\t"\
+ "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
+ "add %6, %%"REG_S" \n\t" \
+ "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
+ "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
+ "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
+ "movq %%mm0, %%mm3 \n\t"\
+ "punpcklwd %%mm1, %%mm0 \n\t"\
+ "punpckhwd %%mm1, %%mm3 \n\t"\
+ "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
+ "pmaddwd %%mm1, %%mm0 \n\t"\
+ "pmaddwd %%mm1, %%mm3 \n\t"\
+ "paddd %%mm0, %%mm4 \n\t"\
+ "paddd %%mm3, %%mm5 \n\t"\
+ "add %6, %%"REG_S" \n\t" \
+ "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
+ "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
+ "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "punpcklwd %%mm3, %%mm2 \n\t"\
+ "punpckhwd %%mm3, %%mm0 \n\t"\
+ "pmaddwd %%mm1, %%mm2 \n\t"\
+ "pmaddwd %%mm1, %%mm0 \n\t"\
+ "paddd %%mm2, %%mm6 \n\t"\
+ "paddd %%mm0, %%mm7 \n\t"\
+ " jnz 2b \n\t"\
+ "psrad $16, %%mm4 \n\t"\
+ "psrad $16, %%mm5 \n\t"\
+ "psrad $16, %%mm6 \n\t"\
+ "psrad $16, %%mm7 \n\t"\
+ "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
+ "packssdw %%mm5, %%mm4 \n\t"\
+ "packssdw %%mm7, %%mm6 \n\t"\
+ "paddw %%mm0, %%mm4 \n\t"\
+ "paddw %%mm0, %%mm6 \n\t"\
+ "movq %%mm4, "U_TEMP"(%0) \n\t"\
+ "movq %%mm6, "V_TEMP"(%0) \n\t"\
+
+#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
+ "lea "offset"(%0), %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "pxor %%mm1, %%mm1 \n\t"\
+ "pxor %%mm5, %%mm5 \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ ".p2align 4 \n\t"\
+ "2: \n\t"\
+ "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
+ "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
+ "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
+ "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
+ "movq %%mm0, %%mm3 \n\t"\
+ "punpcklwd %%mm4, %%mm0 \n\t"\
+ "punpckhwd %%mm4, %%mm3 \n\t"\
+ "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
+ "pmaddwd %%mm4, %%mm0 \n\t"\
+ "pmaddwd %%mm4, %%mm3 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t"\
+ "paddd %%mm3, %%mm5 \n\t"\
+ "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
+ "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
+ "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "punpcklwd %%mm3, %%mm2 \n\t"\
+ "punpckhwd %%mm3, %%mm0 \n\t"\
+ "pmaddwd %%mm4, %%mm2 \n\t"\
+ "pmaddwd %%mm4, %%mm0 \n\t"\
+ "paddd %%mm2, %%mm7 \n\t"\
+ "paddd %%mm0, %%mm6 \n\t"\
+ " jnz 2b \n\t"\
+ "psrad $16, %%mm1 \n\t"\
+ "psrad $16, %%mm5 \n\t"\
+ "psrad $16, %%mm7 \n\t"\
+ "psrad $16, %%mm6 \n\t"\
+ "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
+ "packssdw %%mm5, %%mm1 \n\t"\
+ "packssdw %%mm6, %%mm7 \n\t"\
+ "paddw %%mm0, %%mm1 \n\t"\
+ "paddw %%mm0, %%mm7 \n\t"\
+ "movq "U_TEMP"(%0), %%mm3 \n\t"\
+ "movq "V_TEMP"(%0), %%mm4 \n\t"\
+
+#define YSCALEYUV2PACKEDX_ACCURATE \
+ YSCALEYUV2PACKEDX_ACCURATE_UV \
+ YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
+
+#define YSCALEYUV2RGBX \
+ "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
+ "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
+ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
+ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
+ "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
+ "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
+ /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+ "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
+ "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
+ "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
+ "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
+ "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
+ "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
+ /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+ "paddw %%mm3, %%mm4 \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "movq %%mm5, %%mm6 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+ "punpcklwd %%mm2, %%mm2 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ "punpcklwd %%mm4, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm2 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "paddw %%mm1, %%mm4 \n\t"\
+ "punpckhwd %%mm0, %%mm0 \n\t"\
+ "punpckhwd %%mm6, %%mm6 \n\t"\
+ "punpckhwd %%mm3, %%mm3 \n\t"\
+ "paddw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw %%mm7, %%mm3 \n\t"\
+ /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+ "packuswb %%mm0, %%mm2 \n\t"\
+ "packuswb %%mm6, %%mm5 \n\t"\
+ "packuswb %%mm3, %%mm4 \n\t"\
+
+#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
+ "movq "#b", "#q2" \n\t" /* B */\
+ "movq "#r", "#t" \n\t" /* R */\
+ "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
+ "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
+ "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
+ "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
+ "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
+ "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
+ "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
+ "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
+ "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
+ "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
+\
+ MOVNTQ( q0, (dst, index, 4))\
+ MOVNTQ( b, 8(dst, index, 4))\
+ MOVNTQ( q2, 16(dst, index, 4))\
+ MOVNTQ( q3, 24(dst, index, 4))\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
+
+static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ YSCALEYUV2PACKEDX_ACCURATE
+ YSCALEYUV2RGBX
+ "movq %%mm2, "U_TEMP"(%0) \n\t"
+ "movq %%mm4, "V_TEMP"(%0) \n\t"
+ "movq %%mm5, "Y_TEMP"(%0) \n\t"
+ YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
+ "movq "Y_TEMP"(%0), %%mm5 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ } else {
+ YSCALEYUV2PACKEDX_ACCURATE
+ YSCALEYUV2RGBX
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ }
+}
+
+static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ } else {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ }
+}
+
+#define REAL_WRITERGB16(dst, dstw, index) \
+ "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
+ "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
+ "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
+ "psrlq $3, %%mm2 \n\t"\
+\
+ "movq %%mm2, %%mm1 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm5, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm5, %%mm1 \n\t"\
+\
+ "psllq $3, %%mm3 \n\t"\
+ "psllq $3, %%mm4 \n\t"\
+\
+ "por %%mm3, %%mm2 \n\t"\
+ "por %%mm4, %%mm1 \n\t"\
+\
+ MOVNTQ(%%mm2, (dst, index, 2))\
+ MOVNTQ(%%mm1, 8(dst, index, 2))\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
+
+static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX_ACCURATE
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+ "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+ "paddusb "RED_DITHER"(%0), %%mm5\n\t"
+#endif
+ WRITERGB16(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
+#endif
+ WRITERGB16(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+#define REAL_WRITERGB15(dst, dstw, index) \
+ "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
+ "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
+ "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
+ "psrlq $3, %%mm2 \n\t"\
+ "psrlq $1, %%mm5 \n\t"\
+\
+ "movq %%mm2, %%mm1 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm5, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm5, %%mm1 \n\t"\
+\
+ "psllq $2, %%mm3 \n\t"\
+ "psllq $2, %%mm4 \n\t"\
+\
+ "por %%mm3, %%mm2 \n\t"\
+ "por %%mm4, %%mm1 \n\t"\
+\
+ MOVNTQ(%%mm2, (dst, index, 2))\
+ MOVNTQ(%%mm1, 8(dst, index, 2))\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
+
+static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX_ACCURATE
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+ "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+ "paddusb "RED_DITHER"(%0), %%mm5\n\t"
+#endif
+ WRITERGB15(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
+#endif
+ WRITERGB15(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+#define WRITEBGR24MMX(dst, dstw, index) \
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
+ "movq %%mm2, %%mm1 \n\t" /* B */\
+ "movq %%mm5, %%mm6 \n\t" /* R */\
+ "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
+ "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
+ "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
+ "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
+ "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
+ "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
+ "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
+ "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
+ "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
+\
+ "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
+ "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
+ "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
+ "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
+\
+ "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
+ "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
+ "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
+ "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
+\
+ "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
+ "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
+ "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
+ "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
+\
+ "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
+ "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
+ "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
+ "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
+ MOVNTQ(%%mm0, (dst))\
+\
+ "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
+ "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
+ "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
+ "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
+ MOVNTQ(%%mm6, 8(dst))\
+\
+ "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
+ "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
+ "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
+ MOVNTQ(%%mm5, 16(dst))\
+\
+ "add $24, "#dst" \n\t"\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+
+#define WRITEBGR24MMXEXT(dst, dstw, index) \
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
+ "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
+ "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
+ "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
+ "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
+ "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
+\
+ "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
+ "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
+ "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
+\
+ "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
+ "por %%mm1, %%mm6 \n\t"\
+ "por %%mm3, %%mm6 \n\t"\
+ MOVNTQ(%%mm6, (dst))\
+\
+ "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
+ "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
+ "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
+ "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
+\
+ "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
+ "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
+ "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
+\
+ "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
+ "por %%mm3, %%mm6 \n\t"\
+ MOVNTQ(%%mm6, 8(dst))\
+\
+ "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
+ "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
+ "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
+\
+ "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
+ "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
+ "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
+\
+ "por %%mm1, %%mm3 \n\t"\
+ "por %%mm3, %%mm6 \n\t"\
+ MOVNTQ(%%mm6, 16(dst))\
+\
+ "add $24, "#dst" \n\t"\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+
+#if COMPILE_TEMPLATE_MMXEXT
+#undef WRITEBGR24
+#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
+#else
+#undef WRITEBGR24
+#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
+#endif
+
+static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX_ACCURATE
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
+ "add %4, %%"REG_c" \n\t"
+ WRITEBGR24(%%REGc, %5, %%REGa)
+ :: "r" (&c->redDither),
+ "m" (dummy), "m" (dummy), "m" (dummy),
+ "r" (dest), "m" (dstW_reg), "m"(uv_off)
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+ );
+}
+
+static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
+ "add %4, %%"REG_c" \n\t"
+ WRITEBGR24(%%REGc, %5, %%REGa)
+ :: "r" (&c->redDither),
+ "m" (dummy), "m" (dummy), "m" (dummy),
+ "r" (dest), "m" (dstW_reg), "m"(uv_off)
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+ );
+}
+
+#define REAL_WRITEYUY2(dst, dstw, index) \
+ "packuswb %%mm3, %%mm3 \n\t"\
+ "packuswb %%mm4, %%mm4 \n\t"\
+ "packuswb %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm4, %%mm3 \n\t"\
+ "movq %%mm1, %%mm7 \n\t"\
+ "punpcklbw %%mm3, %%mm1 \n\t"\
+ "punpckhbw %%mm3, %%mm7 \n\t"\
+\
+ MOVNTQ(%%mm1, (dst, index, 2))\
+ MOVNTQ(%%mm7, 8(dst, index, 2))\
+\
+ "add $8, "#index" \n\t"\
+ "cmp "#dstw", "#index" \n\t"\
+ " jb 1b \n\t"
+#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
+
+static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX_ACCURATE
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+ "psraw $3, %%mm3 \n\t"
+ "psraw $3, %%mm4 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ WRITEYUY2(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
+
+ YSCALEYUV2PACKEDX
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+ "psraw $3, %%mm3 \n\t"
+ "psraw $3, %%mm4 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ WRITEYUY2(%4, %5, %%REGa)
+ YSCALEYUV2PACKEDX_END
+}
+
+#define REAL_YSCALEYUV2RGB_UV(index, c) \
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
+ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
+ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
+ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
+ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
+ "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
+ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
+ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
+ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
+ "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
+ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
+ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
+ "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
+ "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
+ /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+
+#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
+ "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
+ "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
+ "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
+ "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
+ "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
+ "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
+ "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+ "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+ "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+ "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+
+#define REAL_YSCALEYUV2RGB_COEFF(c) \
+ "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
+ "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
+ "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
+ "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
+ "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
+ "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
+ /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+ "paddw %%mm3, %%mm4 \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "movq %%mm5, %%mm6 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+ "punpcklwd %%mm2, %%mm2 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ "punpcklwd %%mm4, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm2 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "paddw %%mm1, %%mm4 \n\t"\
+ "punpckhwd %%mm0, %%mm0 \n\t"\
+ "punpckhwd %%mm6, %%mm6 \n\t"\
+ "punpckhwd %%mm3, %%mm3 \n\t"\
+ "paddw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw %%mm7, %%mm3 \n\t"\
+ /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+ "packuswb %%mm0, %%mm2 \n\t"\
+ "packuswb %%mm6, %%mm5 \n\t"\
+ "packuswb %%mm3, %%mm4 \n\t"\
+
+#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
+
+#define YSCALEYUV2RGB(index, c) \
+ REAL_YSCALEYUV2RGB_UV(index, c) \
+ REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
+ REAL_YSCALEYUV2RGB_COEFF(c)
+
+/**
+ * vertical bilinear scale YV12 to RGB
+ */
+static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
+#if ARCH_X86_64
+ __asm__ volatile(
+ YSCALEYUV2RGB(%%r8, %5)
+ YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
+ "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+ "packuswb %%mm7, %%mm1 \n\t"
+ WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
+ "a" (&c->redDither),
+ "r" (abuf0), "r" (abuf1)
+ : "%r8"
+ );
+#else
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB(%%REGBP, %5)
+ "push %0 \n\t"
+ "push %1 \n\t"
+ "mov "U_TEMP"(%5), %0 \n\t"
+ "mov "V_TEMP"(%5), %1 \n\t"
+ YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
+ "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
+ "packuswb %%mm7, %%mm1 \n\t"
+ "pop %1 \n\t"
+ "pop %0 \n\t"
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+#endif
+ } else {
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB(%%REGBP, %5)
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+}
+
+static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+ //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+}
+
+static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+ //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+}
+
+static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+ //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+}
+
+#define REAL_YSCALEYUV2PACKED(index, c) \
+ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
+ "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
+ "psraw $3, %%mm0 \n\t"\
+ "psraw $3, %%mm1 \n\t"\
+ "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
+ "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
+ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
+ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
+ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
+ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
+ "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
+ "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+ "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
+ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
+ "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
+ "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
+ "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
+ "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
+ "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
+ "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
+ "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+ "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+ "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+ "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+
+#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
+
+static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
+
+ //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2PACKED(%%REGBP, %5)
+ WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+}
+
+#define REAL_YSCALEYUV2RGB1(index, c) \
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
+ "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
+ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
+ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
+ "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
+ "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
+ /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
+ "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
+ "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
+ "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
+ "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
+ "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
+ "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
+ "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
+ /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+ "paddw %%mm3, %%mm4 \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "movq %%mm5, %%mm6 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+ "punpcklwd %%mm2, %%mm2 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ "punpcklwd %%mm4, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm2 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "paddw %%mm1, %%mm4 \n\t"\
+ "punpckhwd %%mm0, %%mm0 \n\t"\
+ "punpckhwd %%mm6, %%mm6 \n\t"\
+ "punpckhwd %%mm3, %%mm3 \n\t"\
+ "paddw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw %%mm7, %%mm3 \n\t"\
+ /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+ "packuswb %%mm0, %%mm2 \n\t"\
+ "packuswb %%mm6, %%mm5 \n\t"\
+ "packuswb %%mm3, %%mm4 \n\t"\
+
+#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
+
+// do vertical chrominance interpolation
+#define REAL_YSCALEYUV2RGB1b(index, c) \
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
+ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
+ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+ "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
+ "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
+ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
+ "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
+ "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
+ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
+ "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
+ "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
+ /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
+ "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
+ "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+ "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
+ "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
+ "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
+ "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
+ "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
+ "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
+ /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
+ "paddw %%mm3, %%mm4 \n\t"\
+ "movq %%mm2, %%mm0 \n\t"\
+ "movq %%mm5, %%mm6 \n\t"\
+ "movq %%mm4, %%mm3 \n\t"\
+ "punpcklwd %%mm2, %%mm2 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ "punpcklwd %%mm4, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm2 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "paddw %%mm1, %%mm4 \n\t"\
+ "punpckhwd %%mm0, %%mm0 \n\t"\
+ "punpckhwd %%mm6, %%mm6 \n\t"\
+ "punpckhwd %%mm3, %%mm3 \n\t"\
+ "paddw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw %%mm7, %%mm3 \n\t"\
+ /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
+ "packuswb %%mm0, %%mm2 \n\t"\
+ "packuswb %%mm6, %%mm5 \n\t"\
+ "packuswb %%mm3, %%mm4 \n\t"\
+
+#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
+
+#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
+ "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
+ "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
+ "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
+ "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
+ "packuswb %%mm1, %%mm7 \n\t"
+#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
+
+/**
+ * YV12 to RGB without scaling or interpolating
+ */
+static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest,
+ int dstW, int uvalpha, int y)
+{
+ const int16_t *ubuf0 = ubuf[0];
+ const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+ if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1(%%REGBP, %5)
+ YSCALEYUV2RGB1_ALPHA(%%REGBP)
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1(%%REGBP, %5)
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1];
+ if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1b(%%REGBP, %5)
+ YSCALEYUV2RGB1_ALPHA(%%REGBP)
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1b(%%REGBP, %5)
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+ }
+}
+
+static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest,
+ int dstW, int uvalpha, int y)
+{
+ const int16_t *ubuf0 = ubuf[0];
+ const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+ if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ const int16_t *ubuf1 = ubuf[1];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1b(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+}
+
+static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest,
+ int dstW, int uvalpha, int y)
+{
+ const int16_t *ubuf0 = ubuf[0];
+ const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+ if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ const int16_t *ubuf1 = ubuf[1];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1b(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+}
+
+static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest,
+ int dstW, int uvalpha, int y)
+{
+ const int16_t *ubuf0 = ubuf[0];
+ const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+ if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ const int16_t *ubuf1 = ubuf[1];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2RGB1b(%%REGBP, %5)
+ "pxor %%mm7, %%mm7 \n\t"
+ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+#endif
+ WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+}
+
+#define REAL_YSCALEYUV2PACKED1(index, c) \
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "psraw $7, %%mm3 \n\t" \
+ "psraw $7, %%mm4 \n\t" \
+ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
+ "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
+ "psraw $7, %%mm1 \n\t" \
+ "psraw $7, %%mm7 \n\t" \
+
+#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
+
+#define REAL_YSCALEYUV2PACKED1b(index, c) \
+ "xor "#index", "#index" \n\t"\
+ ".p2align 4 \n\t"\
+ "1: \n\t"\
+ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
+ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
+ "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
+ "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
+ "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
+ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+ "psrlw $8, %%mm3 \n\t" \
+ "psrlw $8, %%mm4 \n\t" \
+ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
+ "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
+ "psraw $7, %%mm1 \n\t" \
+ "psraw $7, %%mm7 \n\t"
+#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
+
+static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest,
+ int dstW, int uvalpha, int y)
+{
+ const int16_t *ubuf0 = ubuf[0];
+ const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+ if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
+ const int16_t *ubuf1 = ubuf[0];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2PACKED1(%%REGBP, %5)
+ WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ } else {
+ const int16_t *ubuf1 = ubuf[1];
+ __asm__ volatile(
+ "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"REG_b" \n\t"
+ "push %%"REG_BP" \n\t"
+ YSCALEYUV2PACKED1b(%%REGBP, %5)
+ WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ "pop %%"REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
+ :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
+ "a" (&c->redDither)
+ );
+ }
+}
+
+#if COMPILE_TEMPLATE_MMXEXT
+static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
+ int dstWidth, const uint8_t *src,
+ int srcW, int xInc)
+{
+ int32_t *filterPos = c->hLumFilterPos;
+ int16_t *filter = c->hLumFilter;
+ void *mmxextFilterCode = c->lumMmxextFilterCode;
+ int i;
+#if defined(PIC)
+ uint64_t ebxsave;
+#endif
+#if ARCH_X86_64
+ uint64_t retsave;
+#endif
+
+ __asm__ volatile(
+#if defined(PIC)
+ "mov %%"REG_b", %5 \n\t"
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %6 \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %5 \n\t"
+#endif
+#endif
+ "pxor %%mm7, %%mm7 \n\t"
+ "mov %0, %%"REG_c" \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "mov %2, %%"REG_d" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+#if ARCH_X86_64
+#define CALL_MMXEXT_FILTER_CODE \
+ "movl (%%"REG_b"), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
+ "add %%"REG_S", %%"REG_c" \n\t"\
+ "add %%"REG_a", %%"REG_D" \n\t"\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+
+#else
+#define CALL_MMXEXT_FILTER_CODE \
+ "movl (%%"REG_b"), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
+ "add %%"REG_a", %%"REG_D" \n\t"\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+
+#endif /* ARCH_X86_64 */
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+
+#if defined(PIC)
+ "mov %5, %%"REG_b" \n\t"
+#if ARCH_X86_64
+ "mov %6, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov %5, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#endif
+ :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
+ "m" (mmxextFilterCode)
+#if defined(PIC)
+ ,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+ ,"m"(retsave)
+#endif
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if !defined(PIC)
+ ,"%"REG_b
+#endif
+ );
+
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc)
+{
+ int32_t *filterPos = c->hChrFilterPos;
+ int16_t *filter = c->hChrFilter;
+ void *mmxextFilterCode = c->chrMmxextFilterCode;
+ int i;
+#if defined(PIC)
+ DECLARE_ALIGNED(8, uint64_t, ebxsave);
+#endif
+#if ARCH_X86_64
+ DECLARE_ALIGNED(8, uint64_t, retsave);
+#endif
+
+ __asm__ volatile(
+#if defined(PIC)
+ "mov %%"REG_b", %7 \n\t"
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %8 \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %7 \n\t"
+#endif
+#endif
+ "pxor %%mm7, %%mm7 \n\t"
+ "mov %0, %%"REG_c" \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "mov %2, %%"REG_d" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ "mov %5, %%"REG_c" \n\t" // src
+ "mov %6, %%"REG_D" \n\t" // buf2
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+
+#if defined(PIC)
+ "mov %7, %%"REG_b" \n\t"
+#if ARCH_X86_64
+ "mov %8, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#else
+#if ARCH_X86_64
+ "mov %7, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#endif
+#endif
+ :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
+ "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
+#if defined(PIC)
+ ,"m" (ebxsave)
+#endif
+#if ARCH_X86_64
+ ,"m"(retsave)
+#endif
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if !defined(PIC)
+ ,"%"REG_b
+#endif
+ );
+
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+
+static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
+{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ c->use_mmx_vfilter= 0;
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+ && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
+ if (c->flags & SWS_ACCURATE_RND) {
+ if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
+ case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
+ case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
+ case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
+ case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
+ default: break;
+ }
+ }
+ } else {
+ c->use_mmx_vfilter= 1;
+ c->yuv2planeX = RENAME(yuv2yuvX );
+ if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
+ case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
+ case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
+ case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
+ case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
+ default: break;
+ }
+ }
+ }
+ if (!(c->flags & SWS_FULL_CHR_H_INT)) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32:
+ c->yuv2packed1 = RENAME(yuv2rgb32_1);
+ c->yuv2packed2 = RENAME(yuv2rgb32_2);
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packed1 = RENAME(yuv2bgr24_1);
+ c->yuv2packed2 = RENAME(yuv2bgr24_2);
+ break;
+ case AV_PIX_FMT_RGB555:
+ c->yuv2packed1 = RENAME(yuv2rgb555_1);
+ c->yuv2packed2 = RENAME(yuv2rgb555_2);
+ break;
+ case AV_PIX_FMT_RGB565:
+ c->yuv2packed1 = RENAME(yuv2rgb565_1);
+ c->yuv2packed2 = RENAME(yuv2rgb565_2);
+ break;
+ case AV_PIX_FMT_YUYV422:
+ c->yuv2packed1 = RENAME(yuv2yuyv422_1);
+ c->yuv2packed2 = RENAME(yuv2yuyv422_2);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
+ // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
+#if COMPILE_TEMPLATE_MMXEXT
+ if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
+ c->hyscale_fast = RENAME(hyscale_fast);
+ c->hcscale_fast = RENAME(hcscale_fast);
+ } else {
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+ c->hyscale_fast = NULL;
+ c->hcscale_fast = NULL;
+#if COMPILE_TEMPLATE_MMXEXT
+ }
+#endif /* COMPILE_TEMPLATE_MMXEXT */
+ }
+}
diff --git a/ffmpeg/libswscale/x86/w64xmmtest.c b/ffmpeg/libswscale/x86/w64xmmtest.c
new file mode 100644
index 0000000..dd9a2a4
--- /dev/null
+++ b/ffmpeg/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+ const int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *const dst[], const int dstStride[]))
+{
+ testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+ srcSliceH, dst, dstStride);
+}
diff --git a/ffmpeg/libswscale/x86/yuv2rgb.c b/ffmpeg/libswscale/x86/yuv2rgb.c
new file mode 100644
index 0000000..3938e6b
--- /dev/null
+++ b/ffmpeg/libswscale/x86/yuv2rgb.c
@@ -0,0 +1,113 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * MMX/MMXEXT template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/cpu.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP // only for MMX
+
+/* hope these constant values are cache line aligned */
+DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMX_INLINE */
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMXEXT_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
+{
+#if HAVE_INLINE_ASM
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+ if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB24:
+ return yuv420_rgb24_MMXEXT;
+ case AV_PIX_FMT_BGR24:
+ return yuv420_bgr24_MMXEXT;
+ }
+ }
+#endif
+
+ if (cpu_flags & AV_CPU_FLAG_MMX) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32:
+ if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+ return yuva420_rgb32_MMX;
+#endif
+ break;
+ } else return yuv420_rgb32_MMX;
+ case AV_PIX_FMT_BGR32:
+ if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+ return yuva420_bgr32_MMX;
+#endif
+ break;
+ } else return yuv420_bgr32_MMX;
+ case AV_PIX_FMT_RGB24: return yuv420_rgb24_MMX;
+ case AV_PIX_FMT_BGR24: return yuv420_bgr24_MMX;
+ case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+ case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+
+ return NULL;
+}
diff --git a/ffmpeg/libswscale/x86/yuv2rgb_template.c b/ffmpeg/libswscale/x86/yuv2rgb_template.c
new file mode 100644
index 0000000..c879102
--- /dev/null
+++ b/ffmpeg/libswscale/x86/yuv2rgb_template.c
@@ -0,0 +1,451 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2001-2007 Michael Niedermayer
+ * (c) 2010 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE " # nop"
+#endif
+
+#define REG_BLUE "0"
+#define REG_RED "1"
+#define REG_GREEN "2"
+#define REG_ALPHA "3"
+
+#define YUV2RGB_LOOP(depth) \
+ h_size = (c->dstW + 7) & ~7; \
+ if (h_size * depth > FFABS(dstStride[0])) \
+ h_size -= 8; \
+ \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
+ \
+ __asm__ volatile ("pxor %mm4, %mm4\n\t"); \
+ for (y = 0; y < srcSliceH; y++) { \
+ uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
+ const uint8_t *py = src[0] + y * srcStride[0]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
+ x86_reg index = -h_size / 2; \
+
+#define YUV2RGB_INITIAL_LOAD \
+ __asm__ volatile ( \
+ "movq (%5, %0, 2), %%mm6\n\t" \
+ "movd (%2, %0), %%mm0\n\t" \
+ "movd (%3, %0), %%mm1\n\t" \
+ "1: \n\t" \
+
+/* YUV2RGB core
+ * Conversion is performed in usual way:
+ * R = Y' * Ycoef + Vred * V'
+ * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
+ * B = Y' * Ycoef + Ublue * U'
+ *
+ * where X' = X * 8 - Xoffset (multiplication is performed to increase
+ * precision a bit).
+ * Since it operates in YUV420 colorspace, Y component is additionally
+ * split into Y1 and Y2 for even and odd pixels.
+ *
+ * Input:
+ * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
+ * Output:
+ * mm1 - R, mm2 - G, mm0 - B
+ */
+#define YUV2RGB \
+ /* convert Y, U, V into Y1', Y2', U', V' */ \
+ "movq %%mm6, %%mm7\n\t" \
+ "punpcklbw %%mm4, %%mm0\n\t" \
+ "punpcklbw %%mm4, %%mm1\n\t" \
+ "pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
+ "psrlw $8, %%mm7\n\t" \
+ "psllw $3, %%mm0\n\t" \
+ "psllw $3, %%mm1\n\t" \
+ "psllw $3, %%mm6\n\t" \
+ "psllw $3, %%mm7\n\t" \
+ "psubsw "U_OFFSET"(%4), %%mm0\n\t" \
+ "psubsw "V_OFFSET"(%4), %%mm1\n\t" \
+ "psubw "Y_OFFSET"(%4), %%mm6\n\t" \
+ "psubw "Y_OFFSET"(%4), %%mm7\n\t" \
+\
+ /* multiply by coefficients */ \
+ "movq %%mm0, %%mm2\n\t" \
+ "movq %%mm1, %%mm3\n\t" \
+ "pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
+ "pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
+ "pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
+ "pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
+ "pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
+ "pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
+ "paddsw %%mm3, %%mm2\n\t" \
+ /* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
+ /* mm6 = Y1, mm7 = Y2 */ \
+\
+ /* produce RGB */ \
+ "movq %%mm7, %%mm3\n\t" \
+ "movq %%mm7, %%mm5\n\t" \
+ "paddsw %%mm0, %%mm3\n\t" \
+ "paddsw %%mm1, %%mm5\n\t" \
+ "paddsw %%mm2, %%mm7\n\t" \
+ "paddsw %%mm6, %%mm0\n\t" \
+ "paddsw %%mm6, %%mm1\n\t" \
+ "paddsw %%mm6, %%mm2\n\t" \
+
+#define RGB_PACK_INTERLEAVE \
+ /* pack and interleave even/odd pixels */ \
+ "packuswb %%mm1, %%mm0\n\t" \
+ "packuswb %%mm5, %%mm3\n\t" \
+ "packuswb %%mm2, %%mm2\n\t" \
+ "movq %%mm0, %%mm1\n\n" \
+ "packuswb %%mm7, %%mm7\n\t" \
+ "punpcklbw %%mm3, %%mm0\n\t" \
+ "punpckhbw %%mm3, %%mm1\n\t" \
+ "punpcklbw %%mm7, %%mm2\n\t" \
+
+#define YUV2RGB_ENDLOOP(depth) \
+ "movq 8 (%5, %0, 2), %%mm6\n\t" \
+ "movd 4 (%3, %0), %%mm1\n\t" \
+ "movd 4 (%2, %0), %%mm0\n\t" \
+ "add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
+ "add $4, %0\n\t" \
+ "js 1b\n\t" \
+
+#define YUV2RGB_OPERANDS \
+ : "+r" (index), "+r" (image) \
+ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+ "r" (py - 2*index) \
+ : "memory" \
+ ); \
+ } \
+
+#define YUV2RGB_OPERANDS_ALPHA \
+ : "+r" (index), "+r" (image) \
+ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+ "r" (py - 2*index), "r" (pa - 2*index) \
+ : "memory" \
+ ); \
+ } \
+
+#define YUV2RGB_ENDFUNC \
+ __asm__ volatile (SFENCE"\n\t" \
+ "emms \n\t"); \
+ return srcSliceH; \
+
+#define IF0(x)
+#define IF1(x) x
+
+#define RGB_PACK16(gmask, is15) \
+ "pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
+ "pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
+ "movq %%mm2, %%mm3\n\t" \
+ "psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
+ "psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
+ "psrlw $3, %%mm0\n\t" \
+ IF##is15("psrlw $1, %%mm1\n\t") \
+ "pand "MANGLE(pb_e0)", %%mm2\n\t" \
+ "pand "MANGLE(gmask)", %%mm3\n\t" \
+ "por %%mm2, %%mm0\n\t" \
+ "por %%mm3, %%mm1\n\t" \
+ "movq %%mm0, %%mm2\n\t" \
+ "punpcklbw %%mm1, %%mm0\n\t" \
+ "punpckhbw %%mm1, %%mm2\n\t" \
+ MOVNTQ " %%mm0, (%1)\n\t" \
+ MOVNTQ " %%mm2, 8(%1)\n\t" \
+
+#define DITHER_RGB \
+ "paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
+ "paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
+ "paddusb "RED_DITHER"(%4), %%mm1\n\t" \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+ c->blueDither = ff_dither8[y & 1];
+ c->greenDither = ff_dither8[y & 1];
+ c->redDither = ff_dither8[(y + 1) & 1];
+#endif
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+ DITHER_RGB
+#endif
+ RGB_PACK16(pb_03, 1)
+
+ YUV2RGB_ENDLOOP(2)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+ c->blueDither = ff_dither8[y & 1];
+ c->greenDither = ff_dither4[y & 1];
+ c->redDither = ff_dither8[(y + 1) & 1];
+#endif
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+ DITHER_RGB
+#endif
+ RGB_PACK16(pb_07, 0)
+
+ YUV2RGB_ENDLOOP(2)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
+
+#define RGB_PACK24(blue, red)\
+ "packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
+ "packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
+ "packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
+ "movq %%mm"red", %%mm3 \n"\
+ "movq %%mm"blue", %%mm6 \n"\
+ "psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
+ "punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
+ "punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
+ "movq %%mm3, %%mm5 \n"\
+ "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
+ "punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
+ "punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
+ RGB_PACK24_B
+
+#if COMPILE_TEMPLATE_MMXEXT
+DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+ "pshufw $0xc6, %%mm2, %%mm1 \n"\
+ "pshufw $0x84, %%mm3, %%mm6 \n"\
+ "pshufw $0x38, %%mm5, %%mm7 \n"\
+ "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
+ "movq %%mm1, %%mm0 \n"\
+ "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
+ "movq %%mm1, %%mm2 \n"\
+ "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
+ "psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
+ "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
+ "psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
+ "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
+ "por %%mm3, %%mm1 \n"\
+ "por %%mm6, %%mm0 \n"\
+ "por %%mm5, %%mm1 \n"\
+ "por %%mm7, %%mm2 \n"\
+ MOVNTQ" %%mm0, (%1) \n"\
+ MOVNTQ" %%mm1, 8(%1) \n"\
+ MOVNTQ" %%mm2, 16(%1) \n"\
+
+#else
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+ "movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
+ "movd %%mm2, 4(%1) \n" /* G1 B1 */\
+ "psrlq $32, %%mm3 \n"\
+ "psrlq $16, %%mm2 \n"\
+ "movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
+ "movd %%mm2, 10(%1) \n" /* G3 B3 */\
+ "psrlq $16, %%mm2 \n"\
+ "movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
+ "movd %%mm2, 16(%1) \n" /* G5 B5 */\
+ "psrlq $32, %%mm5 \n"\
+ "movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
+ "movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
+
+#endif
+
+static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(3)
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK24(REG_BLUE, REG_RED)
+
+ YUV2RGB_ENDLOOP(3)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(3)
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK24(REG_RED, REG_BLUE)
+
+ YUV2RGB_ENDLOOP(3)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+
+
+#define SET_EMPTY_ALPHA \
+ "pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
+
+#define LOAD_ALPHA \
+ "movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
+
+#define RGB_PACK32(red, green, blue, alpha) \
+ "movq %%mm"blue", %%mm5\n\t" \
+ "movq %%mm"red", %%mm6\n\t" \
+ "punpckhbw %%mm"green", %%mm5\n\t" \
+ "punpcklbw %%mm"green", %%mm"blue"\n\t" \
+ "punpckhbw %%mm"alpha", %%mm6\n\t" \
+ "punpcklbw %%mm"alpha", %%mm"red"\n\t" \
+ "movq %%mm"blue", %%mm"green"\n\t" \
+ "movq %%mm5, %%mm"alpha"\n\t" \
+ "punpcklwd %%mm"red", %%mm"blue"\n\t" \
+ "punpckhwd %%mm"red", %%mm"green"\n\t" \
+ "punpcklwd %%mm6, %%mm5\n\t" \
+ "punpckhwd %%mm6, %%mm"alpha"\n\t" \
+ MOVNTQ " %%mm"blue", 0(%1)\n\t" \
+ MOVNTQ " %%mm"green", 8(%1)\n\t" \
+ MOVNTQ " %%mm5, 16(%1)\n\t" \
+ MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(4)
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+ SET_EMPTY_ALPHA
+ RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+ YUV2RGB_ENDLOOP(4)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(4)
+
+ const uint8_t *pa = src[3] + y * srcStride[3];
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+ LOAD_ALPHA
+ RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+ YUV2RGB_ENDLOOP(4)
+ YUV2RGB_OPERANDS_ALPHA
+ YUV2RGB_ENDFUNC
+}
+#endif
+
+static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(4)
+
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+ SET_EMPTY_ALPHA
+ RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+ YUV2RGB_ENDLOOP(4)
+ YUV2RGB_OPERANDS
+ YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
+ int srcStride[],
+ int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ int y, h_size, vshift;
+
+ YUV2RGB_LOOP(4)
+
+ const uint8_t *pa = src[3] + y * srcStride[3];
+ YUV2RGB_INITIAL_LOAD
+ YUV2RGB
+ RGB_PACK_INTERLEAVE
+ LOAD_ALPHA
+ RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+ YUV2RGB_ENDLOOP(4)
+ YUV2RGB_OPERANDS_ALPHA
+ YUV2RGB_ENDFUNC
+}
+#endif
+
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
diff --git a/ffmpeg/libswscale/yuv2rgb.c b/ffmpeg/libswscale/yuv2rgb.c
new file mode 100644
index 0000000..d12abda
--- /dev/null
+++ b/ffmpeg/libswscale/yuv2rgb.c
@@ -0,0 +1,927 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+#include "libavutil/pixdesc.h"
+
+extern const uint8_t dither_2x2_4[3][8];
+extern const uint8_t dither_2x2_8[3][8];
+extern const uint8_t dither_4x4_16[5][8];
+extern const uint8_t dither_8x8_32[9][8];
+extern const uint8_t dither_8x8_73[9][8];
+extern const uint8_t dither_8x8_220[9][8];
+
+const int32_t ff_yuv2rgb_coeffs[8][4] = {
+ { 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
+ { 117504, 138453, 13954, 34903 }, /* ITU-R Rec. 709 (1990) */
+ { 104597, 132201, 25675, 53279 }, /* unspecified */
+ { 104597, 132201, 25675, 53279 }, /* reserved */
+ { 104448, 132798, 24759, 53109 }, /* FCC */
+ { 104597, 132201, 25675, 53279 }, /* ITU-R Rec. 624-4 System B, G */
+ { 104597, 132201, 25675, 53279 }, /* SMPTE 170M */
+ { 117579, 136230, 16907, 35559 } /* SMPTE 240M (1987) */
+};
+
+const int *sws_getCoefficients(int colorspace)
+{
+ if (colorspace > 7 || colorspace < 0)
+ colorspace = SWS_CS_DEFAULT;
+ return ff_yuv2rgb_coeffs[colorspace];
+}
+
+#define LOADCHROMA(i) \
+ U = pu[i]; \
+ V = pv[i]; \
+ r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
+ g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
+ b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
+
+#define PUTRGB(dst, src, i) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y] + g[Y] + b[Y]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
+
+#define PUTRGB24(dst, src, i) \
+ Y = src[2 * i]; \
+ dst[6 * i + 0] = r[Y]; \
+ dst[6 * i + 1] = g[Y]; \
+ dst[6 * i + 2] = b[Y]; \
+ Y = src[2 * i + 1]; \
+ dst[6 * i + 3] = r[Y]; \
+ dst[6 * i + 4] = g[Y]; \
+ dst[6 * i + 5] = b[Y];
+
+#define PUTBGR24(dst, src, i) \
+ Y = src[2 * i]; \
+ dst[6 * i + 0] = b[Y]; \
+ dst[6 * i + 1] = g[Y]; \
+ dst[6 * i + 2] = r[Y]; \
+ Y = src[2 * i + 1]; \
+ dst[6 * i + 3] = b[Y]; \
+ dst[6 * i + 4] = g[Y]; \
+ dst[6 * i + 5] = r[Y];
+
+#define PUTRGBA(dst, ysrc, asrc, i, s) \
+ Y = ysrc[2 * i]; \
+ dst[2 * i] = r[Y] + g[Y] + b[Y] + (asrc[2 * i] << s); \
+ Y = ysrc[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
+
+#define PUTRGB48(dst, src, i) \
+ Y = src[ 2 * i]; \
+ dst[12 * i + 0] = dst[12 * i + 1] = r[Y]; \
+ dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
+ dst[12 * i + 4] = dst[12 * i + 5] = b[Y]; \
+ Y = src[ 2 * i + 1]; \
+ dst[12 * i + 6] = dst[12 * i + 7] = r[Y]; \
+ dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
+ dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
+
+#define PUTBGR48(dst, src, i) \
+ Y = src[2 * i]; \
+ dst[12 * i + 0] = dst[12 * i + 1] = b[Y]; \
+ dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
+ dst[12 * i + 4] = dst[12 * i + 5] = r[Y]; \
+ Y = src[2 * i + 1]; \
+ dst[12 * i + 6] = dst[12 * i + 7] = b[Y]; \
+ dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
+ dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha) \
+ static int func_name(SwsContext *c, const uint8_t *src[], \
+ int srcStride[], int srcSliceY, int srcSliceH, \
+ uint8_t *dst[], int dstStride[]) \
+ { \
+ int y; \
+ \
+ if (!alpha && c->srcFormat == AV_PIX_FMT_YUV422P) { \
+ srcStride[1] *= 2; \
+ srcStride[2] *= 2; \
+ } \
+ for (y = 0; y < srcSliceH; y += 2) { \
+ dst_type *dst_1 = \
+ (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]); \
+ dst_type *dst_2 = \
+ (dst_type *)(dst[0] + (y + srcSliceY + 1) * dstStride[0]); \
+ dst_type av_unused *r, *g, *b; \
+ const uint8_t *py_1 = src[0] + y * srcStride[0]; \
+ const uint8_t *py_2 = py_1 + srcStride[0]; \
+ const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
+ const uint8_t av_unused *pa_1, *pa_2; \
+ unsigned int h_size = c->dstW >> 3; \
+ if (alpha) { \
+ pa_1 = src[3] + y * srcStride[3]; \
+ pa_2 = pa_1 + srcStride[3]; \
+ } \
+ while (h_size--) { \
+ int av_unused U, V, Y; \
+
+#define ENDYUV2RGBLINE(dst_delta, ss) \
+ pu += 4 >> ss; \
+ pv += 4 >> ss; \
+ py_1 += 8 >> ss; \
+ py_2 += 8 >> ss; \
+ dst_1 += dst_delta >> ss; \
+ dst_2 += dst_delta >> ss; \
+ } \
+ if (c->dstW & (4 >> ss)) { \
+ int av_unused Y, U, V; \
+
+#define ENDYUV2RGBFUNC() \
+ } \
+ } \
+ return srcSliceH; \
+ }
+
+#define CLOSEYUV2RGBFUNC(dst_delta) \
+ ENDYUV2RGBLINE(dst_delta, 0) \
+ ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_48, uint8_t, 0)
+ LOADCHROMA(0);
+ PUTRGB48(dst_1, py_1, 0);
+ PUTRGB48(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB48(dst_2, py_2, 1);
+ PUTRGB48(dst_1, py_1, 1);
+
+ LOADCHROMA(2);
+ PUTRGB48(dst_1, py_1, 2);
+ PUTRGB48(dst_2, py_2, 2);
+
+ LOADCHROMA(3);
+ PUTRGB48(dst_2, py_2, 3);
+ PUTRGB48(dst_1, py_1, 3);
+ENDYUV2RGBLINE(48, 0)
+ LOADCHROMA(0);
+ PUTRGB48(dst_1, py_1, 0);
+ PUTRGB48(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB48(dst_2, py_2, 1);
+ PUTRGB48(dst_1, py_1, 1);
+ENDYUV2RGBLINE(48, 1)
+ LOADCHROMA(0);
+ PUTRGB48(dst_1, py_1, 0);
+ PUTRGB48(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_bgr48, uint8_t, 0)
+ LOADCHROMA(0);
+ PUTBGR48(dst_1, py_1, 0);
+ PUTBGR48(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTBGR48(dst_2, py_2, 1);
+ PUTBGR48(dst_1, py_1, 1);
+
+ LOADCHROMA(2);
+ PUTBGR48(dst_1, py_1, 2);
+ PUTBGR48(dst_2, py_2, 2);
+
+ LOADCHROMA(3);
+ PUTBGR48(dst_2, py_2, 3);
+ PUTBGR48(dst_1, py_1, 3);
+ENDYUV2RGBLINE(48, 0)
+ LOADCHROMA(0);
+ PUTBGR48(dst_1, py_1, 0);
+ PUTBGR48(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTBGR48(dst_2, py_2, 1);
+ PUTBGR48(dst_1, py_1, 1);
+ENDYUV2RGBLINE(48, 1)
+ LOADCHROMA(0);
+ PUTBGR48(dst_1, py_1, 0);
+ PUTBGR48(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
+ LOADCHROMA(0);
+ PUTRGB(dst_1, py_1, 0);
+ PUTRGB(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB(dst_2, py_2, 1);
+ PUTRGB(dst_1, py_1, 1);
+
+ LOADCHROMA(2);
+ PUTRGB(dst_1, py_1, 2);
+ PUTRGB(dst_2, py_2, 2);
+
+ LOADCHROMA(3);
+ PUTRGB(dst_2, py_2, 3);
+ PUTRGB(dst_1, py_1, 3);
+ENDYUV2RGBLINE(8, 0)
+ LOADCHROMA(0);
+ PUTRGB(dst_1, py_1, 0);
+ PUTRGB(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB(dst_2, py_2, 1);
+ PUTRGB(dst_1, py_1, 1);
+ENDYUV2RGBLINE(8, 1)
+ LOADCHROMA(0);
+ PUTRGB(dst_1, py_1, 0);
+ PUTRGB(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+
+ LOADCHROMA(1);
+ PUTRGBA(dst_2, py_2, pa_1, 1, 24);
+ PUTRGBA(dst_1, py_1, pa_2, 1, 24);
+
+ LOADCHROMA(2);
+ PUTRGBA(dst_1, py_1, pa_1, 2, 24);
+ PUTRGBA(dst_2, py_2, pa_2, 2, 24);
+
+ LOADCHROMA(3);
+ PUTRGBA(dst_2, py_2, pa_1, 3, 24);
+ PUTRGBA(dst_1, py_1, pa_2, 3, 24);
+ pa_1 += 8; \
+ pa_2 += 8; \
+ENDYUV2RGBLINE(8, 0)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+
+ LOADCHROMA(1);
+ PUTRGBA(dst_2, py_2, pa_1, 1, 24);
+ PUTRGBA(dst_1, py_1, pa_2, 1, 24);
+ pa_1 += 4; \
+ pa_2 += 4; \
+ENDYUV2RGBLINE(8, 1)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+
+ LOADCHROMA(1);
+ PUTRGBA(dst_2, py_2, pa_2, 1, 0);
+ PUTRGBA(dst_1, py_1, pa_1, 1, 0);
+
+ LOADCHROMA(2);
+ PUTRGBA(dst_1, py_1, pa_1, 2, 0);
+ PUTRGBA(dst_2, py_2, pa_2, 2, 0);
+
+ LOADCHROMA(3);
+ PUTRGBA(dst_2, py_2, pa_2, 3, 0);
+ PUTRGBA(dst_1, py_1, pa_1, 3, 0);
+ pa_1 += 8; \
+ pa_2 += 8; \
+ENDYUV2RGBLINE(8, 0)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+
+ LOADCHROMA(1);
+ PUTRGBA(dst_2, py_2, pa_2, 1, 0);
+ PUTRGBA(dst_1, py_1, pa_1, 1, 0);
+ pa_1 += 4; \
+ pa_2 += 4; \
+ENDYUV2RGBLINE(8, 1)
+ LOADCHROMA(0);
+ PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+ PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
+ LOADCHROMA(0);
+ PUTRGB24(dst_1, py_1, 0);
+ PUTRGB24(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB24(dst_2, py_2, 1);
+ PUTRGB24(dst_1, py_1, 1);
+
+ LOADCHROMA(2);
+ PUTRGB24(dst_1, py_1, 2);
+ PUTRGB24(dst_2, py_2, 2);
+
+ LOADCHROMA(3);
+ PUTRGB24(dst_2, py_2, 3);
+ PUTRGB24(dst_1, py_1, 3);
+ENDYUV2RGBLINE(24, 0)
+ LOADCHROMA(0);
+ PUTRGB24(dst_1, py_1, 0);
+ PUTRGB24(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTRGB24(dst_2, py_2, 1);
+ PUTRGB24(dst_1, py_1, 1);
+ENDYUV2RGBLINE(24, 1)
+ LOADCHROMA(0);
+ PUTRGB24(dst_1, py_1, 0);
+ PUTRGB24(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+// only trivial mods from yuv2rgb_c_24_rgb
+YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
+ LOADCHROMA(0);
+ PUTBGR24(dst_1, py_1, 0);
+ PUTBGR24(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTBGR24(dst_2, py_2, 1);
+ PUTBGR24(dst_1, py_1, 1);
+
+ LOADCHROMA(2);
+ PUTBGR24(dst_1, py_1, 2);
+ PUTBGR24(dst_2, py_2, 2);
+
+ LOADCHROMA(3);
+ PUTBGR24(dst_2, py_2, 3);
+ PUTBGR24(dst_1, py_1, 3);
+ENDYUV2RGBLINE(24, 0)
+ LOADCHROMA(0);
+ PUTBGR24(dst_1, py_1, 0);
+ PUTBGR24(dst_2, py_2, 0);
+
+ LOADCHROMA(1);
+ PUTBGR24(dst_2, py_2, 1);
+ PUTBGR24(dst_1, py_1, 1);
+ENDYUV2RGBLINE(24, 1)
+ LOADCHROMA(0);
+ PUTBGR24(dst_1, py_1, 0);
+ PUTBGR24(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
+ const uint8_t *d16 = dither_2x2_8[y & 1];
+ const uint8_t *e16 = dither_2x2_4[y & 1];
+ const uint8_t *f16 = dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB16(dst, src, i, o) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y + d16[0 + o]] + \
+ g[Y + e16[0 + o]] + \
+ b[Y + f16[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y + d16[1 + o]] + \
+ g[Y + e16[1 + o]] + \
+ b[Y + f16[1 + o]];
+ LOADCHROMA(0);
+ PUTRGB16(dst_1, py_1, 0, 0);
+ PUTRGB16(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB16(dst_2, py_2, 1, 2 + 8);
+ PUTRGB16(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB16(dst_1, py_1, 2, 4);
+ PUTRGB16(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB16(dst_2, py_2, 3, 6 + 8);
+ PUTRGB16(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
+ const uint8_t *d16 = dither_2x2_8[y & 1];
+ const uint8_t *e16 = dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB15(dst, src, i, o) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y + d16[0 + o]] + \
+ g[Y + d16[1 + o]] + \
+ b[Y + e16[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y + d16[1 + o]] + \
+ g[Y + d16[0 + o]] + \
+ b[Y + e16[1 + o]];
+ LOADCHROMA(0);
+ PUTRGB15(dst_1, py_1, 0, 0);
+ PUTRGB15(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB15(dst_2, py_2, 1, 2 + 8);
+ PUTRGB15(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB15(dst_1, py_1, 2, 4);
+ PUTRGB15(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB15(dst_2, py_2, 3, 6 + 8);
+ PUTRGB15(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+// r, g, b, dst_1, dst_2
+YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
+ const uint8_t *d16 = dither_4x4_16[y & 3];
+
+#define PUTRGB12(dst, src, i, o) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y + d16[0 + o]] + \
+ g[Y + d16[0 + o]] + \
+ b[Y + d16[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y + d16[1 + o]] + \
+ g[Y + d16[1 + o]] + \
+ b[Y + d16[1 + o]];
+
+ LOADCHROMA(0);
+ PUTRGB12(dst_1, py_1, 0, 0);
+ PUTRGB12(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB12(dst_2, py_2, 1, 2 + 8);
+ PUTRGB12(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB12(dst_1, py_1, 2, 4);
+ PUTRGB12(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB12(dst_2, py_2, 3, 6 + 8);
+ PUTRGB12(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+// r, g, b, dst_1, dst_2
+YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
+ const uint8_t *d32 = dither_8x8_32[y & 7];
+ const uint8_t *d64 = dither_8x8_73[y & 7];
+
+#define PUTRGB8(dst, src, i, o) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y + d32[0 + o]] + \
+ g[Y + d32[0 + o]] + \
+ b[Y + d64[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y + d32[1 + o]] + \
+ g[Y + d32[1 + o]] + \
+ b[Y + d64[1 + o]];
+
+ LOADCHROMA(0);
+ PUTRGB8(dst_1, py_1, 0, 0);
+ PUTRGB8(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB8(dst_2, py_2, 1, 2 + 8);
+ PUTRGB8(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB8(dst_1, py_1, 2, 4);
+ PUTRGB8(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB8(dst_2, py_2, 3, 6 + 8);
+ PUTRGB8(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
+ const uint8_t * d64 = dither_8x8_73[y & 7];
+ const uint8_t *d128 = dither_8x8_220[y & 7];
+ int acc;
+
+#define PUTRGB4D(dst, src, i, o) \
+ Y = src[2 * i]; \
+ acc = r[Y + d128[0 + o]] + \
+ g[Y + d64[0 + o]] + \
+ b[Y + d128[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ acc |= (r[Y + d128[1 + o]] + \
+ g[Y + d64[1 + o]] + \
+ b[Y + d128[1 + o]]) << 4; \
+ dst[i] = acc;
+
+ LOADCHROMA(0);
+ PUTRGB4D(dst_1, py_1, 0, 0);
+ PUTRGB4D(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB4D(dst_2, py_2, 1, 2 + 8);
+ PUTRGB4D(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB4D(dst_1, py_1, 2, 4);
+ PUTRGB4D(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB4D(dst_2, py_2, 3, 6 + 8);
+ PUTRGB4D(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(4)
+
+YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
+ const uint8_t *d64 = dither_8x8_73[y & 7];
+ const uint8_t *d128 = dither_8x8_220[y & 7];
+
+#define PUTRGB4DB(dst, src, i, o) \
+ Y = src[2 * i]; \
+ dst[2 * i] = r[Y + d128[0 + o]] + \
+ g[Y + d64[0 + o]] + \
+ b[Y + d128[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ dst[2 * i + 1] = r[Y + d128[1 + o]] + \
+ g[Y + d64[1 + o]] + \
+ b[Y + d128[1 + o]];
+
+ LOADCHROMA(0);
+ PUTRGB4DB(dst_1, py_1, 0, 0);
+ PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
+
+ LOADCHROMA(1);
+ PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
+ PUTRGB4DB(dst_1, py_1, 1, 2);
+
+ LOADCHROMA(2);
+ PUTRGB4DB(dst_1, py_1, 2, 4);
+ PUTRGB4DB(dst_2, py_2, 2, 4 + 8);
+
+ LOADCHROMA(3);
+ PUTRGB4DB(dst_2, py_2, 3, 6 + 8);
+ PUTRGB4DB(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
+ const uint8_t *d128 = dither_8x8_220[y & 7];
+ char out_1 = 0, out_2 = 0;
+ g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
+
+#define PUTRGB1(out, src, i, o) \
+ Y = src[2 * i]; \
+ out += out + g[Y + d128[0 + o]]; \
+ Y = src[2 * i + 1]; \
+ out += out + g[Y + d128[1 + o]];
+
+ PUTRGB1(out_1, py_1, 0, 0);
+ PUTRGB1(out_2, py_2, 0, 0 + 8);
+
+ PUTRGB1(out_2, py_2, 1, 2 + 8);
+ PUTRGB1(out_1, py_1, 1, 2);
+
+ PUTRGB1(out_1, py_1, 2, 4);
+ PUTRGB1(out_2, py_2, 2, 4 + 8);
+
+ PUTRGB1(out_2, py_2, 3, 6 + 8);
+ PUTRGB1(out_1, py_1, 3, 6);
+
+ dst_1[0] = out_1;
+ dst_2[0] = out_2;
+CLOSEYUV2RGBFUNC(1)
+
+SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
+{
+ SwsFunc t = NULL;
+
+ if (HAVE_MMX)
+ t = ff_yuv2rgb_init_mmx(c);
+ else if (HAVE_VIS)
+ t = ff_yuv2rgb_init_vis(c);
+ else if (HAVE_ALTIVEC)
+ t = ff_yuv2rgb_init_altivec(c);
+ else if (ARCH_BFIN)
+ t = ff_yuv2rgb_get_func_ptr_bfin(c);
+
+ if (t)
+ return t;
+
+ av_log(c, AV_LOG_WARNING,
+ "No accelerated colorspace conversion found from %s to %s.\n",
+ av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
+
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_BGR48BE:
+ case AV_PIX_FMT_BGR48LE:
+ return yuv2rgb_c_bgr48;
+ case AV_PIX_FMT_RGB48BE:
+ case AV_PIX_FMT_RGB48LE:
+ return yuv2rgb_c_48;
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_ABGR:
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
+ return yuva2argb_c;
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_BGRA:
+ return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
+ case AV_PIX_FMT_RGB24:
+ return yuv2rgb_c_24_rgb;
+ case AV_PIX_FMT_BGR24:
+ return yuv2rgb_c_24_bgr;
+ case AV_PIX_FMT_RGB565:
+ case AV_PIX_FMT_BGR565:
+ return yuv2rgb_c_16_ordered_dither;
+ case AV_PIX_FMT_RGB555:
+ case AV_PIX_FMT_BGR555:
+ return yuv2rgb_c_15_ordered_dither;
+ case AV_PIX_FMT_RGB444:
+ case AV_PIX_FMT_BGR444:
+ return yuv2rgb_c_12_ordered_dither;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ return yuv2rgb_c_8_ordered_dither;
+ case AV_PIX_FMT_RGB4:
+ case AV_PIX_FMT_BGR4:
+ return yuv2rgb_c_4_ordered_dither;
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR4_BYTE:
+ return yuv2rgb_c_4b_ordered_dither;
+ case AV_PIX_FMT_MONOBLACK:
+ return yuv2rgb_c_1_ordered_dither;
+ }
+ return NULL;
+}
+
+static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
+ const int64_t inc, void *y_tab)
+{
+ int i;
+ uint8_t *y_table = y_tab;
+
+ y_table -= elemsize * (inc >> 9);
+
+ for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+ int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+ table[i] = y_table + elemsize * (cb >> 16);
+ }
+}
+
+static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
+{
+ int i;
+ int off = -(inc >> 9);
+
+ for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+ int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+ table[i] = elemsize * (off + (cb >> 16));
+ }
+}
+
+static uint16_t roundToInt16(int64_t f)
+{
+ int r = (f + (1 << 15)) >> 16;
+
+ if (r < -0x7FFF)
+ return 0x8000;
+ else if (r > 0x7FFF)
+ return 0x7FFF;
+ else
+ return r;
+}
+
+av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
+ int fullRange, int brightness,
+ int contrast, int saturation)
+{
+ const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32 ||
+ c->dstFormat == AV_PIX_FMT_RGB32_1 ||
+ c->dstFormat == AV_PIX_FMT_BGR24 ||
+ c->dstFormat == AV_PIX_FMT_RGB565BE ||
+ c->dstFormat == AV_PIX_FMT_RGB565LE ||
+ c->dstFormat == AV_PIX_FMT_RGB555BE ||
+ c->dstFormat == AV_PIX_FMT_RGB555LE ||
+ c->dstFormat == AV_PIX_FMT_RGB444BE ||
+ c->dstFormat == AV_PIX_FMT_RGB444LE ||
+ c->dstFormat == AV_PIX_FMT_RGB8 ||
+ c->dstFormat == AV_PIX_FMT_RGB4 ||
+ c->dstFormat == AV_PIX_FMT_RGB4_BYTE ||
+ c->dstFormat == AV_PIX_FMT_MONOBLACK;
+ const int isNotNe = c->dstFormat == AV_PIX_FMT_NE(RGB565LE, RGB565BE) ||
+ c->dstFormat == AV_PIX_FMT_NE(RGB555LE, RGB555BE) ||
+ c->dstFormat == AV_PIX_FMT_NE(RGB444LE, RGB444BE) ||
+ c->dstFormat == AV_PIX_FMT_NE(BGR565LE, BGR565BE) ||
+ c->dstFormat == AV_PIX_FMT_NE(BGR555LE, BGR555BE) ||
+ c->dstFormat == AV_PIX_FMT_NE(BGR444LE, BGR444BE);
+ const int bpp = c->dstFormatBpp;
+ uint8_t *y_table;
+ uint16_t *y_table16;
+ uint32_t *y_table32;
+ int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
+ const int yoffs = fullRange ? 384 : 326;
+
+ int64_t crv = inv_table[0];
+ int64_t cbu = inv_table[1];
+ int64_t cgu = -inv_table[2];
+ int64_t cgv = -inv_table[3];
+ int64_t cy = 1 << 16;
+ int64_t oy = 0;
+ int64_t yb = 0;
+
+ if (!fullRange) {
+ cy = (cy * 255) / 219;
+ oy = 16 << 16;
+ } else {
+ crv = (crv * 224) / 255;
+ cbu = (cbu * 224) / 255;
+ cgu = (cgu * 224) / 255;
+ cgv = (cgv * 224) / 255;
+ }
+
+ cy = (cy * contrast) >> 16;
+ crv = (crv * contrast * saturation) >> 32;
+ cbu = (cbu * contrast * saturation) >> 32;
+ cgu = (cgu * contrast * saturation) >> 32;
+ cgv = (cgv * contrast * saturation) >> 32;
+ oy -= 256 * brightness;
+
+ c->uOffset = 0x0400040004000400LL;
+ c->vOffset = 0x0400040004000400LL;
+ c->yCoeff = roundToInt16(cy * 8192) * 0x0001000100010001ULL;
+ c->vrCoeff = roundToInt16(crv * 8192) * 0x0001000100010001ULL;
+ c->ubCoeff = roundToInt16(cbu * 8192) * 0x0001000100010001ULL;
+ c->vgCoeff = roundToInt16(cgv * 8192) * 0x0001000100010001ULL;
+ c->ugCoeff = roundToInt16(cgu * 8192) * 0x0001000100010001ULL;
+ c->yOffset = roundToInt16(oy * 8) * 0x0001000100010001ULL;
+
+ c->yuv2rgb_y_coeff = (int16_t)roundToInt16(cy << 13);
+ c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
+ c->yuv2rgb_v2r_coeff = (int16_t)roundToInt16(crv << 13);
+ c->yuv2rgb_v2g_coeff = (int16_t)roundToInt16(cgv << 13);
+ c->yuv2rgb_u2g_coeff = (int16_t)roundToInt16(cgu << 13);
+ c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13);
+
+ //scale coefficients by cy
+ crv = ((crv << 16) + 0x8000) / cy;
+ cbu = ((cbu << 16) + 0x8000) / cy;
+ cgu = ((cgu << 16) + 0x8000) / cy;
+ cgv = ((cgv << 16) + 0x8000) / cy;
+
+ av_free(c->yuvTable);
+
+ switch (bpp) {
+ case 1:
+ c->yuvTable = av_malloc(1024);
+ y_table = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024 - 110; i++) {
+ y_table[i + 110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
+ yb += cy;
+ }
+ fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+ fill_gv_table(c->table_gV, 1, cgv);
+ break;
+ case 4:
+ case 4 | 128:
+ rbase = isRgb ? 3 : 0;
+ gbase = 1;
+ bbase = isRgb ? 0 : 3;
+ c->yuvTable = av_malloc(1024 * 3);
+ y_table = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024 - 110; i++) {
+ int yval = av_clip_uint8((yb + 0x8000) >> 16);
+ y_table[i + 110] = (yval >> 7) << rbase;
+ y_table[i + 37 + 1024] = ((yval + 43) / 85) << gbase;
+ y_table[i + 110 + 2048] = (yval >> 7) << bbase;
+ yb += cy;
+ }
+ fill_table(c->table_rV, 1, crv, y_table + yoffs);
+ fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+ fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+ fill_gv_table(c->table_gV, 1, cgv);
+ break;
+ case 8:
+ rbase = isRgb ? 5 : 0;
+ gbase = isRgb ? 2 : 3;
+ bbase = isRgb ? 0 : 6;
+ c->yuvTable = av_malloc(1024 * 3);
+ y_table = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024 - 38; i++) {
+ int yval = av_clip_uint8((yb + 0x8000) >> 16);
+ y_table[i + 16] = ((yval + 18) / 36) << rbase;
+ y_table[i + 16 + 1024] = ((yval + 18) / 36) << gbase;
+ y_table[i + 37 + 2048] = ((yval + 43) / 85) << bbase;
+ yb += cy;
+ }
+ fill_table(c->table_rV, 1, crv, y_table + yoffs);
+ fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+ fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+ fill_gv_table(c->table_gV, 1, cgv);
+ break;
+ case 12:
+ rbase = isRgb ? 8 : 0;
+ gbase = 4;
+ bbase = isRgb ? 0 : 8;
+ c->yuvTable = av_malloc(1024 * 3 * 2);
+ y_table16 = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024; i++) {
+ uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
+ y_table16[i] = (yval >> 4) << rbase;
+ y_table16[i + 1024] = (yval >> 4) << gbase;
+ y_table16[i + 2048] = (yval >> 4) << bbase;
+ yb += cy;
+ }
+ if (isNotNe)
+ for (i = 0; i < 1024 * 3; i++)
+ y_table16[i] = av_bswap16(y_table16[i]);
+ fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
+ fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
+ fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
+ fill_gv_table(c->table_gV, 2, cgv);
+ break;
+ case 15:
+ case 16:
+ rbase = isRgb ? bpp - 5 : 0;
+ gbase = 5;
+ bbase = isRgb ? 0 : (bpp - 5);
+ c->yuvTable = av_malloc(1024 * 3 * 2);
+ y_table16 = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024; i++) {
+ uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
+ y_table16[i] = (yval >> 3) << rbase;
+ y_table16[i + 1024] = (yval >> (18 - bpp)) << gbase;
+ y_table16[i + 2048] = (yval >> 3) << bbase;
+ yb += cy;
+ }
+ if (isNotNe)
+ for (i = 0; i < 1024 * 3; i++)
+ y_table16[i] = av_bswap16(y_table16[i]);
+ fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
+ fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
+ fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
+ fill_gv_table(c->table_gV, 2, cgv);
+ break;
+ case 24:
+ case 48:
+ c->yuvTable = av_malloc(1024);
+ y_table = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024; i++) {
+ y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
+ yb += cy;
+ }
+ fill_table(c->table_rV, 1, crv, y_table + yoffs);
+ fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+ fill_table(c->table_bU, 1, cbu, y_table + yoffs);
+ fill_gv_table(c->table_gV, 1, cgv);
+ break;
+ case 32:
+ base = (c->dstFormat == AV_PIX_FMT_RGB32_1 ||
+ c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0;
+ rbase = base + (isRgb ? 16 : 0);
+ gbase = base + 8;
+ bbase = base + (isRgb ? 0 : 16);
+ needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
+ if (!needAlpha)
+ abase = (base + 24) & 31;
+ c->yuvTable = av_malloc(1024 * 3 * 4);
+ y_table32 = c->yuvTable;
+ yb = -(384 << 16) - oy;
+ for (i = 0; i < 1024; i++) {
+ unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
+ y_table32[i] = (yval << rbase) +
+ (needAlpha ? 0 : (255u << abase));
+ y_table32[i + 1024] = yval << gbase;
+ y_table32[i + 2048] = yval << bbase;
+ yb += cy;
+ }
+ fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
+ fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
+ fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
+ fill_gv_table(c->table_gV, 4, cgv);
+ break;
+ default:
+ c->yuvTable = NULL;
+ if(!isPlanar(c->dstFormat) || bpp <= 24)
+ av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
+ return -1;
+ }
+ return 0;
+}