diff options
Diffstat (limited to 'ffmpeg/libavcodec/ppc')
22 files changed, 0 insertions, 5091 deletions
diff --git a/ffmpeg/libavcodec/ppc/Makefile b/ffmpeg/libavcodec/ppc/Makefile deleted file mode 100644 index 71b23da..0000000 --- a/ffmpeg/libavcodec/ppc/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -OBJS += ppc/dsputil_ppc.o \ - ppc/fmtconvert_altivec.o \ - ppc/videodsp_ppc.o \ - -OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o -OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o -OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o -OBJS-$(CONFIG_H264QPEL) += ppc/h264qpel.o -OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o -OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o -OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o -OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o -OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o -OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o -OBJS-$(CONFIG_VP8_DECODER) += ppc/vp8dsp_altivec.o - -ALTIVEC-OBJS += ppc/dsputil_altivec.o \ - ppc/fdct_altivec.o \ - ppc/gmc_altivec.o \ - ppc/idct_altivec.o \ - ppc/int_altivec.o \ - -FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o -ALTIVEC-OBJS-$(CONFIG_FFT) += $(FFT-OBJS-yes) diff --git a/ffmpeg/libavcodec/ppc/asm.S b/ffmpeg/libavcodec/ppc/asm.S deleted file mode 100644 index bbbf8a4..0000000 --- a/ffmpeg/libavcodec/ppc/asm.S +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2009 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#define GLUE(a, b) a ## b -#define JOIN(a, b) GLUE(a, b) -#define X(s) JOIN(EXTERN_ASM, s) - -#if ARCH_PPC64 - -#define PTR .quad -#define lp ld -#define lpx ldx -#define stp std -#define stpu stdu -#define PS 8 -#define L(s) JOIN(., s) - -.macro extfunc name - .global X(\name) - .section .opd, "aw" -X(\name): - .quad L(\name), .TOC.@tocbase, 0 - .previous - .type X(\name), STT_FUNC -L(\name): -.endm - -.macro movrel rd, sym, gp - ld \rd, \sym@got(r2) -.endm - -.macro get_got rd -.endm - -#else /* ARCH_PPC64 */ - -#define PTR .int -#define lp lwz -#define lpx lwzx -#define stp stw -#define stpu stwu -#define PS 4 -#define L(s) s - -.macro extfunc name - .global X(\name) - .type X(\name), STT_FUNC -X(\name): -\name: -.endm - -.macro movrel rd, sym, gp -#if CONFIG_PIC - lwz \rd, \sym@got(\gp) -#else - lis \rd, \sym@ha - la \rd, \sym@l(\rd) -#endif -.endm - -.macro get_got rd -#if CONFIG_PIC - bcl 20, 31, .Lgot\@ -.Lgot\@: - mflr \rd - addis \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@ha - addi \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@l -#endif -.endm - -#endif /* ARCH_PPC64 */ - -#if HAVE_IBM_ASM - -.macro DEFINE_REG n - .equiv r\n, \n - .equiv f\n, \n - .equiv v\n, \n -.endm - -DEFINE_REG 0 -DEFINE_REG 1 -DEFINE_REG 2 -DEFINE_REG 3 -DEFINE_REG 4 -DEFINE_REG 5 -DEFINE_REG 6 -DEFINE_REG 7 -DEFINE_REG 8 -DEFINE_REG 9 -DEFINE_REG 10 -DEFINE_REG 11 -DEFINE_REG 12 -DEFINE_REG 13 -DEFINE_REG 14 -DEFINE_REG 15 -DEFINE_REG 16 -DEFINE_REG 17 -DEFINE_REG 18 -DEFINE_REG 19 -DEFINE_REG 20 -DEFINE_REG 21 -DEFINE_REG 22 -DEFINE_REG 23 -DEFINE_REG 24 -DEFINE_REG 25 -DEFINE_REG 26 -DEFINE_REG 27 -DEFINE_REG 28 -DEFINE_REG 29 -DEFINE_REG 30 -DEFINE_REG 31 - -#endif /* HAVE_IBM_ASM */ diff --git a/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/ffmpeg/libavcodec/ppc/dsputil_altivec.c deleted file mode 100644 index f36e394..0000000 --- a/ffmpeg/libavcodec/ppc/dsputil_altivec.c +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_altivec.h" - -static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ - pix1v = vec_ld( 0, pix1); - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(16, pix2); - pix2v = vec_perm(pix2l, pix2r, perm1); - pix2iv = vec_perm(pix2l, pix2r, perm2); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix2iv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix3v, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - uint8_t *pix3 = pix2 + line_size; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, each - time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] - Split the pixel vectors into shorts */ - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(15, pix2); - pix2v = vec_perm(pix2l, pix2r, perm); - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld( 0, pix3); - pix2r = vec_ld(15, pix3); - pix3v = vec_perm(pix2l, pix2r, perm); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix3v); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2v = pix3v; - pix3 += line_size; - - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - return s; -} - -static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - uint8_t *pix3 = pix2 + line_size; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); - vector unsigned char avgv, t5; - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; - vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; - vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv; - vector unsigned short t1, t2, t3, t4; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - s = 0; - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, as well - as some splitting, and vector addition each time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - Split the pixel vectors into shorts */ - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(16, pix2); - pix2v = vec_perm(pix2l, pix2r, perm1); - pix2iv = vec_perm(pix2l, pix2r, perm2); - - pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); - pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); - pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); - pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); - t1 = vec_add(pix2hv, pix2ihv); - t2 = vec_add(pix2lv, pix2ilv); - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld( 0, pix3); - pix2r = vec_ld(16, pix3); - pix3v = vec_perm(pix2l, pix2r, perm1); - pix3iv = vec_perm(pix2l, pix2r, perm2); - - /* Note that AltiVec does have vec_avg, but this works on vector pairs - and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding - would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. - Instead, we have to split the pixel vectors into vectors of shorts, - and do the averaging by hand. */ - - /* Split the pixel vectors into shorts */ - pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); - pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); - pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); - pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); - - /* Do the averaging on them */ - t3 = vec_add(pix3hv, pix3ihv); - t4 = vec_add(pix3lv, pix3ilv); - - avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); - avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); - - /* Pack the shorts back into a result */ - avgv = vec_pack(avghv, avglv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix3 += line_size; - /* Transfer the calculated values for pix3 into pix2 */ - t1 = t3; - t2 = t4; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_ld(0, pix1); - t2 = vec_perm(pix2l, pix2r, perm); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld( 0, pix1); - vector unsigned char pix1r = vec_ld(15, pix1); - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); - t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int pix_norm1_altivec(uint8_t *pix, int line_size) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned char pixv; - vector unsigned int sv; - vector signed int sum; - - sv = (vector unsigned int)vec_splat_u32(0); - - s = 0; - for (i = 0; i < 16; i++) { - /* Read in the potentially unaligned pixels */ - vector unsigned char pixl = vec_ld( 0, pix); - vector unsigned char pixr = vec_ld(15, pix); - pixv = vec_perm(pixl, pixr, perm); - - /* Square the values, and add them to our sum */ - sv = vec_msum(pixv, pixv, sv); - - pix += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sum = vec_sums((vector signed int) sv, (vector signed int) zero); - sum = vec_splat(sum, 3); - vec_ste(sum, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 8x8 block. - * AltiVec-enhanced. - * It's the sad8_altivec code above w/ squaring added. - */ -static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld( 0, pix1); - vector unsigned char pix1r = vec_ld(15, pix1); - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); - t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); - - /* Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 16x16 block. - * AltiVec-enhanced. - * It's the sad16_altivec code above w/ squaring added. - */ -static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_ld(0, pix1); - t2 = vec_perm(pix2l, pix2r, perm); - - /* Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -static int pix_sum_altivec(uint8_t * pix, int line_size) -{ - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned char t1; - vector unsigned int sad; - vector signed int sumdiffs; - - int i; - int s; - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned 16 pixels into t1 */ - vector unsigned char pixl = vec_ld( 0, pix); - vector unsigned char pixr = vec_ld(15, pix); - t1 = vec_perm(pixl, pixr, perm); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t1, sad); - - pix += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, int line_size) -{ - int i; - vector unsigned char perm = vec_lvsl(0, pixels); - vector unsigned char bytes; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts; - - for (i = 0; i < 8; i++) { - // Read potentially unaligned pixels. - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - vector unsigned char pixl = vec_ld( 0, pixels); - vector unsigned char pixr = vec_ld(15, pixels); - bytes = vec_perm(pixl, pixr, perm); - - // convert the bytes into shorts - shorts = (vector signed short)vec_mergeh(zero, bytes); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts, i*16, (vector signed short*)block); - - pixels += line_size; - } -} - -static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, - const uint8_t *s2, int stride) -{ - int i; - vector unsigned char perm1 = vec_lvsl(0, s1); - vector unsigned char perm2 = vec_lvsl(0, s2); - vector unsigned char bytes, pixl, pixr; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts1, shorts2; - - for (i = 0; i < 4; i++) { - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - pixl = vec_ld( 0, s1); - pixr = vec_ld(15, s1); - bytes = vec_perm(pixl, pixr, perm1); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - pixl = vec_ld( 0, s2); - pixr = vec_ld(15, s2); - bytes = vec_perm(pixl, pixr, perm2); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - - - // The code below is a copy of the code above... This is a manual - // unroll. - - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - pixl = vec_ld( 0, s1); - pixr = vec_ld(15, s1); - bytes = vec_perm(pixl, pixr, perm1); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - pixl = vec_ld( 0, s2); - pixr = vec_ld(15, s2); - bytes = vec_perm(pixl, pixr, perm2); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - } -} - - -static void clear_block_altivec(int16_t *block) { - LOAD_ZERO; - vec_st(zero_s16v, 0, block); - vec_st(zero_s16v, 16, block); - vec_st(zero_s16v, 32, block); - vec_st(zero_s16v, 48, block); - vec_st(zero_s16v, 64, block); - vec_st(zero_s16v, 80, block); - vec_st(zero_s16v, 96, block); - vec_st(zero_s16v, 112, block); -} - - -static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { - register int i; - register vector unsigned char vdst, vsrc; - - /* dst and src are 16 bytes-aligned (guaranteed) */ - for (i = 0 ; (i + 15) < w ; i+=16) { - vdst = vec_ld(i, (unsigned char*)dst); - vsrc = vec_ld(i, (unsigned char*)src); - vdst = vec_add(vsrc, vdst); - vec_st(vdst, i, (unsigned char*)dst); - } - /* if w is not a multiple of 16 */ - for (; (i < w) ; i++) { - dst[i] = src[i]; - } -} - -static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ - int sum; - register const vector unsigned char vzero = - (const vector unsigned char)vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, - temp5, temp6, temp7; - { - register const vector signed short vprod1 =(const vector signed short) - { 1,-1, 1,-1, 1,-1, 1,-1 }; - register const vector signed short vprod2 =(const vector signed short) - { 1, 1,-1,-1, 1, 1,-1,-1 }; - register const vector signed short vprod3 =(const vector signed short) - { 1, 1, 1, 1,-1,-1,-1,-1 }; - register const vector unsigned char perm1 = (const vector unsigned char) - {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; - register const vector unsigned char perm2 = (const vector unsigned char) - {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; - register const vector unsigned char perm3 = (const vector unsigned char) - {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1, src2, srcO; \ - register vector unsigned char dst1, dst2, dstO; \ - register vector signed short srcV, dstV; \ - register vector signed short but0, but1, but2, op1, op2, op3; \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 15, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 15, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - /* we're in the 8x8 function, we only care for the first 8 */ \ - srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ - } - ONEITERBUTTERFLY(0, temp0); - ONEITERBUTTERFLY(1, temp1); - ONEITERBUTTERFLY(2, temp2); - ONEITERBUTTERFLY(3, temp3); - ONEITERBUTTERFLY(4, temp4); - ONEITERBUTTERFLY(5, temp5); - ONEITERBUTTERFLY(6, temp6); - ONEITERBUTTERFLY(7, temp7); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -/* -16x8 works with 16 elements; it allows to avoid replicating loads, and -give the compiler more rooms for scheduling. It's only used from -inside hadamard8_diff16_altivec. - -Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT -of spill code, it seems gcc (unlike xlc) cannot keep everything in registers -by itself. The following code include hand-made registers allocation. It's not -clean, but on a 7450 the resulting code is much faster (best case fall from -700+ cycles to 550). - -xlc doesn't add spill code, but it doesn't know how to schedule for the 7450, -and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less -instructions...) - -On the 970, the hand-made RA is still a win (around 690 vs. around 780), but -xlc goes to around 660 on the regular C code... -*/ - -static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { - int sum; - register vector signed short - temp0 __asm__ ("v0"), - temp1 __asm__ ("v1"), - temp2 __asm__ ("v2"), - temp3 __asm__ ("v3"), - temp4 __asm__ ("v4"), - temp5 __asm__ ("v5"), - temp6 __asm__ ("v6"), - temp7 __asm__ ("v7"); - register vector signed short - temp0S __asm__ ("v8"), - temp1S __asm__ ("v9"), - temp2S __asm__ ("v10"), - temp3S __asm__ ("v11"), - temp4S __asm__ ("v12"), - temp5S __asm__ ("v13"), - temp6S __asm__ ("v14"), - temp7S __asm__ ("v15"); - register const vector unsigned char vzero __asm__ ("v31") = - (const vector unsigned char)vec_splat_u8(0); - { - register const vector signed short vprod1 __asm__ ("v16") = - (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 }; - register const vector signed short vprod2 __asm__ ("v17") = - (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 }; - register const vector signed short vprod3 __asm__ ("v18") = - (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 }; - register const vector unsigned char perm1 __asm__ ("v19") = - (const vector unsigned char) - {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; - register const vector unsigned char perm2 __asm__ ("v20") = - (const vector unsigned char) - {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; - register const vector unsigned char perm3 __asm__ ("v21") = - (const vector unsigned char) - {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 __asm__ ("v22"), \ - src2 __asm__ ("v23"), \ - dst1 __asm__ ("v24"), \ - dst2 __asm__ ("v25"), \ - srcO __asm__ ("v22"), \ - dstO __asm__ ("v23"); \ - \ - register vector signed short srcV __asm__ ("v24"), \ - dstV __asm__ ("v25"), \ - srcW __asm__ ("v26"), \ - dstW __asm__ ("v27"), \ - but0 __asm__ ("v28"), \ - but0S __asm__ ("v29"), \ - op1 __asm__ ("v30"), \ - but1 __asm__ ("v22"), \ - op1S __asm__ ("v23"), \ - but1S __asm__ ("v24"), \ - op2 __asm__ ("v25"), \ - but2 __asm__ ("v26"), \ - op2S __asm__ ("v27"), \ - but2S __asm__ ("v28"), \ - op3 __asm__ ("v29"), \ - op3S __asm__ ("v30"); \ - \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 16, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 16, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - but0S = vec_sub(srcW, dstW); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op1S = vec_perm(but0S, but0S, perm1); \ - but1S = vec_mladd(but0S, vprod1, op1S); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op2S = vec_perm(but1S, but1S, perm2); \ - but2S = vec_mladd(but1S, vprod2, op2S); \ - op3 = vec_perm(but2, but2, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - op3S = vec_perm(but2S, but2S, perm3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ - } - ONEITERBUTTERFLY(0, temp0, temp0S); - ONEITERBUTTERFLY(1, temp1, temp1S); - ONEITERBUTTERFLY(2, temp2, temp2S); - ONEITERBUTTERFLY(3, temp3, temp3S); - ONEITERBUTTERFLY(4, temp4, temp4S); - ONEITERBUTTERFLY(5, temp5, temp5S); - ONEITERBUTTERFLY(6, temp6, temp6S); - ONEITERBUTTERFLY(7, temp7, temp7S); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0S, line1S, line2S, line3S, line4S, - line5S, line6S, line7S, line0BS,line2BS, - line1BS,line3BS,line4BS,line6BS,line5BS, - line7BS,line0CS,line4CS,line1CS,line5CS, - line2CS,line6CS,line3CS,line7CS; - - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - - line0S = vec_add(temp0S, temp1S); - line1S = vec_sub(temp0S, temp1S); - line2S = vec_add(temp2S, temp3S); - line3S = vec_sub(temp2S, temp3S); - line4S = vec_add(temp4S, temp5S); - line5S = vec_sub(temp4S, temp5S); - line6S = vec_add(temp6S, temp7S); - line7S = vec_sub(temp6S, temp7S); - - line0BS = vec_add(line0S, line2S); - line2BS = vec_sub(line0S, line2S); - line1BS = vec_add(line1S, line3S); - line3BS = vec_sub(line1S, line3S); - line4BS = vec_add(line4S, line6S); - line6BS = vec_sub(line4S, line6S); - line5BS = vec_add(line5S, line7S); - line7BS = vec_sub(line5S, line7S); - - line0CS = vec_add(line0BS, line4BS); - line4CS = vec_sub(line0BS, line4BS); - line1CS = vec_add(line1BS, line5BS); - line5CS = vec_sub(line1BS, line5BS); - line2CS = vec_add(line2BS, line6BS); - line6CS = vec_sub(line2BS, line6BS); - line3CS = vec_add(line3BS, line7BS); - line7CS = vec_sub(line3BS, line7BS); - - vsum = vec_sum4s(vec_abs(line0CS), vsum); - vsum = vec_sum4s(vec_abs(line1CS), vsum); - vsum = vec_sum4s(vec_abs(line2CS), vsum); - vsum = vec_sum4s(vec_abs(line3CS), vsum); - vsum = vec_sum4s(vec_abs(line4CS), vsum); - vsum = vec_sum4s(vec_abs(line5CS), vsum); - vsum = vec_sum4s(vec_abs(line6CS), vsum); - vsum = vec_sum4s(vec_abs(line7CS), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ - int score; - score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - if (h==16) { - dst += 8*stride; - src += 8*stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } - return score; -} - -av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - - c->pix_abs[0][1] = sad16_x2_altivec; - c->pix_abs[0][2] = sad16_y2_altivec; - c->pix_abs[0][3] = sad16_xy2_altivec; - c->pix_abs[0][0] = sad16_altivec; - c->pix_abs[1][0] = sad8_altivec; - c->sad[0]= sad16_altivec; - c->sad[1]= sad8_altivec; - c->pix_norm1 = pix_norm1_altivec; - c->sse[1]= sse8_altivec; - c->sse[0]= sse16_altivec; - c->pix_sum = pix_sum_altivec; - c->diff_pixels = diff_pixels_altivec; - c->add_bytes= add_bytes_altivec; - if (!high_bit_depth) { - c->get_pixels = get_pixels_altivec; - c->clear_block = clear_block_altivec; - } - - c->hadamard8_diff[0] = hadamard8_diff16_altivec; - c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; -} diff --git a/ffmpeg/libavcodec/ppc/dsputil_altivec.h b/ffmpeg/libavcodec/ppc/dsputil_altivec.h deleted file mode 100644 index 0e769ab..0000000 --- a/ffmpeg/libavcodec/ppc/dsputil_altivec.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H -#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H - -#include <stdint.h> -#include "libavcodec/dsputil.h" - -void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_fdct_altivec(int16_t *block); -void ff_gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, - int x16, int y16, int rounder); -void ff_idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void ff_dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx); -void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx); -void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx); - -#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */ diff --git a/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/ffmpeg/libavcodec/ppc/dsputil_ppc.c deleted file mode 100644 index 7454ea0..0000000 --- a/ffmpeg/libavcodec/ppc/dsputil_ppc.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <string.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/ppc/cpu.h" -#include "dsputil_altivec.h" - -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* -clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a -cache line size not equal to 32 bytes. -Fortunately all processor used by Apple up to at least the 7450 (aka second -generation G4) use 32 bytes cache line. -This is due to the use of the 'dcbz' instruction. It simply clear to zero a -single cache line, so you need to know the cache line size to use it ! -It's absurd, but it's fast... - -update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line -size: 128 bytes. Oups. -The semantic of dcbz was changed, it always clear 32 bytes. so the function -below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, -which is defined to clear a cache line (as dcbz before). So we still can -distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - -see <http://developer.apple.com/technotes/tn/tn2087.html> -and <http://developer.apple.com/technotes/tn/tn2086.html> -*/ -static void clear_blocks_dcbz32_ppc(int16_t *blocks) -{ - register int misal = ((unsigned long)blocks & 0x00000010); - register int i = 0; - if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; - } - for ( ; i < sizeof(int16_t)*6*64-31 ; i += 32) { - __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } - if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; - } -} - -/* same as above, when dcbzl clear a whole 128B cache line - i.e. the PPC970 aka G5 */ -#if HAVE_DCBZL -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ - register int misal = ((unsigned long)blocks & 0x0000007f); - register int i = 0; - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(int16_t)*6*64); - } - else - for ( ; i < sizeof(int16_t)*6*64 ; i += 128) { - __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } -} -#else -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ - memset(blocks, 0, sizeof(int16_t)*6*64); -} -#endif - -#if HAVE_DCBZL -/* check dcbz report how many bytes are set to 0 by dcbz */ -/* update 24/06/2003 : replace dcbz by dcbzl to get - the intended effect (Apple "fixed" dcbz) - unfortunately this cannot be used unless the assembler - knows about dcbzl ... */ -static long check_dcbzl_effect(void) -{ - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; - - if (!fakedata) { - return 0L; - } - - fakedata_middle = (fakedata + 512); - - memset(fakedata, 0xFF, 1024); - - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024 ; i ++) { - if (fakedata[i] == (char)0) - count++; - } - - av_free(fakedata); - - return count; -} -#else -static long check_dcbzl_effect(void) -{ - return 0; -} -#endif - -av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - int mm_flags = av_get_cpu_flags(); - - // Common optimizations whether AltiVec is available or not - if (!high_bit_depth) { - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - } - - if (PPC_ALTIVEC(mm_flags)) { - ff_dsputil_init_altivec(c, avctx); - ff_int_init_altivec(c, avctx); - c->gmc1 = ff_gmc1_altivec; - -#if CONFIG_ENCODERS - if (avctx->bits_per_raw_sample <= 8 && - (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC)) { - c->fdct = ff_fdct_altivec; - } -#endif //CONFIG_ENCODERS - - if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) { - c->idct_put = ff_idct_put_altivec; - c->idct_add = ff_idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - } -} diff --git a/ffmpeg/libavcodec/ppc/fdct_altivec.c b/ffmpeg/libavcodec/ppc/fdct_altivec.c deleted file mode 100644 index ff816e2..0000000 --- a/ffmpeg/libavcodec/ppc/fdct_altivec.c +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (C) 2003 James Klicman <james@klicman.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/common.h" -#include "dsputil_altivec.h" - -#define vs16(v) ((vector signed short)(v)) -#define vs32(v) ((vector signed int)(v)) -#define vu8(v) ((vector unsigned char)(v)) -#define vu16(v) ((vector unsigned short)(v)) -#define vu32(v) ((vector unsigned int)(v)) - - -#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */ -#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */ -#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */ -#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */ -#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */ -#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */ -#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */ -#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ - - -#define W0 -(2 * C2) -#define W1 (2 * C6) -#define W2 (SQRT_2 * C6) -#define W3 (SQRT_2 * C3) -#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) -#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7)) -#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7)) -#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7)) -#define W8 (SQRT_2 * ( C7 - C3)) -#define W9 (SQRT_2 * (-C1 - C3)) -#define WA (SQRT_2 * (-C3 - C5)) -#define WB (SQRT_2 * ( C5 - C3)) - - -static vector float fdctconsts[3] = { - { W0, W1, W2, W3 }, - { W4, W5, W6, W7 }, - { W8, W9, WA, WB } -}; - -#define LD_W0 vec_splat(cnsts0, 0) -#define LD_W1 vec_splat(cnsts0, 1) -#define LD_W2 vec_splat(cnsts0, 2) -#define LD_W3 vec_splat(cnsts0, 3) -#define LD_W4 vec_splat(cnsts1, 0) -#define LD_W5 vec_splat(cnsts1, 1) -#define LD_W6 vec_splat(cnsts1, 2) -#define LD_W7 vec_splat(cnsts1, 3) -#define LD_W8 vec_splat(cnsts2, 0) -#define LD_W9 vec_splat(cnsts2, 1) -#define LD_WA vec_splat(cnsts2, 2) -#define LD_WB vec_splat(cnsts2, 3) - - -#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ - b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ - b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ - b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \ - /* }}} */ - -#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 += x2; */ \ - b5 = vec_add(b5, x3); /* b5 += x3; */ \ - b3 = vec_add(b3, x2); /* b3 += x2; */ \ - b1 = vec_add(b1, x3); /* b1 += x3; */ \ - /* }}} */ - - - -/* two dimensional discrete cosine transform */ - -void ff_fdct_altivec(int16_t *block) -{ - vector signed short *bp; - vector float *cp; - vector float b00, b10, b20, b30, b40, b50, b60, b70; - vector float b01, b11, b21, b31, b41, b51, b61, b71; - vector float mzero, cnst, cnsts0, cnsts1, cnsts2; - vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* setup constants {{{ */ - /* mzero = -0.0 */ - mzero = ((vector float)vec_splat_u32(-1)); - mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); - cp = fdctconsts; - cnsts0 = vec_ld(0, cp); cp++; - cnsts1 = vec_ld(0, cp); cp++; - cnsts2 = vec_ld(0, cp); - /* }}} */ - - - /* 8x8 matrix transpose (vector short[8]) {{{ */ -#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) - - bp = (vector signed short*)block; - b00 = ((vector float)vec_ld(0, bp)); - b40 = ((vector float)vec_ld(16*4, bp)); - b01 = ((vector float)MERGE_S16(h, b00, b40)); - b11 = ((vector float)MERGE_S16(l, b00, b40)); - bp++; - b10 = ((vector float)vec_ld(0, bp)); - b50 = ((vector float)vec_ld(16*4, bp)); - b21 = ((vector float)MERGE_S16(h, b10, b50)); - b31 = ((vector float)MERGE_S16(l, b10, b50)); - bp++; - b20 = ((vector float)vec_ld(0, bp)); - b60 = ((vector float)vec_ld(16*4, bp)); - b41 = ((vector float)MERGE_S16(h, b20, b60)); - b51 = ((vector float)MERGE_S16(l, b20, b60)); - bp++; - b30 = ((vector float)vec_ld(0, bp)); - b70 = ((vector float)vec_ld(16*4, bp)); - b61 = ((vector float)MERGE_S16(h, b30, b70)); - b71 = ((vector float)MERGE_S16(l, b30, b70)); - - x0 = ((vector float)MERGE_S16(h, b01, b41)); - x1 = ((vector float)MERGE_S16(l, b01, b41)); - x2 = ((vector float)MERGE_S16(h, b11, b51)); - x3 = ((vector float)MERGE_S16(l, b11, b51)); - x4 = ((vector float)MERGE_S16(h, b21, b61)); - x5 = ((vector float)MERGE_S16(l, b21, b61)); - x6 = ((vector float)MERGE_S16(h, b31, b71)); - x7 = ((vector float)MERGE_S16(l, b31, b71)); - - b00 = ((vector float)MERGE_S16(h, x0, x4)); - b10 = ((vector float)MERGE_S16(l, x0, x4)); - b20 = ((vector float)MERGE_S16(h, x1, x5)); - b30 = ((vector float)MERGE_S16(l, x1, x5)); - b40 = ((vector float)MERGE_S16(h, x2, x6)); - b50 = ((vector float)MERGE_S16(l, x2, x6)); - b60 = ((vector float)MERGE_S16(h, x3, x7)); - b70 = ((vector float)MERGE_S16(l, x3, x7)); - -#undef MERGE_S16 - /* }}} */ - - -/* Some of the initial calculations can be done as vector short before - * conversion to vector float. The following code section takes advantage - * of this. - */ - /* fdct rows {{{ */ - x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); - x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); - x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); - x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); - x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); - x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); - x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); - x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); - - b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); - b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); - - b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); - b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); - -#define CTF0(n) \ - b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ - b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); - - CTF0(0); - CTF0(4); - - b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); - b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); - - CTF0(2); - CTF0(6); - -#undef CTF0 - - x0 = vec_add(b60, b20); - x1 = vec_add(b61, b21); - - cnst = LD_W2; - x0 = vec_madd(cnst, x0, mzero); - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_W1; - b20 = vec_madd(cnst, b20, x0); - b21 = vec_madd(cnst, b21, x1); - cnst = LD_W0; - b60 = vec_madd(cnst, b60, x0); - b61 = vec_madd(cnst, b61, x1); - -#define CTFX(x,b) \ - b##0 = ((vector float)vec_unpackh(vs16(x))); \ - b##1 = ((vector float)vec_unpackl(vs16(x))); \ - b##0 = vec_ctf(vs32(b##0), 0); \ - b##1 = vec_ctf(vs32(b##1), 0); \ - - CTFX(x4, b7); - CTFX(x5, b5); - CTFX(x6, b3); - CTFX(x7, b1); - -#undef CTFX - - - x0 = vec_add(b70, b10); - x1 = vec_add(b50, b30); - x2 = vec_add(b70, b30); - x3 = vec_add(b50, b10); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b70 = vec_madd(cnst, b70, x0); - cnst = LD_W5; - b50 = vec_madd(cnst, b50, x1); - cnst = LD_W6; - b30 = vec_madd(cnst, b30, x1); - cnst = LD_W7; - b10 = vec_madd(cnst, b10, x0); - - b70 = vec_add(b70, x2); - b50 = vec_add(b50, x3); - b30 = vec_add(b30, x2); - b10 = vec_add(b10, x3); - - - x0 = vec_add(b71, b11); - x1 = vec_add(b51, b31); - x2 = vec_add(b71, b31); - x3 = vec_add(b51, b11); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b71 = vec_madd(cnst, b71, x0); - cnst = LD_W5; - b51 = vec_madd(cnst, b51, x1); - cnst = LD_W6; - b31 = vec_madd(cnst, b31, x1); - cnst = LD_W7; - b11 = vec_madd(cnst, b11, x0); - - b71 = vec_add(b71, x2); - b51 = vec_add(b51, x3); - b31 = vec_add(b31, x2); - b11 = vec_add(b11, x3); - /* }}} */ - - - /* 8x8 matrix transpose (vector float[8][2]) {{{ */ - x0 = vec_mergel(b00, b20); - x1 = vec_mergeh(b00, b20); - x2 = vec_mergel(b10, b30); - x3 = vec_mergeh(b10, b30); - - b00 = vec_mergeh(x1, x3); - b10 = vec_mergel(x1, x3); - b20 = vec_mergeh(x0, x2); - b30 = vec_mergel(x0, x2); - - x4 = vec_mergel(b41, b61); - x5 = vec_mergeh(b41, b61); - x6 = vec_mergel(b51, b71); - x7 = vec_mergeh(b51, b71); - - b41 = vec_mergeh(x5, x7); - b51 = vec_mergel(x5, x7); - b61 = vec_mergeh(x4, x6); - b71 = vec_mergel(x4, x6); - - x0 = vec_mergel(b01, b21); - x1 = vec_mergeh(b01, b21); - x2 = vec_mergel(b11, b31); - x3 = vec_mergeh(b11, b31); - - x4 = vec_mergel(b40, b60); - x5 = vec_mergeh(b40, b60); - x6 = vec_mergel(b50, b70); - x7 = vec_mergeh(b50, b70); - - b40 = vec_mergeh(x1, x3); - b50 = vec_mergel(x1, x3); - b60 = vec_mergeh(x0, x2); - b70 = vec_mergel(x0, x2); - - b01 = vec_mergeh(x5, x7); - b11 = vec_mergel(x5, x7); - b21 = vec_mergeh(x4, x6); - b31 = vec_mergel(x4, x6); - /* }}} */ - - - FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); - - - /* round, convert back to short {{{ */ -#define CTS(n) \ - b##n##0 = vec_round(b##n##0); \ - b##n##1 = vec_round(b##n##1); \ - b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ - b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ - b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ - vec_st(vs16(b##n##0), 0, bp); - - bp = (vector signed short*)block; - CTS(0); bp++; - CTS(1); bp++; - CTS(2); bp++; - CTS(3); bp++; - CTS(4); bp++; - CTS(5); bp++; - CTS(6); bp++; - CTS(7); - -#undef CTS - /* }}} */ -} diff --git a/ffmpeg/libavcodec/ppc/fft_altivec.c b/ffmpeg/libavcodec/ppc/fft_altivec.c deleted file mode 100644 index 2357198..0000000 --- a/ffmpeg/libavcodec/ppc/fft_altivec.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2009 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" - -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before with s->revtab table. No - * 1.0/sqrt(n) normalization is done. - * AltiVec-enabled - * This code assumes that the 'z' pointer is 16 bytes-aligned - * It also assumes all FFTComplex are 8 bytes-aligned pair of float - */ - -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); -void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); - -#if HAVE_GNU_AS && HAVE_ALTIVEC -static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int j, k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n8 = n >> 3; - int n32 = n >> 5; - const uint16_t *revtabj = s->revtab; - const uint16_t *revtabk = s->revtab+n4; - const vec_f *tcos = (const vec_f*)(s->tcos+n8); - const vec_f *tsin = (const vec_f*)(s->tsin+n8); - const vec_f *pin = (const vec_f*)(input+n4); - vec_f *pout = (vec_f*)(output+n4); - - /* pre rotation */ - k = n32-1; - do { - vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; -#define CMULA(p,o0,o1,o2,o3)\ - a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ - b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ - re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ - im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ - cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ - sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ - r##p = im*cos - re*sin;\ - i##p = re*cos + im*sin; -#define STORE2(v,dst)\ - j = dst;\ - vec_ste(v, 0, output+j*2);\ - vec_ste(v, 4, output+j*2); -#define STORE8(p)\ - a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ - b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ - c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ - d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ - STORE2(a, revtabk[ p*2-4]);\ - STORE2(b, revtabk[ p*2-3]);\ - STORE2(c, revtabj[-p*2+2]);\ - STORE2(d, revtabj[-p*2+3]); - - cos0 = tcos[k]; - sin0 = tsin[k]; - cos1 = tcos[-k-1]; - sin1 = tsin[-k-1]; - CMULA(0, 0,1,2,3); - CMULA(1, 2,3,0,1); - STORE8(0); - STORE8(1); - revtabj += 4; - revtabk -= 4; - k--; - } while(k >= 0); - - ff_fft_calc_altivec(s, (FFTComplex*)output); - - /* post rotation + reordering */ - j = -n32; - k = n32-1; - do { - vec_f cos,sin,re,im,a,b,c,d; -#define CMULB(d0,d1,o)\ - re = pout[o*2];\ - im = pout[o*2+1];\ - cos = tcos[o];\ - sin = tsin[o];\ - d0 = im*sin - re*cos;\ - d1 = re*sin + im*cos; - - CMULB(a,b,j); - CMULB(c,d,k); - pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); - pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); - pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); - pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); - j++; - k--; - } while(k >= 0); -} - -static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n16 = n >> 4; - vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; - vec_u32 *p0 = (vec_u32*)(output+n4); - vec_u32 *p1 = (vec_u32*)(output+n4*3); - - imdct_half_altivec(s, output + n4, input); - - for (k = 0; k < n16; k++) { - vec_u32 a = p0[k] ^ sign; - vec_u32 b = p1[-k-1]; - p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); - p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); - } -} -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */ - -av_cold void ff_fft_init_ppc(FFTContext *s) -{ -#if HAVE_GNU_AS && HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - s->fft_calc = ff_fft_calc_interleave_altivec; - if (s->mdct_bits >= 5) { - s->imdct_calc = imdct_calc_altivec; - s->imdct_half = imdct_half_altivec; - } -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/fft_altivec_s.S b/ffmpeg/libavcodec/ppc/fft_altivec_s.S deleted file mode 100644 index 16ce838..0000000 --- a/ffmpeg/libavcodec/ppc/fft_altivec_s.S +++ /dev/null @@ -1,449 +0,0 @@ -/* - * FFT transform with Altivec optimizations - * Copyright (c) 2009 Loren Merritt - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * These functions are not individually interchangeable with the C versions. - * While C takes arrays of FFTComplex, Altivec leaves intermediate results - * in blocks as convenient to the vector size. - * i.e. {4x real, 4x imaginary, 4x real, ...} - * - * I ignore standard calling convention. - * Instead, the following registers are treated as global constants: - * v14: zero - * v15..v18: cosines - * v19..v29: permutations - * r9: 16 - * r12: ff_cos_tabs - * and the rest are free for local use. - */ - -#include "config.h" -#include "asm.S" - -.text - -.macro addi2 ra, imm // add 32-bit immediate -.if \imm & 0xffff - addi \ra, \ra, \imm@l -.endif -.if (\imm+0x8000)>>16 - addis \ra, \ra, \imm@ha -.endif -.endm - -.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} -.endm - -.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vperm \b2,\b0,\b1,v20 - vperm \b3,\b0,\b1,v21 - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vmrghw \b2,\b0,\b1 - vperm \b3,\b0,\b1,v22 - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} - vperm \b2,\b0,\b1,v23 - vperm \b3,\b0,\b1,v24 -.endm - -.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 - vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} - vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} - vperm \a2,\a0,\a1,v20 // FFT4 ... - vperm \a3,\a0,\a1,v21 - vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} - vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} - vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) - vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} - vmrghw \a2,\a0,\a1 - vperm \a3,\a0,\a1,v22 - vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} - vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} - vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} - vperm \a2,\a0,\a1,v23 - vperm \a3,\a0,\a1,v24 - vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} - vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} - vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} - vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} - vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} - vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} -.endm - -.macro BF d0,d1,s0,s1 - vsubfp \d1,\s0,\s1 - vaddfp \d0,\s0,\s1 -.endm - -.macro zip d0,d1,s0,s1 - vmrghw \d0,\s0,\s1 - vmrglw \d1,\s0,\s1 -.endm - -.macro def_fft4 interleave -fft4\interleave\()_altivec: - lvx v0, 0,r3 - lvx v1,r9,r3 - FFT4 v0,v1,v2,v3 -.ifnb \interleave - zip v0,v1,v2,v3 - stvx v0, 0,r3 - stvx v1,r9,r3 -.else - stvx v2, 0,r3 - stvx v3,r9,r3 -.endif - blr -.endm - -.macro def_fft8 interleave -fft8\interleave\()_altivec: - addi r4,r3,32 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 -.ifnb \interleave - zip v4,v5,v0,v1 - zip v6,v7,v2,v3 - stvx v4, 0,r3 - stvx v5,r9,r3 - stvx v6, 0,r4 - stvx v7,r9,r4 -.else - stvx v0, 0,r3 - stvx v1,r9,r3 - stvx v2, 0,r4 - stvx v3,r9,r4 -.endif - blr -.endm - -.macro def_fft16 interleave -fft16\interleave\()_altivec: - addi r5,r3,64 - addi r6,r3,96 - addi r4,r3,32 - lvx v0, 0,r5 - lvx v1,r9,r5 - lvx v2, 0,r6 - lvx v3,r9,r6 - FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 - vmaddfp v8,v4,v15,v14 // r2*wre - vmaddfp v9,v5,v15,v14 // i2*wre - vmaddfp v10,v6,v15,v14 // r3*wre - vmaddfp v11,v7,v15,v14 // i3*wre - vmaddfp v8,v5,v16,v8 // i2*wim - vnmsubfp v9,v4,v16,v9 // r2*wim - vnmsubfp v10,v7,v16,v10 // i3*wim - vmaddfp v11,v6,v16,v11 // r3*wim - BF v10,v12,v10,v8 - BF v11,v13,v9,v11 - BF v0,v4,v0,v10 - BF v3,v7,v3,v12 - BF v1,v5,v1,v11 - BF v2,v6,v2,v13 -.ifnb \interleave - zip v8, v9,v0,v1 - zip v10,v11,v2,v3 - zip v12,v13,v4,v5 - zip v14,v15,v6,v7 - stvx v8, 0,r3 - stvx v9,r9,r3 - stvx v10, 0,r4 - stvx v11,r9,r4 - stvx v12, 0,r5 - stvx v13,r9,r5 - stvx v14, 0,r6 - stvx v15,r9,r6 -.else - stvx v0, 0,r3 - stvx v4, 0,r5 - stvx v3,r9,r4 - stvx v7,r9,r6 - stvx v1,r9,r3 - stvx v5,r9,r5 - stvx v2, 0,r4 - stvx v6, 0,r6 -.endif - blr -.endm - -// void pass(float *z, float *wre, int n) -.macro PASS interleave, suffix -fft_pass\suffix\()_altivec: - mtctr r5 - slwi r0,r5,4 - slwi r7,r5,6 // o2 - slwi r5,r5,5 // o1 - add r10,r5,r7 // o3 - add r0,r4,r0 // wim - addi r6,r5,16 // o1+16 - addi r8,r7,16 // o2+16 - addi r11,r10,16 // o3+16 -1: - lvx v8, 0,r4 // wre - lvx v10, 0,r0 // wim - sub r0,r0,r9 - lvx v9, 0,r0 - vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] - lvx v4,r3,r7 // r2 = z[o2] - lvx v5,r3,r8 // i2 = z[o2+16] - lvx v6,r3,r10 // r3 = z[o3] - lvx v7,r3,r11 // i3 = z[o3+16] - vmaddfp v10,v4,v8,v14 // r2*wre - vmaddfp v11,v5,v8,v14 // i2*wre - vmaddfp v12,v6,v8,v14 // r3*wre - vmaddfp v13,v7,v8,v14 // i3*wre - lvx v0, 0,r3 // r0 = z[0] - lvx v3,r3,r6 // i1 = z[o1+16] - vmaddfp v10,v5,v9,v10 // i2*wim - vnmsubfp v11,v4,v9,v11 // r2*wim - vnmsubfp v12,v7,v9,v12 // i3*wim - vmaddfp v13,v6,v9,v13 // r3*wim - lvx v1,r3,r9 // i0 = z[16] - lvx v2,r3,r5 // r1 = z[o1] - BF v12,v8,v12,v10 - BF v13,v9,v11,v13 - BF v0,v4,v0,v12 - BF v3,v7,v3,v8 -.if !\interleave - stvx v0, 0,r3 - stvx v4,r3,r7 - stvx v3,r3,r6 - stvx v7,r3,r11 -.endif - BF v1,v5,v1,v13 - BF v2,v6,v2,v9 -.if !\interleave - stvx v1,r3,r9 - stvx v2,r3,r5 - stvx v5,r3,r8 - stvx v6,r3,r10 -.else - vmrghw v8,v0,v1 - vmrglw v9,v0,v1 - stvx v8, 0,r3 - stvx v9,r3,r9 - vmrghw v8,v2,v3 - vmrglw v9,v2,v3 - stvx v8,r3,r5 - stvx v9,r3,r6 - vmrghw v8,v4,v5 - vmrglw v9,v4,v5 - stvx v8,r3,r7 - stvx v9,r3,r8 - vmrghw v8,v6,v7 - vmrglw v9,v6,v7 - stvx v8,r3,r10 - stvx v9,r3,r11 -.endif - addi r3,r3,32 - addi r4,r4,16 - bdnz 1b - sub r3,r3,r5 - blr -.endm - -#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ - -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d - - .rodata - .align 4 -fft_data: - .float 0, 0, 0, 0 - .float 1, 0.92387953, M_SQRT1_2, 0.38268343 - .float 0, 0.38268343, M_SQRT1_2, 0.92387953 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 - .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 - vcprm(s0,3,2,1) - vcprm(0,1,s2,s1) - vcprm(2,3,s0,s3) - vcprm(2,s3,3,s2) - vcprm(0,1,s0,s1) - vcprm(2,3,s2,s3) - vcprm(2,3,0,1) - vcprm(1,2,s3,s0) - vcprm(0,3,s2,s1) - vcprm(0,2,s1,s3) - vcprm(1,3,s0,s2) - -.macro lvm b, r, regs:vararg - lvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - lvm \b, \regs - .endif -.endm - -.macro stvm b, r, regs:vararg - stvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - stvm \b, \regs - .endif -.endm - -.macro fft_calc interleave -extfunc ff_fft_calc\interleave\()_altivec - mflr r0 - stp r0, 2*PS(r1) - stpu r1, -(160+16*PS)(r1) - get_got r11 - addi r6, r1, 16*PS - stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - mfvrsave r0 - stw r0, 15*PS(r1) - li r6, 0xfffffffc - mtvrsave r6 - - movrel r6, fft_data, r11 - lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 - lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 - - li r9, 16 - movrel r12, X(ff_cos_tabs), r11 - - movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 - lwz r3, 0(r3) - subi r3, r3, 2 - slwi r3, r3, 2+ARCH_PPC64 - lpx r3, r3, r6 - mtctr r3 - mr r3, r4 - bctrl - - addi r6, r1, 16*PS - lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - lwz r6, 15*PS(r1) - mtvrsave r6 - lp r1, 0(r1) - lp r0, 2*PS(r1) - mtlr r0 - blr -.endm - -.macro DECL_FFT suffix, bits, n, n2, n4 -fft\n\suffix\()_altivec: - mflr r0 - stp r0,PS*(\bits-3)(r1) - bl fft\n2\()_altivec - addi2 r3,\n*4 - bl fft\n4\()_altivec - addi2 r3,\n*2 - bl fft\n4\()_altivec - addi2 r3,\n*-6 - lp r0,PS*(\bits-3)(r1) - lp r4,\bits*PS(r12) - mtlr r0 - li r5,\n/16 - b fft_pass\suffix\()_altivec -.endm - -.macro DECL_FFTS interleave, suffix - .text - def_fft4 \suffix - def_fft8 \suffix - def_fft16 \suffix - PASS \interleave, \suffix - DECL_FFT \suffix, 5, 32, 16, 8 - DECL_FFT \suffix, 6, 64, 32, 16 - DECL_FFT \suffix, 7, 128, 64, 32 - DECL_FFT \suffix, 8, 256, 128, 64 - DECL_FFT \suffix, 9, 512, 256, 128 - DECL_FFT \suffix,10, 1024, 512, 256 - DECL_FFT \suffix,11, 2048, 1024, 512 - DECL_FFT \suffix,12, 4096, 2048, 1024 - DECL_FFT \suffix,13, 8192, 4096, 2048 - DECL_FFT \suffix,14,16384, 8192, 4096 - DECL_FFT \suffix,15,32768,16384, 8192 - DECL_FFT \suffix,16,65536,32768,16384 - - fft_calc \suffix - - .rodata - .align 3 -fft_dispatch_tab\suffix\()_altivec: - PTR fft4\suffix\()_altivec - PTR fft8\suffix\()_altivec - PTR fft16\suffix\()_altivec - PTR fft32\suffix\()_altivec - PTR fft64\suffix\()_altivec - PTR fft128\suffix\()_altivec - PTR fft256\suffix\()_altivec - PTR fft512\suffix\()_altivec - PTR fft1024\suffix\()_altivec - PTR fft2048\suffix\()_altivec - PTR fft4096\suffix\()_altivec - PTR fft8192\suffix\()_altivec - PTR fft16384\suffix\()_altivec - PTR fft32768\suffix\()_altivec - PTR fft65536\suffix\()_altivec -.endm - -DECL_FFTS 0 -DECL_FFTS 1, _interleave diff --git a/ffmpeg/libavcodec/ppc/fmtconvert_altivec.c b/ffmpeg/libavcodec/ppc/fmtconvert_altivec.c deleted file mode 100644 index cd32e39..0000000 --- a/ffmpeg/libavcodec/ppc/fmtconvert_altivec.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fmtconvert.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC - -static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src, - float mul, int len) -{ - union { - vector float v; - float s[4]; - } mul_u; - int i; - vector float src1, src2, dst1, dst2, mul_v, zero; - - zero = (vector float)vec_splat_u32(0); - mul_u.s[0] = mul; - mul_v = vec_splat(mul_u.v, 0); - - for (i = 0; i < len; i += 8) { - src1 = vec_ctf(vec_ld(0, src+i), 0); - src2 = vec_ctf(vec_ld(16, src+i), 0); - dst1 = vec_madd(src1, mul_v, zero); - dst2 = vec_madd(src2, mul_v, zero); - vec_st(dst1, 0, dst+i); - vec_st(dst2, 16, dst+i); - } -} - - -static vector signed short float_to_int16_one_altivec(const float *src) -{ - vector float s0 = vec_ld(0, src); - vector float s1 = vec_ld(16, src); - vector signed int t0 = vec_cts(s0, 0); - vector signed int t1 = vec_cts(s1, 0); - return vec_packs(t0,t1); -} - -static void float_to_int16_altivec(int16_t *dst, const float *src, long len) -{ - int i; - vector signed short d0, d1, d; - vector unsigned char align; - if (((long)dst) & 15) { //FIXME - for (i = 0; i < len - 7; i += 8) { - d0 = vec_ld(0, dst+i); - d = float_to_int16_one_altivec(src + i); - d1 = vec_ld(15, dst+i); - d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); - align = vec_lvsr(0, dst + i); - d0 = vec_perm(d1, d, align); - d1 = vec_perm(d, d1, align); - vec_st(d0, 0, dst + i); - vec_st(d1, 15, dst + i); - } - } else { - for (i = 0; i < len - 7; i += 8) { - d = float_to_int16_one_altivec(src + i); - vec_st(d, 0, dst + i); - } - } -} - -#define VSTE_INC(dst, v, elem, inc) do { \ - vector signed short s = vec_splat(v, elem); \ - vec_ste(s, 0, dst); \ - dst += inc; \ - } while (0) - -static void float_to_int16_stride_altivec(int16_t *dst, const float *src, - long len, int stride) -{ - int i; - vector signed short d; - - for (i = 0; i < len - 7; i += 8) { - d = float_to_int16_one_altivec(src + i); - VSTE_INC(dst, d, 0, stride); - VSTE_INC(dst, d, 1, stride); - VSTE_INC(dst, d, 2, stride); - VSTE_INC(dst, d, 3, stride); - VSTE_INC(dst, d, 4, stride); - VSTE_INC(dst, d, 5, stride); - VSTE_INC(dst, d, 6, stride); - VSTE_INC(dst, d, 7, stride); - } -} - -static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, - long len, int channels) -{ - int i; - vector signed short d0, d1, d2, c0, c1, t0, t1; - vector unsigned char align; - - if (channels == 1) - float_to_int16_altivec(dst, src[0], len); - else { - if (channels == 2) { - if (((long)dst) & 15) { - for (i = 0; i < len - 7; i += 8) { - d0 = vec_ld(0, dst + i); - t0 = float_to_int16_one_altivec(src[0] + i); - d1 = vec_ld(31, dst + i); - t1 = float_to_int16_one_altivec(src[1] + i); - c0 = vec_mergeh(t0, t1); - c1 = vec_mergel(t0, t1); - d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); - align = vec_lvsr(0, dst + i); - d0 = vec_perm(d2, c0, align); - d1 = vec_perm(c0, c1, align); - vec_st(d0, 0, dst + i); - d0 = vec_perm(c1, d2, align); - vec_st(d1, 15, dst + i); - vec_st(d0, 31, dst + i); - dst += 8; - } - } else { - for (i = 0; i < len - 7; i += 8) { - t0 = float_to_int16_one_altivec(src[0] + i); - t1 = float_to_int16_one_altivec(src[1] + i); - d0 = vec_mergeh(t0, t1); - d1 = vec_mergel(t0, t1); - vec_st(d0, 0, dst + i); - vec_st(d1, 16, dst + i); - dst += 8; - } - } - } else { - for (i = 0; i < channels; i++) - float_to_int16_stride_altivec(dst + i, src[i], len, channels); - } - } -} - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c, - AVCodecContext *avctx) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = float_to_int16_altivec; - c->float_to_int16_interleave = float_to_int16_interleave_altivec; - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/gmc_altivec.c b/ffmpeg/libavcodec/ppc/gmc_altivec.c deleted file mode 100644 index 45d850a..0000000 --- a/ffmpeg/libavcodec/ppc/gmc_altivec.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * GMC (Global Motion Compensation) - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -/* - altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, - to preserve proper dst alignment. -*/ -void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) -{ - const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; - const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); - register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; - register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; - int i; - unsigned long dst_odd = (unsigned long)dst & 0x0000000F; - unsigned long src_really_odd = (unsigned long)src & 0x0000000F; - - tempA = vec_ld(0, (const unsigned short*)ABCD); - Av = vec_splat(tempA, 0); - Bv = vec_splat(tempA, 1); - Cv = vec_splat(tempA, 2); - Dv = vec_splat(tempA, 3); - - rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0); - - // we'll be able to pick-up our 9 char elements - // at src from those 32 bytes - // we load the first batch here, as inside the loop - // we can re-use 'src+stride' from one iteration - // as the 'src' of the next. - src_0 = vec_ld(0, src); - src_1 = vec_ld(16, src); - srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - - if (src_really_odd != 0x0000000F) { - // if (src & 0xF) == 0xF, then (src+1) is properly aligned - // on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } else { - srcvB = src_1; - } - srcvA = vec_mergeh(vczero, srcvA); - srcvB = vec_mergeh(vczero, srcvB); - - for(i=0; i<h; i++) { - dst_odd = (unsigned long)dst & 0x0000000F; - src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; - - dstv = vec_ld(0, dst); - - // we we'll be able to pick-up our 9 char elements - // at src + stride from those 32 bytes - // then reuse the resulting 2 vectors srvcC and srcvD - // as the next srcvA and srcvB - src_0 = vec_ld(stride + 0, src); - src_1 = vec_ld(stride + 16, src); - srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); - - if (src_really_odd != 0x0000000F) { - // if (src & 0xF) == 0xF, then (src+1) is properly aligned - // on the second vector. - srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); - } else { - srcvD = src_1; - } - - srcvC = vec_mergeh(vczero, srcvC); - srcvD = vec_mergeh(vczero, srcvD); - - - // OK, now we (finally) do the math :-) - // those four instructions replaces 32 int muls & 32 int adds. - // isn't AltiVec nice ? - tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); - tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); - tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); - tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); - - srcvA = srcvC; - srcvB = srcvD; - - tempD = vec_sr(tempD, vcsr8); - - dstv2 = vec_pack(tempD, (vector unsigned short)vczero); - - if (dst_odd) { - dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); - } else { - dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); - } - - vec_st(dstv2, 0, dst); - - dst += stride; - src += stride; - } -} diff --git a/ffmpeg/libavcodec/ppc/h264chroma_init.c b/ffmpeg/libavcodec/ppc/h264chroma_init.c deleted file mode 100644 index 921f2de..0000000 --- a/ffmpeg/libavcodec/ppc/h264chroma_init.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/h264chroma.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth) -{ -#if HAVE_ALTIVEC - const int high_bit_depth = bit_depth > 8; - - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - if (!high_bit_depth) { - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/h264chroma_template.c b/ffmpeg/libavcodec/ppc/h264chroma_template.c deleted file mode 100644 index 7436e11..0000000 --- a/ffmpeg/libavcodec/ppc/h264chroma_template.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" - -/* this code assume that stride % 16 == 0 */ - -#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ - vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ - vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ - psum = vec_mladd(vB, vsrc1ssH, psum);\ - psum = vec_mladd(vC, vsrc2ssH, psum);\ - psum = vec_mladd(vD, vsrc3ssH, psum);\ - psum = BIAS2(psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - vsrc0ssH = vsrc2ssH;\ - vsrc1ssH = vsrc3ssH;\ -\ - dst += stride;\ - src += stride; - -#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ -\ - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, v32ss);\ - psum = vec_mladd(vE, vsrc1ssH, psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - dst += stride;\ - src += stride; - -#define noop(a) a -#define add28(a) vec_add(v28ss, a) - -#ifdef PREFIX_h264_chroma_mc8_altivec -static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, - int stride, int h, int x, int y) { - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); - - if (ABCD[3]) { - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } - } else { - const vec_s16 vE = vec_add(vB, vC); - if (ABCD[2]) { // x == 0 B == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 15, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } - } else { // y == 0 C == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrcDuc = vec_ld(15, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcDuc; - else - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } - } - } -} -#endif - -/* this code assume that stride % 16 == 0 */ -#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec -static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } -} -#endif - -#undef noop -#undef add28 -#undef CHROMA_MC8_ALTIVEC_CORE diff --git a/ffmpeg/libavcodec/ppc/hpeldsp_altivec.c b/ffmpeg/libavcodec/ppc/hpeldsp_altivec.c deleted file mode 100644 index 345ec39..0000000 --- a/ffmpeg/libavcodec/ppc/hpeldsp_altivec.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/hpeldsp.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC -/* next one assumes that ((line_size % 16) == 0) */ -void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - register ptrdiff_t line_size_2 = line_size << 1; - register ptrdiff_t line_size_3 = line_size + line_size_2; - register ptrdiff_t line_size_4 = line_size << 2; - -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 - for (i = 0; i < h; i += 4) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(15, pixels); - pixelsv1B = vec_ld(line_size, pixels); - pixelsv2B = vec_ld(15 + line_size, pixels); - pixelsv1C = vec_ld(line_size_2, pixels); - pixelsv2C = vec_ld(15 + line_size_2, pixels); - pixelsv1D = vec_ld(line_size_3, pixels); - pixelsv2D = vec_ld(15 + line_size_3, pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - line_size, (unsigned char*)block); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - line_size_2, (unsigned char*)block); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - line_size_3, (unsigned char*)block); - pixels+=line_size_4; - block +=line_size_4; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) -void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - - for (i = 0; i < h; i++) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16,pixels); - blockv = vec_ld(0, block); - pixelsv = vec_perm(pixelsv1, pixelsv2, perm); - blockv = vec_avg(blockv,pixelsv); - vec_st(blockv, 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - int i; - - for (i = 0; i < h; i++) { - /* block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16, pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } else { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2, blocktemp; - register vector unsigned short pixelssum1, pixelssum2, temp3; - - register const vector unsigned char vczero = (const vector unsigned char) - vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short) - vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; - c->avg_pixels_tab[1][0] = avg_pixels8_altivec; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; - - c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; - c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; - c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/idct_altivec.c b/ffmpeg/libavcodec/ppc/idct_altivec.c deleted file mode 100644 index c6f2cd8..0000000 --- a/ffmpeg/libavcodec/ppc/idct_altivec.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of FFmpeg. - */ - -/* - * FFmpeg integration by Dieter Shirley - * - * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 - * project. I've deleted all of the libmpeg2-specific code, renamed the - * functions and reordered the function parameters. The only change to the - * IDCT function itself was to factor out the partial transposition, and to - * perform a full transpose at the end of the function. - */ - - -#include <stdlib.h> /* malloc(), free() */ -#include <string.h> -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/ppc/types_altivec.h" -#include "dsputil_altivec.h" - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds (a1, vx7, vx1 ); \ - t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ - t7 = vec_mradds (a2, vx5, vx3); \ - t3 = vec_mradds (ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds (vx0, vx4); \ - t0 = vec_subs (vx0, vx4); \ - t2 = vec_mradds (a0, vx6, vx2); \ - t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ - t6 = vec_adds (t8, t3); \ - t3 = vec_subs (t8, t3); \ - t8 = vec_subs (t1, t7); \ - t1 = vec_adds (t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds (t5, t2); \ - t2 = vec_subs (t5, t2); \ - t5 = vec_adds (t0, t4); \ - t0 = vec_subs (t0, t4); \ - t4 = vec_subs (t8, t3); \ - t3 = vec_adds (t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds (t7, t1); \ - vy7 = vec_subs (t7, t1); \ - vy1 = vec_mradds (c4, t3, t5); \ - vy6 = vec_mradds (mc4, t3, t5); \ - vy2 = vec_mradds (c4, t4, t0); \ - vy5 = vec_mradds (mc4, t4, t0); \ - vy3 = vec_adds (t2, t6); \ - vy4 = vec_subs (t2, t6); - - -#define IDCT \ - vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ - vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ - vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - vec_u16 shift; \ - \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ - \ - zero = vec_splat_s16 (0); \ - shift = vec_splat_u16 (4); \ - \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ - \ - IDCT_HALF \ - \ - vx0 = vec_mergeh (vy0, vy4); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - vy0 = vec_mergeh (vx0, vx4); \ - vy1 = vec_mergel (vx0, vx4); \ - vy2 = vec_mergeh (vx1, vx5); \ - vy3 = vec_mergel (vx1, vx5); \ - vy4 = vec_mergeh (vx2, vx6); \ - vy5 = vec_mergel (vx2, vx6); \ - vy6 = vec_mergeh (vx3, vx7); \ - vy7 = vec_mergel (vx3, vx7); \ - \ - vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - IDCT_HALF \ - \ - shift = vec_splat_u16 (6); \ - vx0 = vec_sra (vy0, shift); \ - vx1 = vec_sra (vy1, shift); \ - vx2 = vec_sra (vy2, shift); \ - vx3 = vec_sra (vy3, shift); \ - vx4 = vec_sra (vy4, shift); \ - vx5 = vec_sra (vy5, shift); \ - vx6 = vec_sra (vy6, shift); \ - vx7 = vec_sra (vy7, shift); - - -static const vec_s16 constants[5] = { - {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, - {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, - {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, - {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, - {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} -}; - -void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - - IDCT - -#define COPY(dest,src) \ - tmp = vec_packsu (src, src); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - COPY (dest, vx0) dest += stride; - COPY (dest, vx1) dest += stride; - COPY (dest, vx2) dest += stride; - COPY (dest, vx3) dest += stride; - COPY (dest, vx4) dest += stride; - COPY (dest, vx5) dest += stride; - COPY (dest, vx6) dest += stride; - COPY (dest, vx7) -} - -void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - vec_s16 tmp2, tmp3; - vec_u8 perm0; - vec_u8 perm1; - vec_u8 p0, p1, p; - - IDCT - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - ADD (dest, vx0, perm0) dest += stride; - ADD (dest, vx1, perm1) dest += stride; - ADD (dest, vx2, perm0) dest += stride; - ADD (dest, vx3, perm1) dest += stride; - ADD (dest, vx4, perm0) dest += stride; - ADD (dest, vx5, perm1) dest += stride; - ADD (dest, vx6, perm0) dest += stride; - ADD (dest, vx7, perm1) -} diff --git a/ffmpeg/libavcodec/ppc/int_altivec.c b/ffmpeg/libavcodec/ppc/int_altivec.c deleted file mode 100644 index d4e0c85..0000000 --- a/ffmpeg/libavcodec/ppc/int_altivec.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - ** @file - ** integer misc ops. - **/ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavcodec/dsputil.h" - -#include "dsputil_altivec.h" - -static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, - int size) { - int i, size16; - vector signed char vpix1; - vector signed short vpix2, vdiff, vpix1l,vpix1h; - union { vector signed int vscore; - int32_t score[4]; - } u; - u.vscore = vec_splat_s32(0); -// -//XXX lazy way, fix it later - -#define vec_unaligned_load(b) \ - vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); - - size16 = size >> 4; - while(size16) { -// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - //load pix1 and the first batch of pix2 - - vpix1 = vec_unaligned_load(pix1); - vpix2 = vec_unaligned_load(pix2); - pix2 += 8; - //unpack - vpix1h = vec_unpackh(vpix1); - vdiff = vec_sub(vpix1h, vpix2); - vpix1l = vec_unpackl(vpix1); - // load another batch from pix2 - vpix2 = vec_unaligned_load(pix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - vdiff = vec_sub(vpix1l, vpix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - pix1 += 16; - pix2 += 8; - size16--; - } - u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); - - size %= 16; - for (i = 0; i < size; i++) { - u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - } - return u.score[3]; -} - -static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, - int order) -{ - int i; - LOAD_ZERO; - register vec_s16 vec1; - register vec_s32 res = vec_splat_s32(0), t; - int32_t ires; - - for(i = 0; i < order; i += 8){ - vec1 = vec_unaligned_load(v1); - t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); - res = vec_sums(t, res); - v1 += 8; - v2 += 8; - } - res = vec_splat(res, 3); - vec_ste(res, 0, &ires); - return ires; -} - -static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) -{ - LOAD_ZERO; - vec_s16 *pv1 = (vec_s16*)v1; - register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; - register vec_s16 t0, t1, i0, i1, i4; - register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); - register vec_s32 res = zero_s32v; - register vec_u8 align = vec_lvsl(0, v2); - int32_t ires; - order >>= 4; - do { - i1 = vec_ld(16, v2); - t0 = vec_perm(i2, i1, align); - i2 = vec_ld(32, v2); - t1 = vec_perm(i1, i2, align); - i0 = pv1[0]; - i1 = pv1[1]; - res = vec_msum(t0, i0, res); - res = vec_msum(t1, i1, res); - i4 = vec_ld(16, v3); - t0 = vec_perm(i3, i4, align); - i3 = vec_ld(32, v3); - t1 = vec_perm(i4, i3, align); - pv1[0] = vec_mladd(t0, muls, i0); - pv1[1] = vec_mladd(t1, muls, i1); - pv1 += 2; - v2 += 16; - v3 += 16; - } while(--order); - res = vec_splat(vec_sums(res, zero_s32v), 3); - vec_ste(res, 0, &ires); - return ires; -} - -av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx) -{ - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; - c->scalarproduct_int16 = scalarproduct_int16_altivec; - c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; -} diff --git a/ffmpeg/libavcodec/ppc/mathops.h b/ffmpeg/libavcodec/ppc/mathops.h deleted file mode 100644 index dbd714f..0000000 --- a/ffmpeg/libavcodec/ppc/mathops.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_MATHOPS_H -#define AVCODEC_PPC_MATHOPS_H - -#include <stdint.h> -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_PPC4XX -/* signed 16x16 -> 32 multiply add accumulate */ -#define MAC16(rt, ra, rb) \ - __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - -/* signed 16x16 -> 32 multiply */ -#define MUL16(ra, rb) \ - ({ int __rt; \ - __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) -#endif - -#define MULH MULH -static inline av_const int MULH(int a, int b){ - int r; - __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} - -#if !ARCH_PPC64 -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "addc %1, %1, %3 \n\t" - "adde %0, %0, %2 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) - -static inline av_const int64_t MLS64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "subfc %1, %3, %1 \n\t" - "subfe %0, %2, %0 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) -#endif - -#endif /* AVCODEC_PPC_MATHOPS_H */ diff --git a/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c deleted file mode 100644 index cedc1c8..0000000 --- a/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * dct_unquantize_h263_altivec: - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include <stdio.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC - -/* AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned */ -static void dct_unquantize_h263_altivec(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - block[0] = block[0] * s->y_dc_scale; - else - block[0] = block[0] * s->c_dc_scale; - }else - qadd = 0; - i = 1; - nCoeffs= 63; //does not always use zigzag table - } else { - i = 0; - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } - - { - register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED(16, short, qmul8) = qmul; - DECLARE_ALIGNED(16, short, qadd8) = qadd; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; - - qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); - qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); - nqaddv = vec_sub(vczero, qaddv); - - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } - - if (i == 1) { - // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } - } -} - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_MPV_common_init_ppc(MpegEncContext *s) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { - s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; - s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c b/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c deleted file mode 100644 index 1b73dd0..0000000 --- a/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized - * Copyright (c) 2006 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/vc1dsp.h" - -#if HAVE_ALTIVEC - -// main steps of 8x8 transform -#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ -do { \ - t0 = vec_sl(vec_add(s0, s4), vec_2); \ - t0 = vec_add(vec_sl(t0, vec_1), t0); \ - t0 = vec_add(t0, vec_rnd); \ - t1 = vec_sl(vec_sub(s0, s4), vec_2); \ - t1 = vec_add(vec_sl(t1, vec_1), t1); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ - t2 = vec_add(t2, vec_sl(s2, vec_4)); \ - t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ - t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ - t4 = vec_add(t0, t2); \ - t5 = vec_add(t1, t3); \ - t6 = vec_sub(t1, t3); \ - t7 = vec_sub(t0, t2); \ -\ - t0 = vec_sl(vec_add(s1, s3), vec_4); \ - t0 = vec_add(t0, vec_sl(s5, vec_3)); \ - t0 = vec_add(t0, vec_sl(s7, vec_2)); \ - t0 = vec_add(t0, vec_sub(s5, s3)); \ -\ - t1 = vec_sl(vec_sub(s1, s5), vec_4); \ - t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ - t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ - t1 = vec_sub(t1, vec_add(s1, s7)); \ -\ - t2 = vec_sl(vec_sub(s7, s3), vec_4); \ - t2 = vec_add(t2, vec_sl(s1, vec_3)); \ - t2 = vec_add(t2, vec_sl(s5, vec_2)); \ - t2 = vec_add(t2, vec_sub(s1, s7)); \ -\ - t3 = vec_sl(vec_sub(s5, s7), vec_4); \ - t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s1, vec_2)); \ - t3 = vec_sub(t3, vec_add(s3, s5)); \ -\ - s0 = vec_add(t4, t0); \ - s1 = vec_add(t5, t1); \ - s2 = vec_add(t6, t2); \ - s3 = vec_add(t7, t3); \ - s4 = vec_sub(t7, t3); \ - s5 = vec_sub(t6, t2); \ - s6 = vec_sub(t5, t1); \ - s7 = vec_sub(t4, t0); \ -}while(0) - -#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); \ - s4 = vec_sra(s4, vec_3); \ - s5 = vec_sra(s5, vec_3); \ - s6 = vec_sra(s6, vec_3); \ - s7 = vec_sra(s7, vec_3); \ -}while(0) - -#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); \ - s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ - s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ - s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ - s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ -}while(0) - -/* main steps of 4x4 transform */ -#define STEP4(s0, s1, s2, s3, vec_rnd) \ -do { \ - t1 = vec_add(vec_sl(s0, vec_4), s0); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s2, vec_4), s2); \ - t0 = vec_add(t1, t2); \ - t1 = vec_sub(t1, t2); \ - t3 = vec_sl(vec_sub(s3, s1), vec_1); \ - t3 = vec_add(t3, vec_sl(t3, vec_2)); \ - t2 = vec_add(t3, vec_sl(s1, vec_5)); \ - t3 = vec_add(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s3, vec_2)); \ - s0 = vec_add(t0, t2); \ - s1 = vec_sub(t1, t3); \ - s2 = vec_add(t1, t3); \ - s3 = vec_sub(t0, t2); \ -}while (0) - -#define SHIFT_HOR4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); - -#define SHIFT_VERT4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); - -/** Do inverse transform on 8x8 block -*/ -static void vc1_inv_trans_8x8_altivec(int16_t block[64]) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector signed int vec_1s = vec_splat_s32(1); - const vector unsigned int vec_1 = vec_splat_u32(1); - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); - SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); - SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); -} - -/** Do inverse transform on 8x4 part of block -*/ -static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector unsigned int vec_1 = vec_splat_u32(1); - vector unsigned char tmp; - vector signed short tmp2, tmp3; - vector unsigned char perm0, perm1, p0, p1, p; - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackh(src0); - s1 = vec_unpackh(src1); - s2 = vec_unpackh(src2); - s3 = vec_unpackh(src3); - s8 = vec_unpackl(src0); - s9 = vec_unpackl(src1); - sA = vec_unpackl(src2); - sB = vec_unpackl(src3); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src0 = vec_pack(s0, s8); - src1 = vec_pack(s1, s9); - src2 = vec_pack(s2, sA); - src3 = vec_pack(s3, sB); - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); - - ADD (dest, src0, perm0) dest += stride; - ADD (dest, src1, perm1) dest += stride; - ADD (dest, src2, perm0) dest += stride; - ADD (dest, src3, perm1) -} - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; - dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; - dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec; -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/videodsp_ppc.c b/ffmpeg/libavcodec/ppc/videodsp_ppc.c deleted file mode 100644 index 9157022..0000000 --- a/ffmpeg/libavcodec/ppc/videodsp_ppc.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavcodec/videodsp.h" - -static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h) -{ - register const uint8_t *p = mem; - do { - __asm__ volatile ("dcbt 0,%0" : : "r" (p)); - p += stride; - } while(--h); -} - -av_cold void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc) -{ - ctx->prefetch = prefetch_ppc; -} diff --git a/ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c b/ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c deleted file mode 100644 index d243bf6..0000000 --- a/ffmpeg/libavcodec/ppc/vorbisdsp_altivec.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavcodec/vorbisdsp.h" - -#if HAVE_ALTIVEC -static void vorbis_inverse_coupling_altivec(float *mag, float *ang, - intptr_t blocksize) -{ - int i; - vector float m, a; - vector bool int t0, t1; - const vector unsigned int v_31 = //XXX - vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); - for (i = 0; i < blocksize; i += 4) { - m = vec_ld(0, mag+i); - a = vec_ld(0, ang+i); - t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); - t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); - a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); - t0 = (vector bool int)vec_and(a, t1); - t1 = (vector bool int)vec_andc(a, t1); - a = vec_sub(m, (vector float)t1); - m = vec_add(m, (vector float)t0); - vec_stl(a, 0, ang+i); - vec_stl(m, 0, mag+i); - } -} -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg/libavcodec/ppc/vp3dsp_altivec.c b/ffmpeg/libavcodec/ppc/vp3dsp_altivec.c deleted file mode 100644 index 56c2d0b..0000000 --- a/ffmpeg/libavcodec/ppc/vp3dsp_altivec.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (C) 2009 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <string.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/vp3dsp.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC - -static const vec_s16 constants = - {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785}; -static const vec_u8 interleave_high = - {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; - -#define IDCT_START \ - vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\ - vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\ - vec_s16 eight = vec_splat_s16(8);\ - vec_u16 four = vec_splat_u16(4);\ -\ - vec_s16 C1 = vec_splat(constants, 1);\ - vec_s16 C2 = vec_splat(constants, 2);\ - vec_s16 C3 = vec_splat(constants, 3);\ - vec_s16 C4 = vec_splat(constants, 4);\ - vec_s16 C5 = vec_splat(constants, 5);\ - vec_s16 C6 = vec_splat(constants, 6);\ - vec_s16 C7 = vec_splat(constants, 7);\ -\ - vec_s16 b0 = vec_ld(0x00, block);\ - vec_s16 b1 = vec_ld(0x10, block);\ - vec_s16 b2 = vec_ld(0x20, block);\ - vec_s16 b3 = vec_ld(0x30, block);\ - vec_s16 b4 = vec_ld(0x40, block);\ - vec_s16 b5 = vec_ld(0x50, block);\ - vec_s16 b6 = vec_ld(0x60, block);\ - vec_s16 b7 = vec_ld(0x70, block); - -// these functions do (a*C)>>16 -// things are tricky because a is signed, but C unsigned. -// M15 is used if C fits in 15 bit unsigned (C6,C7) -// M16 is used if C requires 16 bits unsigned -static inline vec_s16 M15(vec_s16 a, vec_s16 C) -{ - return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high); -} -static inline vec_s16 M16(vec_s16 a, vec_s16 C) -{ - return vec_add(a, M15(a, C)); -} - -#define IDCT_1D(ADD, SHIFT)\ - A = vec_add(M16(b1, C1), M15(b7, C7));\ - B = vec_sub(M15(b1, C7), M16(b7, C1));\ - C = vec_add(M16(b3, C3), M16(b5, C5));\ - D = vec_sub(M16(b5, C3), M16(b3, C5));\ -\ - Ad = M16(vec_sub(A, C), C4);\ - Bd = M16(vec_sub(B, D), C4);\ -\ - Cd = vec_add(A, C);\ - Dd = vec_add(B, D);\ -\ - E = ADD(M16(vec_add(b0, b4), C4));\ - F = ADD(M16(vec_sub(b0, b4), C4));\ -\ - G = vec_add(M16(b2, C2), M15(b6, C6));\ - H = vec_sub(M15(b2, C6), M16(b6, C2));\ -\ - Ed = vec_sub(E, G);\ - Gd = vec_add(E, G);\ -\ - Add = vec_add(F, Ad);\ - Bdd = vec_sub(Bd, H);\ -\ - Fd = vec_sub(F, Ad);\ - Hd = vec_add(Bd, H);\ -\ - b0 = SHIFT(vec_add(Gd, Cd));\ - b7 = SHIFT(vec_sub(Gd, Cd));\ -\ - b1 = SHIFT(vec_add(Add, Hd));\ - b2 = SHIFT(vec_sub(Add, Hd));\ -\ - b3 = SHIFT(vec_add(Ed, Dd));\ - b4 = SHIFT(vec_sub(Ed, Dd));\ -\ - b5 = SHIFT(vec_add(Fd, Bdd));\ - b6 = SHIFT(vec_sub(Fd, Bdd)); - -#define NOP(a) a -#define ADD8(a) vec_add(a, eight) -#define SHIFT4(a) vec_sra(a, four) - -static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64]) -{ - vec_u8 t; - IDCT_START - - // pixels are signed; so add 128*16 in addition to the normal 8 - vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); - eight = vec_add(eight, v2048); - - IDCT_1D(NOP, NOP) - TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); - IDCT_1D(ADD8, SHIFT4) - -#define PUT(a)\ - t = vec_packsu(a, a);\ - vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ - vec_ste((vec_u32)t, 4, (unsigned int *)dst); - - PUT(b0) dst += stride; - PUT(b1) dst += stride; - PUT(b2) dst += stride; - PUT(b3) dst += stride; - PUT(b4) dst += stride; - PUT(b5) dst += stride; - PUT(b6) dst += stride; - PUT(b7) - memset(block, 0, sizeof(*block) * 64); -} - -static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64]) -{ - LOAD_ZERO; - vec_u8 t, vdst; - vec_s16 vdst_16; - vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); - - IDCT_START - - IDCT_1D(NOP, NOP) - TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); - IDCT_1D(ADD8, SHIFT4) - -#define ADD(a)\ - vdst = vec_ld(0, dst);\ - vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ - vdst_16 = vec_adds(a, vdst_16);\ - t = vec_packsu(vdst_16, vdst_16);\ - vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ - vec_ste((vec_u32)t, 4, (unsigned int *)dst); - - ADD(b0) dst += stride; - ADD(b1) dst += stride; - ADD(b2) dst += stride; - ADD(b3) dst += stride; - ADD(b4) dst += stride; - ADD(b5) dst += stride; - ADD(b6) dst += stride; - ADD(b7) - memset(block, 0, sizeof(*block) * 64); -} - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->idct_put = vp3_idct_put_altivec; - c->idct_add = vp3_idct_add_altivec; -#endif -} diff --git a/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c b/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c deleted file mode 100644 index c858d8a..0000000 --- a/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * VP8 compatible video decoder - * - * Copyright (C) 2010 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/vp8dsp.h" -#include "dsputil_altivec.h" - -#if HAVE_ALTIVEC -#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } - -// h subpel filter uses msum to multiply+add 4 pixel taps at once -static const vec_s8 h_subpel_filters_inner[7] = -{ - REPT4( -6, 123, 12, -1), - REPT4(-11, 108, 36, -8), - REPT4( -9, 93, 50, -6), - REPT4(-16, 77, 77, -16), - REPT4( -6, 50, 93, -9), - REPT4( -8, 36, 108, -11), - REPT4( -1, 12, 123, -6), -}; - -// for 6tap filters, these are the outer two taps -// The zeros mask off pixels 4-7 when filtering 0-3 -// and vice-versa -static const vec_s8 h_subpel_filters_outer[3] = -{ - REPT4(0, 0, 2, 1), - REPT4(0, 0, 3, 3), - REPT4(0, 0, 1, 2), -}; - -#define LOAD_H_SUBPEL_FILTER(i) \ - vec_s8 filter_inner = h_subpel_filters_inner[i]; \ - vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ - vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) - -#define FILTER_H(dstv, off) \ - a = vec_ld((off)-is6tap-1, src); \ - b = vec_ld((off)-is6tap-1+15, src); \ -\ - pixh = vec_perm(a, b, permh##off); \ - pixl = vec_perm(a, b, perml##off); \ - filth = vec_msum(filter_inner, pixh, c64); \ - filtl = vec_msum(filter_inner, pixl, c64); \ -\ - if (is6tap) { \ - outer = vec_perm(a, b, perm_6tap##off); \ - filth = vec_msum(filter_outerh, outer, filth); \ - filtl = vec_msum(filter_outerl, outer, filtl); \ - } \ - if (w == 4) \ - filtl = filth; /* discard pixels 4-7 */ \ - dstv = vec_packs(filth, filtl); \ - dstv = vec_sra(dstv, c7) - -static av_always_inline -void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, - uint8_t *src, ptrdiff_t src_stride, - int h, int mx, int w, int is6tap) -{ - LOAD_H_SUBPEL_FILTER(mx-1); - vec_u8 align_vec0, align_vec8, permh0, permh8, filt; - vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; - vec_u8 a, b, pixh, pixl, outer; - vec_s16 f16h, f16l; - vec_s32 filth, filtl; - - vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; - vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; - vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; - vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; - vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); - vec_u16 c7 = vec_splat_u16(7); - - align_vec0 = vec_lvsl( -is6tap-1, src); - align_vec8 = vec_lvsl(8-is6tap-1, src); - - permh0 = vec_perm(align_vec0, align_vec0, perm_inner); - permh8 = vec_perm(align_vec8, align_vec8, perm_inner); - perm_inner = vec_add(perm_inner, vec_splat_u8(4)); - perml0 = vec_perm(align_vec0, align_vec0, perm_inner); - perml8 = vec_perm(align_vec8, align_vec8, perm_inner); - perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); - perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); - - while (h --> 0) { - FILTER_H(f16h, 0); - - if (w == 16) { - FILTER_H(f16l, 8); - filt = vec_packsu(f16h, f16l); - vec_st(filt, 0, dst); - } else { - filt = vec_packsu(f16h, f16h); - vec_ste((vec_u32)filt, 0, (uint32_t*)dst); - if (w == 8) - vec_ste((vec_u32)filt, 4, (uint32_t*)dst); - } - src += src_stride; - dst += dst_stride; - } -} - -// v subpel filter does a simple vertical multiply + add -static const vec_u8 v_subpel_filters[7] = -{ - { 0, 6, 123, 12, 1, 0 }, - { 2, 11, 108, 36, 8, 1 }, - { 0, 9, 93, 50, 6, 0 }, - { 3, 16, 77, 77, 16, 3 }, - { 0, 6, 50, 93, 9, 0 }, - { 1, 8, 36, 108, 11, 2 }, - { 0, 1, 12, 123, 6, 0 }, -}; - -#define LOAD_V_SUBPEL_FILTER(i) \ - vec_u8 subpel_filter = v_subpel_filters[i]; \ - vec_u8 f0 = vec_splat(subpel_filter, 0); \ - vec_u8 f1 = vec_splat(subpel_filter, 1); \ - vec_u8 f2 = vec_splat(subpel_filter, 2); \ - vec_u8 f3 = vec_splat(subpel_filter, 3); \ - vec_u8 f4 = vec_splat(subpel_filter, 4); \ - vec_u8 f5 = vec_splat(subpel_filter, 5) - -#define FILTER_V(dstv, vec_mul) \ - s1f = (vec_s16)vec_mul(s1, f1); \ - s2f = (vec_s16)vec_mul(s2, f2); \ - s3f = (vec_s16)vec_mul(s3, f3); \ - s4f = (vec_s16)vec_mul(s4, f4); \ - s2f = vec_subs(s2f, s1f); \ - s3f = vec_subs(s3f, s4f); \ - if (is6tap) { \ - s0f = (vec_s16)vec_mul(s0, f0); \ - s5f = (vec_s16)vec_mul(s5, f5); \ - s2f = vec_adds(s2f, s0f); \ - s3f = vec_adds(s3f, s5f); \ - } \ - dstv = vec_adds(s2f, s3f); \ - dstv = vec_adds(dstv, c64); \ - dstv = vec_sra(dstv, c7) - -static av_always_inline -void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, - uint8_t *src, ptrdiff_t src_stride, - int h, int my, int w, int is6tap) -{ - LOAD_V_SUBPEL_FILTER(my-1); - vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; - vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; - vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); - vec_u16 c7 = vec_splat_u16(7); - - // we want pixels 0-7 to be in the even positions and 8-15 in the odd, - // so combine this permute with the alignment permute vector - align_vech = vec_lvsl(0, src); - align_vecl = vec_sld(align_vech, align_vech, 8); - if (w ==16) - perm_vec = vec_mergeh(align_vech, align_vecl); - else - perm_vec = vec_mergeh(align_vech, align_vech); - - if (is6tap) - s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); - s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); - s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); - s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); - if (is6tap) - s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); - - src += (2+is6tap)*src_stride; - - while (h --> 0) { - if (is6tap) - s5 = load_with_perm_vec(0, src, perm_vec); - else - s4 = load_with_perm_vec(0, src, perm_vec); - - FILTER_V(f16h, vec_mule); - - if (w == 16) { - FILTER_V(f16l, vec_mulo); - filt = vec_packsu(f16h, f16l); - vec_st(filt, 0, dst); - } else { - filt = vec_packsu(f16h, f16h); - if (w == 4) - filt = (vec_u8)vec_splat((vec_u32)filt, 0); - else - vec_ste((vec_u32)filt, 4, (uint32_t*)dst); - vec_ste((vec_u32)filt, 0, (uint32_t*)dst); - } - - if (is6tap) - s0 = s1; - s1 = s2; - s2 = s3; - s3 = s4; - if (is6tap) - s4 = s5; - - dst += dst_stride; - src += src_stride; - } -} - -#define EPEL_FUNCS(WIDTH, TAPS) \ -static av_noinline \ -void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ -{ \ - put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ -} \ -\ -static av_noinline \ -void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ -{ \ - put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ -} - -#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ -static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ -{ \ - DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ - if (VTAPS == 6) { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ - } else { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ - } \ -} - -EPEL_FUNCS(16,6) -EPEL_FUNCS(8, 6) -EPEL_FUNCS(8, 4) -EPEL_FUNCS(4, 6) -EPEL_FUNCS(4, 4) - -EPEL_HV(16, 6,6) -EPEL_HV(8, 6,6) -EPEL_HV(8, 4,6) -EPEL_HV(8, 6,4) -EPEL_HV(8, 4,4) -EPEL_HV(4, 6,6) -EPEL_HV(4, 4,6) -EPEL_HV(4, 6,4) -EPEL_HV(4, 4,4) - -static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) -{ - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, src); - int i; - register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; - register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; - register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; - -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 - for (i = 0; i < h; i += 4) { - pixelsv1 = vec_ld( 0, src); - pixelsv2 = vec_ld(15, src); - pixelsv1B = vec_ld(sstride, src); - pixelsv2B = vec_ld(15 + sstride, src); - pixelsv1C = vec_ld(sstride2, src); - pixelsv2C = vec_ld(15 + sstride2, src); - pixelsv1D = vec_ld(sstride3, src); - pixelsv2D = vec_ld(15 + sstride3, src); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)dst); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - dstride, (unsigned char*)dst); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - dstride2, (unsigned char*)dst); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - dstride3, (unsigned char*)dst); - src += sstride4; - dst += dstride4; - } -} - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vp8dsp_init_ppc(VP8DSPContext *c) -{ -#if HAVE_ALTIVEC - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; - c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; - c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; - c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; - - c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; - c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; - c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; - c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; - - c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; - c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; - c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; - c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; - - c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; - c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; - c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; - c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; - - c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; - c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; - c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; - c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; -#endif /* HAVE_ALTIVEC */ -} |
