diff options
Diffstat (limited to 'ffmpeg1/libavcodec/ppc')
26 files changed, 0 insertions, 6724 deletions
diff --git a/ffmpeg1/libavcodec/ppc/Makefile b/ffmpeg1/libavcodec/ppc/Makefile deleted file mode 100644 index febbb0a..0000000 --- a/ffmpeg1/libavcodec/ppc/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -OBJS += ppc/dsputil_ppc.o \ - ppc/videodsp_ppc.o \ - -OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o -OBJS-$(CONFIG_H264QPEL) += ppc/h264_qpel.o -OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o -OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o -OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o - -FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o -ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \ - $(FFT-OBJS-yes) -ALTIVEC-OBJS-$(CONFIG_H264DSP) += ppc/h264_altivec.o -ALTIVEC-OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodec_altivec.o -ALTIVEC-OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o -ALTIVEC-OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o -ALTIVEC-OBJS-$(CONFIG_VP8_DECODER) += ppc/vp8dsp_altivec.o - -ALTIVEC-OBJS += ppc/dsputil_altivec.o \ - ppc/fdct_altivec.o \ - ppc/fmtconvert_altivec.o \ - ppc/gmc_altivec.o \ - ppc/idct_altivec.o \ - ppc/int_altivec.o \ diff --git a/ffmpeg1/libavcodec/ppc/asm.S b/ffmpeg1/libavcodec/ppc/asm.S deleted file mode 100644 index bbbf8a4..0000000 --- a/ffmpeg1/libavcodec/ppc/asm.S +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2009 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#define GLUE(a, b) a ## b -#define JOIN(a, b) GLUE(a, b) -#define X(s) JOIN(EXTERN_ASM, s) - -#if ARCH_PPC64 - -#define PTR .quad -#define lp ld -#define lpx ldx -#define stp std -#define stpu stdu -#define PS 8 -#define L(s) JOIN(., s) - -.macro extfunc name - .global X(\name) - .section .opd, "aw" -X(\name): - .quad L(\name), .TOC.@tocbase, 0 - .previous - .type X(\name), STT_FUNC -L(\name): -.endm - -.macro movrel rd, sym, gp - ld \rd, \sym@got(r2) -.endm - -.macro get_got rd -.endm - -#else /* ARCH_PPC64 */ - -#define PTR .int -#define lp lwz -#define lpx lwzx -#define stp stw -#define stpu stwu -#define PS 4 -#define L(s) s - -.macro extfunc name - .global X(\name) - .type X(\name), STT_FUNC -X(\name): -\name: -.endm - -.macro movrel rd, sym, gp -#if CONFIG_PIC - lwz \rd, \sym@got(\gp) -#else - lis \rd, \sym@ha - la \rd, \sym@l(\rd) -#endif -.endm - -.macro get_got rd -#if CONFIG_PIC - bcl 20, 31, .Lgot\@ -.Lgot\@: - mflr \rd - addis \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@ha - addi \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@l -#endif -.endm - -#endif /* ARCH_PPC64 */ - -#if HAVE_IBM_ASM - -.macro DEFINE_REG n - .equiv r\n, \n - .equiv f\n, \n - .equiv v\n, \n -.endm - -DEFINE_REG 0 -DEFINE_REG 1 -DEFINE_REG 2 -DEFINE_REG 3 -DEFINE_REG 4 -DEFINE_REG 5 -DEFINE_REG 6 -DEFINE_REG 7 -DEFINE_REG 8 -DEFINE_REG 9 -DEFINE_REG 10 -DEFINE_REG 11 -DEFINE_REG 12 -DEFINE_REG 13 -DEFINE_REG 14 -DEFINE_REG 15 -DEFINE_REG 16 -DEFINE_REG 17 -DEFINE_REG 18 -DEFINE_REG 19 -DEFINE_REG 20 -DEFINE_REG 21 -DEFINE_REG 22 -DEFINE_REG 23 -DEFINE_REG 24 -DEFINE_REG 25 -DEFINE_REG 26 -DEFINE_REG 27 -DEFINE_REG 28 -DEFINE_REG 29 -DEFINE_REG 30 -DEFINE_REG 31 - -#endif /* HAVE_IBM_ASM */ diff --git a/ffmpeg1/libavcodec/ppc/dsputil_altivec.c b/ffmpeg1/libavcodec/ppc/dsputil_altivec.c deleted file mode 100644 index f36e394..0000000 --- a/ffmpeg1/libavcodec/ppc/dsputil_altivec.c +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_altivec.h" - -static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ - pix1v = vec_ld( 0, pix1); - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(16, pix2); - pix2v = vec_perm(pix2l, pix2r, perm1); - pix2iv = vec_perm(pix2l, pix2r, perm2); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix2iv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix3v, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - uint8_t *pix3 = pix2 + line_size; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, each - time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] - Split the pixel vectors into shorts */ - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(15, pix2); - pix2v = vec_perm(pix2l, pix2r, perm); - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld( 0, pix3); - pix2r = vec_ld(15, pix3); - pix3v = vec_perm(pix2l, pix2r, perm); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix3v); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2v = pix3v; - pix3 += line_size; - - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - return s; -} - -static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - uint8_t *pix3 = pix2 + line_size; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); - vector unsigned char avgv, t5; - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned char pix2l, pix2r; - vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; - vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; - vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv; - vector unsigned short t1, t2, t3, t4; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - s = 0; - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, as well - as some splitting, and vector addition each time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - Split the pixel vectors into shorts */ - pix2l = vec_ld( 0, pix2); - pix2r = vec_ld(16, pix2); - pix2v = vec_perm(pix2l, pix2r, perm1); - pix2iv = vec_perm(pix2l, pix2r, perm2); - - pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); - pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); - pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); - pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); - t1 = vec_add(pix2hv, pix2ihv); - t2 = vec_add(pix2lv, pix2ilv); - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld( 0, pix3); - pix2r = vec_ld(16, pix3); - pix3v = vec_perm(pix2l, pix2r, perm1); - pix3iv = vec_perm(pix2l, pix2r, perm2); - - /* Note that AltiVec does have vec_avg, but this works on vector pairs - and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding - would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. - Instead, we have to split the pixel vectors into vectors of shorts, - and do the averaging by hand. */ - - /* Split the pixel vectors into shorts */ - pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); - pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); - pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); - pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); - - /* Do the averaging on them */ - t3 = vec_add(pix3hv, pix3ihv); - t4 = vec_add(pix3lv, pix3ilv); - - avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); - avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); - - /* Pack the shorts back into a result */ - avgv = vec_pack(avghv, avglv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix3 += line_size; - /* Transfer the calculated values for pix3 into pix2 */ - t1 = t3; - t2 = t4; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_ld(0, pix1); - t2 = vec_perm(pix2l, pix2r, perm); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld( 0, pix1); - vector unsigned char pix1r = vec_ld(15, pix1); - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); - t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int pix_norm1_altivec(uint8_t *pix, int line_size) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned char pixv; - vector unsigned int sv; - vector signed int sum; - - sv = (vector unsigned int)vec_splat_u32(0); - - s = 0; - for (i = 0; i < 16; i++) { - /* Read in the potentially unaligned pixels */ - vector unsigned char pixl = vec_ld( 0, pix); - vector unsigned char pixr = vec_ld(15, pix); - pixv = vec_perm(pixl, pixr, perm); - - /* Square the values, and add them to our sum */ - sv = vec_msum(pixv, pixv, sv); - - pix += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sum = vec_sums((vector signed int) sv, (vector signed int) zero); - sum = vec_splat(sum, 3); - vec_ste(sum, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 8x8 block. - * AltiVec-enhanced. - * It's the sad8_altivec code above w/ squaring added. - */ -static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld( 0, pix1); - vector unsigned char pix1r = vec_ld(15, pix1); - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); - t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); - - /* Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 16x16 block. - * AltiVec-enhanced. - * It's the sad16_altivec code above w/ squaring added. - */ -static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s; - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - vector unsigned char pix2l = vec_ld( 0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - t1 = vec_ld(0, pix1); - t2 = vec_perm(pix2l, pix2r, perm); - - /* Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -static int pix_sum_altivec(uint8_t * pix, int line_size) -{ - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned char t1; - vector unsigned int sad; - vector signed int sumdiffs; - - int i; - int s; - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned 16 pixels into t1 */ - vector unsigned char pixl = vec_ld( 0, pix); - vector unsigned char pixr = vec_ld(15, pix); - t1 = vec_perm(pixl, pixr, perm); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t1, sad); - - pix += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, int line_size) -{ - int i; - vector unsigned char perm = vec_lvsl(0, pixels); - vector unsigned char bytes; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts; - - for (i = 0; i < 8; i++) { - // Read potentially unaligned pixels. - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - vector unsigned char pixl = vec_ld( 0, pixels); - vector unsigned char pixr = vec_ld(15, pixels); - bytes = vec_perm(pixl, pixr, perm); - - // convert the bytes into shorts - shorts = (vector signed short)vec_mergeh(zero, bytes); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts, i*16, (vector signed short*)block); - - pixels += line_size; - } -} - -static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, - const uint8_t *s2, int stride) -{ - int i; - vector unsigned char perm1 = vec_lvsl(0, s1); - vector unsigned char perm2 = vec_lvsl(0, s2); - vector unsigned char bytes, pixl, pixr; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts1, shorts2; - - for (i = 0; i < 4; i++) { - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - pixl = vec_ld( 0, s1); - pixr = vec_ld(15, s1); - bytes = vec_perm(pixl, pixr, perm1); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - pixl = vec_ld( 0, s2); - pixr = vec_ld(15, s2); - bytes = vec_perm(pixl, pixr, perm2); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - - - // The code below is a copy of the code above... This is a manual - // unroll. - - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - pixl = vec_ld( 0, s1); - pixr = vec_ld(15, s1); - bytes = vec_perm(pixl, pixr, perm1); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - pixl = vec_ld( 0, s2); - pixr = vec_ld(15, s2); - bytes = vec_perm(pixl, pixr, perm2); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - } -} - - -static void clear_block_altivec(int16_t *block) { - LOAD_ZERO; - vec_st(zero_s16v, 0, block); - vec_st(zero_s16v, 16, block); - vec_st(zero_s16v, 32, block); - vec_st(zero_s16v, 48, block); - vec_st(zero_s16v, 64, block); - vec_st(zero_s16v, 80, block); - vec_st(zero_s16v, 96, block); - vec_st(zero_s16v, 112, block); -} - - -static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { - register int i; - register vector unsigned char vdst, vsrc; - - /* dst and src are 16 bytes-aligned (guaranteed) */ - for (i = 0 ; (i + 15) < w ; i+=16) { - vdst = vec_ld(i, (unsigned char*)dst); - vsrc = vec_ld(i, (unsigned char*)src); - vdst = vec_add(vsrc, vdst); - vec_st(vdst, i, (unsigned char*)dst); - } - /* if w is not a multiple of 16 */ - for (; (i < w) ; i++) { - dst[i] = src[i]; - } -} - -static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ - int sum; - register const vector unsigned char vzero = - (const vector unsigned char)vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, - temp5, temp6, temp7; - { - register const vector signed short vprod1 =(const vector signed short) - { 1,-1, 1,-1, 1,-1, 1,-1 }; - register const vector signed short vprod2 =(const vector signed short) - { 1, 1,-1,-1, 1, 1,-1,-1 }; - register const vector signed short vprod3 =(const vector signed short) - { 1, 1, 1, 1,-1,-1,-1,-1 }; - register const vector unsigned char perm1 = (const vector unsigned char) - {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; - register const vector unsigned char perm2 = (const vector unsigned char) - {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; - register const vector unsigned char perm3 = (const vector unsigned char) - {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1, src2, srcO; \ - register vector unsigned char dst1, dst2, dstO; \ - register vector signed short srcV, dstV; \ - register vector signed short but0, but1, but2, op1, op2, op3; \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 15, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 15, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - /* we're in the 8x8 function, we only care for the first 8 */ \ - srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ - } - ONEITERBUTTERFLY(0, temp0); - ONEITERBUTTERFLY(1, temp1); - ONEITERBUTTERFLY(2, temp2); - ONEITERBUTTERFLY(3, temp3); - ONEITERBUTTERFLY(4, temp4); - ONEITERBUTTERFLY(5, temp5); - ONEITERBUTTERFLY(6, temp6); - ONEITERBUTTERFLY(7, temp7); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -/* -16x8 works with 16 elements; it allows to avoid replicating loads, and -give the compiler more rooms for scheduling. It's only used from -inside hadamard8_diff16_altivec. - -Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT -of spill code, it seems gcc (unlike xlc) cannot keep everything in registers -by itself. The following code include hand-made registers allocation. It's not -clean, but on a 7450 the resulting code is much faster (best case fall from -700+ cycles to 550). - -xlc doesn't add spill code, but it doesn't know how to schedule for the 7450, -and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less -instructions...) - -On the 970, the hand-made RA is still a win (around 690 vs. around 780), but -xlc goes to around 660 on the regular C code... -*/ - -static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { - int sum; - register vector signed short - temp0 __asm__ ("v0"), - temp1 __asm__ ("v1"), - temp2 __asm__ ("v2"), - temp3 __asm__ ("v3"), - temp4 __asm__ ("v4"), - temp5 __asm__ ("v5"), - temp6 __asm__ ("v6"), - temp7 __asm__ ("v7"); - register vector signed short - temp0S __asm__ ("v8"), - temp1S __asm__ ("v9"), - temp2S __asm__ ("v10"), - temp3S __asm__ ("v11"), - temp4S __asm__ ("v12"), - temp5S __asm__ ("v13"), - temp6S __asm__ ("v14"), - temp7S __asm__ ("v15"); - register const vector unsigned char vzero __asm__ ("v31") = - (const vector unsigned char)vec_splat_u8(0); - { - register const vector signed short vprod1 __asm__ ("v16") = - (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 }; - register const vector signed short vprod2 __asm__ ("v17") = - (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 }; - register const vector signed short vprod3 __asm__ ("v18") = - (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 }; - register const vector unsigned char perm1 __asm__ ("v19") = - (const vector unsigned char) - {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; - register const vector unsigned char perm2 __asm__ ("v20") = - (const vector unsigned char) - {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; - register const vector unsigned char perm3 __asm__ ("v21") = - (const vector unsigned char) - {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 __asm__ ("v22"), \ - src2 __asm__ ("v23"), \ - dst1 __asm__ ("v24"), \ - dst2 __asm__ ("v25"), \ - srcO __asm__ ("v22"), \ - dstO __asm__ ("v23"); \ - \ - register vector signed short srcV __asm__ ("v24"), \ - dstV __asm__ ("v25"), \ - srcW __asm__ ("v26"), \ - dstW __asm__ ("v27"), \ - but0 __asm__ ("v28"), \ - but0S __asm__ ("v29"), \ - op1 __asm__ ("v30"), \ - but1 __asm__ ("v22"), \ - op1S __asm__ ("v23"), \ - but1S __asm__ ("v24"), \ - op2 __asm__ ("v25"), \ - but2 __asm__ ("v26"), \ - op2S __asm__ ("v27"), \ - but2S __asm__ ("v28"), \ - op3 __asm__ ("v29"), \ - op3S __asm__ ("v30"); \ - \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 16, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 16, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - but0S = vec_sub(srcW, dstW); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op1S = vec_perm(but0S, but0S, perm1); \ - but1S = vec_mladd(but0S, vprod1, op1S); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op2S = vec_perm(but1S, but1S, perm2); \ - but2S = vec_mladd(but1S, vprod2, op2S); \ - op3 = vec_perm(but2, but2, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - op3S = vec_perm(but2S, but2S, perm3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ - } - ONEITERBUTTERFLY(0, temp0, temp0S); - ONEITERBUTTERFLY(1, temp1, temp1S); - ONEITERBUTTERFLY(2, temp2, temp2S); - ONEITERBUTTERFLY(3, temp3, temp3S); - ONEITERBUTTERFLY(4, temp4, temp4S); - ONEITERBUTTERFLY(5, temp5, temp5S); - ONEITERBUTTERFLY(6, temp6, temp6S); - ONEITERBUTTERFLY(7, temp7, temp7S); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0S, line1S, line2S, line3S, line4S, - line5S, line6S, line7S, line0BS,line2BS, - line1BS,line3BS,line4BS,line6BS,line5BS, - line7BS,line0CS,line4CS,line1CS,line5CS, - line2CS,line6CS,line3CS,line7CS; - - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - - line0S = vec_add(temp0S, temp1S); - line1S = vec_sub(temp0S, temp1S); - line2S = vec_add(temp2S, temp3S); - line3S = vec_sub(temp2S, temp3S); - line4S = vec_add(temp4S, temp5S); - line5S = vec_sub(temp4S, temp5S); - line6S = vec_add(temp6S, temp7S); - line7S = vec_sub(temp6S, temp7S); - - line0BS = vec_add(line0S, line2S); - line2BS = vec_sub(line0S, line2S); - line1BS = vec_add(line1S, line3S); - line3BS = vec_sub(line1S, line3S); - line4BS = vec_add(line4S, line6S); - line6BS = vec_sub(line4S, line6S); - line5BS = vec_add(line5S, line7S); - line7BS = vec_sub(line5S, line7S); - - line0CS = vec_add(line0BS, line4BS); - line4CS = vec_sub(line0BS, line4BS); - line1CS = vec_add(line1BS, line5BS); - line5CS = vec_sub(line1BS, line5BS); - line2CS = vec_add(line2BS, line6BS); - line6CS = vec_sub(line2BS, line6BS); - line3CS = vec_add(line3BS, line7BS); - line7CS = vec_sub(line3BS, line7BS); - - vsum = vec_sum4s(vec_abs(line0CS), vsum); - vsum = vec_sum4s(vec_abs(line1CS), vsum); - vsum = vec_sum4s(vec_abs(line2CS), vsum); - vsum = vec_sum4s(vec_abs(line3CS), vsum); - vsum = vec_sum4s(vec_abs(line4CS), vsum); - vsum = vec_sum4s(vec_abs(line5CS), vsum); - vsum = vec_sum4s(vec_abs(line6CS), vsum); - vsum = vec_sum4s(vec_abs(line7CS), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ - int score; - score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - if (h==16) { - dst += 8*stride; - src += 8*stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } - return score; -} - -av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - - c->pix_abs[0][1] = sad16_x2_altivec; - c->pix_abs[0][2] = sad16_y2_altivec; - c->pix_abs[0][3] = sad16_xy2_altivec; - c->pix_abs[0][0] = sad16_altivec; - c->pix_abs[1][0] = sad8_altivec; - c->sad[0]= sad16_altivec; - c->sad[1]= sad8_altivec; - c->pix_norm1 = pix_norm1_altivec; - c->sse[1]= sse8_altivec; - c->sse[0]= sse16_altivec; - c->pix_sum = pix_sum_altivec; - c->diff_pixels = diff_pixels_altivec; - c->add_bytes= add_bytes_altivec; - if (!high_bit_depth) { - c->get_pixels = get_pixels_altivec; - c->clear_block = clear_block_altivec; - } - - c->hadamard8_diff[0] = hadamard8_diff16_altivec; - c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; -} diff --git a/ffmpeg1/libavcodec/ppc/dsputil_altivec.h b/ffmpeg1/libavcodec/ppc/dsputil_altivec.h deleted file mode 100644 index 0e769ab..0000000 --- a/ffmpeg1/libavcodec/ppc/dsputil_altivec.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H -#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H - -#include <stdint.h> -#include "libavcodec/dsputil.h" - -void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_fdct_altivec(int16_t *block); -void ff_gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, - int x16, int y16, int rounder); -void ff_idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void ff_dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx); -void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx); -void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx); - -#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */ diff --git a/ffmpeg1/libavcodec/ppc/dsputil_ppc.c b/ffmpeg1/libavcodec/ppc/dsputil_ppc.c deleted file mode 100644 index 6112b0c..0000000 --- a/ffmpeg1/libavcodec/ppc/dsputil_ppc.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <string.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "dsputil_altivec.h" - -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* -clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a -cache line size not equal to 32 bytes. -Fortunately all processor used by Apple up to at least the 7450 (aka second -generation G4) use 32 bytes cache line. -This is due to the use of the 'dcbz' instruction. It simply clear to zero a -single cache line, so you need to know the cache line size to use it ! -It's absurd, but it's fast... - -update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line -size: 128 bytes. Oups. -The semantic of dcbz was changed, it always clear 32 bytes. so the function -below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, -which is defined to clear a cache line (as dcbz before). So we still can -distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - -see <http://developer.apple.com/technotes/tn/tn2087.html> -and <http://developer.apple.com/technotes/tn/tn2086.html> -*/ -static void clear_blocks_dcbz32_ppc(int16_t *blocks) -{ - register int misal = ((unsigned long)blocks & 0x00000010); - register int i = 0; - if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; - } - for ( ; i < sizeof(int16_t)*6*64-31 ; i += 32) { - __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } - if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; - } -} - -/* same as above, when dcbzl clear a whole 128B cache line - i.e. the PPC970 aka G5 */ -#if HAVE_DCBZL -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ - register int misal = ((unsigned long)blocks & 0x0000007f); - register int i = 0; - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(int16_t)*6*64); - } - else - for ( ; i < sizeof(int16_t)*6*64 ; i += 128) { - __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } -} -#else -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ - memset(blocks, 0, sizeof(int16_t)*6*64); -} -#endif - -#if HAVE_DCBZL -/* check dcbz report how many bytes are set to 0 by dcbz */ -/* update 24/06/2003 : replace dcbz by dcbzl to get - the intended effect (Apple "fixed" dcbz) - unfortunately this cannot be used unless the assembler - knows about dcbzl ... */ -static long check_dcbzl_effect(void) -{ - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; - - if (!fakedata) { - return 0L; - } - - fakedata_middle = (fakedata + 512); - - memset(fakedata, 0xFF, 1024); - - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024 ; i ++) { - if (fakedata[i] == (char)0) - count++; - } - - av_free(fakedata); - - return count; -} -#else -static long check_dcbzl_effect(void) -{ - return 0; -} -#endif - -av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - int mm_flags = av_get_cpu_flags(); - - // Common optimizations whether AltiVec is available or not - if (!high_bit_depth) { - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - } - -#if HAVE_ALTIVEC - if (mm_flags & AV_CPU_FLAG_ALTIVEC) { - ff_dsputil_init_altivec(c, avctx); - ff_int_init_altivec(c, avctx); - c->gmc1 = ff_gmc1_altivec; - -#if CONFIG_ENCODERS - if (avctx->bits_per_raw_sample <= 8 && - (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC)) { - c->fdct = ff_fdct_altivec; - } -#endif //CONFIG_ENCODERS - - if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) { - c->idct_put = ff_idct_put_altivec; - c->idct_add = ff_idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg1/libavcodec/ppc/fdct_altivec.c b/ffmpeg1/libavcodec/ppc/fdct_altivec.c deleted file mode 100644 index acab127..0000000 --- a/ffmpeg1/libavcodec/ppc/fdct_altivec.c +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Copyright (C) 2003 James Klicman <james@klicman.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/common.h" -#include "dsputil_altivec.h" - -#define vs16(v) ((vector signed short)(v)) -#define vs32(v) ((vector signed int)(v)) -#define vu8(v) ((vector unsigned char)(v)) -#define vu16(v) ((vector unsigned short)(v)) -#define vu32(v) ((vector unsigned int)(v)) - - -#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */ -#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */ -#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */ -#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */ -#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */ -#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */ -#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */ -#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ - - -#define W0 -(2 * C2) -#define W1 (2 * C6) -#define W2 (SQRT_2 * C6) -#define W3 (SQRT_2 * C3) -#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) -#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7)) -#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7)) -#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7)) -#define W8 (SQRT_2 * ( C7 - C3)) -#define W9 (SQRT_2 * (-C1 - C3)) -#define WA (SQRT_2 * (-C3 - C5)) -#define WB (SQRT_2 * ( C5 - C3)) - - -static vector float fdctconsts[3] = { - { W0, W1, W2, W3 }, - { W4, W5, W6, W7 }, - { W8, W9, WA, WB } -}; - -#define LD_W0 vec_splat(cnsts0, 0) -#define LD_W1 vec_splat(cnsts0, 1) -#define LD_W2 vec_splat(cnsts0, 2) -#define LD_W3 vec_splat(cnsts0, 3) -#define LD_W4 vec_splat(cnsts1, 0) -#define LD_W5 vec_splat(cnsts1, 1) -#define LD_W6 vec_splat(cnsts1, 2) -#define LD_W7 vec_splat(cnsts1, 3) -#define LD_W8 vec_splat(cnsts2, 0) -#define LD_W9 vec_splat(cnsts2, 1) -#define LD_WA vec_splat(cnsts2, 2) -#define LD_WB vec_splat(cnsts2, 3) - - -#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ - b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ - b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ - b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \ - /* }}} */ - -#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 += x2; */ \ - b5 = vec_add(b5, x3); /* b5 += x3; */ \ - b3 = vec_add(b3, x2); /* b3 += x2; */ \ - b1 = vec_add(b1, x3); /* b1 += x3; */ \ - /* }}} */ - - - -/* two dimensional discrete cosine transform */ - -void ff_fdct_altivec(int16_t *block) -{ - vector signed short *bp; - vector float *cp; - vector float b00, b10, b20, b30, b40, b50, b60, b70; - vector float b01, b11, b21, b31, b41, b51, b61, b71; - vector float mzero, cnst, cnsts0, cnsts1, cnsts2; - vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* setup constants {{{ */ - /* mzero = -0.0 */ - mzero = ((vector float)vec_splat_u32(-1)); - mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); - cp = fdctconsts; - cnsts0 = vec_ld(0, cp); cp++; - cnsts1 = vec_ld(0, cp); cp++; - cnsts2 = vec_ld(0, cp); - /* }}} */ - - - /* 8x8 matrix transpose (vector short[8]) {{{ */ -#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) - - bp = (vector signed short*)block; - b00 = ((vector float)vec_ld(0, bp)); - b40 = ((vector float)vec_ld(16*4, bp)); - b01 = ((vector float)MERGE_S16(h, b00, b40)); - b11 = ((vector float)MERGE_S16(l, b00, b40)); - bp++; - b10 = ((vector float)vec_ld(0, bp)); - b50 = ((vector float)vec_ld(16*4, bp)); - b21 = ((vector float)MERGE_S16(h, b10, b50)); - b31 = ((vector float)MERGE_S16(l, b10, b50)); - bp++; - b20 = ((vector float)vec_ld(0, bp)); - b60 = ((vector float)vec_ld(16*4, bp)); - b41 = ((vector float)MERGE_S16(h, b20, b60)); - b51 = ((vector float)MERGE_S16(l, b20, b60)); - bp++; - b30 = ((vector float)vec_ld(0, bp)); - b70 = ((vector float)vec_ld(16*4, bp)); - b61 = ((vector float)MERGE_S16(h, b30, b70)); - b71 = ((vector float)MERGE_S16(l, b30, b70)); - - x0 = ((vector float)MERGE_S16(h, b01, b41)); - x1 = ((vector float)MERGE_S16(l, b01, b41)); - x2 = ((vector float)MERGE_S16(h, b11, b51)); - x3 = ((vector float)MERGE_S16(l, b11, b51)); - x4 = ((vector float)MERGE_S16(h, b21, b61)); - x5 = ((vector float)MERGE_S16(l, b21, b61)); - x6 = ((vector float)MERGE_S16(h, b31, b71)); - x7 = ((vector float)MERGE_S16(l, b31, b71)); - - b00 = ((vector float)MERGE_S16(h, x0, x4)); - b10 = ((vector float)MERGE_S16(l, x0, x4)); - b20 = ((vector float)MERGE_S16(h, x1, x5)); - b30 = ((vector float)MERGE_S16(l, x1, x5)); - b40 = ((vector float)MERGE_S16(h, x2, x6)); - b50 = ((vector float)MERGE_S16(l, x2, x6)); - b60 = ((vector float)MERGE_S16(h, x3, x7)); - b70 = ((vector float)MERGE_S16(l, x3, x7)); - -#undef MERGE_S16 - /* }}} */ - - -/* Some of the initial calculations can be done as vector short before - * conversion to vector float. The following code section takes advantage - * of this. - */ - /* fdct rows {{{ */ - x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); - x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); - x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); - x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); - x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); - x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); - x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); - x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); - - b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); - b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); - - b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); - b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); - -#define CTF0(n) \ - b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ - b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); - - CTF0(0); - CTF0(4); - - b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); - b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); - - CTF0(2); - CTF0(6); - -#undef CTF0 - - x0 = vec_add(b60, b20); - x1 = vec_add(b61, b21); - - cnst = LD_W2; - x0 = vec_madd(cnst, x0, mzero); - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_W1; - b20 = vec_madd(cnst, b20, x0); - b21 = vec_madd(cnst, b21, x1); - cnst = LD_W0; - b60 = vec_madd(cnst, b60, x0); - b61 = vec_madd(cnst, b61, x1); - -#define CTFX(x,b) \ - b##0 = ((vector float)vec_unpackh(vs16(x))); \ - b##1 = ((vector float)vec_unpackl(vs16(x))); \ - b##0 = vec_ctf(vs32(b##0), 0); \ - b##1 = vec_ctf(vs32(b##1), 0); \ - - CTFX(x4, b7); - CTFX(x5, b5); - CTFX(x6, b3); - CTFX(x7, b1); - -#undef CTFX - - - x0 = vec_add(b70, b10); - x1 = vec_add(b50, b30); - x2 = vec_add(b70, b30); - x3 = vec_add(b50, b10); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b70 = vec_madd(cnst, b70, x0); - cnst = LD_W5; - b50 = vec_madd(cnst, b50, x1); - cnst = LD_W6; - b30 = vec_madd(cnst, b30, x1); - cnst = LD_W7; - b10 = vec_madd(cnst, b10, x0); - - b70 = vec_add(b70, x2); - b50 = vec_add(b50, x3); - b30 = vec_add(b30, x2); - b10 = vec_add(b10, x3); - - - x0 = vec_add(b71, b11); - x1 = vec_add(b51, b31); - x2 = vec_add(b71, b31); - x3 = vec_add(b51, b11); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b71 = vec_madd(cnst, b71, x0); - cnst = LD_W5; - b51 = vec_madd(cnst, b51, x1); - cnst = LD_W6; - b31 = vec_madd(cnst, b31, x1); - cnst = LD_W7; - b11 = vec_madd(cnst, b11, x0); - - b71 = vec_add(b71, x2); - b51 = vec_add(b51, x3); - b31 = vec_add(b31, x2); - b11 = vec_add(b11, x3); - /* }}} */ - - - /* 8x8 matrix transpose (vector float[8][2]) {{{ */ - x0 = vec_mergel(b00, b20); - x1 = vec_mergeh(b00, b20); - x2 = vec_mergel(b10, b30); - x3 = vec_mergeh(b10, b30); - - b00 = vec_mergeh(x1, x3); - b10 = vec_mergel(x1, x3); - b20 = vec_mergeh(x0, x2); - b30 = vec_mergel(x0, x2); - - x4 = vec_mergel(b41, b61); - x5 = vec_mergeh(b41, b61); - x6 = vec_mergel(b51, b71); - x7 = vec_mergeh(b51, b71); - - b41 = vec_mergeh(x5, x7); - b51 = vec_mergel(x5, x7); - b61 = vec_mergeh(x4, x6); - b71 = vec_mergel(x4, x6); - - x0 = vec_mergel(b01, b21); - x1 = vec_mergeh(b01, b21); - x2 = vec_mergel(b11, b31); - x3 = vec_mergeh(b11, b31); - - x4 = vec_mergel(b40, b60); - x5 = vec_mergeh(b40, b60); - x6 = vec_mergel(b50, b70); - x7 = vec_mergeh(b50, b70); - - b40 = vec_mergeh(x1, x3); - b50 = vec_mergel(x1, x3); - b60 = vec_mergeh(x0, x2); - b70 = vec_mergel(x0, x2); - - b01 = vec_mergeh(x5, x7); - b11 = vec_mergel(x5, x7); - b21 = vec_mergeh(x4, x6); - b31 = vec_mergel(x4, x6); - /* }}} */ - - - FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); - - - /* round, convert back to short {{{ */ -#define CTS(n) \ - b##n##0 = vec_round(b##n##0); \ - b##n##1 = vec_round(b##n##1); \ - b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ - b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ - b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ - vec_st(vs16(b##n##0), 0, bp); - - bp = (vector signed short*)block; - CTS(0); bp++; - CTS(1); bp++; - CTS(2); bp++; - CTS(3); bp++; - CTS(4); bp++; - CTS(5); bp++; - CTS(6); bp++; - CTS(7); - -#undef CTS - /* }}} */ -} - -/* vim:set foldmethod=marker foldlevel=0: */ diff --git a/ffmpeg1/libavcodec/ppc/fft_altivec.c b/ffmpeg1/libavcodec/ppc/fft_altivec.c deleted file mode 100644 index 651ee26..0000000 --- a/ffmpeg1/libavcodec/ppc/fft_altivec.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2009 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" - -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before with s->revtab table. No - * 1.0/sqrt(n) normalization is done. - * AltiVec-enabled - * This code assumes that the 'z' pointer is 16 bytes-aligned - * It also assumes all FFTComplex are 8 bytes-aligned pair of float - */ - -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); -void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); - -#if HAVE_GNU_AS -static void ff_imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int j, k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n8 = n >> 3; - int n32 = n >> 5; - const uint16_t *revtabj = s->revtab; - const uint16_t *revtabk = s->revtab+n4; - const vec_f *tcos = (const vec_f*)(s->tcos+n8); - const vec_f *tsin = (const vec_f*)(s->tsin+n8); - const vec_f *pin = (const vec_f*)(input+n4); - vec_f *pout = (vec_f*)(output+n4); - - /* pre rotation */ - k = n32-1; - do { - vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; -#define CMULA(p,o0,o1,o2,o3)\ - a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ - b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ - re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ - im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ - cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ - sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ - r##p = im*cos - re*sin;\ - i##p = re*cos + im*sin; -#define STORE2(v,dst)\ - j = dst;\ - vec_ste(v, 0, output+j*2);\ - vec_ste(v, 4, output+j*2); -#define STORE8(p)\ - a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ - b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ - c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ - d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ - STORE2(a, revtabk[ p*2-4]);\ - STORE2(b, revtabk[ p*2-3]);\ - STORE2(c, revtabj[-p*2+2]);\ - STORE2(d, revtabj[-p*2+3]); - - cos0 = tcos[k]; - sin0 = tsin[k]; - cos1 = tcos[-k-1]; - sin1 = tsin[-k-1]; - CMULA(0, 0,1,2,3); - CMULA(1, 2,3,0,1); - STORE8(0); - STORE8(1); - revtabj += 4; - revtabk -= 4; - k--; - } while(k >= 0); - - ff_fft_calc_altivec(s, (FFTComplex*)output); - - /* post rotation + reordering */ - j = -n32; - k = n32-1; - do { - vec_f cos,sin,re,im,a,b,c,d; -#define CMULB(d0,d1,o)\ - re = pout[o*2];\ - im = pout[o*2+1];\ - cos = tcos[o];\ - sin = tsin[o];\ - d0 = im*sin - re*cos;\ - d1 = re*sin + im*cos; - - CMULB(a,b,j); - CMULB(c,d,k); - pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); - pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); - pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); - pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); - j++; - k--; - } while(k >= 0); -} - -static void ff_imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n16 = n >> 4; - vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; - vec_u32 *p0 = (vec_u32*)(output+n4); - vec_u32 *p1 = (vec_u32*)(output+n4*3); - - ff_imdct_half_altivec(s, output+n4, input); - - for (k = 0; k < n16; k++) { - vec_u32 a = p0[k] ^ sign; - vec_u32 b = p1[-k-1]; - p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); - p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); - } -} -#endif /* HAVE_GNU_AS */ - -av_cold void ff_fft_init_altivec(FFTContext *s) -{ -#if HAVE_GNU_AS - s->fft_calc = ff_fft_calc_interleave_altivec; - if (s->mdct_bits >= 5) { - s->imdct_calc = ff_imdct_calc_altivec; - s->imdct_half = ff_imdct_half_altivec; - } -#endif -} diff --git a/ffmpeg1/libavcodec/ppc/fft_altivec_s.S b/ffmpeg1/libavcodec/ppc/fft_altivec_s.S deleted file mode 100644 index 16ce838..0000000 --- a/ffmpeg1/libavcodec/ppc/fft_altivec_s.S +++ /dev/null @@ -1,449 +0,0 @@ -/* - * FFT transform with Altivec optimizations - * Copyright (c) 2009 Loren Merritt - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * These functions are not individually interchangeable with the C versions. - * While C takes arrays of FFTComplex, Altivec leaves intermediate results - * in blocks as convenient to the vector size. - * i.e. {4x real, 4x imaginary, 4x real, ...} - * - * I ignore standard calling convention. - * Instead, the following registers are treated as global constants: - * v14: zero - * v15..v18: cosines - * v19..v29: permutations - * r9: 16 - * r12: ff_cos_tabs - * and the rest are free for local use. - */ - -#include "config.h" -#include "asm.S" - -.text - -.macro addi2 ra, imm // add 32-bit immediate -.if \imm & 0xffff - addi \ra, \ra, \imm@l -.endif -.if (\imm+0x8000)>>16 - addis \ra, \ra, \imm@ha -.endif -.endm - -.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} -.endm - -.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vperm \b2,\b0,\b1,v20 - vperm \b3,\b0,\b1,v21 - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vmrghw \b2,\b0,\b1 - vperm \b3,\b0,\b1,v22 - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} - vperm \b2,\b0,\b1,v23 - vperm \b3,\b0,\b1,v24 -.endm - -.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 - vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} - vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} - vperm \a2,\a0,\a1,v20 // FFT4 ... - vperm \a3,\a0,\a1,v21 - vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} - vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} - vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) - vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} - vmrghw \a2,\a0,\a1 - vperm \a3,\a0,\a1,v22 - vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} - vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} - vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} - vperm \a2,\a0,\a1,v23 - vperm \a3,\a0,\a1,v24 - vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} - vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} - vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} - vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} - vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} - vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} -.endm - -.macro BF d0,d1,s0,s1 - vsubfp \d1,\s0,\s1 - vaddfp \d0,\s0,\s1 -.endm - -.macro zip d0,d1,s0,s1 - vmrghw \d0,\s0,\s1 - vmrglw \d1,\s0,\s1 -.endm - -.macro def_fft4 interleave -fft4\interleave\()_altivec: - lvx v0, 0,r3 - lvx v1,r9,r3 - FFT4 v0,v1,v2,v3 -.ifnb \interleave - zip v0,v1,v2,v3 - stvx v0, 0,r3 - stvx v1,r9,r3 -.else - stvx v2, 0,r3 - stvx v3,r9,r3 -.endif - blr -.endm - -.macro def_fft8 interleave -fft8\interleave\()_altivec: - addi r4,r3,32 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 -.ifnb \interleave - zip v4,v5,v0,v1 - zip v6,v7,v2,v3 - stvx v4, 0,r3 - stvx v5,r9,r3 - stvx v6, 0,r4 - stvx v7,r9,r4 -.else - stvx v0, 0,r3 - stvx v1,r9,r3 - stvx v2, 0,r4 - stvx v3,r9,r4 -.endif - blr -.endm - -.macro def_fft16 interleave -fft16\interleave\()_altivec: - addi r5,r3,64 - addi r6,r3,96 - addi r4,r3,32 - lvx v0, 0,r5 - lvx v1,r9,r5 - lvx v2, 0,r6 - lvx v3,r9,r6 - FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 - vmaddfp v8,v4,v15,v14 // r2*wre - vmaddfp v9,v5,v15,v14 // i2*wre - vmaddfp v10,v6,v15,v14 // r3*wre - vmaddfp v11,v7,v15,v14 // i3*wre - vmaddfp v8,v5,v16,v8 // i2*wim - vnmsubfp v9,v4,v16,v9 // r2*wim - vnmsubfp v10,v7,v16,v10 // i3*wim - vmaddfp v11,v6,v16,v11 // r3*wim - BF v10,v12,v10,v8 - BF v11,v13,v9,v11 - BF v0,v4,v0,v10 - BF v3,v7,v3,v12 - BF v1,v5,v1,v11 - BF v2,v6,v2,v13 -.ifnb \interleave - zip v8, v9,v0,v1 - zip v10,v11,v2,v3 - zip v12,v13,v4,v5 - zip v14,v15,v6,v7 - stvx v8, 0,r3 - stvx v9,r9,r3 - stvx v10, 0,r4 - stvx v11,r9,r4 - stvx v12, 0,r5 - stvx v13,r9,r5 - stvx v14, 0,r6 - stvx v15,r9,r6 -.else - stvx v0, 0,r3 - stvx v4, 0,r5 - stvx v3,r9,r4 - stvx v7,r9,r6 - stvx v1,r9,r3 - stvx v5,r9,r5 - stvx v2, 0,r4 - stvx v6, 0,r6 -.endif - blr -.endm - -// void pass(float *z, float *wre, int n) -.macro PASS interleave, suffix -fft_pass\suffix\()_altivec: - mtctr r5 - slwi r0,r5,4 - slwi r7,r5,6 // o2 - slwi r5,r5,5 // o1 - add r10,r5,r7 // o3 - add r0,r4,r0 // wim - addi r6,r5,16 // o1+16 - addi r8,r7,16 // o2+16 - addi r11,r10,16 // o3+16 -1: - lvx v8, 0,r4 // wre - lvx v10, 0,r0 // wim - sub r0,r0,r9 - lvx v9, 0,r0 - vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] - lvx v4,r3,r7 // r2 = z[o2] - lvx v5,r3,r8 // i2 = z[o2+16] - lvx v6,r3,r10 // r3 = z[o3] - lvx v7,r3,r11 // i3 = z[o3+16] - vmaddfp v10,v4,v8,v14 // r2*wre - vmaddfp v11,v5,v8,v14 // i2*wre - vmaddfp v12,v6,v8,v14 // r3*wre - vmaddfp v13,v7,v8,v14 // i3*wre - lvx v0, 0,r3 // r0 = z[0] - lvx v3,r3,r6 // i1 = z[o1+16] - vmaddfp v10,v5,v9,v10 // i2*wim - vnmsubfp v11,v4,v9,v11 // r2*wim - vnmsubfp v12,v7,v9,v12 // i3*wim - vmaddfp v13,v6,v9,v13 // r3*wim - lvx v1,r3,r9 // i0 = z[16] - lvx v2,r3,r5 // r1 = z[o1] - BF v12,v8,v12,v10 - BF v13,v9,v11,v13 - BF v0,v4,v0,v12 - BF v3,v7,v3,v8 -.if !\interleave - stvx v0, 0,r3 - stvx v4,r3,r7 - stvx v3,r3,r6 - stvx v7,r3,r11 -.endif - BF v1,v5,v1,v13 - BF v2,v6,v2,v9 -.if !\interleave - stvx v1,r3,r9 - stvx v2,r3,r5 - stvx v5,r3,r8 - stvx v6,r3,r10 -.else - vmrghw v8,v0,v1 - vmrglw v9,v0,v1 - stvx v8, 0,r3 - stvx v9,r3,r9 - vmrghw v8,v2,v3 - vmrglw v9,v2,v3 - stvx v8,r3,r5 - stvx v9,r3,r6 - vmrghw v8,v4,v5 - vmrglw v9,v4,v5 - stvx v8,r3,r7 - stvx v9,r3,r8 - vmrghw v8,v6,v7 - vmrglw v9,v6,v7 - stvx v8,r3,r10 - stvx v9,r3,r11 -.endif - addi r3,r3,32 - addi r4,r4,16 - bdnz 1b - sub r3,r3,r5 - blr -.endm - -#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ - -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d - - .rodata - .align 4 -fft_data: - .float 0, 0, 0, 0 - .float 1, 0.92387953, M_SQRT1_2, 0.38268343 - .float 0, 0.38268343, M_SQRT1_2, 0.92387953 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 - .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 - vcprm(s0,3,2,1) - vcprm(0,1,s2,s1) - vcprm(2,3,s0,s3) - vcprm(2,s3,3,s2) - vcprm(0,1,s0,s1) - vcprm(2,3,s2,s3) - vcprm(2,3,0,1) - vcprm(1,2,s3,s0) - vcprm(0,3,s2,s1) - vcprm(0,2,s1,s3) - vcprm(1,3,s0,s2) - -.macro lvm b, r, regs:vararg - lvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - lvm \b, \regs - .endif -.endm - -.macro stvm b, r, regs:vararg - stvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - stvm \b, \regs - .endif -.endm - -.macro fft_calc interleave -extfunc ff_fft_calc\interleave\()_altivec - mflr r0 - stp r0, 2*PS(r1) - stpu r1, -(160+16*PS)(r1) - get_got r11 - addi r6, r1, 16*PS - stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - mfvrsave r0 - stw r0, 15*PS(r1) - li r6, 0xfffffffc - mtvrsave r6 - - movrel r6, fft_data, r11 - lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 - lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 - - li r9, 16 - movrel r12, X(ff_cos_tabs), r11 - - movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 - lwz r3, 0(r3) - subi r3, r3, 2 - slwi r3, r3, 2+ARCH_PPC64 - lpx r3, r3, r6 - mtctr r3 - mr r3, r4 - bctrl - - addi r6, r1, 16*PS - lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - lwz r6, 15*PS(r1) - mtvrsave r6 - lp r1, 0(r1) - lp r0, 2*PS(r1) - mtlr r0 - blr -.endm - -.macro DECL_FFT suffix, bits, n, n2, n4 -fft\n\suffix\()_altivec: - mflr r0 - stp r0,PS*(\bits-3)(r1) - bl fft\n2\()_altivec - addi2 r3,\n*4 - bl fft\n4\()_altivec - addi2 r3,\n*2 - bl fft\n4\()_altivec - addi2 r3,\n*-6 - lp r0,PS*(\bits-3)(r1) - lp r4,\bits*PS(r12) - mtlr r0 - li r5,\n/16 - b fft_pass\suffix\()_altivec -.endm - -.macro DECL_FFTS interleave, suffix - .text - def_fft4 \suffix - def_fft8 \suffix - def_fft16 \suffix - PASS \interleave, \suffix - DECL_FFT \suffix, 5, 32, 16, 8 - DECL_FFT \suffix, 6, 64, 32, 16 - DECL_FFT \suffix, 7, 128, 64, 32 - DECL_FFT \suffix, 8, 256, 128, 64 - DECL_FFT \suffix, 9, 512, 256, 128 - DECL_FFT \suffix,10, 1024, 512, 256 - DECL_FFT \suffix,11, 2048, 1024, 512 - DECL_FFT \suffix,12, 4096, 2048, 1024 - DECL_FFT \suffix,13, 8192, 4096, 2048 - DECL_FFT \suffix,14,16384, 8192, 4096 - DECL_FFT \suffix,15,32768,16384, 8192 - DECL_FFT \suffix,16,65536,32768,16384 - - fft_calc \suffix - - .rodata - .align 3 -fft_dispatch_tab\suffix\()_altivec: - PTR fft4\suffix\()_altivec - PTR fft8\suffix\()_altivec - PTR fft16\suffix\()_altivec - PTR fft32\suffix\()_altivec - PTR fft64\suffix\()_altivec - PTR fft128\suffix\()_altivec - PTR fft256\suffix\()_altivec - PTR fft512\suffix\()_altivec - PTR fft1024\suffix\()_altivec - PTR fft2048\suffix\()_altivec - PTR fft4096\suffix\()_altivec - PTR fft8192\suffix\()_altivec - PTR fft16384\suffix\()_altivec - PTR fft32768\suffix\()_altivec - PTR fft65536\suffix\()_altivec -.endm - -DECL_FFTS 0 -DECL_FFTS 1, _interleave diff --git a/ffmpeg1/libavcodec/ppc/fmtconvert_altivec.c b/ffmpeg1/libavcodec/ppc/fmtconvert_altivec.c deleted file mode 100644 index b29c7d4..0000000 --- a/ffmpeg1/libavcodec/ppc/fmtconvert_altivec.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/fmtconvert.h" - -#include "libavutil/ppc/util_altivec.h" -#include "libavutil/attributes.h" -#include "libavutil/mem.h" -#include "dsputil_altivec.h" - -static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, - float mul, int len) -{ - union { - vector float v; - float s[4]; - } mul_u; - int i; - vector float src1, src2, dst1, dst2, mul_v, zero; - - zero = (vector float)vec_splat_u32(0); - mul_u.s[0] = mul; - mul_v = vec_splat(mul_u.v, 0); - - for (i = 0; i < len; i += 8) { - src1 = vec_ctf(vec_ld(0, src+i), 0); - src2 = vec_ctf(vec_ld(16, src+i), 0); - dst1 = vec_madd(src1, mul_v, zero); - dst2 = vec_madd(src2, mul_v, zero); - vec_st(dst1, 0, dst+i); - vec_st(dst2, 16, dst+i); - } -} - - -static vector signed short float_to_int16_one_altivec(const float *src) -{ - vector float s0 = vec_ld(0, src); - vector float s1 = vec_ld(16, src); - vector signed int t0 = vec_cts(s0, 0); - vector signed int t1 = vec_cts(s1, 0); - return vec_packs(t0,t1); -} - -static void float_to_int16_altivec(int16_t *dst, const float *src, long len) -{ - int i; - vector signed short d0, d1, d; - vector unsigned char align; - if (((long)dst) & 15) { //FIXME - for (i = 0; i < len - 7; i += 8) { - d0 = vec_ld(0, dst+i); - d = float_to_int16_one_altivec(src + i); - d1 = vec_ld(15, dst+i); - d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); - align = vec_lvsr(0, dst + i); - d0 = vec_perm(d1, d, align); - d1 = vec_perm(d, d1, align); - vec_st(d0, 0, dst + i); - vec_st(d1, 15, dst + i); - } - } else { - for (i = 0; i < len - 7; i += 8) { - d = float_to_int16_one_altivec(src + i); - vec_st(d, 0, dst + i); - } - } -} - -#define VSTE_INC(dst, v, elem, inc) do { \ - vector signed short s = vec_splat(v, elem); \ - vec_ste(s, 0, dst); \ - dst += inc; \ - } while (0) - -static void float_to_int16_stride_altivec(int16_t *dst, const float *src, - long len, int stride) -{ - int i; - vector signed short d; - - for (i = 0; i < len - 7; i += 8) { - d = float_to_int16_one_altivec(src + i); - VSTE_INC(dst, d, 0, stride); - VSTE_INC(dst, d, 1, stride); - VSTE_INC(dst, d, 2, stride); - VSTE_INC(dst, d, 3, stride); - VSTE_INC(dst, d, 4, stride); - VSTE_INC(dst, d, 5, stride); - VSTE_INC(dst, d, 6, stride); - VSTE_INC(dst, d, 7, stride); - } -} - -static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, - long len, int channels) -{ - int i; - vector signed short d0, d1, d2, c0, c1, t0, t1; - vector unsigned char align; - - if (channels == 1) - float_to_int16_altivec(dst, src[0], len); - else { - if (channels == 2) { - if (((long)dst) & 15) { - for (i = 0; i < len - 7; i += 8) { - d0 = vec_ld(0, dst + i); - t0 = float_to_int16_one_altivec(src[0] + i); - d1 = vec_ld(31, dst + i); - t1 = float_to_int16_one_altivec(src[1] + i); - c0 = vec_mergeh(t0, t1); - c1 = vec_mergel(t0, t1); - d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); - align = vec_lvsr(0, dst + i); - d0 = vec_perm(d2, c0, align); - d1 = vec_perm(c0, c1, align); - vec_st(d0, 0, dst + i); - d0 = vec_perm(c1, d2, align); - vec_st(d1, 15, dst + i); - vec_st(d0, 31, dst + i); - dst += 8; - } - } else { - for (i = 0; i < len - 7; i += 8) { - t0 = float_to_int16_one_altivec(src[0] + i); - t1 = float_to_int16_one_altivec(src[1] + i); - d0 = vec_mergeh(t0, t1); - d1 = vec_mergel(t0, t1); - vec_st(d0, 0, dst + i); - vec_st(d1, 16, dst + i); - dst += 8; - } - } - } else { - for (i = 0; i < channels; i++) - float_to_int16_stride_altivec(dst + i, src[i], len, channels); - } - } -} - -av_cold void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx) -{ - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = float_to_int16_altivec; - c->float_to_int16_interleave = float_to_int16_interleave_altivec; - } -} diff --git a/ffmpeg1/libavcodec/ppc/gmc_altivec.c b/ffmpeg1/libavcodec/ppc/gmc_altivec.c deleted file mode 100644 index 4db761d..0000000 --- a/ffmpeg1/libavcodec/ppc/gmc_altivec.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * GMC (Global Motion Compensation) - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -/* - altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, - to preserve proper dst alignment. -*/ -void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) -{ - const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; - const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); - register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; - register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; - int i; - unsigned long dst_odd = (unsigned long)dst & 0x0000000F; - unsigned long src_really_odd = (unsigned long)src & 0x0000000F; - - tempA = vec_ld(0, (const unsigned short*)ABCD); - Av = vec_splat(tempA, 0); - Bv = vec_splat(tempA, 1); - Cv = vec_splat(tempA, 2); - Dv = vec_splat(tempA, 3); - - rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0); - - // we'll be able to pick-up our 9 char elements - // at src from those 32 bytes - // we load the first batch here, as inside the loop - // we can re-use 'src+stride' from one iteration - // as the 'src' of the next. - src_0 = vec_ld(0, src); - src_1 = vec_ld(16, src); - srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - - if (src_really_odd != 0x0000000F) { - // if src & 0xF == 0xF, then (src+1) is properly aligned - // on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } else { - srcvB = src_1; - } - srcvA = vec_mergeh(vczero, srcvA); - srcvB = vec_mergeh(vczero, srcvB); - - for(i=0; i<h; i++) { - dst_odd = (unsigned long)dst & 0x0000000F; - src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; - - dstv = vec_ld(0, dst); - - // we we'll be able to pick-up our 9 char elements - // at src + stride from those 32 bytes - // then reuse the resulting 2 vectors srvcC and srcvD - // as the next srcvA and srcvB - src_0 = vec_ld(stride + 0, src); - src_1 = vec_ld(stride + 16, src); - srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); - - if (src_really_odd != 0x0000000F) { - // if src & 0xF == 0xF, then (src+1) is properly aligned - // on the second vector. - srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); - } else { - srcvD = src_1; - } - - srcvC = vec_mergeh(vczero, srcvC); - srcvD = vec_mergeh(vczero, srcvD); - - - // OK, now we (finally) do the math :-) - // those four instructions replaces 32 int muls & 32 int adds. - // isn't AltiVec nice ? - tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); - tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); - tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); - tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); - - srcvA = srcvC; - srcvB = srcvD; - - tempD = vec_sr(tempD, vcsr8); - - dstv2 = vec_pack(tempD, (vector unsigned short)vczero); - - if (dst_odd) { - dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); - } else { - dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); - } - - vec_st(dstv2, 0, dst); - - dst += stride; - src += stride; - } -} diff --git a/ffmpeg1/libavcodec/ppc/h264_altivec.c b/ffmpeg1/libavcodec/ppc/h264_altivec.c deleted file mode 100644 index 3c2bb4d..0000000 --- a/ffmpeg1/libavcodec/ppc/h264_altivec.c +++ /dev/null @@ -1,748 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/h264data.h" -#include "libavcodec/h264dsp.h" - -/**************************************************************************** - * IDCT transform: - ****************************************************************************/ - -#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ - /* 1st stage */ \ - vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ - vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ - vz2 = vec_sra(vb1,vec_splat_u16(1)); \ - vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ - vz3 = vec_sra(vb3,vec_splat_u16(1)); \ - vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ - /* 2nd stage: output */ \ - va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ - va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ - va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ - va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ - -#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ - b0 = vec_mergeh( a0, a0 ); \ - b1 = vec_mergeh( a1, a0 ); \ - b2 = vec_mergeh( a2, a0 ); \ - b3 = vec_mergeh( a3, a0 ); \ - a0 = vec_mergeh( b0, b2 ); \ - a1 = vec_mergel( b0, b2 ); \ - a2 = vec_mergeh( b1, b3 ); \ - a3 = vec_mergel( b1, b3 ); \ - b0 = vec_mergeh( a0, a2 ); \ - b1 = vec_mergel( a0, a2 ); \ - b2 = vec_mergeh( a1, a3 ); \ - b3 = vec_mergel( a1, a3 ) - -#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ - vdst_orig = vec_ld(0, dst); \ - vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ - vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ - va = vec_add(va, vdst_ss); \ - va_u8 = vec_packsu(va, zero_s16v); \ - va_u32 = vec_splat((vec_u32)va_u8, 0); \ - vec_ste(va_u32, element, (uint32_t*)dst); - -static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) -{ - vec_s16 va0, va1, va2, va3; - vec_s16 vz0, vz1, vz2, vz3; - vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; - vec_u8 va_u8; - vec_u32 va_u32; - vec_s16 vdst_ss; - const vec_u16 v6us = vec_splat_u16(6); - vec_u8 vdst, vdst_orig; - vec_u8 vdst_mask = vec_lvsl(0, dst); - int element = ((unsigned long)dst & 0xf) >> 2; - LOAD_ZERO; - - block[0] += 32; /* add 32 as a DC-level for rounding */ - - vtmp0 = vec_ld(0,block); - vtmp1 = vec_sld(vtmp0, vtmp0, 8); - vtmp2 = vec_ld(16,block); - vtmp3 = vec_sld(vtmp2, vtmp2, 8); - memset(block, 0, 16 * sizeof(int16_t)); - - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - - va0 = vec_sra(va0,v6us); - va1 = vec_sra(va1,v6us); - va2 = vec_sra(va2,v6us); - va3 = vec_sra(va3,v6us); - - VEC_LOAD_U8_ADD_S16_STORE_U8(va0); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va1); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va2); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va3); -} - -#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ - /* a0 = SRC(0) + SRC(4); */ \ - vec_s16 a0v = vec_add(s0, s4); \ - /* a2 = SRC(0) - SRC(4); */ \ - vec_s16 a2v = vec_sub(s0, s4); \ - /* a4 = (SRC(2)>>1) - SRC(6); */ \ - vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ - /* a6 = (SRC(6)>>1) + SRC(2); */ \ - vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ - /* b0 = a0 + a6; */ \ - vec_s16 b0v = vec_add(a0v, a6v); \ - /* b2 = a2 + a4; */ \ - vec_s16 b2v = vec_add(a2v, a4v); \ - /* b4 = a2 - a4; */ \ - vec_s16 b4v = vec_sub(a2v, a4v); \ - /* b6 = a0 - a6; */ \ - vec_s16 b6v = vec_sub(a0v, a6v); \ - /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ - /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ - vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ - /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ - /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ - vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ - /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ - /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ - vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ - /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ - vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ - /* b1 = (a7>>2) + a1; */ \ - vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ - /* b3 = a3 + (a5>>2); */ \ - vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ - /* b5 = (a3>>2) - a5; */ \ - vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ - /* b7 = a7 - (a1>>2); */ \ - vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ - /* DST(0, b0 + b7); */ \ - d0 = vec_add(b0v, b7v); \ - /* DST(1, b2 + b5); */ \ - d1 = vec_add(b2v, b5v); \ - /* DST(2, b4 + b3); */ \ - d2 = vec_add(b4v, b3v); \ - /* DST(3, b6 + b1); */ \ - d3 = vec_add(b6v, b1v); \ - /* DST(4, b6 - b1); */ \ - d4 = vec_sub(b6v, b1v); \ - /* DST(5, b4 - b3); */ \ - d5 = vec_sub(b4v, b3v); \ - /* DST(6, b2 - b5); */ \ - d6 = vec_sub(b2v, b5v); \ - /* DST(7, b0 - b7); */ \ - d7 = vec_sub(b0v, b7v); \ -} - -#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ - /* unaligned load */ \ - vec_u8 hv = vec_ld( 0, dest ); \ - vec_u8 lv = vec_ld( 7, dest ); \ - vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ - vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ - vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ - vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ - vec_u8 edgehv; \ - /* unaligned store */ \ - vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ - vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ - lv = vec_sel( lv, bodyv, edgelv ); \ - vec_st( lv, 7, dest ); \ - hv = vec_ld( 0, dest ); \ - edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ - hv = vec_sel( hv, bodyv, edgehv ); \ - vec_st( hv, 0, dest ); \ - } - -static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride ) { - vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; - vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; - vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; - - vec_u8 perm_ldv = vec_lvsl(0, dst); - vec_u8 perm_stv = vec_lvsr(8, dst); - - const vec_u16 onev = vec_splat_u16(1); - const vec_u16 twov = vec_splat_u16(2); - const vec_u16 sixv = vec_splat_u16(6); - - const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; - LOAD_ZERO; - - dct[0] += 32; // rounding for the >>6 at the end - - s0 = vec_ld(0x00, (int16_t*)dct); - s1 = vec_ld(0x10, (int16_t*)dct); - s2 = vec_ld(0x20, (int16_t*)dct); - s3 = vec_ld(0x30, (int16_t*)dct); - s4 = vec_ld(0x40, (int16_t*)dct); - s5 = vec_ld(0x50, (int16_t*)dct); - s6 = vec_ld(0x60, (int16_t*)dct); - s7 = vec_ld(0x70, (int16_t*)dct); - memset(dct, 0, 64 * sizeof(int16_t)); - - IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, - d0, d1, d2, d3, d4, d5, d6, d7); - - TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); - - IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, - idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); - - ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); -} - -static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) -{ - vec_s16 dc16; - vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; - LOAD_ZERO; - DECLARE_ALIGNED(16, int, dc); - int i; - - dc = (block[0] + 32) >> 6; - block[0] = 0; - dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); - - if (size == 4) - dc16 = vec_sld(dc16, zero_s16v, 8); - dcplus = vec_packsu(dc16, zero_s16v); - dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); - - aligner = vec_lvsr(0, dst); - dcplus = vec_perm(dcplus, dcplus, aligner); - dcminus = vec_perm(dcminus, dcminus, aligner); - - for (i = 0; i < size; i += 4) { - v0 = vec_ld(0, dst+0*stride); - v1 = vec_ld(0, dst+1*stride); - v2 = vec_ld(0, dst+2*stride); - v3 = vec_ld(0, dst+3*stride); - - v0 = vec_adds(v0, dcplus); - v1 = vec_adds(v1, dcplus); - v2 = vec_adds(v2, dcplus); - v3 = vec_adds(v3, dcplus); - - v0 = vec_subs(v0, dcminus); - v1 = vec_subs(v1, dcminus); - v2 = vec_subs(v2, dcminus); - v3 = vec_subs(v3, dcminus); - - vec_st(v0, 0, dst+0*stride); - vec_st(v1, 0, dst+1*stride); - vec_st(v2, 0, dst+2*stride); - vec_st(v3, 0, dst+3*stride); - - dst += 4*stride; - } -} - -static void h264_idct_dc_add_altivec(uint8_t *dst, int16_t *block, int stride) -{ - h264_idct_dc_add_internal(dst, block, stride, 4); -} - -static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, int16_t *block, int stride) -{ - h264_idct_dc_add_internal(dst, block, stride, 8); -} - -static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); - else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){ - int i, j; - for (j = 1; j < 3; j++) { - for(i = j * 16; i < j * 16 + 4; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - h264_idct_dc_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride); - } - } -} - -#define transpose4x16(r0, r1, r2, r3) { \ - register vec_u8 r4; \ - register vec_u8 r5; \ - register vec_u8 r6; \ - register vec_u8 r7; \ - \ - r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ - r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ - r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ - r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ - \ - r0 = vec_mergeh(r4, r6); /*all set 0*/ \ - r1 = vec_mergel(r4, r6); /*all set 1*/ \ - r2 = vec_mergeh(r5, r7); /*all set 2*/ \ - r3 = vec_mergel(r5, r7); /*all set 3*/ \ -} - -static inline void write16x4(uint8_t *dst, int dst_stride, - register vec_u8 r0, register vec_u8 r1, - register vec_u8 r2, register vec_u8 r3) { - DECLARE_ALIGNED(16, unsigned char, result)[64]; - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; - int int_dst_stride = dst_stride/4; - - vec_st(r0, 0, result); - vec_st(r1, 16, result); - vec_st(r2, 32, result); - vec_st(r3, 48, result); - /* FIXME: there has to be a better way!!!! */ - *dst_int = *src_int; - *(dst_int+ int_dst_stride) = *(src_int + 1); - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); - *(dst_int+10*int_dst_stride) = *(src_int + 10); - *(dst_int+11*int_dst_stride) = *(src_int + 11); - *(dst_int+12*int_dst_stride) = *(src_int + 12); - *(dst_int+13*int_dst_stride) = *(src_int + 13); - *(dst_int+14*int_dst_stride) = *(src_int + 14); - *(dst_int+15*int_dst_stride) = *(src_int + 15); -} - -/** @brief performs a 6x16 transpose of data in src, and stores it to dst - @todo FIXME: see if we can't spare some vec_lvsl() by them factorizing - out of unaligned_load() */ -#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ - register vec_u8 r0 = unaligned_load(0, src); \ - register vec_u8 r1 = unaligned_load( src_stride, src); \ - register vec_u8 r2 = unaligned_load(2* src_stride, src); \ - register vec_u8 r3 = unaligned_load(3* src_stride, src); \ - register vec_u8 r4 = unaligned_load(4* src_stride, src); \ - register vec_u8 r5 = unaligned_load(5* src_stride, src); \ - register vec_u8 r6 = unaligned_load(6* src_stride, src); \ - register vec_u8 r7 = unaligned_load(7* src_stride, src); \ - register vec_u8 r14 = unaligned_load(14*src_stride, src); \ - register vec_u8 r15 = unaligned_load(15*src_stride, src); \ - \ - r8 = unaligned_load( 8*src_stride, src); \ - r9 = unaligned_load( 9*src_stride, src); \ - r10 = unaligned_load(10*src_stride, src); \ - r11 = unaligned_load(11*src_stride, src); \ - r12 = unaligned_load(12*src_stride, src); \ - r13 = unaligned_load(13*src_stride, src); \ - \ - /*Merge first pairs*/ \ - r0 = vec_mergeh(r0, r8); /*0, 8*/ \ - r1 = vec_mergeh(r1, r9); /*1, 9*/ \ - r2 = vec_mergeh(r2, r10); /*2,10*/ \ - r3 = vec_mergeh(r3, r11); /*3,11*/ \ - r4 = vec_mergeh(r4, r12); /*4,12*/ \ - r5 = vec_mergeh(r5, r13); /*5,13*/ \ - r6 = vec_mergeh(r6, r14); /*6,14*/ \ - r7 = vec_mergeh(r7, r15); /*7,15*/ \ - \ - /*Merge second pairs*/ \ - r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ - r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ - r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ - r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ - r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ - r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ - r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ - r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ - \ - /*Third merge*/ \ - r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ - r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ - r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ - r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ - r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ - r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ - /* Don't need to compute 3 and 7*/ \ - \ - /*Final merge*/ \ - r8 = vec_mergeh(r0, r4); /*all set 0*/ \ - r9 = vec_mergel(r0, r4); /*all set 1*/ \ - r10 = vec_mergeh(r1, r5); /*all set 2*/ \ - r11 = vec_mergel(r1, r5); /*all set 3*/ \ - r12 = vec_mergeh(r2, r6); /*all set 4*/ \ - r13 = vec_mergel(r2, r6); /*all set 5*/ \ - /* Don't need to compute 14 and 15*/ \ - \ -} - -// out: o = |x-y| < a -static inline vec_u8 diff_lt_altivec ( register vec_u8 x, - register vec_u8 y, - register vec_u8 a) { - - register vec_u8 diff = vec_subs(x, y); - register vec_u8 diffneg = vec_subs(y, x); - register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ - o = (vec_u8)vec_cmplt(o, a); - return o; -} - -static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, - register vec_u8 p1, - register vec_u8 q0, - register vec_u8 q1, - register vec_u8 alpha, - register vec_u8 beta) { - - register vec_u8 mask; - register vec_u8 tempmask; - - mask = diff_lt_altivec(p0, q0, alpha); - tempmask = diff_lt_altivec(p1, p0, beta); - mask = vec_and(mask, tempmask); - tempmask = diff_lt_altivec(q1, q0, beta); - mask = vec_and(mask, tempmask); - - return mask; -} - -// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) -static inline vec_u8 h264_deblock_q1(register vec_u8 p0, - register vec_u8 p1, - register vec_u8 p2, - register vec_u8 q0, - register vec_u8 tc0) { - - register vec_u8 average = vec_avg(p0, q0); - register vec_u8 temp; - register vec_u8 uncliped; - register vec_u8 ones; - register vec_u8 max; - register vec_u8 min; - register vec_u8 newp1; - - temp = vec_xor(average, p2); - average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ - ones = vec_splat_u8(1); - temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ - uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ - max = vec_adds(p1, tc0); - min = vec_subs(p1, tc0); - newp1 = vec_max(min, uncliped); - newp1 = vec_min(max, newp1); - return newp1; -} - -#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ - \ - const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ - \ - register vec_u8 pq0bit = vec_xor(p0,q0); \ - register vec_u8 q1minus; \ - register vec_u8 p0minus; \ - register vec_u8 stage1; \ - register vec_u8 stage2; \ - register vec_u8 vec160; \ - register vec_u8 delta; \ - register vec_u8 deltaneg; \ - \ - q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ - stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ - stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ - p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ - stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ - pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ - stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ - stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ - vec160 = vec_ld(0, &A0v); \ - deltaneg = vec_subs(vec160, stage2); /* -d */ \ - delta = vec_subs(stage2, vec160); /* d */ \ - deltaneg = vec_min(tc0masked, deltaneg); \ - delta = vec_min(tc0masked, delta); \ - p0 = vec_subs(p0, deltaneg); \ - q0 = vec_subs(q0, delta); \ - p0 = vec_adds(p0, delta); \ - q0 = vec_adds(q0, deltaneg); \ -} - -#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ - DECLARE_ALIGNED(16, unsigned char, temp)[16]; \ - register vec_u8 alphavec; \ - register vec_u8 betavec; \ - register vec_u8 mask; \ - register vec_u8 p1mask; \ - register vec_u8 q1mask; \ - register vector signed char tc0vec; \ - register vec_u8 finaltc0; \ - register vec_u8 tc0masked; \ - register vec_u8 newp1; \ - register vec_u8 newq1; \ - \ - temp[0] = alpha; \ - temp[1] = beta; \ - alphavec = vec_ld(0, temp); \ - betavec = vec_splat(alphavec, 0x1); \ - alphavec = vec_splat(alphavec, 0x0); \ - mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ - \ - AV_COPY32(temp, tc0); \ - tc0vec = vec_ld(0, (signed char*)temp); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ - finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ - \ - p1mask = diff_lt_altivec(p2, p0, betavec); \ - p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ - tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ - finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ - newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ - /*end if*/ \ - \ - q1mask = diff_lt_altivec(q2, q0, betavec); \ - q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ - tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ - finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ - newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ - /*end if*/ \ - \ - h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ - p1 = newp1; \ - q1 = newq1; \ -} - -static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { - register vec_u8 p2 = vec_ld(-3*stride, pix); - register vec_u8 p1 = vec_ld(-2*stride, pix); - register vec_u8 p0 = vec_ld(-1*stride, pix); - register vec_u8 q0 = vec_ld(0, pix); - register vec_u8 q1 = vec_ld(stride, pix); - register vec_u8 q2 = vec_ld(2*stride, pix); - h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); - vec_st(p1, -2*stride, pix); - vec_st(p0, -1*stride, pix); - vec_st(q0, 0, pix); - vec_st(q1, stride, pix); - } -} - -static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - register vec_u8 line0, line1, line2, line3, line4, line5; - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) - return; - readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); - h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); - transpose4x16(line1, line2, line3, line4); - write16x4(pix-2, stride, line1, line2, line3, line4); -} - -static av_always_inline -void weight_h264_W_altivec(uint8_t *block, int stride, int height, - int log2_denom, int weight, int offset, int w) -{ - int y, aligned; - vec_u8 vblock; - vec_s16 vtemp, vweight, voffset, v0, v1; - vec_u16 vlog2_denom; - DECLARE_ALIGNED(16, int32_t, temp)[4]; - LOAD_ZERO; - - offset <<= log2_denom; - if(log2_denom) offset += 1<<(log2_denom-1); - temp[0] = log2_denom; - temp[1] = weight; - temp[2] = offset; - - vtemp = (vec_s16)vec_ld(0, temp); - vlog2_denom = (vec_u16)vec_splat(vtemp, 1); - vweight = vec_splat(vtemp, 3); - voffset = vec_splat(vtemp, 5); - aligned = !((unsigned long)block & 0xf); - - for (y = 0; y < height; y++) { - vblock = vec_ld(0, block); - - v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); - v1 = (vec_s16)vec_mergel(zero_u8v, vblock); - - if (w == 16 || aligned) { - v0 = vec_mladd(v0, vweight, zero_s16v); - v0 = vec_adds(v0, voffset); - v0 = vec_sra(v0, vlog2_denom); - } - if (w == 16 || !aligned) { - v1 = vec_mladd(v1, vweight, zero_s16v); - v1 = vec_adds(v1, voffset); - v1 = vec_sra(v1, vlog2_denom); - } - vblock = vec_packsu(v0, v1); - vec_st(vblock, 0, block); - - block += stride; - } -} - -static av_always_inline -void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, - int log2_denom, int weightd, int weights, int offset, int w) -{ - int y, dst_aligned, src_aligned; - vec_u8 vsrc, vdst; - vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3; - vec_u16 vlog2_denom; - DECLARE_ALIGNED(16, int32_t, temp)[4]; - LOAD_ZERO; - - offset = ((offset + 1) | 1) << log2_denom; - temp[0] = log2_denom+1; - temp[1] = weights; - temp[2] = weightd; - temp[3] = offset; - - vtemp = (vec_s16)vec_ld(0, temp); - vlog2_denom = (vec_u16)vec_splat(vtemp, 1); - vweights = vec_splat(vtemp, 3); - vweightd = vec_splat(vtemp, 5); - voffset = vec_splat(vtemp, 7); - dst_aligned = !((unsigned long)dst & 0xf); - src_aligned = !((unsigned long)src & 0xf); - - for (y = 0; y < height; y++) { - vdst = vec_ld(0, dst); - vsrc = vec_ld(0, src); - - v0 = (vec_s16)vec_mergeh(zero_u8v, vdst); - v1 = (vec_s16)vec_mergel(zero_u8v, vdst); - v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc); - v3 = (vec_s16)vec_mergel(zero_u8v, vsrc); - - if (w == 8) { - if (src_aligned) - v3 = v2; - else - v2 = v3; - } - - if (w == 16 || dst_aligned) { - v0 = vec_mladd(v0, vweightd, zero_s16v); - v2 = vec_mladd(v2, vweights, zero_s16v); - - v0 = vec_adds(v0, voffset); - v0 = vec_adds(v0, v2); - v0 = vec_sra(v0, vlog2_denom); - } - if (w == 16 || !dst_aligned) { - v1 = vec_mladd(v1, vweightd, zero_s16v); - v3 = vec_mladd(v3, vweights, zero_s16v); - - v1 = vec_adds(v1, voffset); - v1 = vec_adds(v1, v3); - v1 = vec_sra(v1, vlog2_denom); - } - vdst = vec_packsu(v0, v1); - vec_st(vdst, 0, dst); - - dst += stride; - src += stride; - } -} - -#define H264_WEIGHT(W) \ -static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \ - int log2_denom, int weight, int offset){ \ - weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \ -}\ -static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \ - int log2_denom, int weightd, int weights, int offset){ \ - biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ -} - -H264_WEIGHT(16) -H264_WEIGHT( 8) - -av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, - const int chroma_format_idc) -{ - if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { - if (bit_depth == 8) { - c->h264_idct_add = ff_h264_idct_add_altivec; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_altivec; - c->h264_idct_add16 = ff_h264_idct_add16_altivec; - c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; - c->h264_idct_dc_add= h264_idct_dc_add_altivec; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; - c->h264_idct8_add = ff_h264_idct8_add_altivec; - c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; - - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; - } - } -} diff --git a/ffmpeg1/libavcodec/ppc/h264_qpel.c b/ffmpeg1/libavcodec/ppc/h264_qpel.c deleted file mode 100644 index 429ae42..0000000 --- a/ffmpeg1/libavcodec/ppc/h264_qpel.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavcodec/h264qpel.h" - -#if HAVE_ALTIVEC -#include "libavutil/cpu.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num -#include "h264_qpel_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num -#include "h264_qpel_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define H264_MC(OPNAME, SIZE, CODETYPE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - ff_ ## OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{ \ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ -{\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ - -static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(a, b); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -/* Implemented but could be faster -#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) -#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) - */ - -H264_MC(put_, 16, altivec) -H264_MC(avg_, 16, altivec) -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth) -{ -#if HAVE_ALTIVEC - const int high_bit_depth = bit_depth > 8; - - if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { - if (!high_bit_depth) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 0, 16); -#undef dspfunc - } - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg1/libavcodec/ppc/h264_qpel_template.c b/ffmpeg1/libavcodec/ppc/h264_qpel_template.c deleted file mode 100644 index cfc4560..0000000 --- a/ffmpeg1/libavcodec/ppc/h264_qpel_template.c +++ /dev/null @@ -1,507 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" - -#ifdef DEBUG -#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); -#else -#define ASSERT_ALIGNED(ptr) ; -#endif - -/* this code assume stride % 16 == 0 */ -#ifdef PREFIX_h264_qpel16_h_lowpass_altivec -static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - register int i; - - LOAD_ZERO; - const vec_u8 permM2 = vec_lvsl(-2, src); - const vec_u8 permM1 = vec_lvsl(-1, src); - const vec_u8 permP0 = vec_lvsl(+0, src); - const vec_u8 permP1 = vec_lvsl(+1, src); - const vec_u8 permP2 = vec_lvsl(+2, src); - const vec_u8 permP3 = vec_lvsl(+3, src); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_u16 v5us = vec_splat_u16(5); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB; - - vec_u8 sum, fsum; - - for (i = 0 ; i < 16 ; i ++) { - vec_u8 srcR1 = vec_ld(-2, src); - vec_u8 srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - - OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); - - vec_st(fsum, 0, dst); - - src += srcStride; - dst += dstStride; - } -} -#endif - -/* this code assume stride % 16 == 0 */ -#ifdef PREFIX_h264_qpel16_v_lowpass_altivec -static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - register int i; - - LOAD_ZERO; - const vec_u8 perm = vec_lvsl(0, src); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u16 v5us = vec_splat_u16(5); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - uint8_t *srcbis = src - (srcStride * 2); - - const vec_u8 srcM2a = vec_ld(0, srcbis); - const vec_u8 srcM2b = vec_ld(16, srcbis); - const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); - //srcbis += srcStride; - const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcM1b = vec_ld(16, srcbis); - const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); - //srcbis += srcStride; - const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP0b = vec_ld(16, srcbis); - const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); - //srcbis += srcStride; - const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP1b = vec_ld(16, srcbis); - const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); - //srcbis += srcStride; - const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP2b = vec_ld(16, srcbis); - const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); - //srcbis += srcStride; - - vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); - vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); - vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); - vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); - vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); - vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); - vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); - vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); - vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); - vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); - - vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB, - srcP3ssA, srcP3ssB, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - - vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; - - for (i = 0 ; i < 16 ; i++) { - srcP3a = vec_ld(0, srcbis += srcStride); - srcP3b = vec_ld(16, srcbis); - srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); - //srcbis += srcStride; - - sum1A = vec_adds(srcP0ssA, srcP1ssA); - sum1B = vec_adds(srcP0ssB, srcP1ssB); - sum2A = vec_adds(srcM1ssA, srcP2ssA); - sum2B = vec_adds(srcM1ssB, srcP2ssB); - sum3A = vec_adds(srcM2ssA, srcP3ssA); - sum3B = vec_adds(srcM2ssB, srcP3ssB); - - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - - OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } -} -#endif - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec -static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - register int i; - LOAD_ZERO; - const vec_u8 permM2 = vec_lvsl(-2, src); - const vec_u8 permM1 = vec_lvsl(-1, src); - const vec_u8 permP0 = vec_lvsl(+0, src); - const vec_u8 permP1 = vec_lvsl(+1, src); - const vec_u8 permP2 = vec_lvsl(+2, src); - const vec_u8 permP3 = vec_lvsl(+3, src); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u32 v10ui = vec_splat_u32(10); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_s16 v1ss = vec_splat_s16(1); - const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, psumA, psumB; - - const vec_u8 mperm = (const vec_u8) - {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, - 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; - int16_t *tmpbis = tmp; - - vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, - tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, - tmpP2ssA, tmpP2ssB; - - vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, - pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, - pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, - ssumAe, ssumAo, ssumBe, ssumBo; - vec_u8 fsum, sumv, sum; - vec_s16 ssume, ssumo; - - src -= (2 * srcStride); - for (i = 0 ; i < 21 ; i ++) { - vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vec_u8 srcR1 = vec_ld(-2, src); - vec_u8 srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, sum3A); - pp1B = vec_mladd(sum1B, v20ss, sum3B); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - psumA = vec_sub(pp1A, pp2A); - psumB = vec_sub(pp1B, pp2B); - - vec_st(psumA, 0, tmp); - vec_st(psumB, 16, tmp); - - src += srcStride; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - tmpM2ssA = vec_ld(0, tmpbis); - tmpM2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpM1ssA = vec_ld(0, tmpbis); - tmpM1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP0ssA = vec_ld(0, tmpbis); - tmpP0ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP1ssA = vec_ld(0, tmpbis); - tmpP1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP2ssA = vec_ld(0, tmpbis); - tmpP2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < 16 ; i++) { - const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); - const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); - - const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); - - tmpbis += tmpStride; - - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; - - pp1Ae = vec_mule(sum1A, v20ss); - pp1Ao = vec_mulo(sum1A, v20ss); - pp1Be = vec_mule(sum1B, v20ss); - pp1Bo = vec_mulo(sum1B, v20ss); - - pp2Ae = vec_mule(sum2A, v5ss); - pp2Ao = vec_mulo(sum2A, v5ss); - pp2Be = vec_mule(sum2B, v5ss); - pp2Bo = vec_mulo(sum2B, v5ss); - - pp3Ae = vec_sra((vec_s32)sum3A, v16ui); - pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vec_s32)sum3B, v16ui); - pp3Bo = vec_mulo(sum3B, v1ss); - - pp1cAe = vec_add(pp1Ae, v512si); - pp1cAo = vec_add(pp1Ao, v512si); - pp1cBe = vec_add(pp1Be, v512si); - pp1cBo = vec_add(pp1Bo, v512si); - - pp32Ae = vec_sub(pp3Ae, pp2Ae); - pp32Ao = vec_sub(pp3Ao, pp2Ao); - pp32Be = vec_sub(pp3Be, pp2Be); - pp32Bo = vec_sub(pp3Bo, pp2Bo); - - sumAe = vec_add(pp1cAe, pp32Ae); - sumAo = vec_add(pp1cAo, pp32Ao); - sumBe = vec_add(pp1cBe, pp32Be); - sumBo = vec_add(pp1cBo, pp32Bo); - - ssumAe = vec_sra(sumAe, v10ui); - ssumAo = vec_sra(sumAo, v10ui); - ssumBe = vec_sra(sumBe, v10ui); - ssumBo = vec_sra(sumBo, v10ui); - - ssume = vec_packs(ssumAe, ssumBe); - ssumo = vec_packs(ssumAo, ssumBo); - - sumv = vec_packsu(ssume, ssumo); - sum = vec_perm(sumv, sumv, mperm); - - ASSERT_ALIGNED(dst); - - OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } -} -#endif diff --git a/ffmpeg1/libavcodec/ppc/h264chroma_init.c b/ffmpeg1/libavcodec/ppc/h264chroma_init.c deleted file mode 100644 index f9e2a76..0000000 --- a/ffmpeg1/libavcodec/ppc/h264chroma_init.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavcodec/h264chroma.h" - -#if HAVE_ALTIVEC -#include "libavutil/cpu.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth) -{ -#if HAVE_ALTIVEC - const int high_bit_depth = bit_depth > 8; - - if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { - if (!high_bit_depth) { - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - } - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg1/libavcodec/ppc/h264chroma_template.c b/ffmpeg1/libavcodec/ppc/h264chroma_template.c deleted file mode 100644 index 7436e11..0000000 --- a/ffmpeg1/libavcodec/ppc/h264chroma_template.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/mem.h" - -/* this code assume that stride % 16 == 0 */ - -#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ - vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ - vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ - psum = vec_mladd(vB, vsrc1ssH, psum);\ - psum = vec_mladd(vC, vsrc2ssH, psum);\ - psum = vec_mladd(vD, vsrc3ssH, psum);\ - psum = BIAS2(psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - vsrc0ssH = vsrc2ssH;\ - vsrc1ssH = vsrc3ssH;\ -\ - dst += stride;\ - src += stride; - -#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ -\ - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, v32ss);\ - psum = vec_mladd(vE, vsrc1ssH, psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - dst += stride;\ - src += stride; - -#define noop(a) a -#define add28(a) vec_add(v28ss, a) - -#ifdef PREFIX_h264_chroma_mc8_altivec -static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, - int stride, int h, int x, int y) { - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); - - if (ABCD[3]) { - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } - } else { - const vec_s16 vE = vec_add(vB, vC); - if (ABCD[2]) { // x == 0 B == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 15, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } - } else { // y == 0 C == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrcDuc = vec_ld(15, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcDuc; - else - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } - } - } -} -#endif - -/* this code assume that stride % 16 == 0 */ -#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec -static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } -} -#endif - -#undef noop -#undef add28 -#undef CHROMA_MC8_ALTIVEC_CORE diff --git a/ffmpeg1/libavcodec/ppc/hpeldsp_altivec.c b/ffmpeg1/libavcodec/ppc/hpeldsp_altivec.c deleted file mode 100644 index 4309d39..0000000 --- a/ffmpeg1/libavcodec/ppc/hpeldsp_altivec.c +++ /dev/null @@ -1,464 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavcodec/hpeldsp.h" - -#if HAVE_ALTIVEC -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -/* next one assumes that ((line_size % 16) == 0) */ -void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - register ptrdiff_t line_size_2 = line_size << 1; - register ptrdiff_t line_size_3 = line_size + line_size_2; - register ptrdiff_t line_size_4 = line_size << 2; - -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 - for (i = 0; i < h; i += 4) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(15, pixels); - pixelsv1B = vec_ld(line_size, pixels); - pixelsv2B = vec_ld(15 + line_size, pixels); - pixelsv1C = vec_ld(line_size_2, pixels); - pixelsv2C = vec_ld(15 + line_size_2, pixels); - pixelsv1D = vec_ld(line_size_3, pixels); - pixelsv2D = vec_ld(15 + line_size_3, pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - line_size, (unsigned char*)block); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - line_size_2, (unsigned char*)block); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - line_size_3, (unsigned char*)block); - pixels+=line_size_4; - block +=line_size_4; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) -void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - - for (i = 0; i < h; i++) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16,pixels); - blockv = vec_ld(0, block); - pixelsv = vec_perm(pixelsv1, pixelsv2, perm); - blockv = vec_avg(blockv,pixelsv); - vec_st(blockv, 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - int i; - - for (i = 0; i < h; i++) { - /* block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16, pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } else { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2, blocktemp; - register vector unsigned short pixelssum1, pixelssum2, temp3; - - register const vector unsigned char vczero = (const vector unsigned char) - vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short) - vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } -} -#endif /* HAVE_ALTIVEC */ - -void ff_hpeldsp_init_ppc(HpelDSPContext* c, int flags) -{ -#if HAVE_ALTIVEC - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_ALTIVEC) { - c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; - c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; - c->avg_pixels_tab[1][0] = avg_pixels8_altivec; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; - c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; - c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg1/libavcodec/ppc/idct_altivec.c b/ffmpeg1/libavcodec/ppc/idct_altivec.c deleted file mode 100644 index c6f2cd8..0000000 --- a/ffmpeg1/libavcodec/ppc/idct_altivec.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of FFmpeg. - */ - -/* - * FFmpeg integration by Dieter Shirley - * - * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 - * project. I've deleted all of the libmpeg2-specific code, renamed the - * functions and reordered the function parameters. The only change to the - * IDCT function itself was to factor out the partial transposition, and to - * perform a full transpose at the end of the function. - */ - - -#include <stdlib.h> /* malloc(), free() */ -#include <string.h> -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/ppc/types_altivec.h" -#include "dsputil_altivec.h" - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds (a1, vx7, vx1 ); \ - t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ - t7 = vec_mradds (a2, vx5, vx3); \ - t3 = vec_mradds (ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds (vx0, vx4); \ - t0 = vec_subs (vx0, vx4); \ - t2 = vec_mradds (a0, vx6, vx2); \ - t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ - t6 = vec_adds (t8, t3); \ - t3 = vec_subs (t8, t3); \ - t8 = vec_subs (t1, t7); \ - t1 = vec_adds (t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds (t5, t2); \ - t2 = vec_subs (t5, t2); \ - t5 = vec_adds (t0, t4); \ - t0 = vec_subs (t0, t4); \ - t4 = vec_subs (t8, t3); \ - t3 = vec_adds (t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds (t7, t1); \ - vy7 = vec_subs (t7, t1); \ - vy1 = vec_mradds (c4, t3, t5); \ - vy6 = vec_mradds (mc4, t3, t5); \ - vy2 = vec_mradds (c4, t4, t0); \ - vy5 = vec_mradds (mc4, t4, t0); \ - vy3 = vec_adds (t2, t6); \ - vy4 = vec_subs (t2, t6); - - -#define IDCT \ - vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ - vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ - vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - vec_u16 shift; \ - \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ - \ - zero = vec_splat_s16 (0); \ - shift = vec_splat_u16 (4); \ - \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ - \ - IDCT_HALF \ - \ - vx0 = vec_mergeh (vy0, vy4); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - vy0 = vec_mergeh (vx0, vx4); \ - vy1 = vec_mergel (vx0, vx4); \ - vy2 = vec_mergeh (vx1, vx5); \ - vy3 = vec_mergel (vx1, vx5); \ - vy4 = vec_mergeh (vx2, vx6); \ - vy5 = vec_mergel (vx2, vx6); \ - vy6 = vec_mergeh (vx3, vx7); \ - vy7 = vec_mergel (vx3, vx7); \ - \ - vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - IDCT_HALF \ - \ - shift = vec_splat_u16 (6); \ - vx0 = vec_sra (vy0, shift); \ - vx1 = vec_sra (vy1, shift); \ - vx2 = vec_sra (vy2, shift); \ - vx3 = vec_sra (vy3, shift); \ - vx4 = vec_sra (vy4, shift); \ - vx5 = vec_sra (vy5, shift); \ - vx6 = vec_sra (vy6, shift); \ - vx7 = vec_sra (vy7, shift); - - -static const vec_s16 constants[5] = { - {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, - {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, - {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, - {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, - {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} -}; - -void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - - IDCT - -#define COPY(dest,src) \ - tmp = vec_packsu (src, src); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - COPY (dest, vx0) dest += stride; - COPY (dest, vx1) dest += stride; - COPY (dest, vx2) dest += stride; - COPY (dest, vx3) dest += stride; - COPY (dest, vx4) dest += stride; - COPY (dest, vx5) dest += stride; - COPY (dest, vx6) dest += stride; - COPY (dest, vx7) -} - -void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - vec_s16 tmp2, tmp3; - vec_u8 perm0; - vec_u8 perm1; - vec_u8 p0, p1, p; - - IDCT - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - ADD (dest, vx0, perm0) dest += stride; - ADD (dest, vx1, perm1) dest += stride; - ADD (dest, vx2, perm0) dest += stride; - ADD (dest, vx3, perm1) dest += stride; - ADD (dest, vx4, perm0) dest += stride; - ADD (dest, vx5, perm1) dest += stride; - ADD (dest, vx6, perm0) dest += stride; - ADD (dest, vx7, perm1) -} diff --git a/ffmpeg1/libavcodec/ppc/int_altivec.c b/ffmpeg1/libavcodec/ppc/int_altivec.c deleted file mode 100644 index 4386b13..0000000 --- a/ffmpeg1/libavcodec/ppc/int_altivec.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - ** @file - ** integer misc ops. - **/ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavcodec/dsputil.h" - -#include "dsputil_altivec.h" - -static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, - int size) { - int i, size16; - vector signed char vpix1; - vector signed short vpix2, vdiff, vpix1l,vpix1h; - union { vector signed int vscore; - int32_t score[4]; - } u; - u.vscore = vec_splat_s32(0); -// -//XXX lazy way, fix it later - -#define vec_unaligned_load(b) \ - vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); - - size16 = size >> 4; - while(size16) { -// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - //load pix1 and the first batch of pix2 - - vpix1 = vec_unaligned_load(pix1); - vpix2 = vec_unaligned_load(pix2); - pix2 += 8; - //unpack - vpix1h = vec_unpackh(vpix1); - vdiff = vec_sub(vpix1h, vpix2); - vpix1l = vec_unpackl(vpix1); - // load another batch from pix2 - vpix2 = vec_unaligned_load(pix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - vdiff = vec_sub(vpix1l, vpix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - pix1 += 16; - pix2 += 8; - size16--; - } - u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); - - size %= 16; - for (i = 0; i < size; i++) { - u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - } - return u.score[3]; -} - -static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, - int order) -{ - int i; - LOAD_ZERO; - const vec_s16 *pv; - register vec_s16 vec1; - register vec_s32 res = vec_splat_s32(0), t; - int32_t ires; - - for(i = 0; i < order; i += 8){ - pv = (const vec_s16*)v1; - vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); - t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); - res = vec_sums(t, res); - v1 += 8; - v2 += 8; - } - res = vec_splat(res, 3); - vec_ste(res, 0, &ires); - return ires; -} - -static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) -{ - LOAD_ZERO; - vec_s16 *pv1 = (vec_s16*)v1; - register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; - register vec_s16 t0, t1, i0, i1, i4; - register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); - register vec_s32 res = zero_s32v; - register vec_u8 align = vec_lvsl(0, v2); - int32_t ires; - order >>= 4; - do { - i1 = vec_ld(16, v2); - t0 = vec_perm(i2, i1, align); - i2 = vec_ld(32, v2); - t1 = vec_perm(i1, i2, align); - i0 = pv1[0]; - i1 = pv1[1]; - res = vec_msum(t0, i0, res); - res = vec_msum(t1, i1, res); - i4 = vec_ld(16, v3); - t0 = vec_perm(i3, i4, align); - i3 = vec_ld(32, v3); - t1 = vec_perm(i4, i3, align); - pv1[0] = vec_mladd(t0, muls, i0); - pv1[1] = vec_mladd(t1, muls, i1); - pv1 += 2; - v2 += 8; - v3 += 8; - } while(--order); - res = vec_splat(vec_sums(res, zero_s32v), 3); - vec_ste(res, 0, &ires); - return ires; -} - -av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx) -{ - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; - c->scalarproduct_int16 = scalarproduct_int16_altivec; - c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; -} diff --git a/ffmpeg1/libavcodec/ppc/mathops.h b/ffmpeg1/libavcodec/ppc/mathops.h deleted file mode 100644 index dbd714f..0000000 --- a/ffmpeg1/libavcodec/ppc/mathops.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_MATHOPS_H -#define AVCODEC_PPC_MATHOPS_H - -#include <stdint.h> -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_PPC4XX -/* signed 16x16 -> 32 multiply add accumulate */ -#define MAC16(rt, ra, rb) \ - __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - -/* signed 16x16 -> 32 multiply */ -#define MUL16(ra, rb) \ - ({ int __rt; \ - __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) -#endif - -#define MULH MULH -static inline av_const int MULH(int a, int b){ - int r; - __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} - -#if !ARCH_PPC64 -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "addc %1, %1, %3 \n\t" - "adde %0, %0, %2 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) - -static inline av_const int64_t MLS64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "subfc %1, %3, %1 \n\t" - "subfe %0, %2, %0 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) -#endif - -#endif /* AVCODEC_PPC_MATHOPS_H */ diff --git a/ffmpeg1/libavcodec/ppc/mpegaudiodec_altivec.c b/ffmpeg1/libavcodec/ppc/mpegaudiodec_altivec.c deleted file mode 100644 index 1152fd7..0000000 --- a/ffmpeg1/libavcodec/ppc/mpegaudiodec_altivec.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Altivec optimized MP3 decoding functions - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_altivec.h" -#include "libavutil/attributes.h" -#include "libavutil/internal.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/mpegaudiodsp.h" - -#define MACS(rt, ra, rb) rt+=(ra)*(rb) -#define MLSS(rt, ra, rb) rt-=(ra)*(rb) - -#define SUM8(op, sum, w, p) \ -{ \ - op(sum, (w)[0 * 64], (p)[0 * 64]); \ - op(sum, (w)[1 * 64], (p)[1 * 64]); \ - op(sum, (w)[2 * 64], (p)[2 * 64]); \ - op(sum, (w)[3 * 64], (p)[3 * 64]); \ - op(sum, (w)[4 * 64], (p)[4 * 64]); \ - op(sum, (w)[5 * 64], (p)[5 * 64]); \ - op(sum, (w)[6 * 64], (p)[6 * 64]); \ - op(sum, (w)[7 * 64], (p)[7 * 64]); \ -} - -static void apply_window(const float *buf, const float *win1, - const float *win2, float *sum1, float *sum2, int len) -{ - const vector float *win1a = (const vector float *) win1; - const vector float *win2a = (const vector float *) win2; - const vector float *bufa = (const vector float *) buf; - vector float *sum1a = (vector float *) sum1; - vector float *sum2a = (vector float *) sum2; - vector float av_uninit(v0), av_uninit(v4); - vector float v1, v2, v3; - - len = len >> 2; - -#define MULT(a, b) \ - { \ - v1 = vec_ld(a, win1a); \ - v2 = vec_ld(b, win2a); \ - v3 = vec_ld(a, bufa); \ - v0 = vec_madd(v3, v1, v0); \ - v4 = vec_madd(v2, v3, v4); \ - } - - while (len--) { - v0 = vec_xor(v0, v0); - v4 = vec_xor(v4, v4); - - MULT( 0, 0); - MULT( 256, 64); - MULT( 512, 128); - MULT( 768, 192); - MULT(1024, 256); - MULT(1280, 320); - MULT(1536, 384); - MULT(1792, 448); - - vec_st(v0, 0, sum1a); - vec_st(v4, 0, sum2a); - sum1a++; - sum2a++; - win1a++; - win2a++; - bufa++; - } -} - -static void apply_window_mp3(float *in, float *win, int *unused, float *out, - int incr) -{ - LOCAL_ALIGNED_16(float, suma, [17]); - LOCAL_ALIGNED_16(float, sumb, [17]); - LOCAL_ALIGNED_16(float, sumc, [17]); - LOCAL_ALIGNED_16(float, sumd, [17]); - - float sum; - int j; - float *out2 = out + 32 * incr; - - /* copy to avoid wrap */ - memcpy(in + 512, in, 32 * sizeof(*in)); - - apply_window(in + 16, win , win + 512, suma, sumc, 16); - apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); - - SUM8(MLSS, suma[0], win + 32, in + 48); - - sumc[ 0] = 0; - sumb[16] = 0; - sumd[16] = 0; - - out[0 ] = suma[ 0]; - out += incr; - out2 -= incr; - for(j=1;j<16;j++) { - *out = suma[ j] - sumd[16-j]; - *out2 = -sumb[16-j] - sumc[ j]; - out += incr; - out2 -= incr; - } - - sum = 0; - SUM8(MLSS, sum, win + 16 + 32, in + 32); - *out = sum; -} - -av_cold void ff_mpadsp_init_altivec(MPADSPContext *s) -{ - s->apply_window_float = apply_window_mp3; -} diff --git a/ffmpeg1/libavcodec/ppc/mpegvideo_altivec.c b/ffmpeg1/libavcodec/ppc/mpegvideo_altivec.c deleted file mode 100644 index bf490b0..0000000 --- a/ffmpeg1/libavcodec/ppc/mpegvideo_altivec.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * dct_unquantize_h263_altivec: - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include <stdio.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/mpegvideo.h" - -#include "dsputil_altivec.h" - -/* AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned */ -static void dct_unquantize_h263_altivec(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - block[0] = block[0] * s->y_dc_scale; - else - block[0] = block[0] * s->c_dc_scale; - }else - qadd = 0; - i = 1; - nCoeffs= 63; //does not always use zigzag table - } else { - i = 0; - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } - - { - register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED(16, short, qmul8) = qmul; - DECLARE_ALIGNED(16, short, qadd8) = qadd; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; - - qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); - qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); - nqaddv = vec_sub(vczero, qaddv); - - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } - - if (i == 1) { - // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } - } -} - - -av_cold void ff_MPV_common_init_altivec(MpegEncContext *s) -{ - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) return; - - if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { - s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; - s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - } -} diff --git a/ffmpeg1/libavcodec/ppc/vc1dsp_altivec.c b/ffmpeg1/libavcodec/ppc/vc1dsp_altivec.c deleted file mode 100644 index 9c2ad70..0000000 --- a/ffmpeg1/libavcodec/ppc/vc1dsp_altivec.c +++ /dev/null @@ -1,347 +0,0 @@ -/* - * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized - * Copyright (c) 2006 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/vc1dsp.h" - -// main steps of 8x8 transform -#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ -do { \ - t0 = vec_sl(vec_add(s0, s4), vec_2); \ - t0 = vec_add(vec_sl(t0, vec_1), t0); \ - t0 = vec_add(t0, vec_rnd); \ - t1 = vec_sl(vec_sub(s0, s4), vec_2); \ - t1 = vec_add(vec_sl(t1, vec_1), t1); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ - t2 = vec_add(t2, vec_sl(s2, vec_4)); \ - t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ - t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ - t4 = vec_add(t0, t2); \ - t5 = vec_add(t1, t3); \ - t6 = vec_sub(t1, t3); \ - t7 = vec_sub(t0, t2); \ -\ - t0 = vec_sl(vec_add(s1, s3), vec_4); \ - t0 = vec_add(t0, vec_sl(s5, vec_3)); \ - t0 = vec_add(t0, vec_sl(s7, vec_2)); \ - t0 = vec_add(t0, vec_sub(s5, s3)); \ -\ - t1 = vec_sl(vec_sub(s1, s5), vec_4); \ - t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ - t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ - t1 = vec_sub(t1, vec_add(s1, s7)); \ -\ - t2 = vec_sl(vec_sub(s7, s3), vec_4); \ - t2 = vec_add(t2, vec_sl(s1, vec_3)); \ - t2 = vec_add(t2, vec_sl(s5, vec_2)); \ - t2 = vec_add(t2, vec_sub(s1, s7)); \ -\ - t3 = vec_sl(vec_sub(s5, s7), vec_4); \ - t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s1, vec_2)); \ - t3 = vec_sub(t3, vec_add(s3, s5)); \ -\ - s0 = vec_add(t4, t0); \ - s1 = vec_add(t5, t1); \ - s2 = vec_add(t6, t2); \ - s3 = vec_add(t7, t3); \ - s4 = vec_sub(t7, t3); \ - s5 = vec_sub(t6, t2); \ - s6 = vec_sub(t5, t1); \ - s7 = vec_sub(t4, t0); \ -}while(0) - -#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); \ - s4 = vec_sra(s4, vec_3); \ - s5 = vec_sra(s5, vec_3); \ - s6 = vec_sra(s6, vec_3); \ - s7 = vec_sra(s7, vec_3); \ -}while(0) - -#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); \ - s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ - s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ - s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ - s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ -}while(0) - -/* main steps of 4x4 transform */ -#define STEP4(s0, s1, s2, s3, vec_rnd) \ -do { \ - t1 = vec_add(vec_sl(s0, vec_4), s0); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s2, vec_4), s2); \ - t0 = vec_add(t1, t2); \ - t1 = vec_sub(t1, t2); \ - t3 = vec_sl(vec_sub(s3, s1), vec_1); \ - t3 = vec_add(t3, vec_sl(t3, vec_2)); \ - t2 = vec_add(t3, vec_sl(s1, vec_5)); \ - t3 = vec_add(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s3, vec_2)); \ - s0 = vec_add(t0, t2); \ - s1 = vec_sub(t1, t3); \ - s2 = vec_add(t1, t3); \ - s3 = vec_sub(t0, t2); \ -}while (0) - -#define SHIFT_HOR4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); - -#define SHIFT_VERT4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); - -/** Do inverse transform on 8x8 block -*/ -static void vc1_inv_trans_8x8_altivec(int16_t block[64]) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector signed int vec_1s = vec_splat_s32(1); - const vector unsigned int vec_1 = vec_splat_u32(1); - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); - SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); - SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); -} - -/** Do inverse transform on 8x4 part of block -*/ -static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector unsigned int vec_1 = vec_splat_u32(1); - vector unsigned char tmp; - vector signed short tmp2, tmp3; - vector unsigned char perm0, perm1, p0, p1, p; - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackh(src0); - s1 = vec_unpackh(src1); - s2 = vec_unpackh(src2); - s3 = vec_unpackh(src3); - s8 = vec_unpackl(src0); - s9 = vec_unpackl(src1); - sA = vec_unpackl(src2); - sB = vec_unpackl(src3); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src0 = vec_pack(s0, s8); - src1 = vec_pack(s1, s9); - src2 = vec_pack(s2, sA); - src3 = vec_pack(s3, sB); - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); - - ADD (dest, src0, perm0) dest += stride; - ADD (dest, src1, perm1) dest += stride; - ADD (dest, src2, perm0) dest += stride; - ADD (dest, src3, perm1) -} - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec -#include "h264chroma_template.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec - -av_cold void ff_vc1dsp_init_altivec(VC1DSPContext *dsp) -{ - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; - dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; - dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec; -} diff --git a/ffmpeg1/libavcodec/ppc/videodsp_ppc.c b/ffmpeg1/libavcodec/ppc/videodsp_ppc.c deleted file mode 100644 index 9157022..0000000 --- a/ffmpeg1/libavcodec/ppc/videodsp_ppc.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavcodec/videodsp.h" - -static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h) -{ - register const uint8_t *p = mem; - do { - __asm__ volatile ("dcbt 0,%0" : : "r" (p)); - p += stride; - } while(--h); -} - -av_cold void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc) -{ - ctx->prefetch = prefetch_ppc; -} diff --git a/ffmpeg1/libavcodec/ppc/vorbisdsp_altivec.c b/ffmpeg1/libavcodec/ppc/vorbisdsp_altivec.c deleted file mode 100644 index 08a2b26..0000000 --- a/ffmpeg1/libavcodec/ppc/vorbisdsp_altivec.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include <altivec.h> -#endif -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavcodec/vorbisdsp.h" - -#if HAVE_ALTIVEC -static void vorbis_inverse_coupling_altivec(float *mag, float *ang, - intptr_t blocksize) -{ - int i; - vector float m, a; - vector bool int t0, t1; - const vector unsigned int v_31 = //XXX - vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); - for (i = 0; i < blocksize; i += 4) { - m = vec_ld(0, mag+i); - a = vec_ld(0, ang+i); - t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); - t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); - a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); - t0 = (vector bool int)vec_and(a, t1); - t1 = (vector bool int)vec_andc(a, t1); - a = vec_sub(m, (vector float)t1); - m = vec_add(m, (vector float)t0); - vec_stl(a, 0, ang+i); - vec_stl(m, 0, mag+i); - } -} -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c) -{ -#if HAVE_ALTIVEC - if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { - c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/ffmpeg1/libavcodec/ppc/vp3dsp_altivec.c b/ffmpeg1/libavcodec/ppc/vp3dsp_altivec.c deleted file mode 100644 index cc587b0..0000000 --- a/ffmpeg1/libavcodec/ppc/vp3dsp_altivec.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (C) 2009 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <string.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavcodec/vp3dsp.h" - -#if HAVE_ALTIVEC - -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "dsputil_altivec.h" - -static const vec_s16 constants = - {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785}; -static const vec_u8 interleave_high = - {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; - -#define IDCT_START \ - vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\ - vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\ - vec_s16 eight = vec_splat_s16(8);\ - vec_u16 four = vec_splat_u16(4);\ -\ - vec_s16 C1 = vec_splat(constants, 1);\ - vec_s16 C2 = vec_splat(constants, 2);\ - vec_s16 C3 = vec_splat(constants, 3);\ - vec_s16 C4 = vec_splat(constants, 4);\ - vec_s16 C5 = vec_splat(constants, 5);\ - vec_s16 C6 = vec_splat(constants, 6);\ - vec_s16 C7 = vec_splat(constants, 7);\ -\ - vec_s16 b0 = vec_ld(0x00, block);\ - vec_s16 b1 = vec_ld(0x10, block);\ - vec_s16 b2 = vec_ld(0x20, block);\ - vec_s16 b3 = vec_ld(0x30, block);\ - vec_s16 b4 = vec_ld(0x40, block);\ - vec_s16 b5 = vec_ld(0x50, block);\ - vec_s16 b6 = vec_ld(0x60, block);\ - vec_s16 b7 = vec_ld(0x70, block); - -// these functions do (a*C)>>16 -// things are tricky because a is signed, but C unsigned. -// M15 is used if C fits in 15 bit unsigned (C6,C7) -// M16 is used if C requires 16 bits unsigned -static inline vec_s16 M15(vec_s16 a, vec_s16 C) -{ - return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high); -} -static inline vec_s16 M16(vec_s16 a, vec_s16 C) -{ - return vec_add(a, M15(a, C)); -} - -#define IDCT_1D(ADD, SHIFT)\ - A = vec_add(M16(b1, C1), M15(b7, C7));\ - B = vec_sub(M15(b1, C7), M16(b7, C1));\ - C = vec_add(M16(b3, C3), M16(b5, C5));\ - D = vec_sub(M16(b5, C3), M16(b3, C5));\ -\ - Ad = M16(vec_sub(A, C), C4);\ - Bd = M16(vec_sub(B, D), C4);\ -\ - Cd = vec_add(A, C);\ - Dd = vec_add(B, D);\ -\ - E = ADD(M16(vec_add(b0, b4), C4));\ - F = ADD(M16(vec_sub(b0, b4), C4));\ -\ - G = vec_add(M16(b2, C2), M15(b6, C6));\ - H = vec_sub(M15(b2, C6), M16(b6, C2));\ -\ - Ed = vec_sub(E, G);\ - Gd = vec_add(E, G);\ -\ - Add = vec_add(F, Ad);\ - Bdd = vec_sub(Bd, H);\ -\ - Fd = vec_sub(F, Ad);\ - Hd = vec_add(Bd, H);\ -\ - b0 = SHIFT(vec_add(Gd, Cd));\ - b7 = SHIFT(vec_sub(Gd, Cd));\ -\ - b1 = SHIFT(vec_add(Add, Hd));\ - b2 = SHIFT(vec_sub(Add, Hd));\ -\ - b3 = SHIFT(vec_add(Ed, Dd));\ - b4 = SHIFT(vec_sub(Ed, Dd));\ -\ - b5 = SHIFT(vec_add(Fd, Bdd));\ - b6 = SHIFT(vec_sub(Fd, Bdd)); - -#define NOP(a) a -#define ADD8(a) vec_add(a, eight) -#define SHIFT4(a) vec_sra(a, four) - -static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64]) -{ - vec_u8 t; - IDCT_START - - // pixels are signed; so add 128*16 in addition to the normal 8 - vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); - eight = vec_add(eight, v2048); - - IDCT_1D(NOP, NOP) - TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); - IDCT_1D(ADD8, SHIFT4) - -#define PUT(a)\ - t = vec_packsu(a, a);\ - vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ - vec_ste((vec_u32)t, 4, (unsigned int *)dst); - - PUT(b0) dst += stride; - PUT(b1) dst += stride; - PUT(b2) dst += stride; - PUT(b3) dst += stride; - PUT(b4) dst += stride; - PUT(b5) dst += stride; - PUT(b6) dst += stride; - PUT(b7) - memset(block, 0, sizeof(*block) * 64); -} - -static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64]) -{ - LOAD_ZERO; - vec_u8 t, vdst; - vec_s16 vdst_16; - vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); - - IDCT_START - - IDCT_1D(NOP, NOP) - TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); - IDCT_1D(ADD8, SHIFT4) - -#define ADD(a)\ - vdst = vec_ld(0, dst);\ - vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ - vdst_16 = vec_adds(a, vdst_16);\ - t = vec_packsu(vdst_16, vdst_16);\ - vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ - vec_ste((vec_u32)t, 4, (unsigned int *)dst); - - ADD(b0) dst += stride; - ADD(b1) dst += stride; - ADD(b2) dst += stride; - ADD(b3) dst += stride; - ADD(b4) dst += stride; - ADD(b5) dst += stride; - ADD(b6) dst += stride; - ADD(b7) - memset(block, 0, sizeof(*block) * 64); -} - -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) -{ -#if HAVE_ALTIVEC - if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { - c->idct_put = vp3_idct_put_altivec; - c->idct_add = vp3_idct_add_altivec; - } -#endif -} diff --git a/ffmpeg1/libavcodec/ppc/vp8dsp_altivec.c b/ffmpeg1/libavcodec/ppc/vp8dsp_altivec.c deleted file mode 100644 index 14d8784..0000000 --- a/ffmpeg1/libavcodec/ppc/vp8dsp_altivec.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * VP8 compatible video decoder - * - * Copyright (C) 2010 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/vp8dsp.h" -#include "dsputil_altivec.h" - -#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } - -// h subpel filter uses msum to multiply+add 4 pixel taps at once -static const vec_s8 h_subpel_filters_inner[7] = -{ - REPT4( -6, 123, 12, -1), - REPT4(-11, 108, 36, -8), - REPT4( -9, 93, 50, -6), - REPT4(-16, 77, 77, -16), - REPT4( -6, 50, 93, -9), - REPT4( -8, 36, 108, -11), - REPT4( -1, 12, 123, -6), -}; - -// for 6tap filters, these are the outer two taps -// The zeros mask off pixels 4-7 when filtering 0-3 -// and vice-versa -static const vec_s8 h_subpel_filters_outer[3] = -{ - REPT4(0, 0, 2, 1), - REPT4(0, 0, 3, 3), - REPT4(0, 0, 1, 2), -}; - -#define LOAD_H_SUBPEL_FILTER(i) \ - vec_s8 filter_inner = h_subpel_filters_inner[i]; \ - vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ - vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) - -#define FILTER_H(dstv, off) \ - a = vec_ld((off)-is6tap-1, src); \ - b = vec_ld((off)-is6tap-1+15, src); \ -\ - pixh = vec_perm(a, b, permh##off); \ - pixl = vec_perm(a, b, perml##off); \ - filth = vec_msum(filter_inner, pixh, c64); \ - filtl = vec_msum(filter_inner, pixl, c64); \ -\ - if (is6tap) { \ - outer = vec_perm(a, b, perm_6tap##off); \ - filth = vec_msum(filter_outerh, outer, filth); \ - filtl = vec_msum(filter_outerl, outer, filtl); \ - } \ - if (w == 4) \ - filtl = filth; /* discard pixels 4-7 */ \ - dstv = vec_packs(filth, filtl); \ - dstv = vec_sra(dstv, c7) - -static av_always_inline -void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, - uint8_t *src, ptrdiff_t src_stride, - int h, int mx, int w, int is6tap) -{ - LOAD_H_SUBPEL_FILTER(mx-1); - vec_u8 align_vec0, align_vec8, permh0, permh8, filt; - vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; - vec_u8 a, b, pixh, pixl, outer; - vec_s16 f16h, f16l; - vec_s32 filth, filtl; - - vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; - vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; - vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; - vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; - vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); - vec_u16 c7 = vec_splat_u16(7); - - align_vec0 = vec_lvsl( -is6tap-1, src); - align_vec8 = vec_lvsl(8-is6tap-1, src); - - permh0 = vec_perm(align_vec0, align_vec0, perm_inner); - permh8 = vec_perm(align_vec8, align_vec8, perm_inner); - perm_inner = vec_add(perm_inner, vec_splat_u8(4)); - perml0 = vec_perm(align_vec0, align_vec0, perm_inner); - perml8 = vec_perm(align_vec8, align_vec8, perm_inner); - perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); - perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); - - while (h --> 0) { - FILTER_H(f16h, 0); - - if (w == 16) { - FILTER_H(f16l, 8); - filt = vec_packsu(f16h, f16l); - vec_st(filt, 0, dst); - } else { - filt = vec_packsu(f16h, f16h); - vec_ste((vec_u32)filt, 0, (uint32_t*)dst); - if (w == 8) - vec_ste((vec_u32)filt, 4, (uint32_t*)dst); - } - src += src_stride; - dst += dst_stride; - } -} - -// v subpel filter does a simple vertical multiply + add -static const vec_u8 v_subpel_filters[7] = -{ - { 0, 6, 123, 12, 1, 0 }, - { 2, 11, 108, 36, 8, 1 }, - { 0, 9, 93, 50, 6, 0 }, - { 3, 16, 77, 77, 16, 3 }, - { 0, 6, 50, 93, 9, 0 }, - { 1, 8, 36, 108, 11, 2 }, - { 0, 1, 12, 123, 6, 0 }, -}; - -#define LOAD_V_SUBPEL_FILTER(i) \ - vec_u8 subpel_filter = v_subpel_filters[i]; \ - vec_u8 f0 = vec_splat(subpel_filter, 0); \ - vec_u8 f1 = vec_splat(subpel_filter, 1); \ - vec_u8 f2 = vec_splat(subpel_filter, 2); \ - vec_u8 f3 = vec_splat(subpel_filter, 3); \ - vec_u8 f4 = vec_splat(subpel_filter, 4); \ - vec_u8 f5 = vec_splat(subpel_filter, 5) - -#define FILTER_V(dstv, vec_mul) \ - s1f = (vec_s16)vec_mul(s1, f1); \ - s2f = (vec_s16)vec_mul(s2, f2); \ - s3f = (vec_s16)vec_mul(s3, f3); \ - s4f = (vec_s16)vec_mul(s4, f4); \ - s2f = vec_subs(s2f, s1f); \ - s3f = vec_subs(s3f, s4f); \ - if (is6tap) { \ - s0f = (vec_s16)vec_mul(s0, f0); \ - s5f = (vec_s16)vec_mul(s5, f5); \ - s2f = vec_adds(s2f, s0f); \ - s3f = vec_adds(s3f, s5f); \ - } \ - dstv = vec_adds(s2f, s3f); \ - dstv = vec_adds(dstv, c64); \ - dstv = vec_sra(dstv, c7) - -static av_always_inline -void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, - uint8_t *src, ptrdiff_t src_stride, - int h, int my, int w, int is6tap) -{ - LOAD_V_SUBPEL_FILTER(my-1); - vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; - vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; - vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); - vec_u16 c7 = vec_splat_u16(7); - - // we want pixels 0-7 to be in the even positions and 8-15 in the odd, - // so combine this permute with the alignment permute vector - align_vech = vec_lvsl(0, src); - align_vecl = vec_sld(align_vech, align_vech, 8); - if (w ==16) - perm_vec = vec_mergeh(align_vech, align_vecl); - else - perm_vec = vec_mergeh(align_vech, align_vech); - - if (is6tap) - s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); - s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); - s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); - s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); - if (is6tap) - s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); - - src += (2+is6tap)*src_stride; - - while (h --> 0) { - if (is6tap) - s5 = load_with_perm_vec(0, src, perm_vec); - else - s4 = load_with_perm_vec(0, src, perm_vec); - - FILTER_V(f16h, vec_mule); - - if (w == 16) { - FILTER_V(f16l, vec_mulo); - filt = vec_packsu(f16h, f16l); - vec_st(filt, 0, dst); - } else { - filt = vec_packsu(f16h, f16h); - if (w == 4) - filt = (vec_u8)vec_splat((vec_u32)filt, 0); - else - vec_ste((vec_u32)filt, 4, (uint32_t*)dst); - vec_ste((vec_u32)filt, 0, (uint32_t*)dst); - } - - if (is6tap) - s0 = s1; - s1 = s2; - s2 = s3; - s3 = s4; - if (is6tap) - s4 = s5; - - dst += dst_stride; - src += src_stride; - } -} - -#define EPEL_FUNCS(WIDTH, TAPS) \ -static av_noinline \ -void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ -{ \ - put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ -} \ -\ -static av_noinline \ -void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ -{ \ - put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ -} - -#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ -static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \ -{ \ - DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ - if (VTAPS == 6) { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*stride, stride, h+5, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16, 16, h, mx, my); \ - } else { \ - put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-stride, stride, h+4, mx, my); \ - put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16, 16, h, mx, my); \ - } \ -} - -EPEL_FUNCS(16,6) -EPEL_FUNCS(8, 6) -EPEL_FUNCS(8, 4) -EPEL_FUNCS(4, 6) -EPEL_FUNCS(4, 4) - -EPEL_HV(16, 6,6) -EPEL_HV(8, 6,6) -EPEL_HV(8, 4,6) -EPEL_HV(8, 6,4) -EPEL_HV(8, 4,4) -EPEL_HV(4, 6,6) -EPEL_HV(4, 4,6) -EPEL_HV(4, 6,4) -EPEL_HV(4, 4,4) - -static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) -{ - ff_put_pixels16_altivec(dst, src, stride, h); -} - -av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c) -{ - if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) - return; - - c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; - c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; - c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; - c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; - - c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; - c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; - c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; - c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; - - c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; - c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; - c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; - c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; - - c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; - c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; - c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; - c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; - - c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; - c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; - c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; - c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; -} |
