summaryrefslogtreecommitdiff
path: root/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c
diff options
context:
space:
mode:
Diffstat (limited to 'ffmpeg/libavcodec/ppc/vp8dsp_altivec.c')
-rw-r--r--ffmpeg/libavcodec/ppc/vp8dsp_altivec.c57
1 files changed, 49 insertions, 8 deletions
diff --git a/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c b/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c
index 14d8784..c858d8a 100644
--- a/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c
+++ b/ffmpeg/libavcodec/ppc/vp8dsp_altivec.c
@@ -20,6 +20,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
@@ -27,6 +28,7 @@
#include "libavcodec/vp8dsp.h"
#include "dsputil_altivec.h"
+#if HAVE_ALTIVEC
#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
// h subpel filter uses msum to multiply+add 4 pixel taps at once
@@ -239,15 +241,15 @@ void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst
}
#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
-static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \
+static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
{ \
DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
if (VTAPS == 6) { \
- put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*stride, stride, h+5, mx, my); \
- put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16, 16, h, mx, my); \
+ put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
+ put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
} else { \
- put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-stride, stride, h+4, mx, my); \
- put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16, 16, h, mx, my); \
+ put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
+ put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
} \
}
@@ -267,13 +269,51 @@ EPEL_HV(4, 4,6)
EPEL_HV(4, 6,4)
EPEL_HV(4, 4,4)
-static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my)
+static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
{
- ff_put_pixels16_altivec(dst, src, stride, h);
+ register vector unsigned char pixelsv1, pixelsv2;
+ register vector unsigned char pixelsv1B, pixelsv2B;
+ register vector unsigned char pixelsv1C, pixelsv2C;
+ register vector unsigned char pixelsv1D, pixelsv2D;
+
+ register vector unsigned char perm = vec_lvsl(0, src);
+ int i;
+ register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
+ register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
+ register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
+
+// hand-unrolling the loop by 4 gains about 15%
+// mininum execution time goes from 74 to 60 cycles
+// it's faster than -funroll-loops, but using
+// -funroll-loops w/ this is bad - 74 cycles again.
+// all this is on a 7450, tuning for the 7450
+ for (i = 0; i < h; i += 4) {
+ pixelsv1 = vec_ld( 0, src);
+ pixelsv2 = vec_ld(15, src);
+ pixelsv1B = vec_ld(sstride, src);
+ pixelsv2B = vec_ld(15 + sstride, src);
+ pixelsv1C = vec_ld(sstride2, src);
+ pixelsv2C = vec_ld(15 + sstride2, src);
+ pixelsv1D = vec_ld(sstride3, src);
+ pixelsv2D = vec_ld(15 + sstride3, src);
+ vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+ 0, (unsigned char*)dst);
+ vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
+ dstride, (unsigned char*)dst);
+ vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
+ dstride2, (unsigned char*)dst);
+ vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
+ dstride3, (unsigned char*)dst);
+ src += sstride4;
+ dst += dstride4;
+ }
}
-av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c)
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_vp8dsp_init_ppc(VP8DSPContext *c)
{
+#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
@@ -301,4 +341,5 @@ av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c)
c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
+#endif /* HAVE_ALTIVEC */
}