From 8992cb1d0d07edc33d274f6d7924ecdf6f83d994 Mon Sep 17 00:00:00 2001 From: Tim Redfern Date: Thu, 5 Sep 2013 17:57:22 +0100 Subject: making act segmenter --- ffmpeg/libavcodec/arm/Makefile | 116 ++ ffmpeg/libavcodec/arm/aac.h | 143 ++ ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c | 57 + ffmpeg/libavcodec/arm/aacpsdsp_neon.S | 272 ++++ ffmpeg/libavcodec/arm/ac3dsp_arm.S | 36 + ffmpeg/libavcodec/arm/ac3dsp_armv6.S | 84 + ffmpeg/libavcodec/arm/ac3dsp_init_arm.c | 70 + ffmpeg/libavcodec/arm/ac3dsp_neon.S | 154 ++ ffmpeg/libavcodec/arm/asm-offsets.h | 39 + ffmpeg/libavcodec/arm/dca.h | 105 ++ ffmpeg/libavcodec/arm/dcadsp_init_arm.c | 36 + ffmpeg/libavcodec/arm/dcadsp_neon.S | 61 + ffmpeg/libavcodec/arm/dsputil_arm.S | 125 ++ ffmpeg/libavcodec/arm/dsputil_arm.h | 32 + ffmpeg/libavcodec/arm/dsputil_armv6.S | 381 +++++ ffmpeg/libavcodec/arm/dsputil_init_arm.c | 86 + ffmpeg/libavcodec/arm/dsputil_init_armv5te.c | 37 + ffmpeg/libavcodec/arm/dsputil_init_armv6.c | 85 + ffmpeg/libavcodec/arm/dsputil_init_neon.c | 81 + ffmpeg/libavcodec/arm/dsputil_neon.S | 209 +++ ffmpeg/libavcodec/arm/fft_fixed_init_arm.c | 48 + ffmpeg/libavcodec/arm/fft_fixed_neon.S | 261 +++ ffmpeg/libavcodec/arm/fft_init_arm.c | 77 + ffmpeg/libavcodec/arm/fft_neon.S | 375 +++++ ffmpeg/libavcodec/arm/flacdsp_arm.S | 146 ++ ffmpeg/libavcodec/arm/flacdsp_init_arm.c | 32 + ffmpeg/libavcodec/arm/fmtconvert_init_arm.c | 52 + ffmpeg/libavcodec/arm/fmtconvert_neon.S | 392 +++++ ffmpeg/libavcodec/arm/fmtconvert_vfp.S | 78 + ffmpeg/libavcodec/arm/h264chroma_init_arm.c | 51 + ffmpeg/libavcodec/arm/h264cmc_neon.S | 400 +++++ ffmpeg/libavcodec/arm/h264dsp_init_arm.c | 111 ++ ffmpeg/libavcodec/arm/h264dsp_neon.S | 541 +++++++ ffmpeg/libavcodec/arm/h264idct_neon.S | 413 +++++ ffmpeg/libavcodec/arm/h264pred_init_arm.c | 92 ++ ffmpeg/libavcodec/arm/h264pred_neon.S | 359 +++++ ffmpeg/libavcodec/arm/h264qpel_init_arm.c | 171 ++ ffmpeg/libavcodec/arm/h264qpel_neon.S | 955 +++++++++++ ffmpeg/libavcodec/arm/hpeldsp_arm.S | 611 +++++++ ffmpeg/libavcodec/arm/hpeldsp_arm.h | 29 + ffmpeg/libavcodec/arm/hpeldsp_armv6.S | 259 +++ ffmpeg/libavcodec/arm/hpeldsp_init_arm.c | 68 + ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c | 66 + ffmpeg/libavcodec/arm/hpeldsp_init_neon.c | 86 + ffmpeg/libavcodec/arm/hpeldsp_neon.S | 410 +++++ ffmpeg/libavcodec/arm/int_neon.S | 92 ++ ffmpeg/libavcodec/arm/jrevdct_arm.S | 383 +++++ ffmpeg/libavcodec/arm/mathops.h | 108 ++ ffmpeg/libavcodec/arm/mdct_fixed_neon.S | 193 +++ ffmpeg/libavcodec/arm/mdct_neon.S | 301 ++++ ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S | 143 ++ ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c | 38 + ffmpeg/libavcodec/arm/mpegvideo_arm.c | 52 + ffmpeg/libavcodec/arm/mpegvideo_arm.h | 26 + ffmpeg/libavcodec/arm/mpegvideo_armv5te.c | 102 ++ ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S | 114 ++ ffmpeg/libavcodec/arm/mpegvideo_neon.S | 107 ++ ffmpeg/libavcodec/arm/neon.S | 59 + ffmpeg/libavcodec/arm/rdft_neon.S | 150 ++ ffmpeg/libavcodec/arm/rv34dsp_init_arm.c | 46 + ffmpeg/libavcodec/arm/rv34dsp_neon.S | 156 ++ ffmpeg/libavcodec/arm/rv40dsp_init_arm.c | 148 ++ ffmpeg/libavcodec/arm/rv40dsp_neon.S | 920 +++++++++++ ffmpeg/libavcodec/arm/sbrdsp_init_arm.c | 73 + ffmpeg/libavcodec/arm/sbrdsp_neon.S | 411 +++++ ffmpeg/libavcodec/arm/simple_idct_arm.S | 479 ++++++ ffmpeg/libavcodec/arm/simple_idct_armv5te.S | 620 +++++++ ffmpeg/libavcodec/arm/simple_idct_armv6.S | 425 +++++ ffmpeg/libavcodec/arm/simple_idct_neon.S | 375 +++++ ffmpeg/libavcodec/arm/synth_filter_neon.S | 115 ++ ffmpeg/libavcodec/arm/videodsp_arm.h | 29 + ffmpeg/libavcodec/arm/videodsp_armv5te.S | 31 + ffmpeg/libavcodec/arm/videodsp_init_arm.c | 30 + ffmpeg/libavcodec/arm/videodsp_init_armv5te.c | 33 + ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c | 37 + ffmpeg/libavcodec/arm/vorbisdsp_neon.S | 83 + ffmpeg/libavcodec/arm/vp3dsp_init_arm.c | 45 + ffmpeg/libavcodec/arm/vp3dsp_neon.S | 395 +++++ ffmpeg/libavcodec/arm/vp56_arith.h | 121 ++ ffmpeg/libavcodec/arm/vp56dsp_init_arm.c | 39 + ffmpeg/libavcodec/arm/vp56dsp_neon.S | 121 ++ ffmpeg/libavcodec/arm/vp8.h | 35 + ffmpeg/libavcodec/arm/vp8_armv6.S | 248 +++ ffmpeg/libavcodec/arm/vp8dsp.h | 78 + ffmpeg/libavcodec/arm/vp8dsp_armv6.S | 1634 +++++++++++++++++++ ffmpeg/libavcodec/arm/vp8dsp_init_arm.c | 34 + ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c | 120 ++ ffmpeg/libavcodec/arm/vp8dsp_init_neon.c | 116 ++ ffmpeg/libavcodec/arm/vp8dsp_neon.S | 1867 ++++++++++++++++++++++ 89 files changed, 19024 insertions(+) create mode 100644 ffmpeg/libavcodec/arm/Makefile create mode 100644 ffmpeg/libavcodec/arm/aac.h create mode 100644 ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/aacpsdsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/ac3dsp_arm.S create mode 100644 ffmpeg/libavcodec/arm/ac3dsp_armv6.S create mode 100644 ffmpeg/libavcodec/arm/ac3dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/ac3dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/asm-offsets.h create mode 100644 ffmpeg/libavcodec/arm/dca.h create mode 100644 ffmpeg/libavcodec/arm/dcadsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/dcadsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/dsputil_arm.S create mode 100644 ffmpeg/libavcodec/arm/dsputil_arm.h create mode 100644 ffmpeg/libavcodec/arm/dsputil_armv6.S create mode 100644 ffmpeg/libavcodec/arm/dsputil_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/dsputil_init_armv5te.c create mode 100644 ffmpeg/libavcodec/arm/dsputil_init_armv6.c create mode 100644 ffmpeg/libavcodec/arm/dsputil_init_neon.c create mode 100644 ffmpeg/libavcodec/arm/dsputil_neon.S create mode 100644 ffmpeg/libavcodec/arm/fft_fixed_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/fft_fixed_neon.S create mode 100644 ffmpeg/libavcodec/arm/fft_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/fft_neon.S create mode 100644 ffmpeg/libavcodec/arm/flacdsp_arm.S create mode 100644 ffmpeg/libavcodec/arm/flacdsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/fmtconvert_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/fmtconvert_neon.S create mode 100644 ffmpeg/libavcodec/arm/fmtconvert_vfp.S create mode 100644 ffmpeg/libavcodec/arm/h264chroma_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/h264cmc_neon.S create mode 100644 ffmpeg/libavcodec/arm/h264dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/h264dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/h264idct_neon.S create mode 100644 ffmpeg/libavcodec/arm/h264pred_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/h264pred_neon.S create mode 100644 ffmpeg/libavcodec/arm/h264qpel_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/h264qpel_neon.S create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_arm.S create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_arm.h create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_armv6.S create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_init_neon.c create mode 100644 ffmpeg/libavcodec/arm/hpeldsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/int_neon.S create mode 100644 ffmpeg/libavcodec/arm/jrevdct_arm.S create mode 100644 ffmpeg/libavcodec/arm/mathops.h create mode 100644 ffmpeg/libavcodec/arm/mdct_fixed_neon.S create mode 100644 ffmpeg/libavcodec/arm/mdct_neon.S create mode 100644 ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S create mode 100644 ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/mpegvideo_arm.c create mode 100644 ffmpeg/libavcodec/arm/mpegvideo_arm.h create mode 100644 ffmpeg/libavcodec/arm/mpegvideo_armv5te.c create mode 100644 ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S create mode 100644 ffmpeg/libavcodec/arm/mpegvideo_neon.S create mode 100644 ffmpeg/libavcodec/arm/neon.S create mode 100644 ffmpeg/libavcodec/arm/rdft_neon.S create mode 100644 ffmpeg/libavcodec/arm/rv34dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/rv34dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/rv40dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/rv40dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/sbrdsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/sbrdsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/simple_idct_arm.S create mode 100644 ffmpeg/libavcodec/arm/simple_idct_armv5te.S create mode 100644 ffmpeg/libavcodec/arm/simple_idct_armv6.S create mode 100644 ffmpeg/libavcodec/arm/simple_idct_neon.S create mode 100644 ffmpeg/libavcodec/arm/synth_filter_neon.S create mode 100644 ffmpeg/libavcodec/arm/videodsp_arm.h create mode 100644 ffmpeg/libavcodec/arm/videodsp_armv5te.S create mode 100644 ffmpeg/libavcodec/arm/videodsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/videodsp_init_armv5te.c create mode 100644 ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/vorbisdsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/vp3dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/vp3dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/vp56_arith.h create mode 100644 ffmpeg/libavcodec/arm/vp56dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/vp56dsp_neon.S create mode 100644 ffmpeg/libavcodec/arm/vp8.h create mode 100644 ffmpeg/libavcodec/arm/vp8_armv6.S create mode 100644 ffmpeg/libavcodec/arm/vp8dsp.h create mode 100644 ffmpeg/libavcodec/arm/vp8dsp_armv6.S create mode 100644 ffmpeg/libavcodec/arm/vp8dsp_init_arm.c create mode 100644 ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c create mode 100644 ffmpeg/libavcodec/arm/vp8dsp_init_neon.c create mode 100644 ffmpeg/libavcodec/arm/vp8dsp_neon.S (limited to 'ffmpeg/libavcodec/arm') diff --git a/ffmpeg/libavcodec/arm/Makefile b/ffmpeg/libavcodec/arm/Makefile new file mode 100644 index 0000000..011404c --- /dev/null +++ b/ffmpeg/libavcodec/arm/Makefile @@ -0,0 +1,116 @@ +ARCH_HEADERS = mathops.h + +OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ + arm/ac3dsp_arm.o + +OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o \ + arm/aacpsdsp_init_arm.o + +OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ + +ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o + +OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ + arm/flacdsp_arm.o \ + +OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o +ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o + +OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o +OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o +OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o +OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o +OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o +OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o +ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ + arm/vp8dsp_init_armv6.o \ + arm/vp8dsp_armv6.o + +OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o +OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o +OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o +OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o + +OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_arm.o \ + arm/hpeldsp_init_arm.o + +OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o +OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ + arm/rv40dsp_init_arm.o \ + +OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \ + +OBJS += arm/dsputil_init_arm.o \ + arm/dsputil_arm.o \ + arm/fft_init_arm.o \ + arm/fft_fixed_init_arm.o \ + arm/fmtconvert_init_arm.o \ + arm/jrevdct_arm.o \ + arm/simple_idct_arm.o \ + +ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \ + arm/mpegvideo_armv5te_s.o \ + +ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \ + arm/videodsp_armv5te.o \ + +ARMV5TE-OBJS += arm/dsputil_init_armv5te.o \ + arm/simple_idct_armv5te.o \ + +ARMV6-OBJS += arm/dsputil_init_armv6.o \ + arm/dsputil_armv6.o \ + arm/simple_idct_armv6.o \ + +ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_armv6.o \ + arm/hpeldsp_init_armv6.o + +VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o + +NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \ + arm/fft_fixed_neon.o \ + +NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \ + arm/mdct_fixed_neon.o \ + +NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o \ + +NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o +NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \ + arm/h264idct_neon.o \ + +NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o \ + +NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \ + +NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_neon.o \ + arm/hpeldsp_init_neon.o + +NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o + +NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o \ + arm/aacpsdsp_neon.o + +NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ + arm/synth_filter_neon.o \ + +NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o +NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o +NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ + arm/rv40dsp_neon.o \ + +NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o + +NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o + +NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \ + +NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ + +NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \ + arm/vp8dsp_neon.o + +NEON-OBJS += arm/dsputil_init_neon.o \ + arm/dsputil_neon.o \ + arm/fmtconvert_neon.o \ + arm/int_neon.o \ + arm/simple_idct_neon.o \ diff --git a/ffmpeg/libavcodec/arm/aac.h b/ffmpeg/libavcodec/arm/aac.h new file mode 100644 index 0000000..cafa881 --- /dev/null +++ b/ffmpeg/libavcodec/arm/aac.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_AAC_H +#define AVCODEC_ARM_AAC_H + +#include "config.h" + +#if HAVE_NEON_INLINE + +#define VMUL2 VMUL2 +static inline float *VMUL2(float *dst, const float *v, unsigned idx, + const float *scale) +{ + unsigned v0, v1; + __asm__ ("ubfx %0, %6, #0, #4 \n\t" + "ubfx %1, %6, #4, #4 \n\t" + "ldr %0, [%5, %0, lsl #2] \n\t" + "ldr %1, [%5, %1, lsl #2] \n\t" + "vld1.32 {d1[]}, [%7,:32] \n\t" + "vmov d0, %0, %1 \n\t" + "vmul.f32 d0, d0, d1 \n\t" + "vst1.32 {d0}, [%2,:64]! \n\t" + : "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1]) + : "r"(v), "r"(idx), "r"(scale) + : "d0", "d1"); + return dst; +} + +#define VMUL4 VMUL4 +static inline float *VMUL4(float *dst, const float *v, unsigned idx, + const float *scale) +{ + unsigned v0, v1, v2, v3; + __asm__ ("ubfx %0, %10, #0, #2 \n\t" + "ubfx %1, %10, #2, #2 \n\t" + "ldr %0, [%9, %0, lsl #2] \n\t" + "ubfx %2, %10, #4, #2 \n\t" + "ldr %1, [%9, %1, lsl #2] \n\t" + "ubfx %3, %10, #6, #2 \n\t" + "ldr %2, [%9, %2, lsl #2] \n\t" + "vmov d0, %0, %1 \n\t" + "ldr %3, [%9, %3, lsl #2] \n\t" + "vld1.32 {d2[],d3[]},[%11,:32] \n\t" + "vmov d1, %2, %3 \n\t" + "vmul.f32 q0, q0, q1 \n\t" + "vst1.32 {q0}, [%4,:128]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), + "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3]) + : "r"(v), "r"(idx), "r"(scale) + : "d0", "d1", "d2", "d3"); + return dst; +} + +#define VMUL2S VMUL2S +static inline float *VMUL2S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + unsigned v0, v1, v2, v3; + __asm__ ("ubfx %0, %8, #0, #4 \n\t" + "ubfx %1, %8, #4, #4 \n\t" + "ldr %0, [%7, %0, lsl #2] \n\t" + "lsl %2, %10, #30 \n\t" + "ldr %1, [%7, %1, lsl #2] \n\t" + "lsl %3, %10, #31 \n\t" + "vmov d0, %0, %1 \n\t" + "bic %2, %2, #1<<30 \n\t" + "vld1.32 {d1[]}, [%9,:32] \n\t" + "vmov d2, %2, %3 \n\t" + "veor d0, d0, d2 \n\t" + "vmul.f32 d0, d0, d1 \n\t" + "vst1.32 {d0}, [%4,:64]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), + "=m"(dst[0]), "=m"(dst[1]) + : "r"(v), "r"(idx), "r"(scale), "r"(sign) + : "d0", "d1", "d2"); + return dst; +} + +#define VMUL4S VMUL4S +static inline float *VMUL4S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + unsigned v0, v1, v2, v3, nz; + __asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t" + "ubfx %0, %12, #0, #2 \n\t" + "ubfx %1, %12, #2, #2 \n\t" + "ldr %0, [%11,%0, lsl #2] \n\t" + "ubfx %2, %12, #4, #2 \n\t" + "ldr %1, [%11,%1, lsl #2] \n\t" + "ubfx %3, %12, #6, #2 \n\t" + "ldr %2, [%11,%2, lsl #2] \n\t" + "vmov d0, %0, %1 \n\t" + "ldr %3, [%11,%3, lsl #2] \n\t" + "lsr %6, %12, #12 \n\t" + "rbit %6, %6 \n\t" + "vmov d1, %2, %3 \n\t" + "lsls %6, %6, #1 \n\t" + "and %0, %5, #1<<31 \n\t" + "it cs \n\t" + "lslcs %5, %5, #1 \n\t" + "lsls %6, %6, #1 \n\t" + "and %1, %5, #1<<31 \n\t" + "it cs \n\t" + "lslcs %5, %5, #1 \n\t" + "lsls %6, %6, #1 \n\t" + "and %2, %5, #1<<31 \n\t" + "it cs \n\t" + "lslcs %5, %5, #1 \n\t" + "vmov d4, %0, %1 \n\t" + "and %3, %5, #1<<31 \n\t" + "vmov d5, %2, %3 \n\t" + "veor q0, q0, q2 \n\t" + "vmul.f32 q0, q0, q1 \n\t" + "vst1.32 {q0}, [%4,:128]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), + "+r"(sign), "=r"(nz), + "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3]) + : "r"(v), "r"(idx), "r"(scale) + : "cc", "d0", "d1", "d2", "d3", "d4", "d5"); + return dst; +} + +#endif /* HAVE_NEON_INLINE */ + +#endif /* AVCODEC_ARM_AAC_H */ diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c new file mode 100644 index 0000000..6326376 --- /dev/null +++ b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/aacpsdsp.h" + +void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n); +void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2], + float *src1, int n); +void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2], + const float (*filter)[8][2], + int stride, int n); +void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64], + int i, int len); +void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2], + int i, int len); +void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2], + float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2], + const float phi_fract[2], float (*Q_fract)[2], + const float *transient_gain, float g_decay_slope, + int len); +void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2], + float h[2][4], float h_step[2][4], + int len); + +av_cold void ff_psdsp_init_arm(PSDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->add_squares = ff_ps_add_squares_neon; + s->mul_pair_single = ff_ps_mul_pair_single_neon; + s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon; + s->hybrid_analysis = ff_ps_hybrid_analysis_neon; + s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S new file mode 100644 index 0000000..fb00900 --- /dev/null +++ b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_ps_add_squares_neon, export=1 + mov r3, r0 + sub r2, r2, #4 + vld1.32 {q0}, [r1,:128]! + vmul.f32 q0, q0, q0 + vld1.32 {q2}, [r1,:128]! + vmul.f32 q2, q2, q2 + vld1.32 {q1}, [r0,:128]! +1: + vpadd.f32 d6, d0, d1 + vld1.32 {q0}, [r1,:128]! + vpadd.f32 d7, d4, d5 + vmul.f32 q0, q0, q0 + vld1.32 {q2}, [r1,:128]! + vadd.f32 q3, q1, q3 + vld1.32 {q1}, [r0,:128]! + vmul.f32 q2, q2, q2 + vst1.32 {q3}, [r3,:128]! + subs r2, r2, #4 + bgt 1b + vpadd.f32 d6, d0, d1 + vpadd.f32 d7, d4, d5 + vadd.f32 q1, q1, q3 + vst1.32 {q1}, [r3,:128]! + bx lr +endfunc + +function ff_ps_mul_pair_single_neon, export=1 + sub r3, r3, #4 + tst r1, #8 + bne 2f + vld1.32 {q0}, [r1,:128]! +1: + vld1.32 {q3}, [r2,:128]! + vmul.f32 d4, d0, d6[0] + vmul.f32 d5, d1, d6[1] + vld1.32 {q1}, [r1,:128]! + vmul.f32 d6, d2, d7[0] + vmul.f32 d7, d3, d7[1] + vld1.32 {q0}, [r1,:128]! + vst1.32 {q2,q3}, [r0,:128]! + subs r3, r3, #4 + bgt 1b + vld1.32 {q3}, [r2,:128]! + vmul.f32 d4, d0, d6[0] + vmul.f32 d5, d1, d6[1] + vld1.32 {q1}, [r1,:128]! + vmul.f32 d6, d2, d7[0] + vmul.f32 d7, d3, d7[1] + vst1.32 {q2,q3}, [r0,:128]! + bx lr +2: + vld1.32 {d0}, [r1,:64]! + vld1.32 {d1,d2}, [r1,:128]! +1: + vld1.32 {q3}, [r2,:128]! + vmul.f32 d4, d0, d6[0] + vmul.f32 d5, d1, d6[1] + vld1.32 {d0,d1}, [r1,:128]! + vmul.f32 d6, d2, d7[0] + vmul.f32 d7, d0, d7[1] + vmov d0, d1 + vld1.32 {d1,d2}, [r1,:128]! + vst1.32 {q2,q3}, [r0,:128]! + subs r3, r3, #4 + bgt 1b + vld1.32 {q3}, [r2,:128]! + vmul.f32 d4, d0, d6[0] + vmul.f32 d5, d1, d6[1] + vld1.32 {d0}, [r1,:64]! + vmul.f32 d6, d2, d7[0] + vmul.f32 d7, d0, d7[1] + vst1.32 {q2,q3}, [r0,:128]! + bx lr +endfunc + +function ff_ps_hybrid_synthesis_deint_neon, export=1 + push {r4-r8,lr} + add r0, r0, r2, lsl #2 + add r1, r1, r2, lsl #5+1+2 + rsb r2, r2, #64 + mov r5, #64*4 + mov lr, r0 + add r4, r0, #38*64*4 + mov r12, r3 +2: + vld1.32 {d0,d1}, [r1,:128]! + vst1.32 {d0[0]}, [lr,:32], r5 + vst1.32 {d0[1]}, [r4,:32], r5 + vst1.32 {d1[0]}, [lr,:32], r5 + vst1.32 {d1[1]}, [r4,:32], r5 + subs r12, r12, #2 + bgt 2b + add r0, r0, #4 + sub r2, r2, #1 + tst r2, #2 + bne 6f +1: + mov lr, r0 + add r4, r0, #38*64*4 + add r6, r1, # 32*2*4 + add r7, r1, #2*32*2*4 + add r8, r1, #3*32*2*4 + mov r12, r3 +2: + vld1.32 {d0,d1}, [r1,:128]! + vld1.32 {d2,d3}, [r6,:128]! + vld1.32 {d4,d5}, [r7,:128]! + vld1.32 {d6,d7}, [r8,:128]! + vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5 + vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5 + vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5 + vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5 + subs r12, r12, #2 + bgt 2b + add r0, r0, #16 + add r1, r1, #3*32*2*4 + subs r2, r2, #4 + bgt 1b + pop {r4-r8,pc} +6: + mov lr, r0 + add r4, r0, #38*64*4 + add r6, r1, #32*2*4 + mov r12, r3 +2: + vld1.32 {d0,d1}, [r1,:128]! + vld1.32 {d2,d3}, [r6,:128]! + vst2.32 {d0[0],d2[0]}, [lr,:64], r5 + vst2.32 {d0[1],d2[1]}, [r4,:64], r5 + vst2.32 {d1[0],d3[0]}, [lr,:64], r5 + vst2.32 {d1[1],d3[1]}, [r4,:64], r5 + subs r12, r12, #2 + bgt 2b + add r0, r0, #8 + add r1, r1, #32*2*4 + sub r2, r2, #2 + b 1b +endfunc + +function ff_ps_hybrid_analysis_neon, export=1 + vldm r1, {d19-d31} + ldr r12, [sp] + lsl r3, r3, #3 + vadd.f32 d16, d19, d31 + vadd.f32 d17, d20, d30 + vsub.f32 d18, d19, d31 + vsub.f32 d19, d20, d30 + vsub.f32 d0, d21, d29 + vsub.f32 d1, d22, d28 + vadd.f32 d2, d21, d29 + vadd.f32 d3, d22, d28 + vadd.f32 d20, d23, d27 + vadd.f32 d21, d24, d26 + vsub.f32 d22, d23, d27 + vsub.f32 d23, d24, d26 + vmov.i32 d6, #1<<31 + vmov.i32 d7, #0 + vmov.f32 q14, #0.0 + vmov.f32 q15, #0.0 + vtrn.32 d6, d7 + vrev64.32 q9, q9 + vrev64.32 q0, q0 + vrev64.32 q11, q11 + veor q9, q9, q3 + veor q0, q0, q3 + veor q11, q11, q3 + vld1.32 {q13}, [r2,:128]! + vtrn.32 q8, q9 + vtrn.32 q1, q0 + vtrn.32 q10, q11 + sub r12, r12, #1 + vmla.f32 q14, q8, q13 + vld1.32 {q2}, [r2,:128]! + vmla.f32 q15, q9, q13 +1: + vmla.f32 q14, q1, q2 + vld1.32 {q13}, [r2,:128]! + vmla.f32 q15, q0, q2 + vmla.f32 q14, q10, q13 + vld1.32 {q2}, [r2,:128]! + vmla.f32 q15, q11, q13 + vld1.32 {q13}, [r2,:128]! + vadd.f32 d6, d28, d29 + vadd.f32 d7, d30, d31 + vmov.f32 q14, #0.0 + vmov.f32 q15, #0.0 + vmla.f32 q14, q8, q13 + vpadd.f32 d6, d6, d7 + vmla.f32 q15, q9, q13 + vmla.f32 d6, d25, d4[0] + vld1.32 {q2}, [r2,:128]! + vst1.32 {d6}, [r0,:64], r3 + subs r12, r12, #1 + bgt 1b + vmla.f32 q14, q1, q2 + vld1.32 {q13}, [r2,:128]! + vmla.f32 q15, q0, q2 + vmla.f32 q14, q10, q13 + vld1.32 {q2}, [r2,:128]! + vmla.f32 q15, q11, q13 + vadd.f32 d6, d28, d29 + vadd.f32 d7, d30, d31 + vpadd.f32 d6, d6, d7 + vmla.f32 d6, d25, d4[0] + vst1.32 {d6}, [r0,:64], r3 + bx lr +endfunc + +function ff_ps_stereo_interpolate_neon, export=1 + vld1.32 {q0}, [r2] + vld1.32 {q14}, [r3] + vadd.f32 q15, q14, q14 + mov r2, r0 + mov r3, r1 + ldr r12, [sp] + vadd.f32 q1, q0, q14 + vadd.f32 q0, q0, q15 + vld1.32 {q2}, [r0,:64]! + vld1.32 {q3}, [r1,:64]! + subs r12, r12, #1 + beq 2f +1: + vmul.f32 d16, d4, d2[0] + vmul.f32 d17, d5, d0[0] + vmul.f32 d18, d4, d2[1] + vmul.f32 d19, d5, d0[1] + vmla.f32 d16, d6, d3[0] + vmla.f32 d17, d7, d1[0] + vmla.f32 d18, d6, d3[1] + vmla.f32 d19, d7, d1[1] + vadd.f32 q1, q1, q15 + vadd.f32 q0, q0, q15 + vld1.32 {q2}, [r0,:64]! + vld1.32 {q3}, [r1,:64]! + vst1.32 {q8}, [r2,:64]! + vst1.32 {q9}, [r3,:64]! + subs r12, r12, #2 + bgt 1b + it lt + bxlt lr +2: + vmul.f32 d16, d4, d2[0] + vmul.f32 d18, d4, d2[1] + vmla.f32 d16, d6, d3[0] + vmla.f32 d18, d6, d3[1] + vst1.32 {d16}, [r2,:64]! + vst1.32 {d18}, [r3,:64]! + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_arm.S b/ffmpeg/libavcodec/arm/ac3dsp_arm.S new file mode 100644 index 0000000..ed8eb37 --- /dev/null +++ b/ffmpeg/libavcodec/arm/ac3dsp_arm.S @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_ac3_update_bap_counts_arm, export=1 + push {lr} + ldrb lr, [r1], #1 +1: + lsl r3, lr, #1 + ldrh r12, [r0, r3] + subs r2, r2, #1 + it gt + ldrbgt lr, [r1], #1 + add r12, r12, #1 + strh r12, [r0, r3] + bgt 1b + pop {pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S new file mode 100644 index 0000000..2028d0b --- /dev/null +++ b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_ac3_bit_alloc_calc_bap_armv6, export=1 + ldr r12, [sp] + cmp r12, #-960 + beq 4f + push {r4-r11,lr} + add r5, sp, #40 + movrelx r4, X(ff_ac3_bin_to_band_tab), r11 + movrelx lr, X(ff_ac3_band_start_tab) + ldm r5, {r5-r7} + ldrb r4, [r4, r2] + add r1, r1, r2, lsl #1 @ psd + start + add r0, r0, r4, lsl #1 @ mask + band + add r4, r4, lr + add r7, r7, r2 @ bap + start +1: + ldrsh r9, [r0], #2 @ mask[band] + mov r8, #0xff0 + sub r9, r9, r12 @ - snr_offset + ldrb r10, [r4, #1]! @ band_start_tab[++band] + subs r9, r9, r5 @ - floor + it lt + movlt r9, #0 + cmp r10, r3 @ - end + and r9, r9, r8, lsl #1 @ & 0x1fe0 + ite gt + subgt r8, r3, r2 + suble r8, r10, r2 + mov r2, r10 + add r9, r9, r5 @ + floor => m + tst r8, #1 + add r11, r7, r8 + bne 3f + b 5f +2: + ldrsh r8, [r1], #2 + ldrsh lr, [r1], #2 + sub r8, r8, r9 + sub lr, lr, r9 + usat r8, #6, r8, asr #5 @ address + usat lr, #6, lr, asr #5 + ldrb r8, [r6, r8] @ bap_tab[address] + ldrb lr, [r6, lr] + strb r8, [r7], #1 @ bap[bin] + strb lr, [r7], #1 +5: cmp r7, r11 + blo 2b + cmp r3, r10 + bgt 1b + pop {r4-r11,pc} +3: + ldrsh r8, [r1], #2 @ psd[bin] + sub r8, r8, r9 @ - m + usat r8, #6, r8, asr #5 @ address + ldrb r8, [r6, r8] @ bap_tab[address] + strb r8, [r7], #1 @ bap[bin] + b 5b +4: + ldr r0, [sp, #12] + mov r1, #0 + mov r2, #256 + b X(memset) +endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c new file mode 100644 index 0000000..ffe0747 --- /dev/null +++ b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/ac3dsp.h" +#include "config.h" + +void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs); +int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len); +void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift); +void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift); +void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len); +void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs); +void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4], + const int32_t *coef0, + const int32_t *coef1, + int len); +void ff_ac3_sum_square_butterfly_float_neon(float sum[4], + const float *coef0, + const float *coef1, + int len); + +void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd, + int start, int end, + int snr_offset, int floor, + const uint8_t *bap_tab, uint8_t *bap); + +void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len); + +av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact) +{ + int cpu_flags = av_get_cpu_flags(); + + c->update_bap_counts = ff_ac3_update_bap_counts_arm; + + if (have_armv6(cpu_flags)) { + c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6; + } + + if (have_neon(cpu_flags)) { + c->ac3_exponent_min = ff_ac3_exponent_min_neon; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon; + c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon; + c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon; + c->float_to_fixed24 = ff_float_to_fixed24_neon; + c->extract_exponents = ff_ac3_extract_exponents_neon; + c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon; + c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/ac3dsp_neon.S b/ffmpeg/libavcodec/arm/ac3dsp_neon.S new file mode 100644 index 0000000..42f35e3 --- /dev/null +++ b/ffmpeg/libavcodec/arm/ac3dsp_neon.S @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_ac3_max_msb_abs_int16_neon, export=1 + vmov.i16 q0, #0 + vmov.i16 q2, #0 +1: vld1.16 {q1}, [r0,:128]! + vabs.s16 q1, q1 + vld1.16 {q3}, [r0,:128]! + vabs.s16 q3, q3 + vorr q0, q0, q1 + vorr q2, q2, q3 + subs r1, r1, #16 + bgt 1b + vorr q0, q0, q2 + vorr d0, d0, d1 + vpmax.u16 d0, d0, d0 + vpmax.u16 d0, d0, d0 + vmov.u16 r0, d0[0] + bx lr +endfunc + +function ff_ac3_exponent_min_neon, export=1 + cmp r1, #0 + it eq + bxeq lr + push {lr} + mov r12, #256 +1: + vld1.8 {q0}, [r0,:128] + mov lr, r1 + add r3, r0, #256 +2: vld1.8 {q1}, [r3,:128], r12 + subs lr, lr, #1 + vmin.u8 q0, q0, q1 + bgt 2b + subs r2, r2, #16 + vst1.8 {q0}, [r0,:128]! + bgt 1b + pop {pc} +endfunc + +function ff_ac3_lshift_int16_neon, export=1 + vdup.16 q0, r2 +1: vld1.16 {q1}, [r0,:128] + vshl.s16 q1, q1, q0 + vst1.16 {q1}, [r0,:128]! + subs r1, r1, #8 + bgt 1b + bx lr +endfunc + +function ff_ac3_rshift_int32_neon, export=1 + rsb r2, r2, #0 + vdup.32 q0, r2 +1: vld1.32 {q1}, [r0,:128] + vshl.s32 q1, q1, q0 + vst1.32 {q1}, [r0,:128]! + subs r1, r1, #4 + bgt 1b + bx lr +endfunc + +function ff_float_to_fixed24_neon, export=1 +1: vld1.32 {q0-q1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #24 + vld1.32 {q2-q3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #24 + vcvt.s32.f32 q2, q2, #24 + vst1.32 {q0-q1}, [r0,:128]! + vcvt.s32.f32 q3, q3, #24 + vst1.32 {q2-q3}, [r0,:128]! + subs r2, r2, #16 + bgt 1b + bx lr +endfunc + +function ff_ac3_extract_exponents_neon, export=1 + vmov.i32 q15, #8 +1: + vld1.32 {q0}, [r1,:128]! + vabs.s32 q1, q0 + vclz.i32 q3, q1 + vsub.i32 q3, q3, q15 + vmovn.i32 d6, q3 + vmovn.i16 d6, q3 + vst1.32 {d6[0]}, [r0,:32]! + subs r2, r2, #4 + bgt 1b + bx lr +endfunc + +function ff_ac3_sum_square_butterfly_int32_neon, export=1 + vmov.i64 q0, #0 + vmov.i64 q1, #0 + vmov.i64 q2, #0 + vmov.i64 q3, #0 +1: + vld1.32 {d16}, [r1]! + vld1.32 {d17}, [r2]! + vadd.s32 d18, d16, d17 + vsub.s32 d19, d16, d17 + vmlal.s32 q0, d16, d16 + vmlal.s32 q1, d17, d17 + vmlal.s32 q2, d18, d18 + vmlal.s32 q3, d19, d19 + subs r3, r3, #2 + bgt 1b + vadd.s64 d0, d0, d1 + vadd.s64 d1, d2, d3 + vadd.s64 d2, d4, d5 + vadd.s64 d3, d6, d7 + vst1.64 {q0-q1}, [r0] + bx lr +endfunc + +function ff_ac3_sum_square_butterfly_float_neon, export=1 + vmov.f32 q0, #0.0 + vmov.f32 q1, #0.0 +1: + vld1.32 {d16}, [r1]! + vld1.32 {d17}, [r2]! + vadd.f32 d18, d16, d17 + vsub.f32 d19, d16, d17 + vmla.f32 d0, d16, d16 + vmla.f32 d1, d17, d17 + vmla.f32 d2, d18, d18 + vmla.f32 d3, d19, d19 + subs r3, r3, #2 + bgt 1b + vpadd.f32 d0, d0, d1 + vpadd.f32 d1, d2, d3 + vst1.32 {q0}, [r0] + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/asm-offsets.h b/ffmpeg/libavcodec/arm/asm-offsets.h new file mode 100644 index 0000000..5cfc5cb --- /dev/null +++ b/ffmpeg/libavcodec/arm/asm-offsets.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_ASM_OFFSETS_H +#define AVCODEC_ARM_ASM_OFFSETS_H + +#ifndef __ASSEMBLER__ +#include +#define CHK_OFFS(s, m, o) struct check_##o { \ + int x_##o[offsetof(s, m) == o? 1: -1]; \ + } +#endif + +/* MpegEncContext */ +#define Y_DC_SCALE 0xa8 +#define C_DC_SCALE 0xac +#define AC_PRED 0xb0 +#define BLOCK_LAST_INDEX 0xb4 +#define H263_AIC 0xe4 +#define INTER_SCANTAB_RASTER_END 0x12c + +#endif /* AVCODEC_ARM_ASM_OFFSETS_H */ diff --git a/ffmpeg/libavcodec/arm/dca.h b/ffmpeg/libavcodec/arm/dca.h new file mode 100644 index 0000000..2cfd18a --- /dev/null +++ b/ffmpeg/libavcodec/arm/dca.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_DCA_H +#define AVCODEC_ARM_DCA_H + +#include + +#include "config.h" +#include "libavcodec/mathops.h" + +#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB + +#define decode_blockcodes decode_blockcodes +static inline int decode_blockcodes(int code1, int code2, int levels, + int *values) +{ + int v0, v1, v2, v3, v4, v5; + + __asm__ ("smmul %8, %14, %18 \n" + "smmul %11, %15, %18 \n" + "smlabb %14, %8, %17, %14 \n" + "smlabb %15, %11, %17, %15 \n" + "smmul %9, %8, %18 \n" + "smmul %12, %11, %18 \n" + "sub %14, %14, %16, lsr #1 \n" + "sub %15, %15, %16, lsr #1 \n" + "smlabb %8, %9, %17, %8 \n" + "smlabb %11, %12, %17, %11 \n" + "smmul %10, %9, %18 \n" + "smmul %13, %12, %18 \n" + "str %14, %0 \n" + "str %15, %4 \n" + "sub %8, %8, %16, lsr #1 \n" + "sub %11, %11, %16, lsr #1 \n" + "smlabb %9, %10, %17, %9 \n" + "smlabb %12, %13, %17, %12 \n" + "smmul %14, %10, %18 \n" + "smmul %15, %13, %18 \n" + "str %8, %1 \n" + "str %11, %5 \n" + "sub %9, %9, %16, lsr #1 \n" + "sub %12, %12, %16, lsr #1 \n" + "smlabb %10, %14, %17, %10 \n" + "smlabb %13, %15, %17, %13 \n" + "str %9, %2 \n" + "str %12, %6 \n" + "sub %10, %10, %16, lsr #1 \n" + "sub %13, %13, %16, lsr #1 \n" + "str %10, %3 \n" + "str %13, %7 \n" + : "=m"(values[0]), "=m"(values[1]), + "=m"(values[2]), "=m"(values[3]), + "=m"(values[4]), "=m"(values[5]), + "=m"(values[6]), "=m"(values[7]), + "=&r"(v0), "=&r"(v1), "=&r"(v2), + "=&r"(v3), "=&r"(v4), "=&r"(v5), + "+&r"(code1), "+&r"(code2) + : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels])); + + return code1 | code2; +} + +#endif + +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y + +#define int8x8_fmul_int32 int8x8_fmul_int32 +static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale) +{ + __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" + "vld1.8 {d0}, [%1,:64] \n" + "vmovl.s8 q0, d0 \n" + "vmovl.s16 q1, d1 \n" + "vmovl.s16 q0, d0 \n" + "vcvt.f32.s32 q0, q0 \n" + "vcvt.f32.s32 q1, q1 \n" + "vmul.f32 q0, q0, %y2 \n" + "vmul.f32 q1, q1, %y2 \n" + "vst1.32 {q0-q1}, [%m0,:128] \n" + : "=Um"(*(float (*)[8])dst) + : "r"(src), "x"(scale) + : "d0", "d1", "d2", "d3"); +} + +#endif + +#endif /* AVCODEC_ARM_DCA_H */ diff --git a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c new file mode 100644 index 0000000..56568e0 --- /dev/null +++ b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/dcadsp.h" + +void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, + int decifactor, float scale); + +av_cold void ff_dcadsp_init_arm(DCADSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + s->lfe_fir = ff_dca_lfe_fir_neon; +} diff --git a/ffmpeg/libavcodec/arm/dcadsp_neon.S b/ffmpeg/libavcodec/arm/dcadsp_neon.S new file mode 100644 index 0000000..6a6c77a --- /dev/null +++ b/ffmpeg/libavcodec/arm/dcadsp_neon.S @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_dca_lfe_fir_neon, export=1 + push {r4-r6,lr} + + add r4, r0, r3, lsl #2 @ out2 + add r5, r2, #256*4-16 @ cf1 + sub r1, r1, #12 + cmp r3, #32 + ite eq + moveq r6, #256/32 + movne r6, #256/64 +NOVFP vldr s0, [sp, #16] @ scale + mov lr, #-16 +1: + vmov.f32 q2, #0.0 @ v0 + vmov.f32 q3, #0.0 @ v1 + mov r12, r6 +2: + vld1.32 {q8}, [r2,:128]! @ cf0 + vld1.32 {q9}, [r5,:128], lr @ cf1 + vld1.32 {q1}, [r1], lr @ in + subs r12, r12, #4 + vrev64.32 q10, q8 + vmla.f32 q3, q1, q9 + vmla.f32 d4, d2, d21 + vmla.f32 d5, d3, d20 + bne 2b + + add r1, r1, r6, lsl #2 + subs r3, r3, #1 + vadd.f32 d4, d4, d5 + vadd.f32 d6, d6, d7 + vpadd.f32 d4, d4, d6 + vmul.f32 d5, d4, d0[0] + vst1.32 {d5[0]}, [r0,:32]! + vst1.32 {d5[1]}, [r4,:32]! + bne 1b + + pop {r4-r6,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.S b/ffmpeg/libavcodec/arm/dsputil_arm.S new file mode 100644 index 0000000..586a833 --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_arm.S @@ -0,0 +1,125 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +#if !HAVE_ARMV5TE_EXTERNAL +#define pld @ +#endif + + .align 5 +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) +function ff_add_pixels_clamped_arm, export=1 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.h b/ffmpeg/libavcodec/arm/dsputil_arm.h new file mode 100644 index 0000000..b7b5bdc --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_arm.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_DSPUTIL_H +#define AVCODEC_ARM_DSPUTIL_H + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" + +void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); + +#endif /* AVCODEC_ARM_DSPUTIL_H */ diff --git a/ffmpeg/libavcodec/arm/dsputil_armv6.S b/ffmpeg/libavcodec/arm/dsputil_armv6.S new file mode 100644 index 0000000..6ec238b --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_armv6.S @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_add_pixels_clamped_armv6, export=1 + push {r4-r8,lr} + mov r3, #8 +1: + ldm r0!, {r4,r5,r12,lr} + ldrd r6, r7, [r1] + pkhbt r8, r4, r5, lsl #16 + pkhtb r5, r5, r4, asr #16 + pkhbt r4, r12, lr, lsl #16 + pkhtb lr, lr, r12, asr #16 + pld [r1, r2] + uxtab16 r8, r8, r6 + uxtab16 r5, r5, r6, ror #8 + uxtab16 r4, r4, r7 + uxtab16 lr, lr, r7, ror #8 + usat16 r8, #8, r8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 lr, #8, lr + orr r6, r8, r5, lsl #8 + orr r7, r4, lr, lsl #8 + subs r3, r3, #1 + strd_post r6, r7, r1, r2 + bgt 1b + pop {r4-r8,pc} +endfunc + +function ff_get_pixels_armv6, export=1 + pld [r1, r2] + push {r4-r8, lr} + mov lr, #8 +1: + ldrd_post r4, r5, r1, r2 + subs lr, lr, #1 + uxtb16 r6, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r12, r5 + uxtb16 r8, r5, ror #8 + pld [r1, r2] + pkhbt r5, r6, r4, lsl #16 + pkhtb r6, r4, r6, asr #16 + pkhbt r7, r12, r8, lsl #16 + pkhtb r12, r8, r12, asr #16 + stm r0!, {r5,r6,r7,r12} + bgt 1b + + pop {r4-r8, pc} +endfunc + +function ff_diff_pixels_armv6, export=1 + pld [r1, r3] + pld [r2, r3] + push {r4-r9, lr} + mov lr, #8 +1: + ldrd_post r4, r5, r1, r3 + ldrd_post r6, r7, r2, r3 + uxtb16 r8, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r6 + uxtb16 r6, r6, ror #8 + pld [r1, r3] + ssub16 r9, r8, r9 + ssub16 r6, r4, r6 + uxtb16 r8, r5 + uxtb16 r5, r5, ror #8 + pld [r2, r3] + pkhbt r4, r9, r6, lsl #16 + pkhtb r6, r6, r9, asr #16 + uxtb16 r9, r7 + uxtb16 r7, r7, ror #8 + ssub16 r9, r8, r9 + ssub16 r5, r5, r7 + subs lr, lr, #1 + pkhbt r8, r9, r5, lsl #16 + pkhtb r9, r5, r9, asr #16 + stm r0!, {r4,r6,r8,r9} + bgt 1b + + pop {r4-r9, pc} +endfunc + +function ff_pix_abs16_armv6, export=1 + ldr r0, [sp] + push {r4-r9, lr} + mov r12, #0 + mov lr, #0 + ldm r1, {r4-r7} + ldr r8, [r2] +1: + ldr r9, [r2, #4] + pld [r1, r3] + usada8 r12, r4, r8, r12 + ldr r8, [r2, #8] + pld [r2, r3] + usada8 lr, r5, r9, lr + ldr r9, [r2, #12] + usada8 r12, r6, r8, r12 + subs r0, r0, #1 + usada8 lr, r7, r9, lr + beq 2f + add r1, r1, r3 + ldm r1, {r4-r7} + add r2, r2, r3 + ldr r8, [r2] + b 1b +2: + add r0, r12, lr + pop {r4-r9, pc} +endfunc + +function ff_pix_abs16_x2_armv6, export=1 + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 +1: + ldr r8, [r2] + ldr r9, [r2, #4] + lsr r10, r8, #8 + ldr r4, [r1] + lsr r6, r9, #8 + orr r10, r10, r9, lsl #24 + ldr r5, [r2, #8] + eor r11, r8, r10 + uhadd8 r7, r8, r10 + orr r6, r6, r5, lsl #24 + and r11, r11, lr + uadd8 r7, r7, r11 + ldr r8, [r1, #4] + usada8 r0, r4, r7, r0 + eor r7, r9, r6 + lsr r10, r5, #8 + and r7, r7, lr + uhadd8 r4, r9, r6 + ldr r6, [r2, #12] + uadd8 r4, r4, r7 + pld [r1, r3] + orr r10, r10, r6, lsl #24 + usada8 r0, r8, r4, r0 + ldr r4, [r1, #8] + eor r11, r5, r10 + ldrb r7, [r2, #16] + and r11, r11, lr + uhadd8 r8, r5, r10 + ldr r5, [r1, #12] + uadd8 r8, r8, r11 + pld [r2, r3] + lsr r10, r6, #8 + usada8 r0, r4, r8, r0 + orr r10, r10, r7, lsl #24 + subs r12, r12, #1 + eor r11, r6, r10 + add r1, r1, r3 + uhadd8 r9, r6, r10 + and r11, r11, lr + uadd8 r9, r9, r11 + add r2, r2, r3 + usada8 r0, r5, r9, r0 + bgt 1b + + pop {r4-r11, pc} +endfunc + +.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 + ldr \n0, [r2] + eor \n1, \p0, \n0 + uhadd8 \p0, \p0, \n0 + and \n1, \n1, lr + ldr \n2, [r1] + uadd8 \p0, \p0, \n1 + ldr \n1, [r2, #4] + usada8 r0, \p0, \n2, r0 + pld [r1, r3] + eor \n3, \p1, \n1 + uhadd8 \p1, \p1, \n1 + and \n3, \n3, lr + ldr \p0, [r1, #4] + uadd8 \p1, \p1, \n3 + ldr \n2, [r2, #8] + usada8 r0, \p1, \p0, r0 + pld [r2, r3] + eor \p0, \p2, \n2 + uhadd8 \p2, \p2, \n2 + and \p0, \p0, lr + ldr \p1, [r1, #8] + uadd8 \p2, \p2, \p0 + ldr \n3, [r2, #12] + usada8 r0, \p2, \p1, r0 + eor \p1, \p3, \n3 + uhadd8 \p3, \p3, \n3 + and \p1, \p1, lr + ldr \p0, [r1, #12] + uadd8 \p3, \p3, \p1 + add r1, r1, r3 + usada8 r0, \p3, \p0, r0 + add r2, r2, r3 +.endm + +function ff_pix_abs16_y2_armv6, export=1 + pld [r1] + pld [r2] + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldr r4, [r2] + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + add r2, r2, r3 +1: + usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 + subs r12, r12, #2 + usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 + bgt 1b + + pop {r4-r11, pc} +endfunc + +function ff_pix_abs8_armv6, export=1 + pld [r2, r3] + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 + mov lr, #0 + ldrd_post r4, r5, r1, r3 +1: + subs r12, r12, #2 + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + ldrd_post r8, r9, r1, r3 + usada8 r0, r4, r6, r0 + pld [r2, r3] + usada8 lr, r5, r7, lr + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + beq 2f + ldrd_post r4, r5, r1, r3 + usada8 r0, r8, r6, r0 + pld [r2, r3] + usada8 lr, r9, r7, lr + b 1b +2: + usada8 r0, r8, r6, r0 + usada8 lr, r9, r7, lr + add r0, r0, lr + pop {r4-r9, pc} +endfunc + +function ff_sse16_armv6, export=1 + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 +1: + ldrd r4, r5, [r1] + ldr r8, [r2] + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + ldr r7, [r2, #4] + usub16 lr, lr, r9 + usub16 r4, r4, r8 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 lr, r5, ror #8 + uxtb16 r8, r7 + uxtb16 r9, r7, ror #8 + smlad r0, r4, r4, r0 + ldrd r4, r5, [r1, #8] + usub16 r6, r6, r8 + usub16 r8, lr, r9 + ldr r7, [r2, #8] + smlad r0, r6, r6, r0 + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r7 + uxtb16 r7, r7, ror #8 + smlad r0, r8, r8, r0 + ldr r8, [r2, #12] + usub16 lr, lr, r9 + usub16 r4, r4, r7 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 r5, r5, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + smlad r0, r4, r4, r0 + usub16 r6, r6, r9 + usub16 r5, r5, r8 + smlad r0, r6, r6, r0 + add r1, r1, r3 + add r2, r2, r3 + subs r12, r12, #1 + smlad r0, r5, r5, r0 + bgt 1b + + pop {r4-r9, pc} +endfunc + +function ff_pix_norm1_armv6, export=1 + push {r4-r6, lr} + mov r12, #16 + mov lr, #0 +1: + ldm r0, {r2-r5} + uxtb16 r6, r2 + uxtb16 r2, r2, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r3 + smlad lr, r2, r2, lr + uxtb16 r3, r3, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r4 + smlad lr, r3, r3, lr + uxtb16 r4, r4, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r5 + smlad lr, r4, r4, lr + uxtb16 r5, r5, ror #8 + smlad lr, r6, r6, lr + subs r12, r12, #1 + add r0, r0, r1 + smlad lr, r5, r5, lr + bgt 1b + + mov r0, lr + pop {r4-r6, pc} +endfunc + +function ff_pix_sum_armv6, export=1 + push {r4-r7, lr} + mov r12, #16 + mov r2, #0 + mov r3, #0 + mov lr, #0 + ldr r4, [r0] +1: + subs r12, r12, #1 + ldr r5, [r0, #4] + usada8 r2, r4, lr, r2 + ldr r6, [r0, #8] + usada8 r3, r5, lr, r3 + ldr r7, [r0, #12] + usada8 r2, r6, lr, r2 + beq 2f + ldr_pre r4, r0, r1 + usada8 r3, r7, lr, r3 + bgt 1b +2: + usada8 r3, r7, lr, r3 + add r0, r2, r3 + pop {r4-r7, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_init_arm.c b/ffmpeg/libavcodec/arm/dsputil_init_arm.c new file mode 100644 index 0000000..68991fa --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_init_arm.c @@ -0,0 +1,86 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "dsputil_arm.h" + +void ff_j_rev_dct_arm(int16_t *data); +void ff_simple_idct_arm(int16_t *data); + +/* XXX: local hack */ +static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); +static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); + +void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, + int line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct_arm (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct_arm (block); + ff_add_pixels_clamped(block, dest, line_size); +} +static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_simple_idct_arm (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_simple_idct_arm (block); + ff_add_pixels_clamped(block, dest, line_size); +} + +av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + + if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) { + if(avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_ARM){ + c->idct_put = j_rev_dct_arm_put; + c->idct_add = j_rev_dct_arm_add; + c->idct = ff_j_rev_dct_arm; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM){ + c->idct_put = simple_idct_arm_put; + c->idct_add = simple_idct_arm_add; + c->idct = ff_simple_idct_arm; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_arm; + + if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx); + if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx); + if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx); +} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c new file mode 100644 index 0000000..841fbfa --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_arm.h" + +void ff_simple_idct_armv5te(int16_t *data); +void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data); + +av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx) +{ + if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { + c->idct_put = ff_simple_idct_put_armv5te; + c->idct_add = ff_simple_idct_add_armv5te; + c->idct = ff_simple_idct_armv5te; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } +} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c new file mode 100644 index 0000000..8f38302 --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "dsputil_arm.h" + +void ff_simple_idct_armv6(int16_t *data); +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); + +void ff_add_pixels_clamped_armv6(const int16_t *block, + uint8_t *restrict pixels, + int line_size); + +void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride); +void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1, + const uint8_t *s2, int stride); + +int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_pix_norm1_armv6(uint8_t *pix, int line_size); +int ff_pix_sum_armv6(uint8_t *pix, int line_size); + +av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx) +{ + const int high_bit_depth = avctx->bits_per_raw_sample > 8; + + if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) { + c->idct_put = ff_simple_idct_put_armv6; + c->idct_add = ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } + + if (!high_bit_depth) + c->get_pixels = ff_get_pixels_armv6; + c->add_pixels_clamped = ff_add_pixels_clamped_armv6; + c->diff_pixels = ff_diff_pixels_armv6; + + c->pix_abs[0][0] = ff_pix_abs16_armv6; + c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; + c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; + + c->pix_abs[1][0] = ff_pix_abs8_armv6; + + c->sad[0] = ff_pix_abs16_armv6; + c->sad[1] = ff_pix_abs8_armv6; + + c->sse[0] = ff_sse16_armv6; + + c->pix_norm1 = ff_pix_norm1_armv6; + c->pix_sum = ff_pix_sum_armv6; +} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_neon.c b/ffmpeg/libavcodec/arm/dsputil_init_neon.c new file mode 100644 index 0000000..6d19af7 --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_init_neon.c @@ -0,0 +1,81 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "dsputil_arm.h" + +void ff_simple_idct_neon(int16_t *data); +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); + +void ff_clear_block_neon(int16_t *block); +void ff_clear_blocks_neon(int16_t *blocks); + +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); + +void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, + int len); +void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, + int32_t max, unsigned int len); + +int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); +int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, + const int16_t *v3, int len, int mul); + +void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src, + const int16_t *window, unsigned n); + +av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) +{ + const int high_bit_depth = avctx->bits_per_raw_sample > 8; + + if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; + } + } + + if (!high_bit_depth) { + c->clear_block = ff_clear_block_neon; + c->clear_blocks = ff_clear_blocks_neon; + } + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + + c->vector_clipf = ff_vector_clipf_neon; + c->vector_clip_int32 = ff_vector_clip_int32_neon; + + c->scalarproduct_int16 = ff_scalarproduct_int16_neon; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; + + c->apply_window_int16 = ff_apply_window_int16_neon; +} diff --git a/ffmpeg/libavcodec/arm/dsputil_neon.S b/ffmpeg/libavcodec/arm/dsputil_neon.S new file mode 100644 index 0000000..307e122 --- /dev/null +++ b/ffmpeg/libavcodec/arm/dsputil_neon.S @@ -0,0 +1,209 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_clear_block_neon, export=1 + vmov.i16 q0, #0 + .rept 8 + vst1.16 {q0}, [r0,:128]! + .endr + bx lr +endfunc + +function ff_clear_blocks_neon, export=1 + vmov.i16 q0, #0 + .rept 8*6 + vst1.16 {q0}, [r0,:128]! + .endr + bx lr +endfunc + +function ff_put_pixels_clamped_neon, export=1 + vld1.16 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.16 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.16 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.16 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.8 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.8 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.8 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.8 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.8 {d4}, [r1,:64], r2 + vst1.8 {d5}, [r1,:64], r2 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.16 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.16 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.16 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.16 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.8 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.8 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.8 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.16 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.16 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.8 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.8 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.8 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.8 {d16}, [r1,:64], r2 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.8 {d2}, [r3,:64], r2 + vld1.8 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.8 {d4}, [r3,:64], r2 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.8 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.8 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.8 {d4}, [r3,:64], r2 + vst1.8 {d6}, [r3,:64], r2 + bx lr +endfunc + +function ff_vector_clipf_neon, export=1 +VFP vdup.32 q1, d0[1] +VFP vdup.32 q0, d0[0] +NOVFP vdup.32 q0, r2 +NOVFP vdup.32 q1, r3 +NOVFP ldr r2, [sp] + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 +1: vmax.f32 q8, q10, q0 + vmax.f32 q9, q11, q0 + subs r2, r2, #8 + beq 2f + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 + vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + b 1b +2: vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + bx lr +endfunc + +function ff_apply_window_int16_neon, export=1 + push {r4,lr} + add r4, r1, r3, lsl #1 + add lr, r0, r3, lsl #1 + sub r4, r4, #16 + sub lr, lr, #16 + mov r12, #-16 +1: + vld1.16 {q0}, [r1,:128]! + vld1.16 {q2}, [r2,:128]! + vld1.16 {q1}, [r4,:128], r12 + vrev64.16 q3, q2 + vqrdmulh.s16 q0, q0, q2 + vqrdmulh.s16 d2, d2, d7 + vqrdmulh.s16 d3, d3, d6 + vst1.16 {q0}, [r0,:128]! + vst1.16 {q1}, [lr,:128], r12 + subs r3, r3, #16 + bgt 1b + + pop {r4,pc} +endfunc + +function ff_vector_clip_int32_neon, export=1 + vdup.32 q0, r2 + vdup.32 q1, r3 + ldr r2, [sp] +1: + vld1.32 {q2-q3}, [r1,:128]! + vmin.s32 q2, q2, q1 + vmin.s32 q3, q3, q1 + vmax.s32 q2, q2, q0 + vmax.s32 q3, q3, q0 + vst1.32 {q2-q3}, [r0,:128]! + subs r2, r2, #8 + bgt 1b + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c new file mode 100644 index 0000000..ef098f4 --- /dev/null +++ b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/cpu.h" + +#define CONFIG_FFT_FLOAT 0 +#include "libavcodec/fft.h" + +void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z); +void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); +void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); + +av_cold void ff_fft_fixed_init_arm(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; +#if CONFIG_FFT + s->fft_calc = ff_fft_fixed_calc_neon; +#endif + +#if CONFIG_MDCT + if (!s->inverse && s->nbits >= 3) { + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; + s->mdct_calc = ff_mdct_fixed_calc_neon; + s->mdct_calcw = ff_mdct_fixed_calcw_neon; + } +#endif + } +} diff --git a/ffmpeg/libavcodec/arm/fft_fixed_neon.S b/ffmpeg/libavcodec/arm/fft_fixed_neon.S new file mode 100644 index 0000000..fa33eac --- /dev/null +++ b/ffmpeg/libavcodec/arm/fft_fixed_neon.S @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro bflies d0, d1, r0, r1 + vrev64.32 \r0, \d1 @ t5, t6, t1, t2 + vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2 + vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2 + vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5 + vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1 + @ t5, t6, t4, t3 + vhsub.s16 \d1, \d0, \r0 + vhadd.s16 \d0, \d0, \r0 +.endm + +.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1 + vrev32.16 \r0, \d3 + vmull.s16 \w0, \d3, \c0 + vmlal.s16 \w0, \r0, \c1 + vshrn.s32 \d3, \w0, #15 + bflies \q0, \q1, \w0, \w1 +.endm + +.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \ + r0, r1, w0, w1 + vrev32.16 \r0, \d1 + vrev32.16 \r1, \d3 + vmull.s16 \w0, \d1, \c0 + vmlal.s16 \w0, \r0, \c1 + vmull.s16 \w1, \d3, \c2 + vmlal.s16 \w1, \r1, \c3 + vshrn.s32 \d1, \w0, #15 + vshrn.s32 \d3, \w1, #15 + bflies \q0, \q1, \w0, \w1 +.endm + +.macro fft4 d0, d1, r0, r1 + vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7 + vhsub.s16 \r1, \d1, \d0 + vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5 + vmov.i64 \d1, #0xffff00000000 + vbit \r0, \r1, \d1 + vrev64.16 \r1, \r0 @ t7, t8, t4, t3 + vtrn.32 \r0, \r1 @ t3, t4, t7, t8 + vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7 + vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1 + vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3 +.endm + +.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1 + fft4 \d0, \d1, \r0, \r1 + vtrn.32 \d0, \d1 @ z0, z2, z1, z3 + vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4 + vhsub.s16 \d3, \d2, \d3 @ z5, z7 + vmov \d2, \r0 + transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1 +.endm + +function fft4_neon + vld1.16 {d0-d1}, [r0] + fft4 d0, d1, d2, d3 + vst1.16 {d0-d1}, [r0] + bx lr +endfunc + +function fft8_neon + vld1.16 {d0-d3}, [r0,:128] + movrel r1, coefs + vld1.16 {d30}, [r1,:64] + vdup.16 d31, d30[0] + fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9 + vtrn.32 d0, d1 + vtrn.32 d2, d3 + vst1.16 {d0-d3}, [r0,:128] + bx lr +endfunc + +function fft16_neon + vld1.16 {d0-d3}, [r0,:128]! + vld1.16 {d4-d7}, [r0,:128] + movrel r1, coefs + sub r0, r0, #32 + vld1.16 {d28-d31},[r1,:128] + vdup.16 d31, d28[0] + fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9 + vswp d5, d6 + fft4 q2, q3, q8, q9 + vswp d5, d6 + vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7 + vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15 + vswp d1, d2 + vdup.16 d31, d28[0] + transform01 q0, q2, d5, d31, d28, d20, q8, q9 + vdup.16 d26, d29[0] + vdup.16 d27, d30[0] + transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \ + d20, d21, q8, q9 + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vst1.16 {d0-d3}, [r0,:128]! + vst1.16 {d4-d7}, [r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4,lr} + movrel lr, coefs+24 + vld1.16 {d30}, [lr,:64] + lsl r12, r2, #3 + vmov d31, d30 + add r3, r1, r2, lsl #2 + mov lr, #-8 + sub r3, r3, #2 + mov r4, r0 + vld1.16 {d27[]}, [r3,:16] + sub r3, r3, #6 + vld1.16 {q0}, [r4,:128], r12 + vld1.16 {q1}, [r4,:128], r12 + vld1.16 {q2}, [r4,:128], r12 + vld1.16 {q3}, [r4,:128], r12 + vld1.16 {d28}, [r1,:64]! + vld1.16 {d29}, [r3,:64], lr + vswp d1, d2 + vswp d5, d6 + vtrn.32 d0, d1 + vtrn.32 d4, d5 + vdup.16 d25, d28[1] + vmul.s16 d27, d27, d31 + transform01 q0, q2, d5, d25, d27, d20, q8, q9 + b 2f +1: + mov r4, r0 + vdup.16 d26, d29[0] + vld1.16 {q0}, [r4,:128], r12 + vld1.16 {q1}, [r4,:128], r12 + vld1.16 {q2}, [r4,:128], r12 + vld1.16 {q3}, [r4,:128], r12 + vld1.16 {d28}, [r1,:64]! + vld1.16 {d29}, [r3,:64], lr + vswp d1, d2 + vswp d5, d6 + vtrn.32 d0, d1 + vtrn.32 d4, d5 + vdup.16 d24, d28[0] + vdup.16 d25, d28[1] + vdup.16 d27, d29[3] + vmul.s16 q13, q13, q15 + transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \ + d16, d17, q9, q10 +2: + vtrn.32 d2, d3 + vtrn.32 d6, d7 + vdup.16 d24, d28[2] + vdup.16 d26, d29[2] + vdup.16 d25, d28[3] + vdup.16 d27, d29[1] + vmul.s16 q13, q13, q15 + transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \ + d16, d17, q9, q10 + vtrn.32 d0, d1 + vtrn.32 d2, d3 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + vswp d1, d2 + vswp d5, d6 + mov r4, r0 + vst1.16 {q0}, [r4,:128], r12 + vst1.16 {q1}, [r4,:128], r12 + vst1.16 {q2}, [r4,:128], r12 + vst1.16 {q3}, [r4,:128], r12 + add r0, r0, #16 + subs r2, r2, #2 + bgt 1b + pop {r4,pc} +endfunc + +#define F_SQRT1_2 23170 +#define F_COS_16_1 30274 +#define F_COS_16_3 12540 + +const coefs, align=4 + .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2 + .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1 + .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3 + .short 1, -1, -1, 1 +endconst + +.macro def_fft n, n2, n4 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*4 + bl fft\n4\()_neon + add r0, r4, #\n4*3*4 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrelx r1, X(ff_cos_\n\()_fixed) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_fixed_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_fixed_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +const fft_fixed_tab_neon + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon +endconst diff --git a/ffmpeg/libavcodec/arm/fft_init_arm.c b/ffmpeg/libavcodec/arm/fft_init_arm.c new file mode 100644 index 0000000..8c98abc --- /dev/null +++ b/ffmpeg/libavcodec/arm/fft_init_arm.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/cpu.h" +#include "libavcodec/fft.h" +#include "libavcodec/rdft.h" +#include "libavcodec/synth_filter.h" + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale); + +av_cold void ff_fft_init_arm(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#if CONFIG_FFT + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; +#endif +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; +#endif + } +} + +#if CONFIG_RDFT +av_cold void ff_rdft_init_arm(RDFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + s->rdft_calc = ff_rdft_calc_neon; +} +#endif + +#if CONFIG_DCA_DECODER +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; +} +#endif diff --git a/ffmpeg/libavcodec/arm/fft_neon.S b/ffmpeg/libavcodec/arm/fft_neon.S new file mode 100644 index 0000000..8b9ae2a --- /dev/null +++ b/ffmpeg/libavcodec/arm/fft_neon.S @@ -0,0 +1,375 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard + * Copyright (c) 2009 Naotoshi Nojiri + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + + +function fft4_neon + vld1.32 {d0-d3}, [r0,:128] + + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 + vadd.f32 d1, d6, d7 + vsub.f32 d3, d6, d7 + vadd.f32 d0, d4, d5 + vsub.f32 d2, d4, d5 + + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft8_neon + mov r1, r0 + vld1.32 {d0-d3}, [r1,:128]! + vld1.32 {d16-d19}, [r1,:128] + + movw r2, #0x04f3 @ sqrt(1/2) + movt r2, #0x3f35 + eor r3, r2, #1<<31 + vdup.32 d31, r2 + + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 + vmov d28, r3, r2 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 + vrev64.32 d29, d28 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w + vext.32 q3, q2, q2, #1 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w + vadd.f32 d0, d20, d21 + vsub.f32 d2, d20, d21 + vadd.f32 d1, d22, d23 + vrev64.32 q13, q13 + vsub.f32 d3, d22, d23 + vsub.f32 d6, d6, d7 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 + vadd.f32 d7, d4, d5 + vsub.f32 d18, d2, d6 + vext.32 q13, q12, q12, #1 + vadd.f32 d2, d2, d6 + vsub.f32 d16, d0, d7 + vadd.f32 d5, d25, d24 + vsub.f32 d4, d26, d27 + vadd.f32 d0, d0, d7 + vsub.f32 d17, d1, d5 + vsub.f32 d19, d3, d4 + vadd.f32 d3, d3, d4 + vadd.f32 d1, d1, d5 + + vst1.32 {d16-d19}, [r1,:128] + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft16_neon + movrel r1, mppm + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} + pld [r0, #32] + vld1.32 {d2-d3}, [r1,:128] + vext.32 q13, q9, q9, #1 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} + vadd.f32 d4, d16, d17 + vsub.f32 d5, d16, d17 + vadd.f32 d18, d18, d19 + vsub.f32 d19, d26, d27 + + vadd.f32 d20, d22, d23 + vsub.f32 d22, d22, d23 + vsub.f32 d23, d24, d25 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} + vadd.f32 d21, d24, d25 + vmul.f32 d24, d22, d2 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} + vmul.f32 d25, d23, d3 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} + vmul.f32 q1, q11, d2[1] + vuzp.32 d18, d19 @ {r2,r3,i2,i3} + vrev64.32 q12, q12 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} + vzip.32 q10, q11 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + sub r0, r0, #96 + vext.32 q13, q13, q13, #1 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vext.32 q15, q15, q15, #1 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} + movrelx r2, X(ff_cos_16) + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} + vrev64.32 d1, d1 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} + vrev64.32 d3, d3 + movrel r3, pmmp + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} + vld1.32 {d4-d5}, [r2,:64] + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} + vld1.32 {d6-d7}, [r3,:128] + vrev64.32 q1, q14 + vmul.f32 q14, q14, d4[1] + vmul.f32 q1, q1, q3 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} + vzip.32 q12, q14 + vadd.f32 d0, d28, d24 + vadd.f32 d1, d25, d29 + vsub.f32 d2, d25, d29 + vsub.f32 d3, d28, d24 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} + mov r1, #32 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} + vrev64.32 q0, q13 + vmul.f32 q13, q13, d5[0] + vrev64.32 q1, q15 + vmul.f32 q15, q15, d5[1] + vst2.32 {d16-d17},[r0,:128], r1 + vmul.f32 q0, q0, q3 + vst2.32 {d20-d21},[r0,:128], r1 + vmul.f32 q1, q1, q3 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} + vst2.32 {d24-d25},[r0,:128], r1 + vst2.32 {d28-d29},[r0,:128] + vzip.32 q13, q15 + sub r0, r0, #80 + vadd.f32 d0, d30, d26 + vadd.f32 d1, d27, d31 + vsub.f32 d2, d27, d31 + vsub.f32 d3, d30, d26 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} + vst2.32 {d18-d19},[r0,:128], r1 + vst2.32 {d22-d23},[r0,:128], r1 + vst2.32 {d26-d27},[r0,:128], r1 + vst2.32 {d30-d31},[r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4-r6,lr} + mov r6, r2 @ n + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex + add r3, r2, r4 + add r4, r4, r0 @ &z[o1] + add r2, r2, r0 @ &z[o2] + add r3, r3, r0 @ &z[o3] + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + movrel r12, pmmp + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + add r5, r5, r1 @ wim + vld1.32 {d6-d7}, [r12,:128] @ pmmp + vswp d21, d22 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} + sub r5, r5, #4 @ wim-- + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vmul.f32 q1, q1, q3 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + sub r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} + sub r5, r5, #8 @ wim -= 2 +1: + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + vswp d21, d22 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} + vrev64.32 q0, q10 + vmul.f32 q10, q10, d4[0] + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} + vmul.f32 q0, q0, q3 + sub r5, r5, #8 @ wim -= 2 + vmul.f32 q1, q1, q3 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + subs r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} + bne 1b + + pop {r4-r6,pc} +endfunc + +.macro def_fft n, n2, n4 + .align 6 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*8 + bl fft\n4\()_neon + add r0, r4, #\n4*3*8 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrelx r1, X(ff_cos_\n) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +function ff_fft_permute_neon, export=1 + push {r4,lr} + mov r12, #1 + ldr r2, [r0] @ nbits + ldr r3, [r0, #12] @ tmp_buf + ldr r0, [r0, #8] @ revtab + lsl r12, r12, r2 + mov r2, r12 +1: + vld1.32 {d0-d1}, [r1,:128]! + ldr r4, [r0], #4 + uxth lr, r4 + uxth r4, r4, ror #16 + add lr, r3, lr, lsl #3 + add r4, r3, r4, lsl #3 + vst1.32 {d0}, [lr,:64] + vst1.32 {d1}, [r4,:64] + subs r12, r12, #2 + bgt 1b + + sub r1, r1, r2, lsl #3 +1: + vld1.32 {d0-d3}, [r3,:128]! + vst1.32 {d0-d3}, [r1,:128]! + subs r2, r2, #4 + bgt 1b + + pop {r4,pc} +endfunc + +const fft_tab_neon + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon +endconst + +const pmmp, align=4 + .float +1.0, -1.0, -1.0, +1.0 +endconst + +const mppm, align=4 + .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +endconst diff --git a/ffmpeg/libavcodec/arm/flacdsp_arm.S b/ffmpeg/libavcodec/arm/flacdsp_arm.S new file mode 100644 index 0000000..f8861c5 --- /dev/null +++ b/ffmpeg/libavcodec/arm/flacdsp_arm.S @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of FFmpeg + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function flac_lpc_16_1_arm + ldr r12, [sp] + push {r4, lr} + ldr r1, [r1] + subs r12, r12, #2 + ldr lr, [r0], #4 + beq 2f + it lt + poplt {r4, pc} +1: + mul r4, lr, r1 + ldm r0, {r2, lr} + add_sh r2, r2, r4, asr r3 + mul r4, r2, r1 + subs r12, r12, #2 + add_sh lr, lr, r4, asr r3 + stm r0!, {r2, lr} + bgt 1b + it lt + poplt {r4, pc} +2: + mul r4, lr, r1 + ldr r2, [r0] + add_sh r2, r2, r4, asr r3 + str r2, [r0] + pop {r4, pc} +endfunc + +function flac_lpc_16_2_arm + ldr r12, [sp] + subs r12, r12, r2 + it le + bxle lr + + push {r4-r9, lr} + ldm r0!, {r6, r7} + ldm r1, {r8, r9} + subs r12, r12, #1 + beq 2f +1: + mul r4, r6, r8 + mul r5, r7, r8 + mla r4, r7, r9, r4 + ldm r0, {r6, r7} + add_sh r6, r6, r4, asr r3 + mla r5, r6, r9, r5 + add_sh r7, r7, r5, asr r3 + stm r0!, {r6, r7} + subs r12, r12, #2 + bgt 1b + it lt + poplt {r4-r9, pc} +2: + mul r4, r6, r8 + mla r4, r7, r9, r4 + ldr r5, [r0] + add_sh r5, r5, r4, asr r3 + str r5, [r0] + pop {r4-r9, pc} +endfunc + +function ff_flac_lpc_16_arm, export=1 + cmp r2, #2 + blt flac_lpc_16_1_arm + beq flac_lpc_16_2_arm + + ldr r12, [sp] + subs r12, r12, r2 + it le + bxle lr + + push {r4-r9, lr} + + subs r12, r12, #1 + beq 3f +1: + sub lr, r2, #2 + mov r4, #0 + mov r5, #0 + + ldr r7, [r0], #4 + ldr r9, [r1], #4 +2: + mla r4, r7, r9, r4 + ldm r0!, {r6, r7} + mla r5, r6, r9, r5 + ldm r1!, {r8, r9} + mla r4, r6, r8, r4 + subs lr, lr, #2 + mla r5, r7, r8, r5 + bgt 2b + blt 6f + + mla r4, r7, r9, r4 + ldr r7, [r0], #4 + mla r5, r7, r9, r5 + ldr r9, [r1], #4 +6: + mla r4, r7, r9, r4 + ldm r0, {r6, r7} + add_sh r6, r6, r4, asr r3 + mla r5, r6, r9, r5 + add_sh r7, r7, r5, asr r3 + stm r0!, {r6, r7} + sub r0, r0, r2, lsl #2 + sub r1, r1, r2, lsl #2 + + subs r12, r12, #2 + bgt 1b + it lt + poplt {r4-r9, pc} +3: + mov r4, #0 +4: + ldr r5, [r1], #4 + ldr r6, [r0], #4 + mla r4, r5, r6, r4 + subs r2, r2, #1 + bgt 4b + ldr r5, [r0] + add_sh r5, r5, r4, asr r3 + str r5, [r0] + pop {r4-r9, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c new file mode 100644 index 0000000..9b93942 --- /dev/null +++ b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/flacdsp.h" +#include "config.h" + +void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, + int bps) +{ + if (bps <= 16) + c->lpc = ff_flac_lpc_16_arm; +} diff --git a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c new file mode 100644 index 0000000..1d99c97 --- /dev/null +++ b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c @@ -0,0 +1,52 @@ +/* + * ARM optimized Format Conversion Utils + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fmtconvert.h" + +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, + float mul, int len); + +void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) { + c->float_to_int16 = ff_float_to_int16_vfp; + } + + if (have_neon(cpu_flags)) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + } + } +} diff --git a/ffmpeg/libavcodec/arm/fmtconvert_neon.S b/ffmpeg/libavcodec/arm/fmtconvert_neon.S new file mode 100644 index 0000000..55d070e --- /dev/null +++ b/ffmpeg/libavcodec/arm/fmtconvert_neon.S @@ -0,0 +1,392 @@ +/* + * ARM NEON optimised Format Conversion Utils + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/arm/asm.S" + +function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q9, q1, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vshrn.s32 d4, q8, #16 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #16 + vshrn.s32 d5, q9, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vld1.64 {d16-d17},[r1,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r1,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6-d7}, [r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r1,:128]! + vshrn.s32 d4, q8, #16 + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vshrn.s32 d5, q9, #16 + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vst1.64 {d6-d7}, [r0,:128]! + bx lr +3: vshrn.s32 d4, q8, #16 + vshrn.s32 d5, q9, #16 + vst1.64 {d4-d5}, [r0,:128]! + bx lr +endfunc + +function ff_float_to_int16_interleave_neon, export=1 + cmp r3, #2 + itt lt + ldrlt r1, [r1] + blt ff_float_to_int16_neon + bne 4f + + ldr r3, [r1] + ldr r1, [r1, #4] + + subs r2, r2, #8 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q9, q1, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q10, q8, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vld1.64 {d26-d27},[r1,:128]! + vsri.32 q11, q9, #16 + vst1.64 {d20-d21},[r0,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q12, q0, #16 + vld1.64 {d16-d17},[r3,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d25},[r0,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r3,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d26-d27},[r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vsri.32 q10, q8, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vsri.32 q11, q9, #16 + vld1.64 {d26-d27},[r1,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d20-d21},[r0,:128]! + vsri.32 q12, q0, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d27},[r0,:128]! + bx lr +3: vsri.32 q10, q8, #16 + vsri.32 q11, q9, #16 + vst1.64 {d20-d23},[r0,:128]! + bx lr + +4: push {r4-r8,lr} + cmp r3, #4 + lsl ip, r3, #1 + blt 4f + + @ 4 channels +5: ldmia r1!, {r4-r7} + mov lr, r2 + mov r8, r0 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #8 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q9, q8, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 q11, q10, #16 + vld1.64 {d4-d5}, [r6,:128]! + vcvt.s32.f32 q2, q2, #16 + vzip.32 d18, d22 + vld1.64 {d6-d7}, [r7,:128]! + vcvt.s32.f32 q3, q3, #16 + vzip.32 d19, d23 + vst1.64 {d18}, [r8], ip + vsri.32 q1, q0, #16 + vst1.64 {d22}, [r8], ip + vsri.32 q3, q2, #16 + vst1.64 {d19}, [r8], ip + vzip.32 d2, d6 + vst1.64 {d23}, [r8], ip + vzip.32 d3, d7 + beq 7f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.64 {d2}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6}, [r8], ip + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.64 {d3}, [r8], ip + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d7}, [r8], ip + b 6b +7: vst1.64 {d2}, [r8], ip + vst1.64 {d6}, [r8], ip + vst1.64 {d3}, [r8], ip + vst1.64 {d7}, [r8], ip + subs r3, r3, #4 + it eq + popeq {r4-r8,pc} + cmp r3, #4 + add r0, r0, #8 + bge 5b + + @ 2 channels +4: cmp r3, #2 + blt 4f + ldmia r1!, {r4-r5} + mov lr, r2 + mov r8, r0 + tst lr, #8 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 6f + subs lr, lr, #8 + beq 7f + vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #16 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 d18, d16, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 d19, d17, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r5,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vsri.32 d2, d0, #16 + vst1.32 {d19[1]}, [r8], ip + vsri.32 d3, d1, #16 + vst1.32 {d22[0]}, [r8], ip + vsri.32 d6, d4, #16 + vst1.32 {d22[1]}, [r8], ip + vsri.32 d7, d5, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + beq 6f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + bgt 6b +6: vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + b 8f +7: vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip +8: subs r3, r3, #2 + add r0, r0, #4 + it eq + popeq {r4-r8,pc} + + @ 1 channel +4: ldr r4, [r1],#4 + tst r2, #8 + mov lr, r2 + mov r5, r0 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + bne 8f +6: subs lr, lr, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r4,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + beq 7f + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 +7: vst1.16 {d4[1]}, [r5,:16], ip + vst1.16 {d4[3]}, [r5,:16], ip + vst1.16 {d5[1]}, [r5,:16], ip + vst1.16 {d5[3]}, [r5,:16], ip + vst1.16 {d6[1]}, [r5,:16], ip + vst1.16 {d6[3]}, [r5,:16], ip + vst1.16 {d7[1]}, [r5,:16], ip + vst1.16 {d7[3]}, [r5,:16], ip + bgt 6b + pop {r4-r8,pc} +8: subs lr, lr, #8 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + it eq + popeq {r4-r8,pc} + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + b 6b +endfunc + +function ff_int32_to_float_fmul_scalar_neon, export=1 +VFP vdup.32 q0, d0[0] +VFP len .req r2 +NOVFP vdup.32 q0, r2 +NOVFP len .req r3 + + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 +1: subs len, len, #8 + pld [r1, #16] + vmul.f32 q9, q3, q0 + vmul.f32 q10, q8, q0 + beq 2f + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 + vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + b 1b +2: vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + bx lr + .unreq len +endfunc diff --git a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S new file mode 100644 index 0000000..7b012bc --- /dev/null +++ b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/arm/asm.S" + +/** + * ARM VFP optimized float to int16 conversion. + * Assume that len is a positive number and is multiple of 8, destination + * buffer is at least 4 bytes aligned (8 bytes alignment is better for + * performance), little-endian byte sex. + */ +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) +function ff_float_to_int16_vfp, export=1 + push {r4-r8,lr} + vpush {d8-d11} + vldmia r1!, {s16-s23} + vcvt.s32.f32 s0, s16 + vcvt.s32.f32 s1, s17 + vcvt.s32.f32 s2, s18 + vcvt.s32.f32 s3, s19 + vcvt.s32.f32 s4, s20 + vcvt.s32.f32 s5, s21 + vcvt.s32.f32 s6, s22 + vcvt.s32.f32 s7, s23 +1: + subs r2, r2, #8 + vmov r3, r4, s0, s1 + vmov r5, r6, s2, s3 + vmov r7, r8, s4, s5 + vmov ip, lr, s6, s7 + it gt + vldmiagt r1!, {s16-s23} + ssat r4, #16, r4 + ssat r3, #16, r3 + ssat r6, #16, r6 + ssat r5, #16, r5 + pkhbt r3, r3, r4, lsl #16 + pkhbt r4, r5, r6, lsl #16 + itttt gt + vcvtgt.s32.f32 s0, s16 + vcvtgt.s32.f32 s1, s17 + vcvtgt.s32.f32 s2, s18 + vcvtgt.s32.f32 s3, s19 + itttt gt + vcvtgt.s32.f32 s4, s20 + vcvtgt.s32.f32 s5, s21 + vcvtgt.s32.f32 s6, s22 + vcvtgt.s32.f32 s7, s23 + ssat r8, #16, r8 + ssat r7, #16, r7 + ssat lr, #16, lr + ssat ip, #16, ip + pkhbt r5, r7, r8, lsl #16 + pkhbt r6, ip, lr, lsl #16 + stmia r0!, {r3-r6} + bgt 1b + + vpop {d8-d11} + pop {r4-r8,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c new file mode 100644 index 0000000..13f7e0d --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c @@ -0,0 +1,51 @@ +/* + * ARM NEON optimised H.264 chroma functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/h264chroma.h" + +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); + +av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth) +{ + const int high_bit_depth = bit_depth > 8; + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && !high_bit_depth) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/h264cmc_neon.S b/ffmpeg/libavcodec/arm/h264cmc_neon.S new file mode 100644 index 0000000..3427e36 --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264cmc_neon.S @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ +.macro h264_chroma_mc8 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 + push {r4-r7, lr} + ldrd r4, r5, [sp, #20] + .ifc \type,avg + mov lr, r0 + .endif + pld [r1] + pld [r1, r2] + + .ifc \codec,rv40 + movrel r6, rv40bias + lsr r7, r5, #1 + add r6, r6, r7, lsl #3 + lsr r7, r4, #1 + add r6, r6, r7, lsl #1 + vld1.16 {d22[],d23[]}, [r6,:16] + .endif + +A muls r7, r4, r5 +T mul r7, r4, r5 +T cmp r7, #0 + rsb r6, r7, r5, lsl #3 + rsb r12, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + vdup.8 d0, r4 + vdup.8 d1, r12 + vld1.8 {d4, d5}, [r1], r2 + vdup.8 d2, r6 + vdup.8 d3, r7 + vext.8 d5, d4, d5, #1 + +1: vld1.8 {d6, d7}, [r1], r2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vext.8 d7, d6, d7, #1 + vld1.8 {d4, d5}, [r1], r2 + vmlal.u8 q8, d6, d2 + pld [r1] + vext.8 d5, d4, d5, #1 + vmlal.u8 q8, d7, d3 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vmlal.u8 q9, d7, d1 + vmlal.u8 q9, d4, d2 + vmlal.u8 q9, d5, d3 + pld [r1, r2] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add r12, r12, r6 + vdup.8 d0, r4 + vdup.8 d1, r12 + + beq 4f + + vld1.8 {d4}, [r1], r2 + +3: vld1.8 {d6}, [r1], r2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d1 + vld1.8 {d4}, [r1], r2 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d1 + pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif + pld [r1, r2] + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif + subs r3, r3, #2 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.8 {d4, d5}, [r1], r2 + vld1.8 {d6, d7}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + pld [r1] + subs r3, r3, #2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d7, d1 + pld [r1, r2] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 + bgt 4b + + pop {r4-r7, pc} +endfunc +.endm + +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ +.macro h264_chroma_mc4 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 + push {r4-r7, lr} + ldrd r4, r5, [sp, #20] + .ifc \type,avg + mov lr, r0 + .endif + pld [r1] + pld [r1, r2] + + .ifc \codec,rv40 + movrel r6, rv40bias + lsr r7, r5, #1 + add r6, r6, r7, lsl #3 + lsr r7, r4, #1 + add r6, r6, r7, lsl #1 + vld1.16 {d22[],d23[]}, [r6,:16] + .endif + +A muls r7, r4, r5 +T mul r7, r4, r5 +T cmp r7, #0 + rsb r6, r7, r5, lsl #3 + rsb r12, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + vdup.8 d0, r4 + vdup.8 d1, r12 + vld1.8 {d4}, [r1], r2 + vdup.8 d2, r6 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + + vtrn.32 d0, d1 + vtrn.32 d2, d3 + +1: vld1.8 {d6}, [r1], r2 + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d2 + vld1.8 {d4}, [r1], r2 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + pld [r1] + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d2 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif + subs r3, r3, #2 + pld [r1, r2] + .ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 + .endif + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add r12, r12, r6 + vdup.8 d0, r4 + vdup.8 d1, r12 + vtrn.32 d0, d1 + + beq 4f + + vext.32 d1, d0, d1, #1 + vld1.32 {d4[0]}, [r1], r2 + +3: vld1.32 {d4[1]}, [r1], r2 + vmull.u8 q8, d4, d0 + vld1.32 {d4[0]}, [r1], r2 + vmull.u8 q9, d4, d1 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif + .ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 + .endif + subs r3, r3, #2 + pld [r1, r2] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.8 {d4}, [r1], r2 + vld1.8 {d6}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif + .ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 + .endif + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 4b + + pop {r4-r7, pc} +endfunc +.endm + +.macro h264_chroma_mc2 type +function ff_\type\()_h264_chroma_mc2_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + ldr lr, [sp, #20] + pld [r1] + pld [r1, r2] + orrs r5, r4, lr + beq 2f + + mul r5, r4, lr + rsb r6, r5, lr, lsl #3 + rsb r12, r5, r4, lsl #3 + sub r4, r5, r4, lsl #3 + sub r4, r4, lr, lsl #3 + add r4, r4, #64 + vdup.8 d0, r4 + vdup.8 d2, r12 + vdup.8 d1, r6 + vdup.8 d3, r5 + vtrn.16 q0, q1 +1: + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1], r2 + vrev64.32 d5, d4 + vld1.32 {d5[1]}, [r1] + vext.8 q3, q2, q2, #1 + vtrn.16 q2, q3 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + .ifc \type,avg + vld1.16 {d18[0]}, [r0,:16], r2 + vld1.16 {d18[1]}, [r0,:16] + sub r0, r0, r2 + .endif + vtrn.32 d16, d17 + vadd.i16 d16, d16, d17 + vrshrn.u16 d16, q8, #6 + .ifc \type,avg + vrhadd.u8 d16, d16, d18 + .endif + vst1.16 {d16[0]}, [r0,:16], r2 + vst1.16 {d16[1]}, [r0,:16], r2 + subs r3, r3, #2 + bgt 1b + pop {r4-r6, pc} +2: + .ifc \type,put + ldrh_post r5, r1, r2 + strh_post r5, r0, r2 + ldrh_post r6, r1, r2 + strh_post r6, r0, r2 + .else + vld1.16 {d16[0]}, [r1], r2 + vld1.16 {d16[1]}, [r1], r2 + vld1.16 {d18[0]}, [r0,:16], r2 + vld1.16 {d18[1]}, [r0,:16] + sub r0, r0, r2 + vrhadd.u8 d16, d16, d18 + vst1.16 {d16[0]}, [r0,:16], r2 + vst1.16 {d16[1]}, [r0,:16], r2 + .endif + subs r3, r3, #2 + bgt 2b + pop {r4-r6, pc} +endfunc +.endm + +#if CONFIG_H264_DECODER + h264_chroma_mc8 put + h264_chroma_mc8 avg + h264_chroma_mc4 put + h264_chroma_mc4 avg + h264_chroma_mc2 put + h264_chroma_mc2 avg +#endif + +#if CONFIG_RV40_DECODER +const rv40bias + .short 0, 16, 32, 16 + .short 32, 28, 32, 28 + .short 0, 32, 16, 32 + .short 32, 28, 32, 28 +endconst + + h264_chroma_mc8 put, rv40 + h264_chroma_mc8 avg, rv40 + h264_chroma_mc4 put, rv40 + h264_chroma_mc4 avg, rv40 +#endif diff --git a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c new file mode 100644 index 0000000..785b604 --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/h264dsp.h" + +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + +void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); + +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); + +void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + +void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + +static av_cold void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ +#if HAVE_NEON + if (bit_depth == 8) { + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + if(chroma_format_idc == 1){ + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + } + + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->h264_idct_add16 = ff_h264_idct_add16_neon; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_neon; + c->h264_idct8_add = ff_h264_idct8_add_neon; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; + c->h264_idct8_add4 = ff_h264_idct8_add4_neon; + } +#endif // HAVE_NEON +} + +av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc); +} diff --git a/ffmpeg/libavcodec/arm/h264dsp_neon.S b/ffmpeg/libavcodec/arm/h264dsp_neon.S new file mode 100644 index 0000000..274a547 --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264dsp_neon.S @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + + /* H.264 loop filter */ + +.macro h264_loop_filter_start + ldr r12, [sp] + tst r2, r2 + ldr r12, [r12] + it ne + tstne r3, r3 + vmov.32 d24[0], r12 + and r12, r12, r12, lsl #16 + it eq + bxeq lr + ands r12, r12, r12, lsl #8 + it lt + bxlt lr +.endm + +.macro h264_loop_filter_luma + vdup.8 q11, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 q6, q8, q0 @ abs(p0 - q0) + vmovl.u16 q12, d24 + vabd.u8 q14, q9, q8 @ abs(p1 - p0) + vsli.16 q12, q12, #8 + vabd.u8 q15, q1, q0 @ abs(q1 - q0) + vsli.32 q12, q12, #16 + vclt.u8 q6, q6, q11 @ < alpha + vdup.8 q11, r3 @ beta + vclt.s8 q7, q12, #0 + vclt.u8 q14, q14, q11 @ < beta + vclt.u8 q15, q15, q11 @ < beta + vbic q6, q6, q7 + vabd.u8 q4, q10, q8 @ abs(p2 - p0) + vand q6, q6, q14 + vabd.u8 q5, q2, q0 @ abs(q2 - q0) + vclt.u8 q4, q4, q11 @ < beta + vand q6, q6, q15 + vclt.u8 q5, q5, q11 @ < beta + vand q4, q4, q6 + vand q5, q5, q6 + vand q12, q12, q6 + vrhadd.u8 q14, q8, q0 + vsub.i8 q6, q12, q4 + vqadd.u8 q7, q9, q12 + vhadd.u8 q10, q10, q14 + vsub.i8 q6, q6, q5 + vhadd.u8 q14, q2, q14 + vmin.u8 q7, q7, q10 + vqsub.u8 q11, q9, q12 + vqadd.u8 q2, q1, q12 + vmax.u8 q7, q7, q11 + vqsub.u8 q11, q1, q12 + vmin.u8 q14, q2, q14 + vmovl.u8 q2, d0 + vmax.u8 q14, q14, q11 + vmovl.u8 q10, d1 + vsubw.u8 q2, q2, d16 + vsubw.u8 q10, q10, d17 + vshl.i16 q2, q2, #2 + vshl.i16 q10, q10, #2 + vaddw.u8 q2, q2, d18 + vaddw.u8 q10, q10, d19 + vsubw.u8 q2, q2, d2 + vsubw.u8 q10, q10, d3 + vrshrn.i16 d4, q2, #3 + vrshrn.i16 d5, q10, #3 + vbsl q4, q7, q9 + vbsl q5, q14, q1 + vneg.s8 q7, q6 + vmovl.u8 q14, d16 + vmin.s8 q2, q2, q6 + vmovl.u8 q6, d17 + vmax.s8 q2, q2, q7 + vmovl.u8 q11, d0 + vmovl.u8 q12, d1 + vaddw.s8 q14, q14, d4 + vaddw.s8 q6, q6, d5 + vsubw.s8 q11, q11, d4 + vsubw.s8 q12, q12, d5 + vqmovun.s16 d16, q14 + vqmovun.s16 d17, q6 + vqmovun.s16 d0, q11 + vqmovun.s16 d1, q12 +.endm + +function ff_h264_v_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + vld1.8 {d0, d1}, [r0,:128], r1 + vld1.8 {d2, d3}, [r0,:128], r1 + vld1.8 {d4, d5}, [r0,:128], r1 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + vld1.8 {d20,d21}, [r0,:128], r1 + vld1.8 {d18,d19}, [r0,:128], r1 + vld1.8 {d16,d17}, [r0,:128], r1 + + vpush {d8-d15} + + h264_loop_filter_luma + + sub r0, r0, r1, lsl #1 + vst1.8 {d8, d9}, [r0,:128], r1 + vst1.8 {d16,d17}, [r0,:128], r1 + vst1.8 {d0, d1}, [r0,:128], r1 + vst1.8 {d10,d11}, [r0,:128] + + vpop {d8-d15} + bx lr +endfunc + +function ff_h264_h_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #4 + vld1.8 {d6}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d0}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d7}, [r0], r1 + vld1.8 {d21}, [r0], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d3}, [r0], r1 + vld1.8 {d5}, [r0], r1 + vld1.8 {d27}, [r0], r1 + + transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 + + vpush {d8-d15} + + h264_loop_filter_luma + + transpose_4x4 q4, q8, q0, q5 + + sub r0, r0, r1, lsl #4 + add r0, r0, #2 + vst1.32 {d8[0]}, [r0], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d10[0]}, [r0], r1 + vst1.32 {d8[1]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d10[1]}, [r0], r1 + vst1.32 {d9[0]}, [r0], r1 + vst1.32 {d17[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + vst1.32 {d11[0]}, [r0], r1 + vst1.32 {d9[1]}, [r0], r1 + vst1.32 {d17[1]}, [r0], r1 + vst1.32 {d1[1]}, [r0], r1 + vst1.32 {d11[1]}, [r0], r1 + + vpop {d8-d15} + bx lr +endfunc + +.macro h264_loop_filter_chroma + vdup.8 d22, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 d26, d16, d0 @ abs(p0 - q0) + vmovl.u8 q2, d0 + vabd.u8 d28, d18, d16 @ abs(p1 - p0) + vsubw.u8 q2, q2, d16 + vsli.16 d24, d24, #8 + vshl.i16 q2, q2, #2 + vabd.u8 d30, d2, d0 @ abs(q1 - q0) + vaddw.u8 q2, q2, d18 + vclt.u8 d26, d26, d22 @ < alpha + vsubw.u8 q2, q2, d2 + vdup.8 d22, r3 @ beta + vrshrn.i16 d4, q2, #3 + vclt.u8 d28, d28, d22 @ < beta + vclt.u8 d30, d30, d22 @ < beta + vmin.s8 d4, d4, d24 + vneg.s8 d25, d24 + vand d26, d26, d28 + vmax.s8 d4, d4, d25 + vand d26, d26, d30 + vmovl.u8 q11, d0 + vand d4, d4, d26 + vmovl.u8 q14, d16 + vaddw.s8 q14, q14, d4 + vsubw.s8 q11, q11, d4 + vqmovun.s16 d16, q14 + vqmovun.s16 d0, q11 +.endm + +function ff_h264_v_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, r1, lsl #1 + vld1.8 {d18}, [r0,:64], r1 + vld1.8 {d16}, [r0,:64], r1 + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64] + + h264_loop_filter_chroma + + sub r0, r0, r1, lsl #1 + vst1.8 {d16}, [r0,:64], r1 + vst1.8 {d0}, [r0,:64], r1 + + bx lr +endfunc + +function ff_h264_h_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #2 + vld1.32 {d18[0]}, [r0], r1 + vld1.32 {d16[0]}, [r0], r1 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d2[0]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[1]}, [r0], r1 + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + h264_loop_filter_chroma + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + sub r0, r0, r1, lsl #3 + vst1.32 {d18[0]}, [r0], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d2[0]}, [r0], r1 + vst1.32 {d18[1]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d2[1]}, [r0], r1 + + bx lr +endfunc + +@ Biweighted prediction + +.macro biweight_16 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q2, q8 + vmov q3, q8 +1: subs r3, r3, #2 + vld1.8 {d20-d21},[r0,:128], r2 + \macd q2, d0, d20 + pld [r0] + \macd q3, d0, d21 + vld1.8 {d22-d23},[r1,:128], r2 + \macs q2, d1, d22 + pld [r1] + \macs q3, d1, d23 + vmov q12, q8 + vld1.8 {d28-d29},[r0,:128], r2 + vmov q13, q8 + \macd q12, d0, d28 + pld [r0] + \macd q13, d0, d29 + vld1.8 {d30-d31},[r1,:128], r2 + \macs q12, d1, d30 + pld [r1] + \macs q13, d1, d31 + vshl.s16 q2, q2, q9 + vshl.s16 q3, q3, q9 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vshl.s16 q12, q12, q9 + vshl.s16 q13, q13, q9 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vmov q3, q8 + vst1.8 {d4- d5}, [r6,:128], r2 + vmov q2, q8 + vst1.8 {d24-d25},[r6,:128], r2 + bne 1b + pop {r4-r6, pc} +.endm + +.macro biweight_8 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs r3, r3, #2 + vld1.8 {d4},[r0,:64], r2 + \macd q1, d0, d4 + pld [r0] + vld1.8 {d5},[r1,:64], r2 + \macs q1, d1, d5 + pld [r1] + vld1.8 {d6},[r0,:64], r2 + \macd q10, d0, d6 + pld [r0] + vld1.8 {d7},[r1,:64], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.8 {d2},[r6,:64], r2 + vmov q1, q8 + vst1.8 {d4},[r6,:64], r2 + bne 1b + pop {r4-r6, pc} +.endm + +.macro biweight_4 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs r3, r3, #4 + vld1.32 {d4[0]},[r0,:32], r2 + vld1.32 {d4[1]},[r0,:32], r2 + \macd q1, d0, d4 + pld [r0] + vld1.32 {d5[0]},[r1,:32], r2 + vld1.32 {d5[1]},[r1,:32], r2 + \macs q1, d1, d5 + pld [r1] + blt 2f + vld1.32 {d6[0]},[r0,:32], r2 + vld1.32 {d6[1]},[r0,:32], r2 + \macd q10, d0, d6 + pld [r0] + vld1.32 {d7[0]},[r1,:32], r2 + vld1.32 {d7[1]},[r1,:32], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + vmov q1, q8 + vst1.32 {d4[0]},[r6,:32], r2 + vst1.32 {d4[1]},[r6,:32], r2 + bne 1b + pop {r4-r6, pc} +2: vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + pop {r4-r6, pc} +.endm + +.macro biweight_func w +function ff_biweight_h264_pixels_\w\()_neon, export=1 + push {r4-r6, lr} + ldr r12, [sp, #16] + add r4, sp, #20 + ldm r4, {r4-r6} + lsr lr, r4, #31 + add r6, r6, #1 + eors lr, lr, r5, lsr #30 + orr r6, r6, #1 + vdup.16 q9, r12 + lsl r6, r6, r12 + vmvn q9, q9 + vdup.16 q8, r6 + mov r6, r0 + beq 10f + subs lr, lr, #1 + beq 20f + subs lr, lr, #1 + beq 30f + b 40f +10: biweight_\w vmlal.u8, vmlal.u8 +20: rsb r4, r4, #0 + biweight_\w vmlal.u8, vmlsl.u8 +30: rsb r4, r4, #0 + rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlsl.u8 +40: rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlal.u8 +endfunc +.endm + + biweight_func 16 + biweight_func 8 + biweight_func 4 + +@ Weighted prediction + +.macro weight_16 add + vdup.8 d0, r12 +1: subs r2, r2, #2 + vld1.8 {d20-d21},[r0,:128], r1 + vmull.u8 q2, d0, d20 + pld [r0] + vmull.u8 q3, d0, d21 + vld1.8 {d28-d29},[r0,:128], r1 + vmull.u8 q12, d0, d28 + pld [r0] + vmull.u8 q13, d0, d29 + \add q2, q8, q2 + vrshl.s16 q2, q2, q9 + \add q3, q8, q3 + vrshl.s16 q3, q3, q9 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + \add q12, q8, q12 + vrshl.s16 q12, q12, q9 + \add q13, q8, q13 + vrshl.s16 q13, q13, q9 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vst1.8 {d4- d5}, [r4,:128], r1 + vst1.8 {d24-d25},[r4,:128], r1 + bne 1b + pop {r4, pc} +.endm + +.macro weight_8 add + vdup.8 d0, r12 +1: subs r2, r2, #2 + vld1.8 {d4},[r0,:64], r1 + vmull.u8 q1, d0, d4 + pld [r0] + vld1.8 {d6},[r0,:64], r1 + vmull.u8 q10, d0, d6 + \add q1, q8, q1 + pld [r0] + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + \add q10, q8, q10 + vrshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vst1.8 {d2},[r4,:64], r1 + vst1.8 {d4},[r4,:64], r1 + bne 1b + pop {r4, pc} +.endm + +.macro weight_4 add + vdup.8 d0, r12 + vmov q1, q8 + vmov q10, q8 +1: subs r2, r2, #4 + vld1.32 {d4[0]},[r0,:32], r1 + vld1.32 {d4[1]},[r0,:32], r1 + vmull.u8 q1, d0, d4 + pld [r0] + blt 2f + vld1.32 {d6[0]},[r0,:32], r1 + vld1.32 {d6[1]},[r0,:32], r1 + vmull.u8 q10, d0, d6 + pld [r0] + \add q1, q8, q1 + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + \add q10, q8, q10 + vrshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.32 {d2[0]},[r4,:32], r1 + vst1.32 {d2[1]},[r4,:32], r1 + vmov q1, q8 + vst1.32 {d4[0]},[r4,:32], r1 + vst1.32 {d4[1]},[r4,:32], r1 + bne 1b + pop {r4, pc} +2: \add q1, q8, q1 + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vst1.32 {d2[0]},[r4,:32], r1 + vst1.32 {d2[1]},[r4,:32], r1 + pop {r4, pc} +.endm + +.macro weight_func w +function ff_weight_h264_pixels_\w\()_neon, export=1 + push {r4, lr} + ldr r12, [sp, #8] + ldr r4, [sp, #12] + cmp r3, #1 + lsl r4, r4, r3 + vdup.16 q8, r4 + mov r4, r0 + ble 20f + rsb lr, r3, #1 + vdup.16 q9, lr + cmp r12, #0 + blt 10f + weight_\w vhadd.s16 +10: rsb r12, r12, #0 + weight_\w vhsub.s16 +20: rsb lr, r3, #0 + vdup.16 q9, lr + cmp r12, #0 + blt 10f + weight_\w vadd.s16 +10: rsb r12, r12, #0 + weight_\w vsub.s16 +endfunc +.endm + + weight_func 16 + weight_func 8 + weight_func 4 diff --git a/ffmpeg/libavcodec/arm/h264idct_neon.S b/ffmpeg/libavcodec/arm/h264idct_neon.S new file mode 100644 index 0000000..fa5b90c --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264idct_neon.S @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_h264_idct_add_neon, export=1 + vld1.64 {d0-d3}, [r1,:128] + vmov.i16 q15, #0 + + vswp d1, d2 + vst1.16 {q15}, [r1,:128]! + vadd.i16 d4, d0, d1 + vst1.16 {q15}, [r1,:128]! + vshr.s16 q8, q1, #1 + vsub.i16 d5, d0, d1 + vadd.i16 d6, d2, d17 + vsub.i16 d7, d16, d3 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vtrn.16 d0, d1 + vtrn.16 d3, d2 + vtrn.32 d0, d3 + vtrn.32 d1, d2 + + vadd.i16 d4, d0, d3 + vld1.32 {d18[0]}, [r0,:32], r2 + vswp d1, d3 + vshr.s16 q8, q1, #1 + vld1.32 {d19[1]}, [r0,:32], r2 + vsub.i16 d5, d0, d1 + vld1.32 {d18[1]}, [r0,:32], r2 + vadd.i16 d6, d16, d3 + vld1.32 {d19[0]}, [r0,:32], r2 + vsub.i16 d7, d2, d17 + sub r0, r0, r2, lsl #2 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + + vaddw.u8 q0, q0, d18 + vaddw.u8 q1, q1, d19 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + + sub r1, r1, #32 + bx lr +endfunc + +function ff_h264_idct_dc_add_neon, export=1 + mov r3, #0 + vld1.16 {d2[],d3[]}, [r1,:16] + strh r3, [r1] + vrshr.s16 q1, q1, #6 + vld1.32 {d0[0]}, [r0,:32], r2 + vld1.32 {d0[1]}, [r0,:32], r2 + vaddw.u8 q2, q1, d0 + vld1.32 {d1[0]}, [r0,:32], r2 + vld1.32 {d1[1]}, [r0,:32], r2 + vaddw.u8 q1, q1, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q1 + sub r0, r0, r2, lsl #2 + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + bx lr +endfunc + +function ff_h264_idct_add16_neon, export=1 + push {r4-r8,lr} + mov r4, r0 + mov r5, r1 + mov r1, r2 + mov r2, r3 + ldr r6, [sp, #24] + movrel r7, scan8 + mov ip, #16 +1: ldrb r8, [r7], #1 + ldr r0, [r5], #4 + ldrb r8, [r6, r8] + subs r8, r8, #1 + blt 2f + ldrsh lr, [r1] + add r0, r0, r4 + it ne + movne lr, #0 + cmp lr, #0 + ite ne + adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB + blx lr +2: subs ip, ip, #1 + add r1, r1, #32 + bne 1b + pop {r4-r8,pc} +endfunc + +function ff_h264_idct_add16intra_neon, export=1 + push {r4-r8,lr} + mov r4, r0 + mov r5, r1 + mov r1, r2 + mov r2, r3 + ldr r6, [sp, #24] + movrel r7, scan8 + mov ip, #16 +1: ldrb r8, [r7], #1 + ldr r0, [r5], #4 + ldrb r8, [r6, r8] + add r0, r0, r4 + cmp r8, #0 + ldrsh r8, [r1] + iteet ne + adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB + cmpeq r8, #0 + blxne lr + subs ip, ip, #1 + add r1, r1, #32 + bne 1b + pop {r4-r8,pc} +endfunc + +function ff_h264_idct_add8_neon, export=1 + push {r4-r10,lr} + ldm r0, {r4,r9} + add r5, r1, #16*4 + add r1, r2, #16*32 + mov r2, r3 + mov r10, r1 + ldr r6, [sp, #32] + movrel r7, scan8+16 + mov r12, #0 +1: ldrb r8, [r7, r12] + ldr r0, [r5, r12, lsl #2] + ldrb r8, [r6, r8] + add r0, r0, r4 + add r1, r10, r12, lsl #5 + cmp r8, #0 + ldrsh r8, [r1] + iteet ne + adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB + cmpeq r8, #0 + blxne lr + add r12, r12, #1 + cmp r12, #4 + itt eq + moveq r12, #16 + moveq r4, r9 + cmp r12, #20 + blt 1b + pop {r4-r10,pc} +endfunc + +.macro idct8x8_cols pass + .if \pass == 0 + qa .req q2 + qb .req q14 + vshr.s16 q2, q10, #1 + vadd.i16 q0, q8, q12 + vld1.16 {q14-q15},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vsub.i16 q1, q8, q12 + vshr.s16 q3, q14, #1 + vsub.i16 q2, q2, q14 + vadd.i16 q3, q3, q10 + .else + qa .req q14 + qb .req q2 + vtrn.32 q8, q10 + vtrn.16 q12, q13 + vtrn.32 q9, q11 + vtrn.32 q12, q2 + vtrn.32 q13, q15 + vswp d21, d4 + vshr.s16 q14, q10, #1 + vswp d17, d24 + vshr.s16 q3, q2, #1 + vswp d19, d26 + vadd.i16 q0, q8, q12 + vswp d23, d30 + vsub.i16 q1, q8, q12 + vsub.i16 q14, q14, q2 + vadd.i16 q3, q3, q10 + .endif + vadd.i16 q10, q1, qa + vsub.i16 q12, q1, qa + vadd.i16 q8, q0, q3 + vsub.i16 qb, q0, q3 + vsub.i16 q0, q13, q11 + vadd.i16 q1, q15, q9 + vsub.i16 qa, q15, q9 + vadd.i16 q3, q13, q11 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q11 + vadd.i16 qa, qa, q13 + vadd.i16 q3, q3, q9 + vshr.s16 q9, q9, #1 + vshr.s16 q11, q11, #1 + vshr.s16 q13, q13, #1 + vshr.s16 q15, q15, #1 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q11 + vadd.i16 qa, qa, q13 + vadd.i16 q3, q3, q9 + vshr.s16 q9, q0, #2 + vshr.s16 q11, q1, #2 + vshr.s16 q13, qa, #2 + vshr.s16 q15, q3, #2 + vsub.i16 q3, q3, q9 + vsub.i16 qa, q11, qa + vadd.i16 q1, q1, q13 + vadd.i16 q0, q0, q15 + .if \pass == 0 + vsub.i16 q15, q8, q3 + vadd.i16 q8, q8, q3 + vadd.i16 q9, q10, q2 + vsub.i16 q2, q10, q2 + vtrn.16 q8, q9 + vadd.i16 q10, q12, q1 + vtrn.16 q2, q15 + vadd.i16 q11, q14, q0 + vsub.i16 q13, q12, q1 + vtrn.16 q10, q11 + vsub.i16 q12, q14, q0 + .else + vsub.i16 q15, q8, q3 + vadd.i16 q8, q8, q3 + vadd.i16 q9, q10, q14 + vsub.i16 q14, q10, q14 + vadd.i16 q10, q12, q1 + vsub.i16 q13, q12, q1 + vadd.i16 q11, q2, q0 + vsub.i16 q12, q2, q0 + .endif + .unreq qa + .unreq qb +.endm + +function ff_h264_idct8_add_neon, export=1 + vmov.i16 q7, #0 + vld1.16 {q8-q9}, [r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q10-q11},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + vld1.16 {q12-q13},[r1,:128] + vst1.16 {q7}, [r1,:128]! + vst1.16 {q7}, [r1,:128]! + + idct8x8_cols 0 + idct8x8_cols 1 + + mov r3, r0 + vrshr.s16 q8, q8, #6 + vld1.8 {d0}, [r0,:64], r2 + vrshr.s16 q9, q9, #6 + vld1.8 {d1}, [r0,:64], r2 + vrshr.s16 q10, q10, #6 + vld1.8 {d2}, [r0,:64], r2 + vrshr.s16 q11, q11, #6 + vld1.8 {d3}, [r0,:64], r2 + vrshr.s16 q12, q12, #6 + vld1.8 {d4}, [r0,:64], r2 + vrshr.s16 q13, q13, #6 + vld1.8 {d5}, [r0,:64], r2 + vrshr.s16 q14, q14, #6 + vld1.8 {d6}, [r0,:64], r2 + vrshr.s16 q15, q15, #6 + vld1.8 {d7}, [r0,:64], r2 + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vqmovun.s16 d0, q8 + vaddw.u8 q11, q11, d3 + vqmovun.s16 d1, q9 + vaddw.u8 q12, q12, d4 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r2 + vaddw.u8 q13, q13, d5 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r2 + vaddw.u8 q14, q14, d6 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r2 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r2 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + vst1.8 {d4}, [r3,:64], r2 + vst1.8 {d5}, [r3,:64], r2 + vst1.8 {d6}, [r3,:64], r2 + vst1.8 {d7}, [r3,:64], r2 + + sub r1, r1, #128 + bx lr +endfunc + +function ff_h264_idct8_dc_add_neon, export=1 + mov r3, #0 + vld1.16 {d30[],d31[]},[r1,:16] + strh r3, [r1] + vld1.32 {d0}, [r0,:64], r2 + vrshr.s16 q15, q15, #6 + vld1.32 {d1}, [r0,:64], r2 + vld1.32 {d2}, [r0,:64], r2 + vaddw.u8 q8, q15, d0 + vld1.32 {d3}, [r0,:64], r2 + vaddw.u8 q9, q15, d1 + vld1.32 {d4}, [r0,:64], r2 + vaddw.u8 q10, q15, d2 + vld1.32 {d5}, [r0,:64], r2 + vaddw.u8 q11, q15, d3 + vld1.32 {d6}, [r0,:64], r2 + vaddw.u8 q12, q15, d4 + vld1.32 {d7}, [r0,:64], r2 + vaddw.u8 q13, q15, d5 + vaddw.u8 q14, q15, d6 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + sub r0, r0, r2, lsl #3 + vst1.32 {d0}, [r0,:64], r2 + vqmovun.s16 d4, q12 + vst1.32 {d1}, [r0,:64], r2 + vqmovun.s16 d5, q13 + vst1.32 {d2}, [r0,:64], r2 + vqmovun.s16 d6, q14 + vst1.32 {d3}, [r0,:64], r2 + vqmovun.s16 d7, q15 + vst1.32 {d4}, [r0,:64], r2 + vst1.32 {d5}, [r0,:64], r2 + vst1.32 {d6}, [r0,:64], r2 + vst1.32 {d7}, [r0,:64], r2 + bx lr +endfunc + +function ff_h264_idct8_add4_neon, export=1 + push {r4-r8,lr} + mov r4, r0 + mov r5, r1 + mov r1, r2 + mov r2, r3 + ldr r6, [sp, #24] + movrel r7, scan8 + mov r12, #16 +1: ldrb r8, [r7], #4 + ldr r0, [r5], #16 + ldrb r8, [r6, r8] + subs r8, r8, #1 + blt 2f + ldrsh lr, [r1] + add r0, r0, r4 + it ne + movne lr, #0 + cmp lr, #0 + ite ne + adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB + blx lr +2: subs r12, r12, #4 + add r1, r1, #128 + bne 1b + pop {r4-r8,pc} +endfunc + +const scan8 + .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 + .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 + .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 + .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 +endconst diff --git a/ffmpeg/libavcodec/arm/h264pred_init_arm.c b/ffmpeg/libavcodec/arm/h264pred_init_arm.c new file mode 100644 index 0000000..5ec39ce --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264pred_init_arm.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/h264pred.h" + +void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride); + +void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); + +static av_cold void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc) +{ +#if HAVE_NEON + const int high_depth = bit_depth > 8; + + if (high_depth) + return; + if(chroma_format_idc == 1){ + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; + if (codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; + } + } + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; +#endif // HAVE_NEON +} + +av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, + int bit_depth, const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); +} diff --git a/ffmpeg/libavcodec/arm/h264pred_neon.S b/ffmpeg/libavcodec/arm/h264pred_neon.S new file mode 100644 index 0000000..4dc47ba --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264pred_neon.S @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + + .macro ldcol.8 rd, rs, rt, n=8, hi=0 +.if \n == 8 || \hi == 0 + vld1.8 {\rd[0]}, [\rs], \rt + vld1.8 {\rd[1]}, [\rs], \rt + vld1.8 {\rd[2]}, [\rs], \rt + vld1.8 {\rd[3]}, [\rs], \rt +.endif +.if \n == 8 || \hi == 1 + vld1.8 {\rd[4]}, [\rs], \rt + vld1.8 {\rd[5]}, [\rs], \rt + vld1.8 {\rd[6]}, [\rs], \rt + vld1.8 {\rd[7]}, [\rs], \rt +.endif + .endm + + .macro add16x8 dq, dl, dh, rl, rh + vaddl.u8 \dq, \rl, \rh + vadd.u16 \dl, \dl, \dh + vpadd.u16 \dl, \dl, \dl + vpadd.u16 \dl, \dl, \dl + .endm + +function ff_pred16x16_128_dc_neon, export=1 + vmov.i8 q0, #128 + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_top_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {q0}, [r2,:128] + add16x8 q0, d0, d1, d0, d1 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_left_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1 + ldcol.8 d1, r2, r1 + add16x8 q0, d0, d1, d0, d1 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {q0}, [r2,:128] + sub r2, r0, #1 + ldcol.8 d2, r2, r1 + ldcol.8 d3, r2, r1 + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #5 + vdup.8 q0, d0[0] +.L_pred16x16_dc_end: + mov r3, #8 +6: vst1.8 {q0}, [r0,:128], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 6b + bx lr +endfunc + +function ff_pred16x16_hor_neon, export=1 + sub r2, r0, #1 + mov r3, #16 +1: vld1.8 {d0[],d1[]},[r2], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred16x16_vert_neon, export=1 + sub r0, r0, r1 + vld1.8 {q0}, [r0,:128], r1 + mov r3, #8 +1: vst1.8 {q0}, [r0,:128], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred16x16_plane_neon, export=1 + sub r3, r0, r1 + add r2, r3, #8 + sub r3, r3, #1 + vld1.8 {d0}, [r3] + vld1.8 {d2}, [r2,:64], r1 + ldcol.8 d1, r3, r1 + add r3, r3, r1 + ldcol.8 d3, r3, r1 + vrev64.8 q0, q0 + vaddl.u8 q8, d2, d3 + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d1 + movrel r3, p16weight + vld1.8 {q0}, [r3,:128] + vmul.s16 q2, q2, q0 + vmul.s16 q3, q3, q0 + vadd.i16 d4, d4, d5 + vadd.i16 d5, d6, d7 + vpadd.i16 d4, d4, d5 + vpadd.i16 d4, d4, d4 + vshll.s16 q3, d4, #2 + vaddw.s16 q2, q3, d4 + vrshrn.s32 d4, q2, #6 + mov r3, #0 + vtrn.16 d4, d5 + vadd.i16 d2, d4, d5 + vshl.i16 d3, d2, #3 + vrev64.16 d16, d17 + vsub.i16 d3, d3, d2 + vadd.i16 d16, d16, d0 + vshl.i16 d2, d16, #4 + vsub.i16 d2, d2, d3 + vshl.i16 d3, d4, #4 + vext.16 q0, q0, q0, #7 + vsub.i16 d6, d5, d3 + vmov.16 d0[0], r3 + vmul.i16 q0, q0, d4[0] + vdup.16 q1, d2[0] + vdup.16 q2, d4[0] + vdup.16 q3, d6[0] + vshl.i16 q2, q2, #3 + vadd.i16 q1, q1, q0 + vadd.i16 q3, q3, q2 + mov r3, #16 +1: + vqshrun.s16 d0, q1, #5 + vadd.i16 q1, q1, q2 + vqshrun.s16 d1, q1, #5 + vadd.i16 q1, q1, q3 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +const p16weight, align=4 + .short 1,2,3,4,5,6,7,8 +endconst + +function ff_pred8x8_hor_neon, export=1 + sub r2, r0, #1 + mov r3, #8 +1: vld1.8 {d0[]}, [r2], r1 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_vert_neon, export=1 + sub r0, r0, r1 + vld1.8 {d0}, [r0,:64], r1 + mov r3, #4 +1: vst1.8 {d0}, [r0,:64], r1 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_plane_neon, export=1 + sub r3, r0, r1 + add r2, r3, #4 + sub r3, r3, #1 + vld1.32 {d0[0]}, [r3] + vld1.32 {d2[0]}, [r2,:32], r1 + ldcol.8 d0, r3, r1, 4, hi=1 + add r3, r3, r1 + ldcol.8 d3, r3, r1, 4 + vaddl.u8 q8, d2, d3 + vrev32.8 d0, d0 + vtrn.32 d2, d3 + vsubl.u8 q2, d2, d0 + movrel r3, p16weight + vld1.16 {q0}, [r3,:128] + vmul.s16 d4, d4, d0 + vmul.s16 d5, d5, d0 + vpadd.i16 d4, d4, d5 + vpaddl.s16 d4, d4 + vshl.i32 d5, d4, #4 + vadd.s32 d4, d4, d5 + vrshrn.s32 d4, q2, #5 + mov r3, #0 + vtrn.16 d4, d5 + vadd.i16 d2, d4, d5 + vshl.i16 d3, d2, #2 + vrev64.16 d16, d16 + vsub.i16 d3, d3, d2 + vadd.i16 d16, d16, d0 + vshl.i16 d2, d16, #4 + vsub.i16 d2, d2, d3 + vshl.i16 d3, d4, #3 + vext.16 q0, q0, q0, #7 + vsub.i16 d6, d5, d3 + vmov.16 d0[0], r3 + vmul.i16 q0, q0, d4[0] + vdup.16 q1, d2[0] + vdup.16 q2, d4[0] + vdup.16 q3, d6[0] + vshl.i16 q2, q2, #3 + vadd.i16 q1, q1, q0 + vadd.i16 q3, q3, q2 + mov r3, #8 +1: + vqshrun.s16 d0, q1, #5 + vadd.i16 q1, q1, q3 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_128_dc_neon, export=1 + vmov.i8 q0, #128 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_top_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d1, d0[1] + vdup.8 d0, d0[0] + vtrn.32 d0, d1 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_left_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1 + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d1, d0[1] + vdup.8 d0, d0[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + sub r2, r0, #1 + ldcol.8 d1, r2, r1 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d2, q0, #3 + vrshrn.u16 d3, q0, #2 + vdup.8 d0, d2[4] + vdup.8 d1, d3[3] + vdup.8 d4, d3[2] + vdup.8 d5, d2[5] + vtrn.32 q0, q2 +.L_pred8x8_dc_end: + mov r3, #4 + add r2, r0, r1, lsl #2 +6: vst1.8 {d0}, [r0,:64], r1 + vst1.8 {d1}, [r2,:64], r1 + subs r3, r3, #1 + bne 6b + bx lr +endfunc + +function ff_pred8x8_l0t_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + sub r2, r0, #1 + ldcol.8 d1, r2, r1, 4 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d2, q0, #3 + vrshrn.u16 d3, q0, #2 + vdup.8 d0, d2[4] + vdup.8 d1, d3[0] + vdup.8 q2, d3[2] + vtrn.32 q0, q2 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_l00_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1, 4 + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vmov.i8 d1, #128 + vdup.8 d0, d0[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0lt_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + add r2, r0, r1, lsl #2 + sub r2, r2, #1 + ldcol.8 d1, r2, r1, 4, hi=1 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d3, q0, #2 + vrshrn.u16 d2, q0, #3 + vdup.8 d0, d3[0] + vdup.8 d1, d3[3] + vdup.8 d4, d3[2] + vdup.8 d5, d2[5] + vtrn.32 q0, q2 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0l0_dc_neon, export=1 + add r2, r0, r1, lsl #2 + sub r2, r2, #1 + ldcol.8 d1, r2, r1, 4 + vpaddl.u8 d2, d1 + vpadd.u16 d2, d2, d2 + vrshrn.u16 d1, q1, #2 + vmov.i8 d0, #128 + vdup.8 d1, d1[0] + b .L_pred8x8_dc_end +endfunc diff --git a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c new file mode 100644 index 0000000..eaa1324 --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c @@ -0,0 +1,171 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/h264qpel.h" + +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); + +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); + +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); + +void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); +void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); + +av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth) +{ + const int high_bit_depth = bit_depth > 8; + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && !high_bit_depth) { + c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; + c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; + c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; + c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; + c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; + c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; + c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; + c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; + c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; + c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; + + c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; + c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; + c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; + c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; + c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; + c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; + c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; + c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; + c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; + c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/h264qpel_neon.S b/ffmpeg/libavcodec/arm/h264qpel_neon.S new file mode 100644 index 0000000..21336c6 --- /dev/null +++ b/ffmpeg/libavcodec/arm/h264qpel_neon.S @@ -0,0 +1,955 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + + /* H.264 qpel MC */ + +.macro lowpass_const r + movw \r, #5 + movt \r, #20 + vmov.32 d6[0], \r +.endm + +.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 + .if \narrow + t0 .req q0 + t1 .req q8 + .else + t0 .req \d0 + t1 .req \d1 + .endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vext.8 d18, \r2, \r3, #2 + vmla.i16 t0, q1, d6[1] + vext.8 d19, \r2, \r3, #3 + vaddl.u8 q9, d18, d19 + vext.8 d20, \r2, \r3, #1 + vmls.i16 t0, q2, d6[0] + vext.8 d21, \r2, \r3, #4 + vaddl.u8 q10, d20, d21 + vext.8 d31, \r2, \r3, #5 + vaddl.u8 t1, \r2, d31 + vmla.i16 t1, q9, d6[1] + vmls.i16 t1, q10, d6[0] + .if \narrow + vqrshrun.s16 \d0, t0, #5 + vqrshrun.s16 \d1, t1, #5 + .endif + .unreq t0 + .unreq t1 +.endm + +.macro lowpass_8_1 r0, r1, d0, narrow=1 + .if \narrow + t0 .req q0 + .else + t0 .req \d0 + .endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vmla.i16 t0, q1, d6[1] + vmls.i16 t0, q2, d6[0] + .if \narrow + vqrshrun.s16 \d0, t0, #5 + .endif + .unreq t0 +.endm + +.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d + vext.16 q1, \r0, \r1, #2 + vext.16 q0, \r0, \r1, #3 + vaddl.s16 q9, d2, d0 + vext.16 q2, \r0, \r1, #1 + vaddl.s16 q1, d3, d1 + vext.16 q3, \r0, \r1, #4 + vaddl.s16 q10, d4, d6 + vext.16 \r1, \r0, \r1, #5 + vaddl.s16 q2, d5, d7 + vaddl.s16 q0, \h0, \h1 + vaddl.s16 q8, \l0, \l1 + + vshl.i32 q3, q9, #4 + vshl.i32 q9, q9, #2 + vshl.i32 q15, q10, #2 + vadd.i32 q9, q9, q3 + vadd.i32 q10, q10, q15 + + vshl.i32 q3, q1, #4 + vshl.i32 q1, q1, #2 + vshl.i32 q15, q2, #2 + vadd.i32 q1, q1, q3 + vadd.i32 q2, q2, q15 + + vadd.i32 q9, q9, q8 + vsub.i32 q9, q9, q10 + + vadd.i32 q1, q1, q0 + vsub.i32 q1, q1, q2 + + vrshrn.s32 d18, q9, #10 + vrshrn.s32 d19, q1, #10 + + vqmovun.s16 \d, q9 +.endm + +function put_h264_qpel16_h_lowpass_neon_packed + mov r4, lr + mov r12, #16 + mov r3, #8 + bl put_h264_qpel8_h_lowpass_neon + sub r1, r1, r2, lsl #4 + add r1, r1, #8 + mov r12, #16 + mov lr, r4 + b put_h264_qpel8_h_lowpass_neon +endfunc + +.macro h264_qpel_h_lowpass type +function \type\()_h264_qpel16_h_lowpass_neon + push {lr} + mov r12, #16 + bl \type\()_h264_qpel8_h_lowpass_neon + sub r0, r0, r3, lsl #4 + sub r1, r1, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + mov r12, #16 + pop {lr} +endfunc + +function \type\()_h264_qpel8_h_lowpass_neon +1: vld1.8 {d0, d1}, [r1], r2 + vld1.8 {d16,d17}, [r1], r2 + subs r12, r12, #2 + lowpass_8 d0, d1, d16, d17, d0, d16 + .ifc \type,avg + vld1.8 {d2}, [r0,:64], r3 + vrhadd.u8 d0, d0, d2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 d16, d16, d3 + sub r0, r0, r3 + .endif + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d16}, [r0,:64], r3 + bne 1b + bx lr +endfunc +.endm + + h264_qpel_h_lowpass put + h264_qpel_h_lowpass avg + +.macro h264_qpel_h_lowpass_l2 type +function \type\()_h264_qpel16_h_lowpass_l2_neon + push {lr} + mov r12, #16 + bl \type\()_h264_qpel8_h_lowpass_l2_neon + sub r0, r0, r2, lsl #4 + sub r1, r1, r2, lsl #4 + sub r3, r3, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + add r3, r3, #8 + mov r12, #16 + pop {lr} +endfunc + +function \type\()_h264_qpel8_h_lowpass_l2_neon +1: vld1.8 {d0, d1}, [r1], r2 + vld1.8 {d16,d17}, [r1], r2 + vld1.8 {d28}, [r3], r2 + vld1.8 {d29}, [r3], r2 + subs r12, r12, #2 + lowpass_8 d0, d1, d16, d17, d0, d1 + vrhadd.u8 q0, q0, q14 + .ifc \type,avg + vld1.8 {d2}, [r0,:64], r2 + vrhadd.u8 d0, d0, d2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 d1, d1, d3 + sub r0, r0, r2 + .endif + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 + bne 1b + bx lr +endfunc +.endm + + h264_qpel_h_lowpass_l2 put + h264_qpel_h_lowpass_l2 avg + +function put_h264_qpel16_v_lowpass_neon_packed + mov r4, lr + mov r2, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 + b put_h264_qpel8_v_lowpass_neon +endfunc + +.macro h264_qpel_v_lowpass type +function \type\()_h264_qpel16_v_lowpass_neon + mov r4, lr + bl \type\()_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_neon + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl \type\()_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_neon + vld1.8 {d8}, [r1], r3 + vld1.8 {d10}, [r1], r3 + vld1.8 {d12}, [r1], r3 + vld1.8 {d14}, [r1], r3 + vld1.8 {d22}, [r1], r3 + vld1.8 {d24}, [r1], r3 + vld1.8 {d26}, [r1], r3 + vld1.8 {d28}, [r1], r3 + vld1.8 {d9}, [r1], r3 + vld1.8 {d11}, [r1], r3 + vld1.8 {d13}, [r1], r3 + vld1.8 {d15}, [r1], r3 + vld1.8 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d10 + lowpass_8 d12, d13, d14, d15, d12, d14 + lowpass_8 d22, d23, d24, d25, d22, d24 + lowpass_8 d26, d27, d28, d29, d26, d28 + transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 + + .ifc \type,avg + vld1.8 {d9}, [r0,:64], r2 + vrhadd.u8 d8, d8, d9 + vld1.8 {d11}, [r0,:64], r2 + vrhadd.u8 d10, d10, d11 + vld1.8 {d13}, [r0,:64], r2 + vrhadd.u8 d12, d12, d13 + vld1.8 {d15}, [r0,:64], r2 + vrhadd.u8 d14, d14, d15 + vld1.8 {d23}, [r0,:64], r2 + vrhadd.u8 d22, d22, d23 + vld1.8 {d25}, [r0,:64], r2 + vrhadd.u8 d24, d24, d25 + vld1.8 {d27}, [r0,:64], r2 + vrhadd.u8 d26, d26, d27 + vld1.8 {d29}, [r0,:64], r2 + vrhadd.u8 d28, d28, d29 + sub r0, r0, r2, lsl #3 + .endif + + vst1.8 {d8}, [r0,:64], r2 + vst1.8 {d10}, [r0,:64], r2 + vst1.8 {d12}, [r0,:64], r2 + vst1.8 {d14}, [r0,:64], r2 + vst1.8 {d22}, [r0,:64], r2 + vst1.8 {d24}, [r0,:64], r2 + vst1.8 {d26}, [r0,:64], r2 + vst1.8 {d28}, [r0,:64], r2 + + bx lr +endfunc +.endm + + h264_qpel_v_lowpass put + h264_qpel_v_lowpass avg + +.macro h264_qpel_v_lowpass_l2 type +function \type\()_h264_qpel16_v_lowpass_l2_neon + mov r4, lr + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r0, r0, r3, lsl #4 + sub r12, r12, r2, lsl #4 + add r0, r0, #8 + add r12, r12, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_l2_neon + vld1.8 {d8}, [r1], r3 + vld1.8 {d10}, [r1], r3 + vld1.8 {d12}, [r1], r3 + vld1.8 {d14}, [r1], r3 + vld1.8 {d22}, [r1], r3 + vld1.8 {d24}, [r1], r3 + vld1.8 {d26}, [r1], r3 + vld1.8 {d28}, [r1], r3 + vld1.8 {d9}, [r1], r3 + vld1.8 {d11}, [r1], r3 + vld1.8 {d13}, [r1], r3 + vld1.8 {d15}, [r1], r3 + vld1.8 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d9 + lowpass_8 d12, d13, d14, d15, d12, d13 + lowpass_8 d22, d23, d24, d25, d22, d23 + lowpass_8 d26, d27, d28, d29, d26, d27 + transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 + + vld1.8 {d0}, [r12], r2 + vld1.8 {d1}, [r12], r2 + vld1.8 {d2}, [r12], r2 + vld1.8 {d3}, [r12], r2 + vld1.8 {d4}, [r12], r2 + vrhadd.u8 q0, q0, q4 + vld1.8 {d5}, [r12], r2 + vrhadd.u8 q1, q1, q6 + vld1.8 {d10}, [r12], r2 + vrhadd.u8 q2, q2, q11 + vld1.8 {d11}, [r12], r2 + vrhadd.u8 q5, q5, q13 + + .ifc \type,avg + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d0, d0, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d1, d1, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d2, d2, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d3, d3, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d4, d4, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d5, d5, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d10, d10, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d11, d11, d17 + sub r0, r0, r3, lsl #3 + .endif + + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d1}, [r0,:64], r3 + vst1.8 {d2}, [r0,:64], r3 + vst1.8 {d3}, [r0,:64], r3 + vst1.8 {d4}, [r0,:64], r3 + vst1.8 {d5}, [r0,:64], r3 + vst1.8 {d10}, [r0,:64], r3 + vst1.8 {d11}, [r0,:64], r3 + + bx lr +endfunc +.endm + + h264_qpel_v_lowpass_l2 put + h264_qpel_v_lowpass_l2 avg + +function put_h264_qpel8_hv_lowpass_neon_top + lowpass_const r12 + mov r12, #12 +1: vld1.8 {d0, d1}, [r1], r3 + vld1.8 {d16,d17}, [r1], r3 + subs r12, r12, #2 + lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 + vst1.8 {d22-d25}, [r4,:128]! + bne 1b + + vld1.8 {d0, d1}, [r1] + lowpass_8_1 d0, d1, q12, narrow=0 + + mov r12, #-16 + add r4, r4, r12 + vld1.8 {d30,d31}, [r4,:128], r12 + vld1.8 {d20,d21}, [r4,:128], r12 + vld1.8 {d18,d19}, [r4,:128], r12 + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d14,d15}, [r4,:128], r12 + vld1.8 {d12,d13}, [r4,:128], r12 + vld1.8 {d10,d11}, [r4,:128], r12 + vld1.8 {d8, d9}, [r4,:128], r12 + vld1.8 {d6, d7}, [r4,:128], r12 + vld1.8 {d4, d5}, [r4,:128], r12 + vld1.8 {d2, d3}, [r4,:128], r12 + vld1.8 {d0, d1}, [r4,:128] + + swap4 d1, d3, d5, d7, d8, d10, d12, d14 + transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 + + swap4 d17, d19, d21, d31, d24, d26, d28, d22 + transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 + + vst1.8 {d30,d31}, [r4,:128]! + vst1.8 {d6, d7}, [r4,:128]! + vst1.8 {d20,d21}, [r4,:128]! + vst1.8 {d4, d5}, [r4,:128]! + vst1.8 {d18,d19}, [r4,:128]! + vst1.8 {d2, d3}, [r4,:128]! + vst1.8 {d16,d17}, [r4,:128]! + vst1.8 {d0, d1}, [r4,:128] + + lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 + lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 + lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 + lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 + + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128], r12 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 + vld1.8 {d16,d17}, [r4,:128], r12 + vld1.8 {d30,d31}, [r4,:128] + lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 + + transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 + + bx lr +endfunc + +.macro h264_qpel8_hv_lowpass type +function \type\()_h264_qpel8_hv_lowpass_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top + .ifc \type,avg + vld1.8 {d0}, [r0,:64], r2 + vrhadd.u8 d12, d12, d0 + vld1.8 {d1}, [r0,:64], r2 + vrhadd.u8 d13, d13, d1 + vld1.8 {d2}, [r0,:64], r2 + vrhadd.u8 d14, d14, d2 + vld1.8 {d3}, [r0,:64], r2 + vrhadd.u8 d15, d15, d3 + vld1.8 {d4}, [r0,:64], r2 + vrhadd.u8 d8, d8, d4 + vld1.8 {d5}, [r0,:64], r2 + vrhadd.u8 d9, d9, d5 + vld1.8 {d6}, [r0,:64], r2 + vrhadd.u8 d10, d10, d6 + vld1.8 {d7}, [r0,:64], r2 + vrhadd.u8 d11, d11, d7 + sub r0, r0, r2, lsl #3 + .endif + + vst1.8 {d12}, [r0,:64], r2 + vst1.8 {d13}, [r0,:64], r2 + vst1.8 {d14}, [r0,:64], r2 + vst1.8 {d15}, [r0,:64], r2 + vst1.8 {d8}, [r0,:64], r2 + vst1.8 {d9}, [r0,:64], r2 + vst1.8 {d10}, [r0,:64], r2 + vst1.8 {d11}, [r0,:64], r2 + + mov lr, r10 + bx lr +endfunc +.endm + + h264_qpel8_hv_lowpass put + h264_qpel8_hv_lowpass avg + +.macro h264_qpel8_hv_lowpass_l2 type +function \type\()_h264_qpel8_hv_lowpass_l2_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top + + vld1.8 {d0, d1}, [r2,:128]! + vld1.8 {d2, d3}, [r2,:128]! + vrhadd.u8 q0, q0, q6 + vld1.8 {d4, d5}, [r2,:128]! + vrhadd.u8 q1, q1, q7 + vld1.8 {d6, d7}, [r2,:128]! + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q3, q3, q5 + .ifc \type,avg + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d0, d0, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d1, d1, d17 + vld1.8 {d18}, [r0,:64], r3 + vrhadd.u8 d2, d2, d18 + vld1.8 {d19}, [r0,:64], r3 + vrhadd.u8 d3, d3, d19 + vld1.8 {d20}, [r0,:64], r3 + vrhadd.u8 d4, d4, d20 + vld1.8 {d21}, [r0,:64], r3 + vrhadd.u8 d5, d5, d21 + vld1.8 {d22}, [r0,:64], r3 + vrhadd.u8 d6, d6, d22 + vld1.8 {d23}, [r0,:64], r3 + vrhadd.u8 d7, d7, d23 + sub r0, r0, r3, lsl #3 + .endif + vst1.8 {d0}, [r0,:64], r3 + vst1.8 {d1}, [r0,:64], r3 + vst1.8 {d2}, [r0,:64], r3 + vst1.8 {d3}, [r0,:64], r3 + vst1.8 {d4}, [r0,:64], r3 + vst1.8 {d5}, [r0,:64], r3 + vst1.8 {d6}, [r0,:64], r3 + vst1.8 {d7}, [r0,:64], r3 + + mov lr, r10 + bx lr +endfunc +.endm + + h264_qpel8_hv_lowpass_l2 put + h264_qpel8_hv_lowpass_l2 avg + +.macro h264_qpel16_hv type +function \type\()_h264_qpel16_hv_lowpass_neon + mov r9, lr + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b \type\()_h264_qpel8_hv_lowpass_neon +endfunc + +function \type\()_h264_qpel16_hv_lowpass_l2_neon + mov r9, lr + sub r2, r4, #256 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r3, lsl #4 + add r0, r0, #8 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b \type\()_h264_qpel8_hv_lowpass_l2_neon +endfunc +.endm + + h264_qpel16_hv put + h264_qpel16_hv avg + +.macro h264_qpel8 type +function ff_\type\()_h264_qpel8_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + mov r12, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + mov r12, #8 + b \type\()_h264_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel8_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + mov r12, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc01_neon, export=1 + push {lr} + mov r12, r1 +\type\()_h264_qpel8_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl \type\()_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + pop {pc} +endfunc + +function ff_\type\()_h264_qpel8_mc11_neon, export=1 + push {r0, r1, r11, lr} +\type\()_h264_qpel8_mc11: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #64 + mov r0, sp + sub r1, r1, #2 + mov r3, #8 + mov r12, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + ldrd r0, r1, [r11], #8 + mov r3, r2 + add r12, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc21_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +\type\()_h264_qpel8_mc21: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #(8*8+16*12) + sub r1, r1, #2 + mov r3, #8 + mov r0, sp + mov r12, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + mov r4, r0 + ldrd r0, r1, [r11], #8 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub r2, r4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r11, lr} + sub r1, r1, #1 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc02_neon, export=1 + push {lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl \type\()_h264_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {pc} +endfunc + +function ff_\type\()_h264_qpel8_mc12_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +\type\()_h264_qpel8_mc12: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #(8*8+16*12) + sub r1, r1, r2, lsl #1 + mov r3, r2 + mov r2, #8 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel8_v_lowpass_neon + mov r4, r0 + ldrd r0, r1, [r11], #8 + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + sub r2, r4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc22_neon, export=1 + push {r4, r10, r11, lr} + mov r11, sp +A bic sp, sp, #15 +T bic r4, r11, #15 +T mov sp, r4 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl \type\()_h264_qpel8_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc32_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, #1 + b \type\()_h264_qpel8_mc12 +endfunc + +function ff_\type\()_h264_qpel8_mc03_neon, export=1 + push {lr} + add r12, r1, r2 + b \type\()_h264_qpel8_mc01 +endfunc + +function ff_\type\()_h264_qpel8_mc13_neon, export=1 + push {r0, r1, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc23_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel8_mc21 +endfunc + +function ff_\type\()_h264_qpel8_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r11, lr} + add r1, r1, r2 + sub r1, r1, #1 + b \type\()_h264_qpel8_mc11 +endfunc +.endm + + h264_qpel8 put + h264_qpel8 avg + +.macro h264_qpel16 type +function ff_\type\()_h264_qpel16_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + b \type\()_h264_qpel16_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel16_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc01_neon, export=1 + push {r4, lr} + mov r12, r1 +\type\()_h264_qpel16_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl \type\()_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc11_neon, export=1 + push {r0, r1, r4, r11, lr} +\type\()_h264_qpel16_mc11: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #256 + mov r0, sp + sub r1, r1, #2 + mov r3, #16 + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon + ldrd r0, r1, [r11], #8 + mov r3, r2 + add r12, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #16 + bl \type\()_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc21_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +\type\()_h264_qpel16_mc21: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #(16*16+16*12) + sub r1, r1, #2 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon_packed + mov r4, r0 + ldrd r0, r1, [r11], #8 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r4-r5, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, r11, lr} + sub r1, r1, #1 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc02_neon, export=1 + push {r4, lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl \type\()_h264_qpel16_v_lowpass_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc12_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +\type\()_h264_qpel16_mc12: + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 + sub sp, sp, #(16*16+16*12) + sub r1, r1, r2, lsl #1 + mov r0, sp + mov r3, r2 + vpush {d8-d15} + bl put_h264_qpel16_v_lowpass_neon_packed + mov r4, r0 + ldrd r0, r1, [r11], #8 + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + mov r2, r3 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + mov sp, r11 + pop {r4-r5, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc22_neon, export=1 + push {r4, r9-r11, lr} + lowpass_const r3 + mov r11, sp +A bic sp, sp, #15 +T bic r4, r11, #15 +T mov sp, r4 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl \type\()_h264_qpel16_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc32_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, #1 + b \type\()_h264_qpel16_mc12 +endfunc + +function ff_\type\()_h264_qpel16_mc03_neon, export=1 + push {r4, lr} + add r12, r1, r2 + b \type\()_h264_qpel16_mc01 +endfunc + +function ff_\type\()_h264_qpel16_mc13_neon, export=1 + push {r0, r1, r4, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc23_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel16_mc21 +endfunc + +function ff_\type\()_h264_qpel16_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, r11, lr} + add r1, r1, r2 + sub r1, r1, #1 + b \type\()_h264_qpel16_mc11 +endfunc +.endm + + h264_qpel16 put + h264_qpel16 avg diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.S b/ffmpeg/libavcodec/arm/hpeldsp_arm.S new file mode 100644 index 0000000..2f3d311 --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_arm.S @@ -0,0 +1,611 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +#if !HAVE_ARMV5TE_EXTERNAL +#define pld @ +#endif + +.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro JMP_ALIGN tmp, reg + ands \tmp, \reg, #3 + bic \reg, \reg, #3 + beq 1f + subs \tmp, \tmp, #1 + beq 2f + subs \tmp, \tmp, #1 + beq 3f + b 4f +.endm + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels16_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11, lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r7} + add r1, r1, r2 + stm r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + pop {r4-r11, pc} + .align 5 +2: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 2b + pop {r4-r11, pc} + .align 5 +3: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 3b + pop {r4-r11, pc} + .align 5 +4: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 4b + pop {r4-r11,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r5,lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stm r0, {r4-r5} + add r0, r0, r2 + bne 1b + pop {r4-r5,pc} + .align 5 +2: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r5,pc} + .align 5 +3: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r5,pc} + .align 5 +4: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 4b + pop {r4-r5,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .ltorg + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align, rnd + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldm r1, {r6-r8} +.elseif \align == 3 + ldm r1, {r5-r7} +.else + ldm r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, =0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 + it eq + andeq r14, r14, r14, \rnd #1 + add r8, r8, r10 + add r9, r9, r11 + ldr r12, =0xfcfcfcfc >> 2 + itt eq + addeq r8, r8, r14 + addeq r9, r9, r14 + and r4, r12, r4, lsr #2 + and r5, r12, r5, lsr #2 + and r6, r12, r6, lsr #2 + and r7, r12, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 + subs r3, r3, #1 +.endm + +.macro RND_XY2_EXPAND align, rnd + RND_XY2_IT \align, \rnd +6: push {r8-r11} + RND_XY2_IT \align, \rnd + pop {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + ldr r14, =0x0f0f0f0f + add r6, r6, r10 + add r7, r7, r11 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + stm r0, {r4-r5} + add r0, r0, r2 + bge 6b + pop {r4-r11,pc} +.endm + + .align 5 +function ff_put_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} @ R14 is also called LR + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsl + .align 5 +2: RND_XY2_EXPAND 1, lsl + .align 5 +3: RND_XY2_EXPAND 2, lsl + .align 5 +4: RND_XY2_EXPAND 3, lsl +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsr + .align 5 +2: RND_XY2_EXPAND 1, lsr + .align 5 +3: RND_XY2_EXPAND 2, lsr + .align 5 +4: RND_XY2_EXPAND 3, lsr +endfunc diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.h b/ffmpeg/libavcodec/arm/hpeldsp_arm.h new file mode 100644 index 0000000..e79bc6f --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_arm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HPELDSP_H +#define AVCODEC_ARM_HPELDSP_H + +#include "libavcodec/hpeldsp.h" + +void ff_hpeldsp_init_armv6(HpelDSPContext* c, int flags); +void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags); + +#endif /* AVCODEC_ARM_HPELDSP_H */ diff --git a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S new file mode 100644 index 0000000..cd50150 --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro call_2x_pixels type, subp +function ff_\type\()_pixels16\subp\()_armv6, export=1 + push {r0-r3, lr} + bl ff_\type\()_pixels8\subp\()_armv6 + pop {r0-r3, lr} + add r0, r0, #8 + add r1, r1, #8 + b ff_\type\()_pixels8\subp\()_armv6 +endfunc +.endm + +call_2x_pixels avg +call_2x_pixels put, _x2 +call_2x_pixels put, _y2 +call_2x_pixels put, _x2_no_rnd +call_2x_pixels put, _y2_no_rnd + +function ff_put_pixels16_armv6, export=1 + push {r4-r11} +1: + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr_post r4, r1, r2 + strd r6, r7, [r0, #8] + ldr r9, [r1, #4] + strd_post r4, r5, r0, r2 + ldr r10, [r1, #8] + ldr r11, [r1, #12] + ldr_post r8, r1, r2 + strd r10, r11, [r0, #8] + subs r3, r3, #2 + strd_post r8, r9, r0, r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_armv6, export=1 + push {r4-r7} +1: + ldr r5, [r1, #4] + ldr_post r4, r1, r2 + ldr r7, [r1, #4] + strd_post r4, r5, r0, r2 + ldr_post r6, r1, r2 + subs r3, r3, #2 + strd_post r6, r7, r0, r2 + bne 1b + + pop {r4-r7} + bx lr +endfunc + +function ff_put_pixels8_x2_armv6, export=1 + push {r4-r11, lr} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 +1: + ldr r4, [r1] + subs r3, r3, #2 + ldr r5, [r1, #4] + ldr r7, [r1, #5] + lsr r6, r4, #8 + ldr_pre r8, r1, r2 + orr r6, r6, r5, lsl #24 + ldr r9, [r1, #4] + ldr r11, [r1, #5] + lsr r10, r8, #8 + add r1, r1, r2 + orr r10, r10, r9, lsl #24 + eor r14, r4, r6 + uhadd8 r4, r4, r6 + eor r6, r5, r7 + uhadd8 r5, r5, r7 + and r14, r14, r12 + and r6, r6, r12 + uadd8 r4, r4, r14 + eor r14, r8, r10 + uadd8 r5, r5, r6 + eor r6, r9, r11 + uhadd8 r8, r8, r10 + and r14, r14, r12 + uhadd8 r9, r9, r11 + and r6, r6, r12 + uadd8 r8, r8, r14 + strd_post r4, r5, r0, r2 + uadd8 r9, r9, r6 + strd_post r8, r9, r0, r2 + bne 1b + + pop {r4-r11, pc} +endfunc + +function ff_put_pixels8_y2_armv6, export=1 + push {r4-r11} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr_pre r6, r1, r2 + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + eor r10, r4, r6 + uhadd8 r9, r5, r7 + eor r11, r5, r7 + and r10, r10, r12 + ldr_pre r4, r1, r2 + uadd8 r8, r8, r10 + and r11, r11, r12 + uadd8 r9, r9, r11 + ldr r5, [r1, #4] + uhadd8 r10, r4, r6 + eor r6, r4, r6 + uhadd8 r11, r5, r7 + and r6, r6, r12 + eor r7, r5, r7 + uadd8 r10, r10, r6 + and r7, r7, r12 + ldr_pre r6, r1, r2 + uadd8 r11, r11, r7 + strd_post r8, r9, r0, r2 + ldr r7, [r1, #4] + strd_post r10, r11, r0, r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_x2_no_rnd_armv6, export=1 + push {r4-r9, lr} +1: + subs r3, r3, #2 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r7, [r1, #5] + ldr_pre r8, r1, r2 + ldr r9, [r1, #4] + ldr r14, [r1, #5] + add r1, r1, r2 + lsr r6, r4, #8 + orr r6, r6, r5, lsl #24 + lsr r12, r8, #8 + orr r12, r12, r9, lsl #24 + uhadd8 r4, r4, r6 + uhadd8 r5, r5, r7 + uhadd8 r8, r8, r12 + uhadd8 r9, r9, r14 + stm r0, {r4,r5} + add r0, r0, r2 + stm r0, {r8,r9} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_put_pixels8_y2_no_rnd_armv6, export=1 + push {r4-r9, lr} + ldr r4, [r1] + ldr r5, [r1, #4] + ldr_pre r6, r1, r2 + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + ldr_pre r4, r1, r2 + uhadd8 r9, r5, r7 + ldr r5, [r1, #4] + uhadd8 r12, r4, r6 + ldr_pre r6, r1, r2 + uhadd8 r14, r5, r7 + ldr r7, [r1, #4] + stm r0, {r8,r9} + add r0, r0, r2 + stm r0, {r12,r14} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_avg_pixels8_armv6, export=1 + pld [r1, r2] + push {r4-r10, lr} + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldrd r4, r5, [r0] + ldr r10, [r1, #4] + ldr_post r9, r1, r2 + subs r3, r3, #2 +1: + pld [r1, r2] + eor r8, r4, r9 + uhadd8 r4, r4, r9 + eor r12, r5, r10 + ldrd_reg r6, r7, r0, r2 + uhadd8 r5, r5, r10 + and r8, r8, lr + ldr r10, [r1, #4] + and r12, r12, lr + uadd8 r4, r4, r8 + ldr_post r9, r1, r2 + eor r8, r6, r9 + uadd8 r5, r5, r12 + pld [r1, r2, lsl #1] + eor r12, r7, r10 + uhadd8 r6, r6, r9 + strd_post r4, r5, r0, r2 + uhadd8 r7, r7, r10 + beq 2f + and r8, r8, lr + ldrd_reg r4, r5, r0, r2 + uadd8 r6, r6, r8 + ldr r10, [r1, #4] + and r12, r12, lr + subs r3, r3, #2 + uadd8 r7, r7, r12 + ldr_post r9, r1, r2 + strd_post r6, r7, r0, r2 + b 1b +2: + and r8, r8, lr + and r12, r12, lr + uadd8 r6, r6, r8 + uadd8 r7, r7, r12 + strd_post r6, r7, r0, r2 + + pop {r4-r10, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c new file mode 100644 index 0000000..bae93eb --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c @@ -0,0 +1,68 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/cpu.h" +#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS +#include "hpeldsp_arm.h" + +void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) + +void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags) +{ + int cpu_flags = av_get_cpu_flags(); + + c->put_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; + c->put_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; + + if (have_armv6(cpu_flags)) ff_hpeldsp_init_armv6(c, flags); + if (have_neon(cpu_flags)) ff_hpeldsp_init_neon(c, flags); +} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c new file mode 100644 index 0000000..da4caf8 --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "hpeldsp_arm.h" + +void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; +/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ + c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; +/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; +} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c new file mode 100644 index 0000000..d577735 --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c @@ -0,0 +1,86 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "hpeldsp_arm.h" + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; + + c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; + c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; +} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_neon.S b/ffmpeg/libavcodec/arm/hpeldsp_neon.S new file mode 100644 index 0000000..cf4a6cf --- /dev/null +++ b/ffmpeg/libavcodec/arm/hpeldsp_neon.S @@ -0,0 +1,410 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro pixels16 rnd=1, avg=0 + .if \avg + mov r12, r0 + .endif +1: vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 + vld1.8 {q2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.8 {q3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + .if \avg + vld1.8 {q8}, [r12,:128], r2 + vrhadd.u8 q0, q0, q8 + vld1.8 {q9}, [r12,:128], r2 + vrhadd.u8 q1, q1, q9 + vld1.8 {q10}, [r12,:128], r2 + vrhadd.u8 q2, q2, q10 + vld1.8 {q11}, [r12,:128], r2 + vrhadd.u8 q3, q3, q11 + .endif + subs r3, r3, #4 + vst1.64 {q0}, [r0,:128], r2 + vst1.64 {q1}, [r0,:128], r2 + vst1.64 {q2}, [r0,:128], r2 + vst1.64 {q3}, [r0,:128], r2 + bne 1b + bx lr +.endm + +.macro pixels16_x2 rnd=1, avg=0 +1: vld1.8 {d0-d2}, [r1], r2 + vld1.8 {d4-d6}, [r1], r2 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + avg q0, q0, q1 + vext.8 q3, q2, q3, #1 + avg q2, q2, q3 + .if \avg + vld1.8 {q1}, [r0,:128], r2 + vld1.8 {q3}, [r0,:128] + vrhadd.u8 q0, q0, q1 + vrhadd.u8 q2, q2, q3 + sub r0, r0, r2 + .endif + vst1.8 {q0}, [r0,:128], r2 + vst1.8 {q2}, [r0,:128], r2 + bne 1b + bx lr +.endm + +.macro pixels16_y2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 +1: subs r3, r3, #2 + avg q2, q0, q1 + vld1.8 {q0}, [r1], r2 + avg q3, q0, q1 + vld1.8 {q1}, [r1], r2 + pld [r1] + pld [r1, r2] + .if \avg + vld1.8 {q8}, [r0,:128], r2 + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q2, q2, q8 + vrhadd.u8 q3, q3, q9 + sub r0, r0, r2 + .endif + vst1.8 {q2}, [r0,:128], r2 + vst1.8 {q3}, [r0,:128], r2 + bne 1b + + avg q2, q0, q1 + vld1.8 {q0}, [r1], r2 + avg q3, q0, q1 + .if \avg + vld1.8 {q8}, [r0,:128], r2 + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q2, q2, q8 + vrhadd.u8 q3, q3, q9 + sub r0, r0, r2 + .endif + vst1.8 {q2}, [r0,:128], r2 + vst1.8 {q3}, [r0,:128], r2 + + bx lr +.endm + +.macro pixels16_xy2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {d0-d2}, [r1], r2 + vld1.8 {d4-d6}, [r1], r2 +NRND vmov.i16 q13, #1 + pld [r1] + pld [r1, r2] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 +1: subs r3, r3, #2 + vld1.8 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 + pld [r1] +NRND vadd.u16 q12, q12, q13 + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + shrn d28, q12, #2 +NRND vadd.u16 q1, q1, q13 + shrn d29, q1, #2 + .if \avg + vld1.8 {q8}, [r0,:128] + vrhadd.u8 q14, q14, q8 + .endif + vaddl.u8 q8, d0, d30 + vld1.8 {d2-d4}, [r1], r2 + vaddl.u8 q10, d1, d31 + vst1.8 {q14}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [r1, r2] +NRND vadd.u16 q12, q12, q13 + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + shrn d30, q12, #2 +NRND vadd.u16 q0, q0, q13 + shrn d31, q0, #2 + .if \avg + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q15, q15, q9 + .endif + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.8 {q15}, [r0,:128], r2 + bgt 1b + + vld1.8 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 +NRND vadd.u16 q12, q12, q13 + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + shrn d28, q12, #2 +NRND vadd.u16 q1, q1, q13 + shrn d29, q1, #2 + .if \avg + vld1.8 {q8}, [r0,:128] + vrhadd.u8 q14, q14, q8 + .endif + vaddl.u8 q8, d0, d30 + vaddl.u8 q10, d1, d31 + vst1.8 {q14}, [r0,:128], r2 + vadd.u16 q12, q8, q9 +NRND vadd.u16 q12, q12, q13 + vadd.u16 q0, q10, q11 + shrn d30, q12, #2 +NRND vadd.u16 q0, q0, q13 + shrn d31, q0, #2 + .if \avg + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q15, q15, q9 + .endif + vst1.8 {q15}, [r0,:128], r2 + + bx lr +.endm + +.macro pixels8 rnd=1, avg=0 +1: vld1.8 {d0}, [r1], r2 + vld1.8 {d1}, [r1], r2 + vld1.8 {d2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.8 {d3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + .if \avg + vld1.8 {d4}, [r0,:64], r2 + vrhadd.u8 d0, d0, d4 + vld1.8 {d5}, [r0,:64], r2 + vrhadd.u8 d1, d1, d5 + vld1.8 {d6}, [r0,:64], r2 + vrhadd.u8 d2, d2, d6 + vld1.8 {d7}, [r0,:64], r2 + vrhadd.u8 d3, d3, d7 + sub r0, r0, r2, lsl #2 + .endif + subs r3, r3, #4 + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 + vst1.8 {d2}, [r0,:64], r2 + vst1.8 {d3}, [r0,:64], r2 + bne 1b + bx lr +.endm + +.macro pixels8_x2 rnd=1, avg=0 +1: vld1.8 {q0}, [r1], r2 + vext.8 d1, d0, d1, #1 + vld1.8 {q1}, [r1], r2 + vext.8 d3, d2, d3, #1 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vswp d1, d2 + avg q0, q0, q1 + .if \avg + vld1.8 {d4}, [r0,:64], r2 + vld1.8 {d5}, [r0,:64] + vrhadd.u8 q0, q0, q2 + sub r0, r0, r2 + .endif + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 + bne 1b + bx lr +.endm + +.macro pixels8_y2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {d0}, [r1], r2 + vld1.8 {d1}, [r1], r2 +1: subs r3, r3, #2 + avg d4, d0, d1 + vld1.8 {d0}, [r1], r2 + avg d5, d0, d1 + vld1.8 {d1}, [r1], r2 + pld [r1] + pld [r1, r2] + .if \avg + vld1.8 {d2}, [r0,:64], r2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 q2, q2, q1 + sub r0, r0, r2 + .endif + vst1.8 {d4}, [r0,:64], r2 + vst1.8 {d5}, [r0,:64], r2 + bne 1b + + avg d4, d0, d1 + vld1.8 {d0}, [r1], r2 + avg d5, d0, d1 + .if \avg + vld1.8 {d2}, [r0,:64], r2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 q2, q2, q1 + sub r0, r0, r2 + .endif + vst1.8 {d4}, [r0,:64], r2 + vst1.8 {d5}, [r0,:64], r2 + + bx lr +.endm + +.macro pixels8_xy2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 +NRND vmov.i16 q11, #1 + pld [r1] + pld [r1, r2] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 +1: subs r3, r3, #2 + vld1.8 {q0}, [r1], r2 + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +NRND vadd.u16 q10, q10, q11 + vaddl.u8 q8, d0, d4 + shrn d5, q10, #2 + vld1.8 {q1}, [r1], r2 + vadd.u16 q10, q8, q9 + pld [r1, r2] + .if \avg + vld1.8 {d7}, [r0,:64] + vrhadd.u8 d5, d5, d7 + .endif +NRND vadd.u16 q10, q10, q11 + vst1.8 {d5}, [r0,:64], r2 + shrn d7, q10, #2 + .if \avg + vld1.8 {d5}, [r0,:64] + vrhadd.u8 d7, d7, d5 + .endif + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.8 {d7}, [r0,:64], r2 + bgt 1b + + vld1.8 {q0}, [r1], r2 + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +NRND vadd.u16 q10, q10, q11 + vaddl.u8 q8, d0, d4 + shrn d5, q10, #2 + vadd.u16 q10, q8, q9 + .if \avg + vld1.8 {d7}, [r0,:64] + vrhadd.u8 d5, d5, d7 + .endif +NRND vadd.u16 q10, q10, q11 + vst1.8 {d5}, [r0,:64], r2 + shrn d7, q10, #2 + .if \avg + vld1.8 {d5}, [r0,:64] + vrhadd.u8 d7, d7, d5 + .endif + vst1.8 {d7}, [r0,:64], r2 + + bx lr +.endm + +.macro pixfunc pfx, name, suf, rnd=1, avg=0 + .if \rnd + .macro avg rd, rn, rm + vrhadd.u8 \rd, \rn, \rm + .endm + .macro shrn rd, rn, rm + vrshrn.u16 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + .endm + .else + .macro avg rd, rn, rm + vhadd.u8 \rd, \rn, \rm + .endm + .macro shrn rd, rn, rm + vshrn.u16 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + \insn + .endm + .endif +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd, \avg +endfunc + .purgem avg + .purgem shrn + .purgem NRND +.endm + +.macro pixfunc2 pfx, name, avg=0 + pixfunc \pfx, \name, rnd=1, avg=\avg + pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg +.endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc put_, pixels16, avg=0 + pixfunc2 put_, pixels16_x2, avg=0 + pixfunc2 put_, pixels16_y2, avg=0 + pixfunc2 put_, pixels16_xy2, avg=0 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc avg_, pixels16, avg=1 + pixfunc2 avg_, pixels16_x2, avg=1 + pixfunc2 avg_, pixels16_y2, avg=1 + pixfunc2 avg_, pixels16_xy2, avg=1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc put_, pixels8, avg=0 + pixfunc2 put_, pixels8_x2, avg=0 + pixfunc2 put_, pixels8_y2, avg=0 + pixfunc2 put_, pixels8_xy2, avg=0 + +function ff_avg_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc avg_, pixels8, avg=1 + pixfunc avg_, pixels8_x2, avg=1 + pixfunc avg_, pixels8_y2, avg=1 + pixfunc avg_, pixels8_xy2, avg=1 diff --git a/ffmpeg/libavcodec/arm/int_neon.S b/ffmpeg/libavcodec/arm/int_neon.S new file mode 100644 index 0000000..6b28a97 --- /dev/null +++ b/ffmpeg/libavcodec/arm/int_neon.S @@ -0,0 +1,92 @@ +/* + * ARM NEON optimised integer operations + * Copyright (c) 2009 Kostya Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + + .fpu neon + +function ff_scalarproduct_int16_neon, export=1 + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 +1: vld1.16 {d16-d17}, [r0]! + vld1.16 {d20-d21}, [r1,:128]! + vmlal.s16 q0, d16, d20 + vld1.16 {d18-d19}, [r0]! + vmlal.s16 q1, d17, d21 + vld1.16 {d22-d23}, [r1,:128]! + vmlal.s16 q2, d18, d22 + vmlal.s16 q3, d19, d23 + subs r2, r2, #16 + bne 1b + + vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc + +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) +function ff_scalarproduct_and_madd_int16_neon, export=1 + vld1.16 {d28[],d29[]}, [sp] + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + mov r12, r0 + +1: vld1.16 {d16-d17}, [r0,:128]! + vld1.16 {d18-d19}, [r1]! + vld1.16 {d20-d21}, [r2]! + vld1.16 {d22-d23}, [r0,:128]! + vld1.16 {d24-d25}, [r1]! + vld1.16 {d26-d27}, [r2]! + vmul.s16 q10, q10, q14 + vmul.s16 q13, q13, q14 + vmlal.s16 q0, d16, d18 + vmlal.s16 q1, d17, d19 + vadd.s16 q10, q8, q10 + vadd.s16 q13, q11, q13 + vmlal.s16 q2, d22, d24 + vmlal.s16 q3, d23, d25 + vst1.16 {q10}, [r12,:128]! + subs r3, r3, #16 + vst1.16 {q13}, [r12,:128]! + bne 1b + + vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/jrevdct_arm.S b/ffmpeg/libavcodec/arm/jrevdct_arm.S new file mode 100644 index 0000000..f951e2a --- /dev/null +++ b/ffmpeg/libavcodec/arm/jrevdct_arm.S @@ -0,0 +1,383 @@ +/* + C-like prototype : + void j_rev_dct_arm(DCTBLOCK data) + + With DCTBLOCK being a pointer to an array of 64 'signed shorts' + + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +#include "libavutil/arm/asm.S" + +#define FIX_0_298631336 2446 +#define FIX_0_541196100 4433 +#define FIX_0_765366865 6270 +#define FIX_1_175875602 9633 +#define FIX_1_501321110 12299 +#define FIX_2_053119869 16819 +#define FIX_3_072711026 25172 +#define FIX_M_0_390180644 -3196 +#define FIX_M_0_899976223 -7373 +#define FIX_M_1_847759065 -15137 +#define FIX_M_1_961570560 -16069 +#define FIX_M_2_562915447 -20995 +#define FIX_0xFFFF 0xFFFF + +#define FIX_0_298631336_ID 0 +#define FIX_0_541196100_ID 4 +#define FIX_0_765366865_ID 8 +#define FIX_1_175875602_ID 12 +#define FIX_1_501321110_ID 16 +#define FIX_2_053119869_ID 20 +#define FIX_3_072711026_ID 24 +#define FIX_M_0_390180644_ID 28 +#define FIX_M_0_899976223_ID 32 +#define FIX_M_1_847759065_ID 36 +#define FIX_M_1_961570560_ID 40 +#define FIX_M_2_562915447_ID 44 +#define FIX_0xFFFF_ID 48 + +function ff_j_rev_dct_arm, export=1 + push {r0, r4 - r11, lr} + + mov lr, r0 @ lr = pointer to the current row + mov r12, #8 @ r12 = row-counter + movrel r11, const_array @ r11 = base pointer to the constants array +row_loop: + ldrsh r0, [lr, # 0] @ r0 = 'd0' + ldrsh r2, [lr, # 2] @ r2 = 'd2' + + @ Optimization for row that have all items except the first set to 0 + @ (this works as the int16_t are always 4-byte aligned) + ldr r5, [lr, # 0] + ldr r6, [lr, # 4] + ldr r3, [lr, # 8] + ldr r4, [lr, #12] + orr r3, r3, r4 + orr r3, r3, r6 + orrs r5, r3, r5 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' + orrs r3, r3, r2 + beq empty_row + + ldrsh r1, [lr, # 8] @ r1 = 'd1' + ldrsh r4, [lr, # 4] @ r4 = 'd4' + ldrsh r6, [lr, # 6] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r7, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r7, r3, r7 @ r7 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r7 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r7 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 + + push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11 + + ldrsh r3, [lr, #10] @ r3 = 'd3' + ldrsh r5, [lr, #12] @ r5 = 'd5' + ldrsh r7, [lr, #14] @ r7 = 'd7' + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 @ r8 = z3 + z4 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) + add r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 0] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) + sub r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #14] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) + add r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 2] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) + sub r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #12] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) + add r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 4] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) + sub r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #10] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) + add r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 6] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) + sub r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 8] + + @ End of row loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + beq start_column_loop + +empty_row: + ldr r1, [r11, #FIX_0xFFFF_ID] + mov r0, r0, lsl #2 + and r0, r0, r1 + add r0, r0, r0, lsl #16 + str r0, [lr, # 0] + str r0, [lr, # 4] + str r0, [lr, # 8] + str r0, [lr, #12] + +end_of_row_loop: + @ End of loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + +start_column_loop: + @ Start of column loop + pop {lr} + mov r12, #8 +column_loop: + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r1, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r1, r3, r1 @ r1 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r1 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r1 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 + + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' + + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) + orr r9, r1, r3 + orr r10, r5, r7 + orrs r10, r9, r10 + beq empty_odd_column + + push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11 + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + add r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 0*8)] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + sub r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + add r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 2*8)] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + sub r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + add r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 4*8)] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + sub r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + add r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 6*8)] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + sub r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + beq the_end + +empty_odd_column: + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + add r0, r0, #(1<<17) + mov r0, r0, asr #18 + strh r0, [lr, #( 0*8)] + strh r0, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + add r4, r4, #(1<<17) + mov r4, r4, asr #18 + strh r4, [lr, #( 2*8)] + strh r4, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + add r6, r6, #(1<<17) + mov r6, r6, asr #18 + strh r6, [lr, #( 4*8)] + strh r6, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + add r2, r2, #(1<<17) + mov r2, r2, asr #18 + strh r2, [lr, #( 6*8)] + strh r2, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + +the_end: + @ The end.... + pop {r4 - r11, pc} +endfunc + +const const_array + .word FIX_0_298631336 + .word FIX_0_541196100 + .word FIX_0_765366865 + .word FIX_1_175875602 + .word FIX_1_501321110 + .word FIX_2_053119869 + .word FIX_3_072711026 + .word FIX_M_0_390180644 + .word FIX_M_0_899976223 + .word FIX_M_1_847759065 + .word FIX_M_1_961570560 + .word FIX_M_2_562915447 + .word FIX_0xFFFF +endconst diff --git a/ffmpeg/libavcodec/arm/mathops.h b/ffmpeg/libavcodec/arm/mathops.h new file mode 100644 index 0000000..dc57c55 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mathops.h @@ -0,0 +1,108 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MATHOPS_H +#define AVCODEC_ARM_MATHOPS_H + +#include +#include "config.h" +#include "libavutil/common.h" + +#if HAVE_INLINE_ASM + +#if HAVE_ARMV6_INLINE +#define MULH MULH +static inline av_const int MULH(int a, int b) +{ + int r; + __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +#define FASTDIV FASTDIV +static av_always_inline av_const int FASTDIV(int a, int b) +{ + int r; + __asm__ ("cmp %2, #2 \n\t" + "ldr %0, [%3, %2, lsl #2] \n\t" + "ite le \n\t" + "lsrle %0, %1, #1 \n\t" + "smmulgt %0, %0, %1 \n\t" + : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); + return r; +} + +#else /* HAVE_ARMV6_INLINE */ + +#define FASTDIV FASTDIV +static av_always_inline av_const int FASTDIV(int a, int b) +{ + int r, t; + __asm__ ("umull %1, %0, %2, %3" + : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b])); + return r; +} +#endif + +#define MLS64(d, a, b) MAC64(d, -(a), b) + +#if HAVE_ARMV5TE_INLINE + +/* signed 16x16 -> 32 multiply add accumulate */ +# define MAC16(rt, ra, rb) \ + __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); + +/* signed 16x16 -> 32 multiply */ +# define MUL16 MUL16 +static inline av_const int MUL16(int ra, int rb) +{ + int rt; + __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); + return rt; +} + +#endif + +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + int m; + __asm__ ( + "mov %0, %2 \n\t" + "cmp %1, %2 \n\t" + "itt gt \n\t" + "movgt %0, %1 \n\t" + "movgt %1, %2 \n\t" + "cmp %1, %3 \n\t" + "it le \n\t" + "movle %1, %3 \n\t" + "cmp %0, %1 \n\t" + "it gt \n\t" + "movgt %0, %1 \n\t" + : "=&r"(m), "+r"(a) + : "r"(b), "r"(c) + : "cc"); + return m; +} + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_ARM_MATHOPS_H */ diff --git a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S new file mode 100644 index 0000000..c77be59 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro prerot dst, rt + lsr r3, r6, #2 @ n4 + add \rt, r4, r6, lsr #1 @ revtab + n4 + add r9, r3, r3, lsl #1 @ n3 + add r8, r7, r6 @ tcos + n4 + add r3, r2, r6, lsr #1 @ in + n4 + add r9, r2, r9, lsl #1 @ in + n3 + sub r8, r8, #16 + sub r10, r3, #16 + sub r11, r9, #16 + mov r12, #-16 +1: + vld2.16 {d0,d1}, [r9, :128]! + vld2.16 {d2,d3}, [r11,:128], r12 + vld2.16 {d4,d5}, [r3, :128]! + vld2.16 {d6,d7}, [r10,:128], r12 + vld2.16 {d16,d17},[r7, :128]! @ cos, sin + vld2.16 {d18,d19},[r8, :128], r12 + vrev64.16 q1, q1 + vrev64.16 q3, q3 + vrev64.16 q9, q9 + vneg.s16 d0, d0 + vneg.s16 d2, d2 + vneg.s16 d16, d16 + vneg.s16 d18, d18 + vhsub.s16 d0, d0, d3 @ re + vhsub.s16 d4, d7, d4 @ im + vhsub.s16 d6, d6, d5 + vhsub.s16 d2, d2, d1 + vmull.s16 q10, d0, d16 + vmlsl.s16 q10, d4, d17 + vmull.s16 q11, d0, d17 + vmlal.s16 q11, d4, d16 + vmull.s16 q12, d6, d18 + vmlsl.s16 q12, d2, d19 + vmull.s16 q13, d6, d19 + vmlal.s16 q13, d2, d18 + vshrn.s32 d0, q10, #15 + vshrn.s32 d1, q11, #15 + vshrn.s32 d2, q12, #15 + vshrn.s32 d3, q13, #15 + vzip.16 d0, d1 + vzip.16 d2, d3 + ldrh lr, [r4], #2 + ldrh r2, [\rt, #-2]! + add lr, \dst, lr, lsl #2 + add r2, \dst, r2, lsl #2 + vst1.32 {d0[0]}, [lr,:32] + vst1.32 {d2[0]}, [r2,:32] + ldrh lr, [r4], #2 + ldrh r2, [\rt, #-2]! + add lr, \dst, lr, lsl #2 + add r2, \dst, r2, lsl #2 + vst1.32 {d0[1]}, [lr,:32] + vst1.32 {d2[1]}, [r2,:32] + ldrh lr, [r4], #2 + ldrh r2, [\rt, #-2]! + add lr, \dst, lr, lsl #2 + add r2, \dst, r2, lsl #2 + vst1.32 {d1[0]}, [lr,:32] + vst1.32 {d3[0]}, [r2,:32] + ldrh lr, [r4], #2 + ldrh r2, [\rt, #-2]! + add lr, \dst, lr, lsl #2 + add r2, \dst, r2, lsl #2 + vst1.32 {d1[1]}, [lr,:32] + vst1.32 {d3[1]}, [r2,:32] + subs r6, r6, #32 + bgt 1b +.endm + +function ff_mdct_fixed_calc_neon, export=1 + push {r1,r4-r11,lr} + + ldr r4, [r0, #8] @ revtab + ldr r6, [r0, #16] @ mdct_size; n + ldr r7, [r0, #24] @ tcos + + prerot r1, r5 + + mov r4, r0 + bl X(ff_fft_fixed_calc_neon) + + pop {r5} + mov r12, #-16 + ldr r6, [r4, #16] @ mdct_size; n + ldr r7, [r4, #24] @ tcos + add r5, r5, r6, lsr #1 + add r7, r7, r6, lsr #1 + sub r1, r5, #16 + sub r2, r7, #16 +1: + vld2.16 {d4,d5}, [r7,:128]! + vld2.16 {d6,d7}, [r2,:128], r12 + vld2.16 {d0,d1}, [r5,:128] + vld2.16 {d2,d3}, [r1,:128] + vrev64.16 q3, q3 + vrev64.16 q1, q1 + vneg.s16 q3, q3 + vneg.s16 q2, q2 + vmull.s16 q11, d2, d6 + vmlal.s16 q11, d3, d7 + vmull.s16 q8, d0, d5 + vmlsl.s16 q8, d1, d4 + vmull.s16 q9, d0, d4 + vmlal.s16 q9, d1, d5 + vmull.s16 q10, d2, d7 + vmlsl.s16 q10, d3, d6 + vshrn.s32 d0, q11, #15 + vshrn.s32 d1, q8, #15 + vshrn.s32 d2, q9, #15 + vshrn.s32 d3, q10, #15 + vrev64.16 q0, q0 + vst2.16 {d2,d3}, [r5,:128]! + vst2.16 {d0,d1}, [r1,:128], r12 + subs r6, r6, #32 + bgt 1b + + pop {r4-r11,pc} +endfunc + +function ff_mdct_fixed_calcw_neon, export=1 + push {r1,r4-r11,lr} + + ldrd r4, r5, [r0, #8] @ revtab, tmp_buf + ldr r6, [r0, #16] @ mdct_size; n + ldr r7, [r0, #24] @ tcos + + prerot r5, r1 + + mov r4, r0 + mov r1, r5 + bl X(ff_fft_fixed_calc_neon) + + pop {r7} + mov r12, #-16 + ldr r6, [r4, #16] @ mdct_size; n + ldr r9, [r4, #24] @ tcos + add r5, r5, r6, lsr #1 + add r7, r7, r6 + add r9, r9, r6, lsr #1 + sub r3, r5, #16 + sub r1, r7, #16 + sub r2, r9, #16 +1: + vld2.16 {d4,d5}, [r9,:128]! + vld2.16 {d6,d7}, [r2,:128], r12 + vld2.16 {d0,d1}, [r5,:128]! + vld2.16 {d2,d3}, [r3,:128], r12 + vrev64.16 q3, q3 + vrev64.16 q1, q1 + vneg.s16 q3, q3 + vneg.s16 q2, q2 + vmull.s16 q8, d2, d6 + vmlal.s16 q8, d3, d7 + vmull.s16 q9, d0, d5 + vmlsl.s16 q9, d1, d4 + vmull.s16 q10, d0, d4 + vmlal.s16 q10, d1, d5 + vmull.s16 q11, d2, d7 + vmlsl.s16 q11, d3, d6 + vrev64.32 q8, q8 + vrev64.32 q9, q9 + vst2.32 {q10,q11},[r7,:128]! + vst2.32 {d16,d18},[r1,:128], r12 + vst2.32 {d17,d19},[r1,:128], r12 + subs r6, r6, #32 + bgt 1b + + pop {r4-r11,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/mdct_neon.S b/ffmpeg/libavcodec/arm/mdct_neon.S new file mode 100644 index 0000000..e481cd1 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mdct_neon.S @@ -0,0 +1,301 @@ +/* + * ARM NEON optimised MDCT + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define ff_fft_calc_neon X(ff_fft_calc_neon) + +function ff_imdct_half_neon, export=1 + push {r4-r8,lr} + + mov r12, #1 + ldr lr, [r0, #20] @ mdct_bits + ldr r4, [r0, #24] @ tcos + ldr r3, [r0, #8] @ revtab + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #2 @ n4 = n >> 2 + add r7, r2, r12, lsl #1 + mov r12, #-16 + sub r7, r7, #16 + + vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 + vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x + vrev64.32 d17, d17 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 + vmul.f32 d6, d17, d2 + vmul.f32 d7, d0, d2 +1: + subs lr, lr, #2 + ldr r6, [r3], #4 + vmul.f32 d4, d0, d3 + vmul.f32 d5, d17, d3 + vsub.f32 d4, d6, d4 + vadd.f32 d5, d5, d7 + uxth r8, r6, ror #16 + uxth r6, r6 + add r8, r1, r8, lsl #3 + add r6, r1, r6, lsl #3 + beq 1f + vld2.32 {d16-d17},[r7,:128],r12 + vld2.32 {d0-d1}, [r2,:128]! + vrev64.32 d17, d17 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 + vmul.f32 d6, d17, d2 + vmul.f32 d7, d0, d2 + vst2.32 {d4[0],d5[0]}, [r6,:64] + vst2.32 {d4[1],d5[1]}, [r8,:64] + b 1b +1: + vst2.32 {d4[0],d5[0]}, [r6,:64] + vst2.32 {d4[1],d5[1]}, [r8,:64] + + mov r4, r0 + mov r6, r1 + bl ff_fft_calc_neon + + mov r12, #1 + ldr lr, [r4, #20] @ mdct_bits + ldr r4, [r4, #24] @ tcos + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #3 @ n8 = n >> 3 + + add r4, r4, lr, lsl #3 + add r6, r6, lr, lsl #3 + sub r1, r4, #16 + sub r3, r6, #16 + + mov r7, #-16 + mov r8, r6 + mov r0, r3 + + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 + vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 +1: + subs lr, lr, #2 + vmul.f32 d7, d0, d18 + vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 + vmul.f32 d4, d1, d18 + vmul.f32 d5, d21, d19 + vmul.f32 d6, d20, d19 + vmul.f32 d22, d1, d16 + vmul.f32 d23, d21, d17 + vmul.f32 d24, d0, d16 + vmul.f32 d25, d20, d17 + vadd.f32 d7, d7, d22 + vadd.f32 d6, d6, d23 + vsub.f32 d4, d4, d24 + vsub.f32 d5, d5, d25 + beq 1f + vld2.32 {d0-d1}, [r3,:128], r7 + vld2.32 {d20-d21},[r6,:128]! + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128], r7 + vst2.32 {d5,d7}, [r8,:128]! + b 1b +1: + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128] + vst2.32 {d5,d7}, [r8,:128] + + pop {r4-r8,pc} +endfunc + +function ff_imdct_calc_neon, export=1 + push {r4-r6,lr} + + ldr r3, [r0, #20] + mov r4, #1 + mov r5, r1 + lsl r4, r4, r3 + add r1, r1, r4 + + bl ff_imdct_half_neon + + add r0, r5, r4, lsl #2 + add r1, r5, r4, lsl #1 + sub r0, r0, #8 + sub r2, r1, #16 + mov r3, #-16 + mov r6, #-8 + vmov.i32 d30, #1<<31 +1: + vld1.32 {d0-d1}, [r2,:128], r3 + pld [r0, #-16] + vrev64.32 q0, q0 + vld1.32 {d2-d3}, [r1,:128]! + veor d4, d1, d30 + pld [r2, #-16] + vrev64.32 q1, q1 + veor d5, d0, d30 + vst1.32 {d2}, [r0,:64], r6 + vst1.32 {d3}, [r0,:64], r6 + vst1.32 {d4-d5}, [r5,:128]! + subs r4, r4, #16 + bgt 1b + + pop {r4-r6,pc} +endfunc + +function ff_mdct_calc_neon, export=1 + push {r4-r10,lr} + + mov r12, #1 + ldr lr, [r0, #20] @ mdct_bits + ldr r4, [r0, #24] @ tcos + ldr r3, [r0, #8] @ revtab + lsl lr, r12, lr @ n = 1 << nbits + add r7, r2, lr @ in4u + sub r9, r7, #16 @ in4d + add r2, r7, lr, lsl #1 @ in3u + add r8, r9, lr, lsl #1 @ in3d + add r5, r4, lr, lsl #1 + sub r5, r5, #16 + sub r3, r3, #4 + mov r12, #-16 + + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 + vsub.f32 d0, d18, d0 @ in4d-in4u I + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 + vadd.f32 d1, d1, d19 @ in3u+in3d -R + vsub.f32 d16, d16, d2 @ in0u-in2d R + vadd.f32 d17, d17, d3 @ in2u+in1d -I +1: + vmul.f32 d7, d0, d21 @ I*s +A ldr r10, [r3, lr, lsr #1] +T lsr r10, lr, #1 +T ldr r10, [r3, r10] + vmul.f32 d6, d1, d20 @ -R*c + ldr r6, [r3, #4]! + vmul.f32 d4, d1, d21 @ -R*s + vmul.f32 d5, d0, d20 @ I*c + vmul.f32 d24, d16, d30 @ R*c + vmul.f32 d25, d17, d31 @ -I*s + vmul.f32 d22, d16, d31 @ R*s + vmul.f32 d23, d17, d30 @ I*c + subs lr, lr, #16 + vsub.f32 d6, d6, d7 @ -R*c-I*s + vadd.f32 d7, d4, d5 @ -R*s+I*c + vsub.f32 d24, d25, d24 @ I*s-R*c + vadd.f32 d25, d22, d23 @ R*s-I*c + beq 1f + mov r12, #-16 + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 + vneg.f32 d7, d7 @ R*s-I*c + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 + vsub.f32 d0, d18, d0 @ in4d-in4u I + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 + vadd.f32 d1, d1, d19 @ in3u+in3d -R + vsub.f32 d16, d16, d2 @ in0u-in2d R + vadd.f32 d17, d17, d3 @ in2u+in1d -I + uxth r12, r6, ror #16 + uxth r6, r6 + add r12, r1, r12, lsl #3 + add r6, r1, r6, lsl #3 + vst2.32 {d6[0],d7[0]}, [r6,:64] + vst2.32 {d6[1],d7[1]}, [r12,:64] + uxth r6, r10, ror #16 + uxth r10, r10 + add r6 , r1, r6, lsl #3 + add r10, r1, r10, lsl #3 + vst2.32 {d24[0],d25[0]},[r10,:64] + vst2.32 {d24[1],d25[1]},[r6,:64] + b 1b +1: + vneg.f32 d7, d7 @ R*s-I*c + uxth r12, r6, ror #16 + uxth r6, r6 + add r12, r1, r12, lsl #3 + add r6, r1, r6, lsl #3 + vst2.32 {d6[0],d7[0]}, [r6,:64] + vst2.32 {d6[1],d7[1]}, [r12,:64] + uxth r6, r10, ror #16 + uxth r10, r10 + add r6 , r1, r6, lsl #3 + add r10, r1, r10, lsl #3 + vst2.32 {d24[0],d25[0]},[r10,:64] + vst2.32 {d24[1],d25[1]},[r6,:64] + + mov r4, r0 + mov r6, r1 + bl ff_fft_calc_neon + + mov r12, #1 + ldr lr, [r4, #20] @ mdct_bits + ldr r4, [r4, #24] @ tcos + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #3 @ n8 = n >> 3 + + add r4, r4, lr, lsl #3 + add r6, r6, lr, lsl #3 + sub r1, r4, #16 + sub r3, r6, #16 + + mov r7, #-16 + mov r8, r6 + mov r0, r3 + + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 + vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 +1: + subs lr, lr, #2 + vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 + vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 + vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 + vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 + vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 + vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 + vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 + vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 + vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 + vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 + vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 + vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 + vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 + vneg.f32 q2, q2 + beq 1f + vld2.32 {d0-d1}, [r3,:128], r7 + vld2.32 {d20-d21},[r6,:128]! + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128], r7 + vst2.32 {d5,d7}, [r8,:128]! + b 1b +1: + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128] + vst2.32 {d5,d7}, [r8,:128] + + pop {r4-r10,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S new file mode 100644 index 0000000..49bd0bc --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro skip args:vararg +.endm + +.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0 + ldr \t1, [\w, #4*\offs] + ldr \t2, [\p, #4]! + \rsb \t1, \t1, #0 + .irpc i, 135 + ldr \t3, [\w, #4*64*\i+4*\offs] + ldr \t4, [\p, #4*64*\i] + smlal \lo, \hi, \t1, \t2 + \rsb \t3, \t3, #0 + ldr \t1, [\w, #4*64*(\i+1)+4*\offs] + ldr \t2, [\p, #4*64*(\i+1)] + smlal \lo, \hi, \t3, \t4 + \rsb \t1, \t1, #0 + .endr + ldr \t3, [\w, #4*64*7+4*\offs] + ldr \t4, [\p, #4*64*7] + smlal \lo, \hi, \t1, \t2 + \rsb \t3, \t3, #0 + smlal \lo, \hi, \t3, \t4 +.endm + +.macro round rd, lo, hi + lsr \rd, \lo, #24 + bic \lo, \lo, #0xff000000 + orr \rd, \rd, \hi, lsl #8 + mov \hi, #0 + ssat \rd, #16, \rd +.endm + +function ff_mpadsp_apply_window_fixed_armv6, export=1 + push {r2,r4-r11,lr} + + add r4, r0, #4*512 @ synth_buf + 512 + .rept 4 + ldm r0!, {r5-r12} + stm r4!, {r5-r12} + .endr + + ldr r4, [sp, #40] @ incr + sub r0, r0, #4*17 @ synth_buf + 16 + ldr r8, [r2] @ sum:low + add r2, r0, #4*32 @ synth_buf + 48 + rsb r5, r4, r4, lsl #5 @ 31 * incr + lsl r4, r4, #1 + asr r9, r8, #31 @ sum:high + add r5, r3, r5, lsl #1 @ samples2 + add r6, r1, #4*32 @ w2 + str r4, [sp, #40] + + sum8 r8, r9, r1, r0, r10, r11, r12, lr + sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 + round r10, r8, r9 + strh_post r10, r3, r4 + + mov lr, #15 +1: + ldr r12, [r0, #4]! + ldr r11, [r6, #-4]! + ldr r10, [r1, #4]! + .irpc i, 0246 + .if \i + ldr r11, [r6, #4*64*\i] + ldr r10, [r1, #4*64*\i] + .endif + rsb r11, r11, #0 + smlal r8, r9, r10, r12 + ldr r10, [r0, #4*64*(\i+1)] + .ifeq \i + smull r4, r7, r11, r12 + .else + smlal r4, r7, r11, r12 + .endif + ldr r11, [r6, #4*64*(\i+1)] + ldr r12, [r1, #4*64*(\i+1)] + rsb r11, r11, #0 + smlal r8, r9, r12, r10 + .iflt \i-6 + ldr r12, [r0, #4*64*(\i+2)] + .else + ldr r12, [r2, #-4]! + .endif + smlal r4, r7, r11, r10 + .endr + .irpc i, 0246 + ldr r10, [r1, #4*64*\i+4*32] + rsb r12, r12, #0 + ldr r11, [r6, #4*64*\i+4*32] + smlal r8, r9, r10, r12 + ldr r10, [r2, #4*64*(\i+1)] + smlal r4, r7, r11, r12 + ldr r12, [r1, #4*64*(\i+1)+4*32] + rsb r10, r10, #0 + ldr r11, [r6, #4*64*(\i+1)+4*32] + smlal r8, r9, r12, r10 + .iflt \i-6 + ldr r12, [r2, #4*64*(\i+2)] + .else + ldr r12, [sp, #40] + .endif + smlal r4, r7, r11, r10 + .endr + round r10, r8, r9 + adds r8, r8, r4 + adc r9, r9, r7 + strh_post r10, r3, r12 + round r11, r8, r9 + subs lr, lr, #1 + strh_dpost r11, r5, r12 + bgt 1b + + sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 + pop {r4} + round r10, r8, r9 + str r8, [r4] + strh r10, [r3] + + pop {r4-r11,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c new file mode 100644 index 0000000..e73aee6 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/mpegaudiodsp.h" +#include "config.h" + +void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window, + int *dither, int16_t *out, int incr); + +av_cold void ff_mpadsp_init_arm(MPADSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6; + } +} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.c b/ffmpeg/libavcodec/arm/mpegvideo_arm.c new file mode 100644 index 0000000..6566798 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegvideo_arm.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2002 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/mpegvideo.h" +#include "mpegvideo_arm.h" +#include "asm-offsets.h" + +#if HAVE_NEON +CHK_OFFS(MpegEncContext, y_dc_scale, Y_DC_SCALE); +CHK_OFFS(MpegEncContext, c_dc_scale, C_DC_SCALE); +CHK_OFFS(MpegEncContext, ac_pred, AC_PRED); +CHK_OFFS(MpegEncContext, block_last_index, BLOCK_LAST_INDEX); +CHK_OFFS(MpegEncContext, inter_scantable.raster_end, INTER_SCANTAB_RASTER_END); +CHK_OFFS(MpegEncContext, h263_aic, H263_AIC); +#endif + +void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block, + int n, int qscale); +void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block, + int n, int qscale); + +av_cold void ff_MPV_common_init_arm(MpegEncContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv5te(cpu_flags)) + ff_MPV_common_init_armv5te(s); + + if (have_neon(cpu_flags)) { + s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon; + s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.h b/ffmpeg/libavcodec/arm/mpegvideo_arm.h new file mode 100644 index 0000000..4ff93b7 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegvideo_arm.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MPEGVIDEO_H +#define AVCODEC_ARM_MPEGVIDEO_H + +#include "libavcodec/mpegvideo.h" + +void ff_MPV_common_init_armv5te(MpegEncContext *s); + +#endif /* AVCODEC_ARM_MPEGVIDEO_H */ diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c new file mode 100644 index 0000000..a572290 --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c @@ -0,0 +1,102 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/mpegvideo.h" +#include "mpegvideo_arm.h" + +void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count); + +#ifdef ENABLE_ARM_TESTS +/** + * h263 dequantizer supplementary function, it is performance critical and needs to + * have optimized implementations for each architecture. Is also used as a reference + * implementation in regression tests + */ +static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count) +{ + int i, level; + for (i = 0; i < count; i++) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } +} +#endif + +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, + int16_t *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + + av_assert2(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); + block[0] = level; +} + +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, + int16_t *block, int n, int qscale) +{ + int qmul, qadd; + int nCoeffs; + + av_assert2(s->block_last_index[n]>=0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); +} + +av_cold void ff_MPV_common_init_armv5te(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; +} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S new file mode 100644 index 0000000..8687d6b --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S @@ -0,0 +1,114 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/arm/asm.S" + +/* + * Special optimized version of dct_unquantize_h263_helper_c, it + * requires the block to be at least 8 bytes aligned, and may process + * more elements than requested. But it is guaranteed to never + * process more than 64 elements provided that count argument is <= 64, + * so it is safe. This function is optimized for a common distribution + * of values for nCoeffs (they are mostly multiple of 8 plus one or + * two extra elements). So this function processes data as 8 elements + * per loop iteration and contains optional 2 elements processing in + * the end. + * + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) + */ + +.macro dequant_t dst, src, mul, add, tmp + rsbs \tmp, ip, \src, asr #16 + it gt + addgt \tmp, \add, #0 + it lt + rsblt \tmp, \add, #0 + it ne + smlatbne \dst, \src, \mul, \tmp +.endm + +.macro dequant_b dst, src, mul, add, tmp + rsbs \tmp, ip, \src, lsl #16 + it gt + addgt \tmp, \add, #0 + it lt + rsblt \tmp, \add, #0 + it ne + smlabbne \dst, \src, \mul, \tmp +.endm + +function ff_dct_unquantize_h263_armv5te, export=1 + push {r4-r9,lr} + mov ip, #0 + subs r3, r3, #2 + ble 2f + ldrd r4, r5, [r0, #0] +1: + ldrd r6, r7, [r0, #8] + + dequant_t r9, r4, r1, r2, r9 + dequant_t lr, r5, r1, r2, lr + dequant_b r4, r4, r1, r2, r8 + dequant_b r5, r5, r1, r2, r8 + + strh r4, [r0], #2 + strh r9, [r0], #2 + strh r5, [r0], #2 + strh lr, [r0], #2 + + dequant_t r9, r6, r1, r2, r9 + dequant_t lr, r7, r1, r2, lr + dequant_b r6, r6, r1, r2, r8 + dequant_b r7, r7, r1, r2, r8 + + strh r6, [r0], #2 + strh r9, [r0], #2 + strh r7, [r0], #2 + strh lr, [r0], #2 + + subs r3, r3, #8 + it gt + ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */ + bgt 1b + + adds r3, r3, #2 + it le + pople {r4-r9,pc} +2: + ldrsh r9, [r0, #0] + ldrsh lr, [r0, #2] + mov r8, r2 + cmp r9, #0 + it lt + rsblt r8, r2, #0 + it ne + smlabbne r9, r9, r1, r8 + mov r8, r2 + cmp lr, #0 + it lt + rsblt r8, r2, #0 + it ne + smlabbne lr, lr, r1, r8 + strh r9, [r0], #2 + strh lr, [r0], #2 + pop {r4-r9,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/mpegvideo_neon.S b/ffmpeg/libavcodec/arm/mpegvideo_neon.S new file mode 100644 index 0000000..e05df8e --- /dev/null +++ b/ffmpeg/libavcodec/arm/mpegvideo_neon.S @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "asm-offsets.h" + +function ff_dct_unquantize_h263_inter_neon, export=1 + add r12, r0, #BLOCK_LAST_INDEX + ldr r12, [r12, r2, lsl #2] + add r0, r0, #INTER_SCANTAB_RASTER_END + ldrb r12, [r0, r12] + sub r2, r3, #1 + lsl r0, r3, #1 + orr r2, r2, #1 + add r3, r12, #1 +endfunc + +function ff_dct_unquantize_h263_neon, export=1 + vdup.16 q15, r0 @ qmul + vdup.16 q14, r2 @ qadd + vneg.s16 q13, q14 + cmp r3, #4 + mov r0, r1 + ble 2f +1: + vld1.16 {q0}, [r0,:128]! + vclt.s16 q3, q0, #0 + vld1.16 {q8}, [r0,:128]! + vceq.s16 q1, q0, #0 + vmul.s16 q2, q0, q15 + vclt.s16 q11, q8, #0 + vmul.s16 q10, q8, q15 + vbsl q3, q13, q14 + vbsl q11, q13, q14 + vadd.s16 q2, q2, q3 + vceq.s16 q9, q8, #0 + vadd.s16 q10, q10, q11 + vbif q0, q2, q1 + vbif q8, q10, q9 + subs r3, r3, #16 + vst1.16 {q0}, [r1,:128]! + vst1.16 {q8}, [r1,:128]! + it le + bxle lr + cmp r3, #8 + bgt 1b +2: + vld1.16 {d0}, [r0,:64] + vclt.s16 d3, d0, #0 + vceq.s16 d1, d0, #0 + vmul.s16 d2, d0, d30 + vbsl d3, d26, d28 + vadd.s16 d2, d2, d3 + vbif d0, d2, d1 + vst1.16 {d0}, [r1,:64] + bx lr +endfunc + +function ff_dct_unquantize_h263_intra_neon, export=1 + push {r4-r6,lr} + add r12, r0, #BLOCK_LAST_INDEX + ldr r6, [r0, #AC_PRED] + add lr, r0, #INTER_SCANTAB_RASTER_END + cmp r6, #0 + it ne + movne r12, #63 + bne 1f + ldr r12, [r12, r2, lsl #2] + ldrb r12, [lr, r12] +1: ldr r5, [r0, #H263_AIC] + ldrsh r4, [r1] + cmp r5, #0 + mov r5, r1 + it ne + movne r2, #0 + bne 2f + cmp r2, #4 + it ge + addge r0, r0, #4 + sub r2, r3, #1 + ldr r6, [r0, #Y_DC_SCALE] + orr r2, r2, #1 + smulbb r4, r4, r6 +2: lsl r0, r3, #1 + add r3, r12, #1 + bl ff_dct_unquantize_h263_neon + vmov.16 d0[0], r4 + vst1.16 {d0[0]}, [r5] + pop {r4-r6,pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/neon.S b/ffmpeg/libavcodec/arm/neon.S new file mode 100644 index 0000000..716a607 --- /dev/null +++ b/ffmpeg/libavcodec/arm/neon.S @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_4x4 r0, r1, r2, r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 +.endm + +.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm diff --git a/ffmpeg/libavcodec/arm/rdft_neon.S b/ffmpeg/libavcodec/arm/rdft_neon.S new file mode 100644 index 0000000..781d976 --- /dev/null +++ b/ffmpeg/libavcodec/arm/rdft_neon.S @@ -0,0 +1,150 @@ +/* + * ARM NEON optimised RDFT + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_rdft_calc_neon, export=1 + push {r4-r8,lr} + + ldr r6, [r0, #4] @ inverse + mov r4, r0 + mov r5, r1 + + lsls r6, r6, #31 + bne 1f + add r0, r4, #20 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_calc_neon) +1: + ldr r12, [r4, #0] @ nbits + mov r2, #1 + lsl r12, r2, r12 + add r0, r5, #8 + add r1, r5, r12, lsl #2 + lsr r12, r12, #2 + ldr r2, [r4, #12] @ tcos + sub r12, r12, #2 + ldr r3, [r4, #16] @ tsin + mov r7, r0 + sub r1, r1, #8 + mov lr, r1 + mov r8, #-8 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + vld1.32 {d5}, [r3,:64]! @ tsin[i] + vmov.f32 d18, #0.5 @ k1 + vdup.32 d19, r6 + pld [r0, #32] + veor d19, d18, d19 @ k2 + vmov.i32 d16, #0 + vmov.i32 d17, #1<<31 + pld [r1, #-32] + vtrn.32 d16, d17 + pld [r2, #32] + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 + pld [r3, #32] +2: + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vld1.32 {d24}, [r0,:64]! @ d1[0,1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] + pld [r0, #32] + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + pld [r1, #-32] + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + veor d2, d3, d16 @ -od.re, od.im + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + veor d7, d23, d16 @ -od.im, od.re + vld1.32 {d5}, [r3,:64]! @ tsin[i] + veor d24, d22, d17 @ ev.re,-ev.im + vrev64.32 d3, d23 @ od.re, od.im + pld [r2, #32] + veor d2, d3, d16 @ -od.re, od.im + pld [r3, #32] + vmla.f32 d22, d3, d4[0] + vmla.f32 d22, d7, d5[0] + vmla.f32 d24, d2, d4[0] + vmla.f32 d24, d23, d5[0] + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vst1.32 {d20}, [r7,:64]! + vst1.32 {d6}, [lr,:64], r8 + vst1.32 {d22}, [r7,:64]! + vst1.32 {d24}, [lr,:64], r8 + subs r12, r12, #2 + bgt 2b + + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + ldr r2, [r4, #8] @ sign_convention + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + add r0, r0, #4 + bfc r2, #0, #31 + vld1.32 {d0[0]}, [r0,:32] + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + vld1.32 {d22}, [r5,:64] + vdup.32 d1, r2 + vmov d23, d22 + veor d2, d3, d16 @ -od.re, od.im + vtrn.32 d22, d23 + veor d0, d0, d1 + veor d23, d23, d17 + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vadd.f32 d22, d22, d23 + vst1.32 {d20}, [r7,:64] + vst1.32 {d6}, [lr,:64] + vst1.32 {d0[0]}, [r0,:32] + vst1.32 {d22}, [r5,:64] + + cmp r6, #0 + it eq + popeq {r4-r8,pc} + + vmul.f32 d22, d22, d18 + vst1.32 {d22}, [r5,:64] + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + pop {r4-r8,lr} + b X(ff_fft_calc_neon) +endfunc diff --git a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c new file mode 100644 index 0000000..8bfe90b --- /dev/null +++ b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/rv34dsp.h" +#include "libavutil/arm/cpu.h" + +void ff_rv34_inv_transform_noround_neon(int16_t *block); + +void ff_rv34_inv_transform_noround_dc_neon(int16_t *block); + +void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block); +void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc); + +av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon; + c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon; + + c->rv34_idct_add = ff_rv34_idct_add_neon; + c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/rv34dsp_neon.S b/ffmpeg/libavcodec/arm/rv34dsp_neon.S new file mode 100644 index 0000000..a29123f --- /dev/null +++ b/ffmpeg/libavcodec/arm/rv34dsp_neon.S @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.macro rv34_inv_transform r0 + vld1.16 {q14-q15}, [\r0,:128] + vmov.s16 d0, #13 + vshll.s16 q12, d29, #3 + vshll.s16 q13, d29, #4 + vshll.s16 q9, d31, #3 + vshll.s16 q1, d31, #4 + vmull.s16 q10, d28, d0 + vmlal.s16 q10, d30, d0 + vmull.s16 q11, d28, d0 + vmlsl.s16 q11, d30, d0 + vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7 + vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17 + vsubw.s16 q9, q9, d31 + vaddw.s16 q1, q1, d31 + vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3] + vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3] + vadd.s32 q1, q10, q13 @ z0 + z3 + vadd.s32 q2, q11, q12 @ z1 + z2 + vsub.s32 q8, q10, q13 @ z0 - z3 + vsub.s32 q3, q11, q12 @ z1 - z2 + vtrn.32 q1, q2 + vtrn.32 q3, q8 + vswp d3, d6 + vswp d5, d16 + vmov.s32 d0, #13 + vadd.s32 q10, q1, q3 + vsub.s32 q11, q1, q3 + vshl.s32 q12, q2, #3 + vshl.s32 q9, q2, #4 + vmul.s32 q13, q11, d0[0] + vshl.s32 q11, q8, #4 + vadd.s32 q9, q9, q2 + vshl.s32 q15, q8, #3 + vsub.s32 q12, q12, q2 + vadd.s32 q11, q11, q8 + vmul.s32 q14, q10, d0[0] + vsub.s32 q8, q15, q8 + vsub.s32 q12, q12, q11 + vadd.s32 q9, q9, q8 + vadd.s32 q2, q13, q12 @ z1 + z2 + vadd.s32 q1, q14, q9 @ z0 + z3 + vsub.s32 q3, q13, q12 @ z1 - z2 + vsub.s32 q15, q14, q9 @ z0 - z3 +.endm + +/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */ +function ff_rv34_idct_add_neon, export=1 + mov r3, r0 + rv34_inv_transform r2 + vmov.i16 q12, #0 + vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10 + vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10 + vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10 + vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10 + vld1.32 {d28[]}, [r0,:32], r1 + vld1.32 {d29[]}, [r0,:32], r1 + vtrn.32 q8, q9 + vld1.32 {d28[1]}, [r0,:32], r1 + vld1.32 {d29[1]}, [r0,:32], r1 + vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16) + vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16) + vtrn.16 d16, d17 + vtrn.32 d28, d29 + vtrn.16 d18, d19 + vaddw.u8 q0, q8, d28 + vaddw.u8 q1, q9, d29 + vqmovun.s16 d28, q0 + vqmovun.s16 d29, q1 + vst1.32 {d28[0]}, [r3,:32], r1 + vst1.32 {d28[1]}, [r3,:32], r1 + vst1.32 {d29[0]}, [r3,:32], r1 + vst1.32 {d29[1]}, [r3,:32], r1 + bx lr +endfunc + +/* void rv34_inv_transform_noround_neon(int16_t *block); */ +function ff_rv34_inv_transform_noround_neon, export=1 + rv34_inv_transform r0 + vshl.s32 q11, q2, #1 + vshl.s32 q10, q1, #1 + vshl.s32 q12, q3, #1 + vshl.s32 q13, q15, #1 + vadd.s32 q11, q11, q2 + vadd.s32 q10, q10, q1 + vadd.s32 q12, q12, q3 + vadd.s32 q13, q13, q15 + vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11 + vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11 + vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11 + vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11 + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]! + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]! + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]! + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]! + bx lr +endfunc + +/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */ +function ff_rv34_idct_dc_add_neon, export=1 + mov r3, r0 + vld1.32 {d28[]}, [r0,:32], r1 + vld1.32 {d29[]}, [r0,:32], r1 + vdup.16 d0, r2 + vmov.s16 d1, #169 + vld1.32 {d28[1]}, [r0,:32], r1 + vmull.s16 q1, d0, d1 @ dc * 13 * 13 + vld1.32 {d29[1]}, [r0,:32], r1 + vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10 + vmov d1, d0 + vaddw.u8 q2, q0, d28 + vaddw.u8 q3, q0, d29 + vqmovun.s16 d28, q2 + vqmovun.s16 d29, q3 + vst1.32 {d28[0]}, [r3,:32], r1 + vst1.32 {d29[0]}, [r3,:32], r1 + vst1.32 {d28[1]}, [r3,:32], r1 + vst1.32 {d29[1]}, [r3,:32], r1 + bx lr +endfunc + +/* void rv34_inv_transform_dc_noround_c(int16_t *block) */ +function ff_rv34_inv_transform_noround_dc_neon, export=1 + vld1.16 {d28[]}, [r0,:16] @ block[0] + vmov.i16 d4, #251 + vorr.s16 d4, #256 @ 13^2 * 3 + vmull.s16 q3, d28, d4 + vshrn.s32 d0, q3, #11 + vmov.i16 d1, d0 + vst1.64 {q0}, [r0,:128]! + vst1.64 {q0}, [r0,:128]! + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c new file mode 100644 index 0000000..fec3702 --- /dev/null +++ b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/rv34dsp.h" +#include "libavutil/arm/cpu.h" + +#define DECL_QPEL3(type, w, pos) \ + void ff_##type##_rv40_qpel##w##_mc##pos##_neon(uint8_t *dst, uint8_t *src,\ + ptrdiff_t stride) +#define DECL_QPEL2(w, pos) \ + DECL_QPEL3(put, w, pos); \ + DECL_QPEL3(avg, w, pos) + +#define DECL_QPEL_XY(x, y) \ + DECL_QPEL2(16, x ## y); \ + DECL_QPEL2(8, x ## y) + +#define DECL_QPEL_Y(y) \ + DECL_QPEL_XY(0, y); \ + DECL_QPEL_XY(1, y); \ + DECL_QPEL_XY(2, y); \ + DECL_QPEL_XY(3, y); \ + +DECL_QPEL_Y(0); +DECL_QPEL_Y(1); +DECL_QPEL_Y(2); +DECL_QPEL_Y(3); + +void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t); +void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t); + +int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride, + int beta, int beta2, int edge, + int *p1, int *q1); +int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride, + int beta, int beta2, int edge, + int *p1, int *q1); + +void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1, + int filter_q1, int alpha, int beta, + int lim_p0q0, int lim_q1, int lim_p1); +void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1, + int filter_q1, int alpha, int beta, + int lim_p0q0, int lim_q1, int lim_p1); + +static av_cold void ff_rv40dsp_init_neon(RV34DSPContext *c) +{ + c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon; + c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon; + c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon; + c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon; + c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon; + c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon; + c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon; + c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon; + c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon; + c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon; + c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon; + c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon; + c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon; + c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon; + c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon; + c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon; + c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon; + c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon; + c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon; + c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon; + c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon; + c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon; + c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon; + c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon; + c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon; + c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon; + c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon; + c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon; + c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon; + c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon; + c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon; + c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon; + c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon; + c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon; + c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon; + c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon; + c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon; + c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon; + c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon; + c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon; + c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon; + c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon; + c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon; + c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon; + c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon; + c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon; + c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon; + c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon; + c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon; + c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon; + c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon; + c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon; + + c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon; + c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon; + c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; + c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; + + c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon; + c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon; + + c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; + c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; + c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon; + c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon; +} + +av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_rv40dsp_init_neon(c); +} diff --git a/ffmpeg/libavcodec/arm/rv40dsp_neon.S b/ffmpeg/libavcodec/arm/rv40dsp_neon.S new file mode 100644 index 0000000..6bd45eb --- /dev/null +++ b/ffmpeg/libavcodec/arm/rv40dsp_neon.S @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2011 Janne Grunau + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.macro qpel_lowpass r0, r1, rc1, rc2, shift + vext.8 d25, \r0, \r1, #1 @ src[-1] + vext.8 d26, \r0, \r1, #4 @ src[ 2] + vext.8 d24, \r0, \r1, #5 @ src[ 3] + vaddl.u8 q9, d25, d26 + vaddl.u8 q8, \r0, d24 + vext.8 d27, \r0, \r1, #2 @ src[ 0] + vshl.s16 q12, q9, #2 + vsub.s16 q8, q8, q9 + vext.8 d28, \r0, \r1, #3 @ src[ 1] + vsub.s16 q8, q8, q12 + vmlal.u8 q8, d27, \rc1 + vmlal.u8 q8, d28, \rc2 + vqrshrun.s16 \r0, q8, #\shift +.endm + +.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift + vext.8 d25, \r0, \r1, #1 @ src[-1] + vext.8 d26, \r0, \r1, #4 @ src[ 2] + vext.8 d24, \r0, \r1, #5 @ src[ 3] + vaddl.u8 q9, d25, d26 + vaddl.u8 q8, \r0, d24 + vext.8 d29, \r0, \r1, #2 @ src[ 0] + vext.8 d28, \r0, \r1, #3 @ src[ 1] + vshl.s16 q10, q9, #2 + vext.8 \r1, \r2, \r3, #1 @ src[-1] + vsub.s16 q8, q8, q9 + vext.8 d22, \r2, \r3, #4 @ src[ 2] + vext.8 \r0, \r2, \r3, #5 @ src[ 3] + vaddl.u8 q13, \r1, d22 + vaddl.u8 q12, \r2, \r0 + vsub.s16 q8, q8, q10 + vshl.s16 q9, q13, #2 + vsub.s16 q12, q12, q13 + vmlal.u8 q8, d29, \rc1 + vmlal.u8 q8, d28, \rc2 + vsub.s16 q12, q12, q9 + vext.8 d26, \r2, \r3, #2 @ src[ 0] + vext.8 d27, \r2, \r3, #3 @ src[ 1] + vmlal.u8 q12, d26, \rc1 + vmlal.u8 q12, d27, \rc2 + vqrshrun.s16 \r0, q8, #\shift + vqrshrun.s16 \r2, q12, #\shift +.endm + +.macro rv40_qpel8_h shift +function put_rv40_qpel8_h_lp_packed_s\shift\()_neon +1: + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r1], r2 + qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift + vst1.8 {d4}, [r12,:64]! + vst1.8 {d6}, [r12,:64]! + subs r3, r3, #2 + bgt 1b + vld1.8 {q2}, [r1] + qpel_lowpass d4, d5, d0, d1, \shift + vst1.8 {d4}, [r12,:64]! + bx lr +endfunc +.endm + +.macro rv40_qpel8_v shift, type +function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon + vld1.64 {d2}, [r1,:64]! + vld1.64 {d3}, [r1,:64]! + vld1.64 {d4}, [r1,:64]! + vld1.64 {d5}, [r1,:64]! + vld1.64 {d6}, [r1,:64]! + vld1.64 {d7}, [r1,:64]! + vld1.64 {d8}, [r1,:64]! + vld1.64 {d9}, [r1,:64]! + vld1.64 {d10}, [r1,:64]! + vld1.64 {d11}, [r1,:64]! + vld1.64 {d12}, [r1,:64]! + vld1.64 {d13}, [r1,:64]! + vld1.64 {d14}, [r1,:64]! + transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 + transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31 + qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift + qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift + qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift + qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift + transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 + .ifc \type,avg + vld1.64 d12, [r0,:64], r2 + vld1.64 d13, [r0,:64], r2 + vld1.64 d14, [r0,:64], r2 + vld1.64 d15, [r0,:64], r2 + vld1.64 d16, [r0,:64], r2 + vld1.64 d17, [r0,:64], r2 + vld1.64 d18, [r0,:64], r2 + vld1.64 d19, [r0,:64], r2 + sub r0, r0, r2, lsl #3 + vrhadd.u8 q1, q1, q6 + vrhadd.u8 q2, q2, q7 + vrhadd.u8 q3, q3, q8 + vrhadd.u8 q4, q4, q9 + .endif + vst1.64 d2, [r0,:64], r2 + vst1.64 d3, [r0,:64], r2 + vst1.64 d4, [r0,:64], r2 + vst1.64 d5, [r0,:64], r2 + vst1.64 d6, [r0,:64], r2 + vst1.64 d7, [r0,:64], r2 + vst1.64 d8, [r0,:64], r2 + vst1.64 d9, [r0,:64], r2 + bx lr +endfunc +.endm + + rv40_qpel8_h 5 + rv40_qpel8_h 6 + +.macro rv40_qpel type +function \type\()_rv40_qpel8_h_lowpass_neon + .ifc \type,avg + mov r12, r0 + .endif +1: + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r1], r2 + qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6 + .ifc \type,avg + vld1.8 {d3}, [r12,:64], r2 + vld1.8 {d16}, [r12,:64], r2 + vrhadd.u8 d4, d4, d3 + vrhadd.u8 d6, d6, d16 + .endif + vst1.8 {d4}, [r0,:64], r2 + vst1.8 {d6}, [r0,:64], r2 + subs r3, r3, #2 + bgt 1b + bx lr +endfunc + +function \type\()_rv40_qpel8_v_lowpass_neon + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d8}, [r1], r2 + vld1.64 {d9}, [r1], r2 + vld1.64 {d10}, [r1], r2 + vld1.64 {d11}, [r1], r2 + vld1.64 {d12}, [r1], r2 + vld1.64 {d13}, [r1], r2 + vld1.64 {d14}, [r1] + transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 + transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31 + qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6 + qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6 + qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6 + qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6 + transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 + .ifc \type,avg + vld1.64 d12, [r0,:64], r2 + vld1.64 d13, [r0,:64], r2 + vld1.64 d14, [r0,:64], r2 + vld1.64 d15, [r0,:64], r2 + vld1.64 d16, [r0,:64], r2 + vld1.64 d17, [r0,:64], r2 + vld1.64 d18, [r0,:64], r2 + vld1.64 d19, [r0,:64], r2 + sub r0, r0, r2, lsl #3 + vrhadd.u8 q1, q1, q6 + vrhadd.u8 q2, q2, q7 + vrhadd.u8 q3, q3, q8 + vrhadd.u8 q4, q4, q9 + .endif + vst1.64 d2, [r0,:64], r2 + vst1.64 d3, [r0,:64], r2 + vst1.64 d4, [r0,:64], r2 + vst1.64 d5, [r0,:64], r2 + vst1.64 d6, [r0,:64], r2 + vst1.64 d7, [r0,:64], r2 + vst1.64 d8, [r0,:64], r2 + vst1.64 d9, [r0,:64], r2 + bx lr +endfunc + + rv40_qpel8_v 5, \type + rv40_qpel8_v 6, \type + +function ff_\type\()_rv40_qpel8_mc10_neon, export=1 + sub r1, r1, #2 + mov r3, #8 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + b \type\()_rv40_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_rv40_qpel8_mc30_neon, export=1 + sub r1, r1, #2 + mov r3, #8 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + b \type\()_rv40_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_rv40_qpel8_mc01_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub r1, r1, r2, lsl #1 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl \type\()_rv40_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc11_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + add r1, sp, #7 + bic r1, r1, #7 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc21_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + add r1, sp, #7 + bic r1, r1, #7 + vmov.i8 d0, #52 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc31_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + bl put_rv40_qpel8_h_lp_packed_s6_neon + add r1, sp, #7 + bic r1, r1, #7 + vswp d0, d1 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc12_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + add r1, sp, #7 + bic r1, r1, #7 + vmov.i8 d0, #20 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc22_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + add r1, sp, #7 + bic r1, r1, #7 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc32_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + bl put_rv40_qpel8_h_lp_packed_s6_neon + add r1, sp, #7 + bic r1, r1, #7 + vmov.i8 d1, #20 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc03_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub r1, r1, r2, lsl #1 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + bl \type\()_rv40_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc33_neon, export=1 + mov r3, #8 + b X(ff_\type\()_pixels8_xy2_neon) +endfunc + +function ff_\type\()_rv40_qpel8_mc13_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + add r1, sp, #7 + bic r1, r1, #7 + vswp d0, d1 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel8_mc23_neon, export=1 + push {r4, lr} + vpush {d8-d15} + sub sp, sp, #14*8 + add r12, sp, #7 + bic r12, r12, #7 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, #12 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + add r1, sp, #7 + bic r1, r1, #7 + vmov.i8 d1, #52 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #14*8 + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_rv40_qpel16_mc10_neon, export=1 + vmov.i8 d0, #52 + vmov.i8 d1, #20 +.L\type\()_rv40_qpel16_h: + push {r1, lr} + sub r1, r1, #2 + mov r3, #16 + bl \type\()_rv40_qpel8_h_lowpass_neon + pop {r1, lr} + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #6 + mov r3, #16 + b \type\()_rv40_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_rv40_qpel16_mc30_neon, export=1 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + b .L\type\()_rv40_qpel16_h +endfunc + +function ff_\type\()_rv40_qpel16_mc01_neon, export=1 + vmov.i8 d0, #52 + vmov.i8 d1, #20 +.L\type\()_rv40_qpel16_v: + sub r1, r1, r2, lsl #1 + push {r1, lr} + vpush {d8-d15} + bl \type\()_rv40_qpel8_v_lowpass_neon + sub r1, r1, r2, lsl #2 + bl \type\()_rv40_qpel8_v_lowpass_neon + ldr r1, [sp, #64] + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + bl \type\()_rv40_qpel8_v_lowpass_neon + sub r1, r1, r2, lsl #2 + bl \type\()_rv40_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {r1, pc} +endfunc + +function ff_\type\()_rv40_qpel16_mc11_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon +.L\type\()_rv40_qpel16_v_s6: + add r1, sp, #7 + bic r1, r1, #7 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + sub r1, r1, #40 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + sub r1, r1, #40 + bl \type\()_rv40_qpel8_v_lp_packed_s6_neon + add sp, sp, #44*8 + vpop {d8-d15} + pop {r1, pc} +endfunc + +function ff_\type\()_rv40_qpel16_mc21_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + vmov.i8 d0, #52 + b .L\type\()_rv40_qpel16_v_s6 +endfunc + +function ff_\type\()_rv40_qpel16_mc31_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + bl put_rv40_qpel8_h_lp_packed_s6_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + vswp d0, d1 + b .L\type\()_rv40_qpel16_v_s6 +endfunc + +function ff_\type\()_rv40_qpel16_mc12_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + vmov.i8 d0, #20 +.L\type\()_rv40_qpel16_v_s5: + add r1, sp, #7 + bic r1, r1, #7 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + sub r1, r1, #40 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + sub r1, r1, #40 + bl \type\()_rv40_qpel8_v_lp_packed_s5_neon + add sp, sp, #44*8 + vpop {d8-d15} + pop {r1, pc} +endfunc + +function ff_\type\()_rv40_qpel16_mc22_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + b .L\type\()_rv40_qpel16_v_s5 +endfunc + +function ff_\type\()_rv40_qpel16_mc32_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + bl put_rv40_qpel8_h_lp_packed_s6_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + vmov.i8 d1, #20 + b .L\type\()_rv40_qpel16_v_s5 +endfunc + +function ff_\type\()_rv40_qpel16_mc03_neon, export=1 + vmov.i8 d0, #20 + vmov.i8 d1, #52 + b .L\type\()_rv40_qpel16_v +endfunc + +function ff_\type\()_rv40_qpel16_mc13_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #52 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s6_neon + vswp d0, d1 + b .L\type\()_rv40_qpel16_v_s6 +endfunc + +function ff_\type\()_rv40_qpel16_mc23_neon, export=1 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + push {r1, lr} + vpush {d8-d15} + sub sp, sp, #44*8 + add r12, sp, #7 + bic r12, r12, #7 + mov r3, #20 + vmov.i8 d0, #20 + vmov.i8 d1, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + ldr r1, [sp, #416] + add r1, r1, #8 + mov r3, #20 + bl put_rv40_qpel8_h_lp_packed_s5_neon + vmov.i8 d1, #52 + b .L\type\()_rv40_qpel16_v_s6 +endfunc + +function ff_\type\()_rv40_qpel16_mc33_neon, export=1 + mov r3, #16 + b X(ff_\type\()_pixels16_xy2_neon) +endfunc +.endm + + rv40_qpel put + rv40_qpel avg + +.macro rv40_weight + vmovl.u8 q8, d2 + vmovl.u8 q9, d3 + vmovl.u8 q10, d4 + vmovl.u8 q11, d5 + vmull.u16 q2, d16, d0[2] + vmull.u16 q3, d17, d0[2] + vmull.u16 q8, d18, d0[2] + vmull.u16 q9, d19, d0[2] + vmull.u16 q12, d20, d0[0] + vmull.u16 q13, d21, d0[0] + vmull.u16 q14, d22, d0[0] + vmull.u16 q15, d23, d0[0] + vshrn.i32 d4, q2, #9 + vshrn.i32 d5, q3, #9 + vshrn.i32 d6, q8, #9 + vshrn.i32 d7, q9, #9 + vshrn.i32 d16, q12, #9 + vshrn.i32 d17, q13, #9 + vshrn.i32 d18, q14, #9 + vshrn.i32 d19, q15, #9 + vadd.u16 q2, q2, q8 + vadd.u16 q3, q3, q9 + vrshrn.i16 d2, q2, #5 + vrshrn.i16 d3, q3, #5 +.endm + +/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int w1, int w2, int stride) */ +function ff_rv40_weight_func_16_neon, export=1 + ldr r12, [sp] + vmov d0, r3, r12 + ldr r12, [sp, #4] + mov r3, #16 +1: + vld1.8 {q1}, [r1,:128], r12 + vld1.8 {q2}, [r2,:128], r12 + rv40_weight + vst1.8 {q1}, [r0,:128], r12 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int w1, int w2, int stride) */ +function ff_rv40_weight_func_8_neon, export=1 + ldr r12, [sp] + vmov d0, r3, r12 + ldr r12, [sp, #4] + mov r3, #8 +1: + vld1.8 {d2}, [r1,:64], r12 + vld1.8 {d3}, [r1,:64], r12 + vld1.8 {d4}, [r2,:64], r12 + vld1.8 {d5}, [r2,:64], r12 + rv40_weight + vst1.8 {d2}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + subs r3, r3, #2 + bne 1b + bx lr +endfunc + +function ff_rv40_h_loop_filter_strength_neon, export=1 + pkhbt r2, r3, r2, lsl #18 + + ldr r3, [r0] + ldr_dpre r12, r0, r1 + teq r3, r12 + beq 1f + + sub r0, r0, r1, lsl #1 + + vld1.32 {d4[]}, [r0,:32], r1 @ -3 + vld1.32 {d0[]}, [r0,:32], r1 @ -2 + vld1.32 {d4[1]}, [r0,:32], r1 @ -1 + vld1.32 {d5[]}, [r0,:32], r1 @ 0 + vld1.32 {d1[]}, [r0,:32], r1 @ 1 + vld1.32 {d5[0]}, [r0,:32], r1 @ 2 + + vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1 + vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0 + vdup.32 d30, r2 @ beta2, beta << 2 + vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1 + vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0 + vabd.u16 d16, d18, d16 + vclt.u16 d16, d16, d30 + + ldrd r2, r3, [sp, #4] + vmovl.u16 q12, d16 + vtrn.16 d16, d17 + vshr.u32 q12, q12, #15 + ldr r0, [sp] + vst1.32 {d24[1]}, [r2,:32] + vst1.32 {d25[1]}, [r3,:32] + + cmp r0, #0 + it eq + bxeq lr + + vand d18, d16, d17 + vtrn.32 d18, d19 + vand d18, d18, d19 + vmov.u16 r0, d18[0] + bx lr +1: + ldrd r2, r3, [sp, #4] + mov r0, #0 + str r0, [r2] + str r0, [r3] + bx lr +endfunc + +function ff_rv40_v_loop_filter_strength_neon, export=1 + sub r0, r0, #3 + pkhbt r2, r3, r2, lsl #18 + + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r0], r1 + + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vdup.32 q15, r2 + vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2 + vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2 + vabd.u16 q0, q1, q0 + vclt.u16 q0, q0, q15 + + ldrd r2, r3, [sp, #4] + vmovl.u16 q1, d0 + vext.16 d1, d0, d1, #3 + vshr.u32 q1, q1, #15 + ldr r0, [sp] + vst1.32 {d2[1]}, [r2,:32] + vst1.32 {d3[1]}, [r3,:32] + + cmp r0, #0 + it eq + bxeq lr + + vand d0, d0, d1 + vtrn.16 d0, d1 + vand d0, d0, d1 + vmov.u16 r0, d0[0] + bx lr +endfunc + +.macro rv40_weak_loop_filter + vdup.16 d30, r2 @ filter_p1 + vdup.16 d31, r3 @ filter_q1 + ldrd r2, r3, [sp] + vdup.16 d28, r2 @ alpha + vdup.16 d29, r3 @ beta + ldr r12, [sp, #8] + vdup.16 d25, r12 @ lim_p0q0 + ldrd r2, r3, [sp, #12] + vsubl.u8 q9, d5, d4 @ x, t + vabdl.u8 q8, d5, d4 @ x, abs(t) + vneg.s16 q15, q15 + vceq.i16 d16, d19, #0 @ !t + vshl.s16 d19, d19, #2 @ t << 2 + vmul.u16 d18, d17, d28 @ alpha * abs(t) + vand d24, d30, d31 @ filter_p1 & filter_q1 + vsubl.u8 q1, d0, d4 @ p1p2, p1p0 + vsubl.u8 q3, d1, d5 @ q1q2, q1q0 + vmov.i16 d22, #3 + vshr.u16 d18, d18, #7 + vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1) + vsubl.u8 q10, d0, d1 @ src[-2] - src[1] + vcle.u16 d18, d18, d22 + vand d20, d20, d24 + vneg.s16 d23, d25 @ -lim_p0q0 + vadd.s16 d19, d19, d20 + vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1) + vtrn.32 d4, d5 @ -3, 2, -1, 0 + vrshr.s16 d19, d19, #3 + vmov d28, d29 @ beta + vswp d3, d6 @ q1q2, p1p0 + vmin.s16 d19, d19, d25 + vand d30, d30, d16 + vand d31, d31, d16 + vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0 + vmax.s16 d19, d19, d23 @ diff + vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2) + vand d18, d19, d16 @ diff + vcle.u16 q1, q1, q14 + vneg.s16 d19, d18 @ -diff + vdup.16 d26, r3 @ lim_p1 + vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff + vhsub.s16 q11, q10, q9 + vand q1, q1, q15 + vqmovun.s16 d4, q2 @ -1, 0 + vand q9, q11, q1 + vdup.16 d27, r2 @ lim_q1 + vneg.s16 q9, q9 + vneg.s16 q14, q13 + vmin.s16 q9, q9, q13 + vtrn.32 d0, d1 @ -2, 1, -2, 1 + vmax.s16 q9, q9, q14 + vaddw.u8 q3, q9, d0 + vqmovun.s16 d5, q3 @ -2, 1 +.endm + +function ff_rv40_h_weak_loop_filter_neon, export=1 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + vld1.32 {d4[]}, [r0,:32], r1 + vld1.32 {d0[]}, [r0,:32], r1 + vld1.32 {d4[1]}, [r0,:32], r1 + vld1.32 {d5[]}, [r0,:32], r1 + vld1.32 {d1[]}, [r0,:32], r1 + vld1.32 {d5[0]}, [r0,:32] + + sub r0, r0, r1, lsl #2 + + rv40_weak_loop_filter + + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 + + bx lr +endfunc + +function ff_rv40_v_weak_loop_filter_neon, export=1 + sub r12, r0, #3 + sub r0, r0, #2 + + vld1.8 {d4}, [r12], r1 + vld1.8 {d5}, [r12], r1 + vld1.8 {d2}, [r12], r1 + vld1.8 {d3}, [r12], r1 + + vtrn.16 q2, q1 + vtrn.8 d4, d5 + vtrn.8 d2, d3 + + vrev64.32 d5, d5 + vtrn.32 q2, q1 + vdup.32 d0, d3[0] + vdup.32 d1, d2[0] + + rv40_weak_loop_filter + + vtrn.32 q2, q3 + vswp d4, d5 + + vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1 + vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1 + vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1 + vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1 + + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c new file mode 100644 index 0000000..4da7967 --- /dev/null +++ b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/sbrdsp.h" + +void ff_sbr_sum64x5_neon(float *z); +float ff_sbr_sum_square_neon(float (*x)[2], int n); +void ff_sbr_neg_odd_64_neon(float *x); +void ff_sbr_qmf_pre_shuffle_neon(float *z); +void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z); +void ff_sbr_qmf_deint_neg_neon(float *v, const float *src); +void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1); +void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2], + const float *g_filt, int m_max, intptr_t ixh); +void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2], + const float alpha0[2], const float alpha1[2], + float bw, int start, int end); +void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]); + +void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + +av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->sum64x5 = ff_sbr_sum64x5_neon; + s->sum_square = ff_sbr_sum_square_neon; + s->neg_odd_64 = ff_sbr_neg_odd_64_neon; + s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon; + s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon; + s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon; + s->hf_g_filt = ff_sbr_hf_g_filt_neon; + s->hf_gen = ff_sbr_hf_gen_neon; + s->autocorrelate = ff_sbr_autocorrelate_neon; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/sbrdsp_neon.S b/ffmpeg/libavcodec/arm/sbrdsp_neon.S new file mode 100644 index 0000000..610397f --- /dev/null +++ b/ffmpeg/libavcodec/arm/sbrdsp_neon.S @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_sbr_sum64x5_neon, export=1 + push {lr} + add r1, r0, # 64*4 + add r2, r0, #128*4 + add r3, r0, #192*4 + add lr, r0, #256*4 + mov r12, #64 +1: + vld1.32 {q0}, [r0,:128] + vld1.32 {q1}, [r1,:128]! + vadd.f32 q0, q0, q1 + vld1.32 {q2}, [r2,:128]! + vadd.f32 q0, q0, q2 + vld1.32 {q3}, [r3,:128]! + vadd.f32 q0, q0, q3 + vld1.32 {q8}, [lr,:128]! + vadd.f32 q0, q0, q8 + vst1.32 {q0}, [r0,:128]! + subs r12, #4 + bgt 1b + pop {pc} +endfunc + +function ff_sbr_sum_square_neon, export=1 + vmov.f32 q0, #0.0 +1: + vld1.32 {q1}, [r0,:128]! + vmla.f32 q0, q1, q1 + subs r1, r1, #2 + bgt 1b + vadd.f32 d0, d0, d1 + vpadd.f32 d0, d0, d0 +NOVFP vmov.32 r0, d0[0] + bx lr +endfunc + +function ff_sbr_neg_odd_64_neon, export=1 + mov r1, r0 + vmov.i32 q8, #1<<31 + vld2.32 {q0,q1}, [r0,:128]! + veor q1, q1, q8 + vld2.32 {q2,q3}, [r0,:128]! + .rept 3 + vst2.32 {q0,q1}, [r1,:128]! + veor q3, q3, q8 + vld2.32 {q0,q1}, [r0,:128]! + vst2.32 {q2,q3}, [r1,:128]! + veor q1, q1, q8 + vld2.32 {q2,q3}, [r0,:128]! + .endr + veor q3, q3, q8 + vst2.32 {q0,q1}, [r1,:128]! + vst2.32 {q2,q3}, [r1,:128]! + bx lr +endfunc + +function ff_sbr_qmf_pre_shuffle_neon, export=1 + add r1, r0, #60*4 + add r2, r0, #64*4 + vld1.32 {d0}, [r0,:64]! + vst1.32 {d0}, [r2,:64]! + mov r3, #-16 + mov r12, #24 + vmov.i32 q8, #1<<31 + vld1.32 {q0}, [r1,:128], r3 + vld1.32 {d2}, [r0,:64]! +1: + vld1.32 {d3,d4}, [r0,:128]! + vrev64.32 q0, q0 + vld1.32 {q9}, [r1,:128], r3 + veor q0, q0, q8 + vld1.32 {d5,d6}, [r0,:128]! + vswp d0, d1 + vrev64.32 q9, q9 + vst2.32 {q0,q1}, [r2,:64]! + vmov q10, q2 + veor q9, q9, q8 + vmov d2, d6 + vswp d18, d19 + vld1.32 {q0}, [r1,:128], r3 + vst2.32 {q9,q10}, [r2,:64]! + subs r12, r12, #8 + bgt 1b + vld1.32 {d3,d4}, [r0,:128]! + vrev64.32 q0, q0 + vld1.32 {q9}, [r1,:128], r3 + veor q0, q0, q8 + vld1.32 {d5}, [r0,:64]! + vswp d0, d1 + vrev64.32 q9, q9 + vst2.32 {q0,q1}, [r2,:64]! + vswp d4, d5 + veor q1, q9, q8 + vst2.32 {d3,d5}, [r2,:64]! + vst2.32 {d2[0],d4[0]}, [r2,:64]! + bx lr +endfunc + +function ff_sbr_qmf_post_shuffle_neon, export=1 + add r2, r1, #60*4 + mov r3, #-16 + mov r12, #32 + vmov.i32 q8, #1<<31 + vld1.32 {q0}, [r2,:128], r3 + vld1.32 {q1}, [r1,:128]! +1: + pld [r2, #-32] + vrev64.32 q0, q0 + vswp d2, d3 + veor q0, q0, q8 + vld1.32 {q2}, [r2,:128], r3 + vld1.32 {q3}, [r1,:128]! + vst2.32 {d1,d3}, [r0,:128]! + vst2.32 {d0,d2}, [r0,:128]! + pld [r2, #-32] + vrev64.32 q2, q2 + vswp d6, d7 + veor q2, q2, q8 + vld1.32 {q0}, [r2,:128], r3 + vld1.32 {q1}, [r1,:128]! + vst2.32 {d5,d7}, [r0,:128]! + vst2.32 {d4,d6}, [r0,:128]! + subs r12, r12, #8 + bgt 1b + bx lr +endfunc + +function ff_sbr_qmf_deint_neg_neon, export=1 + add r1, r1, #60*4 + add r2, r0, #62*4 + mov r3, #-16 + mov r12, #32 + vmov.i32 d2, #1<<31 +1: + vld2.32 {d0,d1}, [r1,:128], r3 + veor d0, d0, d2 + vrev64.32 d1, d1 + vst1.32 {d0}, [r2,:64] + vst1.32 {d1}, [r0,:64]! + sub r2, r2, #8 + subs r12, r12, #2 + bgt 1b + bx lr +endfunc + +function ff_sbr_qmf_deint_bfly_neon, export=1 + push {lr} + add r2, r2, #60*4 + add r3, r0, #124*4 + mov r12, #64 + mov lr, #-16 +1: + vld1.32 {q0}, [r1,:128]! + vld1.32 {q1}, [r2,:128], lr + vrev64.32 q2, q0 + vrev64.32 q3, q1 + vadd.f32 d3, d4, d3 + vadd.f32 d2, d5, d2 + vsub.f32 d0, d0, d7 + vsub.f32 d1, d1, d6 + vst1.32 {q1}, [r3,:128], lr + vst1.32 {q0}, [r0,:128]! + subs r12, r12, #4 + bgt 1b + pop {pc} +endfunc + +function ff_sbr_hf_g_filt_neon, export=1 + ldr r12, [sp] + add r1, r1, r12, lsl #3 + mov r12, #40*2*4 + sub r3, r3, #1 + vld2.32 {d2[],d3[]},[r2,:64]! + vld1.32 {d0}, [r1,:64], r12 +1: + vld1.32 {d1}, [r1,:64], r12 + vmul.f32 q3, q0, q1 + vld2.32 {d2[],d3[]},[r2,:64]! + vld1.32 {d0}, [r1,:64], r12 + vst1.32 {q3}, [r0,:64]! + subs r3, r3, #2 + bgt 1b + it lt + bxlt lr + vmul.f32 d0, d0, d2 + vst1.32 {d0}, [r0,:64]! + bx lr +endfunc + +function ff_sbr_hf_gen_neon, export=1 +NOVFP vld1.32 {d1[]}, [sp,:32] +VFP vdup.32 d1, d0[0] + vmul.f32 d0, d1, d1 + vld1.32 {d3}, [r2,:64] + vld1.32 {d2}, [r3,:64] + vmul.f32 q0, q0, q1 + ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] + vtrn.32 d0, d1 + vneg.f32 d18, d1 + vtrn.32 d18, d1 + add r0, r0, r2, lsl #3 + add r1, r1, r2, lsl #3 + sub r1, r1, #2*8 + sub r3, r3, r2 + vld1.32 {q1}, [r1,:128]! +1: + vld1.32 {q3}, [r1,:128]! + vrev64.32 q2, q1 + vmov q8, q3 + vrev64.32 d20, d3 + vrev64.32 d21, d6 + vmla.f32 q3, q1, d0[0] + vmla.f32 d6, d4, d18 + vmla.f32 d7, d20, d18 + vmla.f32 d6, d3, d0[1] + vmla.f32 d7, d16, d0[1] + vmla.f32 d6, d5, d1 + vmla.f32 d7, d21, d1 + vmov q1, q8 + vst1.32 {q3}, [r0,:128]! + subs r3, r3, #2 + bgt 1b + bx lr +endfunc + +function ff_sbr_autocorrelate_neon, export=1 + vld1.32 {q0}, [r0,:128]! + vmov.f32 q1, #0.0 + vmov.f32 q3, #0.0 + vmov.f32 d20, #0.0 + vmul.f32 d21, d1, d1 + vmov q8, q0 + vmov q11, q0 + mov r12, #36 +1: + vld1.32 {q2}, [r0,:128]! + vrev64.32 q12, q2 + vmla.f32 q10, q2, q2 + vmla.f32 d2, d1, d4 + vmla.f32 d3, d1, d24 + vmla.f32 d6, d0, d4 + vmla.f32 d7, d0, d24 + vmla.f32 d2, d4, d5 + vmla.f32 d3, d4, d25 + vmla.f32 d6, d1, d5 + vmla.f32 d7, d1, d25 + vmov q0, q2 + subs r12, r12, #2 + bgt 1b + vld1.32 {q2}, [r0,:128]! + vrev64.32 q12, q2 + vmla.f32 d2, d1, d4 + vmla.f32 d3, d1, d24 + vmla.f32 d6, d0, d4 + vmla.f32 d7, d0, d24 + vadd.f32 d20, d20, d21 + vrev64.32 d18, d17 + vmla.f32 d6, d1, d5 + vmla.f32 d7, d1, d25 + vmov q0, q1 + vmla.f32 d0, d16, d17 + vmla.f32 d1, d16, d18 + vmla.f32 d2, d4, d5 + vmla.f32 d3, d4, d25 + vneg.f32 s15, s15 + vmov d21, d20 + vpadd.f32 d0, d0, d2 + vpadd.f32 d7, d6, d7 + vtrn.32 d1, d3 + vsub.f32 d6, d1, d3 + vmla.f32 d20, d22, d22 + vmla.f32 d21, d4, d4 + vtrn.32 d0, d6 + vpadd.f32 d20, d20, d21 + vst1.32 {q3}, [r1,:128]! + vst1.32 {d20[1]}, [r1,:32] + add r1, r1, #2*4 + vst1.32 {d0}, [r1,:64] + add r1, r1, #4*4 + vst1.32 {d20[0]}, [r1,:32] + bx lr +endfunc + +function ff_sbr_hf_apply_noise_0_neon, export=1 + vmov.i32 d3, #0 +.Lhf_apply_noise_0: + push {r4,lr} + movrelx r4, X(ff_sbr_noise_table) + ldr r12, [sp, #12] + add r3, r3, #1 + bfc r3, #9, #23 + sub r12, r12, #1 +1: + add lr, r4, r3, lsl #3 + vld2.32 {q0}, [r0,:64] + vld2.32 {q3}, [lr,:64] + vld1.32 {d2}, [r1,:64]! + vld1.32 {d18}, [r2,:64]! + vceq.f32 d16, d2, #0 + veor d2, d2, d3 + vmov q2, q0 + vmla.f32 d0, d6, d18 + vmla.f32 d1, d7, d18 + vadd.f32 d4, d4, d2 + add r3, r3, #2 + bfc r3, #9, #23 + vbif d0, d4, d16 + vbif d1, d5, d16 + vst2.32 {q0}, [r0,:64]! + subs r12, r12, #2 + bgt 1b + blt 2f + add lr, r4, r3, lsl #3 + vld1.32 {d0}, [r0,:64] + vld1.32 {d6}, [lr,:64] + vld1.32 {d2[]}, [r1,:32]! + vld1.32 {d3[]}, [r2,:32]! + vceq.f32 d4, d2, #0 + veor d2, d2, d3 + vmov d1, d0 + vmla.f32 d0, d6, d3 + vadd.f32 s2, s2, s4 + vbif d0, d1, d4 + vst1.32 {d0}, [r0,:64]! +2: + pop {r4,pc} +endfunc + +function ff_sbr_hf_apply_noise_1_neon, export=1 + ldr r12, [sp] + push {r4,lr} + lsl r12, r12, #31 + eor lr, r12, #1<<31 + vmov d3, r12, lr +.Lhf_apply_noise_1: + movrelx r4, X(ff_sbr_noise_table) + ldr r12, [sp, #12] + add r3, r3, #1 + bfc r3, #9, #23 + sub r12, r12, #1 +1: + add lr, r4, r3, lsl #3 + vld2.32 {q0}, [r0,:64] + vld2.32 {q3}, [lr,:64] + vld1.32 {d2}, [r1,:64]! + vld1.32 {d18}, [r2,:64]! + vceq.f32 d16, d2, #0 + veor d2, d2, d3 + vmov q2, q0 + vmla.f32 d0, d6, d18 + vmla.f32 d1, d7, d18 + vadd.f32 d5, d5, d2 + add r3, r3, #2 + bfc r3, #9, #23 + vbif d0, d4, d16 + vbif d1, d5, d16 + vst2.32 {q0}, [r0,:64]! + subs r12, r12, #2 + bgt 1b + blt 2f + add lr, r4, r3, lsl #3 + vld1.32 {d0}, [r0,:64] + vld1.32 {d6}, [lr,:64] + vld1.32 {d2[]}, [r1,:32]! + vld1.32 {d18[]}, [r2,:32]! + vceq.f32 d4, d2, #0 + veor d2, d2, d3 + vmov d1, d0 + vmla.f32 d0, d6, d18 + vadd.f32 s3, s3, s5 + vbif d0, d1, d4 + vst1.32 {d0}, [r0,:64]! +2: + pop {r4,pc} +endfunc + +function ff_sbr_hf_apply_noise_2_neon, export=1 + vmov.i32 d3, #1<<31 + b .Lhf_apply_noise_0 +endfunc + +function ff_sbr_hf_apply_noise_3_neon, export=1 + ldr r12, [sp] + push {r4,lr} + lsl r12, r12, #31 + eor lr, r12, #1<<31 + vmov d3, lr, r12 + b .Lhf_apply_noise_1 +endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_arm.S b/ffmpeg/libavcodec/arm/simple_idct_arm.S new file mode 100644 index 0000000..dd1c815 --- /dev/null +++ b/ffmpeg/libavcodec/arm/simple_idct_arm.S @@ -0,0 +1,479 @@ +/* + * Copyright (C) 2002 Frederic 'dilb' Boulay + * + * Author: Frederic Boulay + * + * The function defined in this file is derived from the simple_idct function + * from the libavcodec library part of the FFmpeg project. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +/* useful constants for the algorithm */ +#define W1 22725 +#define W2 21407 +#define W3 19266 +#define W4 16383 +#define W5 12873 +#define W6 8867 +#define W7 4520 +#define MASK_MSHW 0xFFFF0000 + +#define ROW_SHIFT 11 +#define ROW_SHIFT2MSHW (16-11) +#define COL_SHIFT 20 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ + + +function ff_simple_idct_arm, export=1 + @@ void simple_idct_arm(int16_t *block) + @@ save stack for reg needed (take all of them), + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block + @@ so it must not be overwritten, if it is not saved!! + @@ R12 is another scratch register, so it should not be saved too + @@ save all registers + stmfd sp!, {r4-r11, r14} @ R14 is also called LR + @@ at this point, R0=block, other registers are free. + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. + @@ add 2 temporary variables in the stack: R0 and R14 + sub sp, sp, #8 @ allow 2 local variables + str r0, [sp, #0] @ save block in sp[0] + @@ stack status + @@ sp+4 free + @@ sp+0 R0 (block) + + + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free + + +__row_loop: + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] + ldr r3, [r14, #8] @ R3=ROWr32[2] + ldr r4, [r14, #12] @ R4=ROWr32[3] + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) + @@ else follow the complete algorithm. + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free + orr r5, r4, r3 @ R5=R4 | R3 + orr r5, r5, r2 @ R5=R4 | R3 | R2 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) + beq __end_row_loop + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) + ldrsh r6, [r14, #0] @ R6=ROWr16[0] + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 + beq __almost_empty_row + +__b_evaluation: + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, + @@ R12=__const_ptr_, R14=&block[n] + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 + + @@ MUL16(b0, W1, row[1]); + @@ MUL16(b1, W3, row[1]); + @@ MUL16(b2, W5, row[1]); + @@ MUL16(b3, W7, row[1]); + @@ MAC16(b0, W3, row[3]); + @@ MAC16(b1, -W7, row[3]); + @@ MAC16(b2, -W1, row[3]); + @@ MAC16(b3, -W5, row[3]); + ldr r8, =W1 @ R8=W1 + mov r2, r2, asr #16 @ R2=ROWr16[3] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, =W7 @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if null avoid muls + itttt ne + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] + beq __end_b_evaluation + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, row[5]); + @@ MAC16(b2, W7, row[5]); + @@ MAC16(b3, W3, row[5]); + @@ MAC16(b1, -W1, row[5]); + @@ MAC16(b0, W7, row[7]); + @@ MAC16(b2, W3, row[7]); + @@ MAC16(b3, -W1, row[7]); + @@ MAC16(b1, -W5, row[7]); + mov r3, r3, asr #16 @ R3=ROWr16[5] + teq r3, #0 @ if null avoid muls + it ne + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 + mov r4, r4, asr #16 @ R4=ROWr16[7] + itttt ne + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 + @@ R3 is free now + teq r4, #0 @ if null avoid muls + itttt ne + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 + it ne + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 + @@ R4 is free now +__end_b_evaluation: + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation: + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldr r9, =W4 @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, =W6 @ R10=W6 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) + + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, =W2 @ R8=W2 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + teq r2, #0 + beq __end_bef_a_evaluation + + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + + + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #8] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + it ne + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + ldrsh r9, [r14, #12] @ R9=ROWr16[6] + itttt ne + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + itttt ne + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) + +__end_a_evaluation: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ row[0] = (a0 + b0) >> ROW_SHIFT; + @@ row[1] = (a1 + b1) >> ROW_SHIFT; + @@ row[2] = (a2 + b2) >> ROW_SHIFT; + @@ row[3] = (a3 + b3) >> ROW_SHIFT; + @@ row[4] = (a3 - b3) >> ROW_SHIFT; + @@ row[5] = (a2 - b2) >> ROW_SHIFT; + @@ row[6] = (a1 - b1) >> ROW_SHIFT; + @@ row[7] = (a0 - b0) >> ROW_SHIFT; + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + @@ put 2 16 bits half-words in a 32bits word + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) + ldr r10, =MASK_MSHW @ R10=0xFFFF0000 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) + orr r8, r8, r9 + str r8, [r14, #0] + + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) + orr r8, r8, r9 + str r8, [r14, #4] + + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) + orr r8, r8, r9 + str r8, [r14, #8] + + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) + orr r8, r8, r9 + str r8, [r14, #12] + + bal __end_row_loop + +__almost_empty_row: + @@ the row was empty, except ROWr16[0], now, management of this special case + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], + @@ R8=0xFFFF (temp), R9-R11 free + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). + sub r8, r8, #1 @ R8 is now ready. + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 + +__end_row_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. + sub r14, r14, #16 + bne __row_loop + + + + @@ at this point, R0=block, R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. +__col_loop: + +__b_evaluation2: + @@ at this point, R0=block (temp), R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + @@ proceed with b0-b3 first, followed by a0-a3 + @@ MUL16(b0, W1, col[8x1]); + @@ MUL16(b1, W3, col[8x1]); + @@ MUL16(b2, W5, col[8x1]); + @@ MUL16(b3, W7, col[8x1]); + @@ MAC16(b0, W3, col[8x3]); + @@ MAC16(b1, -W7, col[8x3]); + @@ MAC16(b2, -W1, col[8x3]); + @@ MAC16(b3, -W5, col[8x3]); + ldr r8, =W1 @ R8=W1 + ldrsh r7, [r14, #16] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, =W7 @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldrsh r2, [r14, #48] + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if 0, then avoid muls + itttt ne + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, col[5x8]); + @@ MAC16(b2, W7, col[5x8]); + @@ MAC16(b3, W3, col[5x8]); + @@ MAC16(b1, -W1, col[5x8]); + @@ MAC16(b0, W7, col[7x8]); + @@ MAC16(b2, W3, col[7x8]); + @@ MAC16(b3, -W1, col[7x8]); + @@ MAC16(b1, -W5, col[7x8]); + ldrsh r3, [r14, #80] @ R3=COLr16[5x8] + teq r3, #0 @ if 0 then avoid muls + itttt ne + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8] + ldrsh r4, [r14, #112] @ R4=COLr16[7x8] + it ne + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 + @@ R3 is free now + teq r4, #0 @ if 0 then avoid muls + itttt ne + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 + it ne + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 + @@ R4 is free now +__end_b_evaluation2: + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation2: + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldrsh r6, [r14, #0] + ldr r9, =W4 @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, =W6 @ R10=W6 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, =W2 @ R8=W2 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #64] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + itttt ne + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + ldrsh r9, [r14, #96] @ R9=ROWr16[6] + it ne + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + itttt ne + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) +__end_a_evaluation2: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); + @@ col[16] = ((a2 + b2) >> COL_SHIFT); + @@ col[24] = ((a3 + b3) >> COL_SHIFT); + @@ col[32] = ((a3 - b3) >> COL_SHIFT); + @@ col[40] = ((a2 - b2) >> COL_SHIFT); + @@ col[48] = ((a1 - b1) >> COL_SHIFT); + @@ col[56] = ((a0 - b0) >> COL_SHIFT); + @@@@@ no optimization here @@@@@ + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #0] + strh r9, [r14, #16] + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #32] + strh r9, [r14, #48] + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #64] + strh r9, [r14, #80] + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #96] + strh r9, [r14, #112] + +__end_col_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. + sub r14, r14, #2 + bne __col_loop + + + + +__end_simple_idct_arm: + @@ restore registers to previous status! + add sp, sp, #8 @@ the local variables! + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. + + + +@@ kind of sub-function, here not to overload the common case. +__end_bef_a_evaluation: + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + bal __end_a_evaluation diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S new file mode 100644 index 0000000..d1f10b7 --- /dev/null +++ b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S @@ -0,0 +1,620 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2006 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + +function idct_row_armv5te + str lr, [sp, #-4]! + + ldrd v1, v2, [a1, #8] + ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */ + orrs v1, v1, v2 + itt eq + cmpeq v1, a4 + cmpeq v1, a3, lsr #16 + beq row_dc_only + + mov v1, #(1<<(ROW_SHIFT-1)) + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ + smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ + smultb a2, ip, a4 + smulbb lr, ip, a4 + add v2, v1, a2 + sub v3, v1, a2 + sub v4, v1, lr + add v1, v1, lr + + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr lr, =W57 /* lr = W5 | (W7 << 16) */ + smulbt v5, ip, a3 + smultt v6, lr, a4 + smlatt v5, ip, a4, v5 + smultt a2, ip, a3 + smulbt v7, lr, a3 + sub v6, v6, a2 + smulbt a2, ip, a4 + smultt fp, lr, a3 + sub v7, v7, a2 + smulbt a2, lr, a4 + ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ + sub fp, fp, a2 + + orrs a2, a3, a4 + beq 1f + + smlabt v5, lr, a3, v5 + smlabt v6, ip, a3, v6 + smlatt v5, lr, a4, v5 + smlabt v6, lr, a4, v6 + smlatt v7, lr, a3, v7 + smlatt fp, ip, a3, fp + smulbt a2, ip, a4 + smlatt v7, ip, a4, v7 + sub fp, fp, a2 + + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ + mov a2, #16384 + sub a2, a2, #1 /* a2 = W4 */ + smulbb a2, a2, a3 /* a2 = W4*row[4] */ + smultb lr, ip, a4 /* lr = W6*row[6] */ + add v1, v1, a2 /* v1 += W4*row[4] */ + add v1, v1, lr /* v1 += W6*row[6] */ + add v4, v4, a2 /* v4 += W4*row[4] */ + sub v4, v4, lr /* v4 -= W6*row[6] */ + smulbb lr, ip, a4 /* lr = W2*row[6] */ + sub v2, v2, a2 /* v2 -= W4*row[4] */ + sub v2, v2, lr /* v2 -= W2*row[6] */ + sub v3, v3, a2 /* v3 -= W4*row[4] */ + add v3, v3, lr /* v3 += W2*row[6] */ + +1: add a2, v1, v5 + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v2, v6 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v3, v7 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + add a2, v4, fp + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, a4, [a1] + + sub a2, v4, fp + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v3, v7 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v2, v6 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + sub a2, v1, v5 + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, a4, [a1, #8] + + ldr pc, [sp], #4 + +row_dc_only: + orr a3, a3, a3, lsl #16 + bic a3, a3, #0xe000 + mov a3, a3, lsl #3 + mov a4, a3 + strd a3, a4, [a1] + strd a3, a4, [a1, #8] + + ldr pc, [sp], #4 +endfunc + + .macro idct_col + ldr a4, [a1] /* a4 = col[1:0] */ + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ +#if 0 + mov v1, #(1<<(COL_SHIFT-1)) + smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ + smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ + ldr a4, [a1, #(16*4)] +#else + mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ + add v2, v1, a4, asr #16 + rsb v2, v2, v2, lsl #14 + mov a4, a4, lsl #16 + add v1, v1, a4, asr #16 + ldr a4, [a1, #(16*4)] + rsb v1, v1, v1, lsl #14 +#endif + + smulbb lr, ip, a4 + smulbt a3, ip, a4 + sub v3, v1, lr + sub v5, v1, lr + add v7, v1, lr + add v1, v1, lr + sub v4, v2, a3 + sub v6, v2, a3 + add fp, v2, a3 + ldr ip, =W26 + ldr a4, [a1, #(16*2)] + add v2, v2, a3 + + smulbb lr, ip, a4 + smultb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + add v3, v3, a3 + sub v5, v5, a3 + smulbt lr, ip, a4 + smultt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + add v4, v4, a3 + ldr a4, [a1, #(16*6)] + sub v6, v6, a3 + + smultb lr, ip, a4 + smulbb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + sub v3, v3, a3 + add v5, v5, a3 + smultt lr, ip, a4 + smulbt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + sub v4, v4, a3 + add v6, v6, a3 + + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} + + ldr ip, =W13 + ldr a4, [a1, #(16*1)] + ldr lr, =W57 + smulbb v1, ip, a4 + smultb v3, ip, a4 + smulbb v5, lr, a4 + smultb v7, lr, a4 + smulbt v2, ip, a4 + smultt v4, ip, a4 + smulbt v6, lr, a4 + smultt fp, lr, a4 + rsb v4, v4, #0 + ldr a4, [a1, #(16*3)] + rsb v3, v3, #0 + + smlatb v1, ip, a4, v1 + smlatb v3, lr, a4, v3 + smulbb a3, ip, a4 + smulbb a2, lr, a4 + sub v5, v5, a3 + sub v7, v7, a2 + smlatt v2, ip, a4, v2 + smlatt v4, lr, a4, v4 + smulbt a3, ip, a4 + smulbt a2, lr, a4 + sub v6, v6, a3 + ldr a4, [a1, #(16*5)] + sub fp, fp, a2 + + smlabb v1, lr, a4, v1 + smlabb v3, ip, a4, v3 + smlatb v5, lr, a4, v5 + smlatb v7, ip, a4, v7 + smlabt v2, lr, a4, v2 + smlabt v4, ip, a4, v4 + smlatt v6, lr, a4, v6 + ldr a3, [a1, #(16*7)] + smlatt fp, ip, a4, fp + + smlatb v1, lr, a3, v1 + smlabb v3, lr, a3, v3 + smlatb v5, ip, a3, v5 + smulbb a4, ip, a3 + smlatt v2, lr, a3, v2 + sub v7, v7, a4 + smlabt v4, lr, a3, v4 + smulbt a4, ip, a3 + smlatt v6, ip, a3, v6 + sub fp, fp, a4 + .endm + +function idct_col_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + adds a2, a3, v1 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, v2 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1] + subs a3, a3, v1 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, v2 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*7)] + + subs a2, a3, v3 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub ip, a4, v4 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*1)] + adds a3, a3, v3 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add a4, a4, v4 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*6)] + + adds a2, a3, v5 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, v6 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*2)] + subs a3, a3, v5 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, v6 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*5)] + + adds a2, a3, v7 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, fp + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*3)] + subs a3, a3, v7 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, fp + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + str a2, [a1, #(16*4)] + + ldr pc, [sp], #4 +endfunc + +.macro clip dst, src:vararg + movs \dst, \src + it mi + movmi \dst, #0 + cmp \dst, #255 + it gt + movgt \dst, #255 +.endm + +.macro aclip dst, src:vararg + adds \dst, \src + it mi + movmi \dst, #0 + cmp \dst, #255 + it gt + movgt \dst, #255 +.endm + +function idct_col_put_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + ldr lr, [sp, #32] + add a2, a3, v1 + clip a2, a2, asr #20 + add ip, a4, v2 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + sub a3, a3, v1 + clip a3, a3, asr #20 + sub a4, a4, v2 + clip a4, a4, asr #20 + ldr v1, [sp, #28] + strh a2, [v1] + add a2, v1, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + rsb v2, lr, lr, lsl #3 + ldmfd sp!, {a3, a4} + strh_pre a2, v2, v1 + + sub a2, a3, v3 + clip a2, a2, asr #20 + sub ip, a4, v4 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh_pre a2, v1, lr + add a3, a3, v3 + clip a2, a3, asr #20 + add a4, a4, v4 + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh_dpre a2, v2, lr + + add a2, a3, v5 + clip a2, a2, asr #20 + add ip, a4, v6 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh_pre a2, v1, lr + sub a3, a3, v5 + clip a2, a3, asr #20 + sub a4, a4, v6 + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh_dpre a2, v2, lr + + add a2, a3, v7 + clip a2, a2, asr #20 + add ip, a4, fp + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr] + sub a3, a3, v7 + clip a2, a3, asr #20 + sub a4, a4, fp + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + strh_dpre a2, v2, lr + + ldr pc, [sp], #4 +endfunc + +function idct_col_add_armv5te + str lr, [sp, #-4]! + + idct_col + + ldr lr, [sp, #36] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr] + add a2, a3, v1 + sub a3, a3, v1 + and v1, ip, #255 + aclip a2, v1, a2, asr #20 + add v1, a4, v2 + mov v1, v1, asr #20 + aclip v1, v1, ip, lsr #8 + orr a2, a2, v1, lsl #8 + ldr v1, [sp, #32] + sub a4, a4, v2 + rsb v2, v1, v1, lsl #3 + ldrh_pre ip, v2, lr + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + add a2, lr, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + sub a2, a3, v3 + add a3, a3, v3 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + sub v3, a4, v4 + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + add a4, a4, v4 + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + add a2, a3, v5 + sub a3, a3, v5 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + add v3, a4, v6 + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + sub a4, a4, v6 + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + add a2, a3, v7 + sub a3, a3, v7 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + add v3, a4, fp + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + sub a4, a4, fp + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldr pc, [sp], #4 +endfunc + +function ff_simple_idct_armv5te, export=1 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_add_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_put_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv6.S b/ffmpeg/libavcodec/arm/simple_idct_armv6.S new file mode 100644 index 0000000..79cf5d4 --- /dev/null +++ b/ffmpeg/libavcodec/arm/simple_idct_armv6.S @@ -0,0 +1,425 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2007 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W42 (W4 | (W2 << 16)) +#define W42n (-W4&0xffff | (-W2 << 16)) +#define W46 (W4 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + +/* + Compute partial IDCT of single row. + shift = left-shift amount + r0 = source address + r2 = row[2,0] <= 2 cycles + r3 = row[3,1] + ip = w42 <= 2 cycles + + Output in registers r4--r11 +*/ + .macro idct_row shift + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + ldr lr, [r0, #12] /* lr = row[7,5] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + + ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ + smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ + ldr r2, [r0, #4] /* r2 = row[6,4] */ + smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ + ldr ip, =W46 /* ip = W4 | (W6 << 16) */ + smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ + + smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ + smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ + smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ + smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ + .endm + +/* + Compute partial IDCT of half row. + shift = left-shift amount + r2 = row[2,0] + r3 = row[3,1] + ip = w42 + + Output in registers r4--r11 +*/ + .macro idct_row4 shift + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* + Compute final part of IDCT single row without shift. + Input in registers r4--r11 + Output in registers ip, r4--r6, lr, r8--r10 +*/ + .macro idct_finish + add ip, r4, r8 /* r1 = A0 + B0 */ + sub lr, r4, r8 /* r2 = A0 - B0 */ + sub r4, r5, r9 /* r2 = A1 + B1 */ + add r8, r5, r9 /* r2 = A1 - B1 */ + add r5, r6, r10 /* r1 = A2 + B2 */ + sub r9, r6, r10 /* r1 = A2 - B2 */ + add r6, r7, r11 /* r2 = A3 + B3 */ + sub r10,r7, r11 /* r2 = A3 - B3 */ + .endm + +/* + Compute final part of IDCT single row. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub r2, r4, r8 /* r2 = A0 - B0 */ + mov r4, r3, asr #\shift + mov r8, r2, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add r2, r5, r9 /* r2 = A1 - B1 */ + mov r5, r3, asr #\shift + mov r9, r2, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub r2, r6, r10 /* r2 = A2 - B2 */ + mov r6, r3, asr #\shift + mov r10,r2, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub r2, r7, r11 /* r2 = A3 - B3 */ + mov r7, r3, asr #\shift + mov r11,r2, asr #\shift + .endm + +/* + Compute final part of IDCT single row, saturating results at 8 bits. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift_sat shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub ip, r4, r8 /* ip = A0 - B0 */ + usat r4, #8, r3, asr #\shift + usat r8, #8, ip, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add ip, r5, r9 /* ip = A1 - B1 */ + usat r5, #8, r3, asr #\shift + usat r9, #8, ip, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub ip, r6, r10 /* ip = A2 - B2 */ + usat r6, #8, r3, asr #\shift + usat r10,#8, ip, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub ip, r7, r11 /* ip = A3 - B3 */ + usat r7, #8, r3, asr #\shift + usat r11,#8, ip, asr #\shift + .endm + +/* + Compute IDCT of single row, storing as column. + r0 = source + r1 = dest +*/ +function idct_row_armv6 + push {lr} + + ldr lr, [r0, #12] /* lr = row[7,5] */ + ldr ip, [r0, #4] /* ip = row[6,4] */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + ldr r2, [r0] /* r2 = row[2,0] */ + orrs lr, lr, ip + itt eq + cmpeq lr, r3 + cmpeq lr, r2, lsr #16 + beq 1f + push {r1} + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + cmp lr, #0 + beq 2f + + idct_row ROW_SHIFT + b 3f + +2: idct_row4 ROW_SHIFT + +3: pop {r1} + idct_finish_shift ROW_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*2)] + strh r6, [r1, #(16*4)] + strh r7, [r1, #(16*6)] + strh r11,[r1, #(16*1)] + strh r10,[r1, #(16*3)] + strh r9, [r1, #(16*5)] + strh r8, [r1, #(16*7)] + + pop {pc} + +1: mov r2, r2, lsl #3 + strh r2, [r1] + strh r2, [r1, #(16*2)] + strh r2, [r1, #(16*4)] + strh r2, [r1, #(16*6)] + strh r2, [r1, #(16*1)] + strh r2, [r1, #(16*3)] + strh r2, [r1, #(16*5)] + strh r2, [r1, #(16*7)] + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row. + r0 = source + r1 = dest +*/ +function idct_col_armv6 + push {r1, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1} + idct_finish_shift COL_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*1)] + strh r6, [r1, #(16*2)] + strh r7, [r1, #(16*3)] + strh r11,[r1, #(16*4)] + strh r10,[r1, #(16*5)] + strh r9, [r1, #(16*6)] + strh r8, [r1, #(16*7)] + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_put_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish_shift_sat COL_SHIFT + + strb_post r4, r1, r2 + strb_post r5, r1, r2 + strb_post r6, r1, r2 + strb_post r7, r1, r2 + strb_post r11,r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, add/store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_add_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish + + ldrb r3, [r1] + ldrb r7, [r1, r2] + ldrb r11,[r1, r2, lsl #2] + add ip, r3, ip, asr #COL_SHIFT + usat ip, #8, ip + add r4, r7, r4, asr #COL_SHIFT + strb_post ip, r1, r2 + ldrb ip, [r1, r2] + usat r4, #8, r4 + ldrb r11,[r1, r2, lsl #2] + add r5, ip, r5, asr #COL_SHIFT + usat r5, #8, r5 + strb_post r4, r1, r2 + ldrb r3, [r1, r2] + ldrb ip, [r1, r2, lsl #2] + strb_post r5, r1, r2 + ldrb r7, [r1, r2] + ldrb r4, [r1, r2, lsl #2] + add r6, r3, r6, asr #COL_SHIFT + usat r6, #8, r6 + add r10,r7, r10,asr #COL_SHIFT + usat r10,#8, r10 + add r9, r11,r9, asr #COL_SHIFT + usat r9, #8, r9 + add r8, ip, r8, asr #COL_SHIFT + usat r8, #8, r8 + add lr, r4, lr, asr #COL_SHIFT + usat lr, #8, lr + strb_post r6, r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 + strb_post lr, r1, r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute 8 IDCT row transforms. + func = IDCT row->col function + width = width of columns in bytes +*/ + .macro idct_rows func width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + sub r0, r0, #(16*5) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + + sub r0, r0, #(16*7) + .endm + +/* void ff_simple_idct_armv6(int16_t *data); */ +function ff_simple_idct_armv6, export=1 + push {r4-r11, lr} + sub sp, sp, #128 + + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r1, r0 + mov r0, sp + idct_rows idct_col_armv6, 2 + + add sp, sp, #128 + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */ +function ff_simple_idct_add_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_add_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */ +function ff_simple_idct_put_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_put_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_neon.S b/ffmpeg/libavcodec/arm/simple_idct_neon.S new file mode 100644 index 0000000..c3e573c --- /dev/null +++ b/ffmpeg/libavcodec/arm/simple_idct_neon.S @@ -0,0 +1,375 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] +A pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] +endfunc + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr +endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1< + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_synth_filter_float_neon, export=1 + push {r3-r11,lr} + + ldr r4, [r2] @ synth_buf_offset + add r1, r1, r4, lsl #2 @ synth_buf + sub r12, r4, #32 + bfc r12, #9, #23 + bic r4, r4, #63 + str r12, [r2] + + ldr r2, [sp, #12*4] @ in + mov r9, r1 @ synth_buf + +VFP vpush {d0} + bl X(ff_imdct_half_neon) +VFP vpop {d0} + pop {r3} + + ldr r5, [sp, #9*4] @ window + ldr r2, [sp, #10*4] @ out +NOVFP vldr s0, [sp, #12*4] @ scale + add r8, r9, #12*4 + + mov lr, #64*4 + mov r1, #4 +1: + add r10, r9, #16*4 @ synth_buf + add r11, r8, #16*4 + add r0, r5, #16*4 @ window + add r6, r5, #32*4 + add r7, r5, #48*4 + + vld1.32 {q10}, [r3,:128] @ a + add r3, r3, #16*4 + vld1.32 {q1}, [r3,:128] @ b + vmov.f32 q2, #0.0 @ c + vmov.f32 q3, #0.0 @ d + + mov r12, #512 +2: + vld1.32 {q9}, [r8, :128], lr + vrev64.32 q9, q9 + vld1.32 {q8}, [r5, :128], lr + vmls.f32 d20, d16, d19 + vld1.32 {q11}, [r0, :128], lr + vmls.f32 d21, d17, d18 + vld1.32 {q12}, [r9, :128], lr + vmla.f32 d2, d22, d24 + vld1.32 {q8}, [r6, :128], lr + vmla.f32 d3, d23, d25 + vld1.32 {q9}, [r10,:128], lr + vmla.f32 d4, d16, d18 + vld1.32 {q12}, [r11,:128], lr + vmla.f32 d5, d17, d19 + vrev64.32 q12, q12 + vld1.32 {q11}, [r7, :128], lr + vmla.f32 d6, d22, d25 + vmla.f32 d7, d23, d24 + subs r12, r12, #64 + beq 3f + cmp r12, r4 + bne 2b + sub r8, r8, #512*4 + sub r9, r9, #512*4 + sub r10, r10, #512*4 + sub r11, r11, #512*4 + b 2b +3: + vmul.f32 q8, q10, d0[0] + vmul.f32 q9, q1, d0[0] + vst1.32 {q3}, [r3,:128] + sub r3, r3, #16*4 + vst1.32 {q2}, [r3,:128] + vst1.32 {q8}, [r2,:128] + add r2, r2, #16*4 + vst1.32 {q9}, [r2,:128] + + subs r1, r1, #1 + it eq + popeq {r4-r11,pc} + + cmp r4, #0 + itt eq + subeq r8, r8, #512*4 + subeq r9, r9, #512*4 + sub r5, r5, #512*4 + sub r2, r2, #12*4 @ out + add r3, r3, #4*4 @ synth_buf2 + add r5, r5, #4*4 @ window + add r9, r9, #4*4 @ synth_buf + sub r8, r8, #4*4 @ synth_buf + b 1b +endfunc diff --git a/ffmpeg/libavcodec/arm/videodsp_arm.h b/ffmpeg/libavcodec/arm/videodsp_arm.h new file mode 100644 index 0000000..112cbb8 --- /dev/null +++ b/ffmpeg/libavcodec/arm/videodsp_arm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VIDEODSP_ARM_H +#define AVCODEC_ARM_VIDEODSP_ARM_H + +#include "libavcodec/avcodec.h" +#include "libavcodec/videodsp.h" + +void ff_videodsp_init_armv5te(VideoDSPContext* ctx, int bpc); + +#endif /* AVCODEC_ARM_VIDEODSP_ARM_H */ diff --git a/ffmpeg/libavcodec/arm/videodsp_armv5te.S b/ffmpeg/libavcodec/arm/videodsp_armv5te.S new file mode 100644 index 0000000..48a6c3b --- /dev/null +++ b/ffmpeg/libavcodec/arm/videodsp_armv5te.S @@ -0,0 +1,31 @@ +@ +@ ARMv5te optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +function ff_prefetch_arm, export=1 + subs r2, r2, #1 + pld [r0] + add r0, r0, r1 + bne ff_prefetch_arm + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/videodsp_init_arm.c b/ffmpeg/libavcodec/arm/videodsp_init_arm.c new file mode 100644 index 0000000..a89abb2 --- /dev/null +++ b/ffmpeg/libavcodec/arm/videodsp_init_arm.c @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/videodsp.h" +#include "videodsp_arm.h" + +av_cold void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc); +} diff --git a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c new file mode 100644 index 0000000..1ea1f34 --- /dev/null +++ b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/videodsp.h" +#include "videodsp_arm.h" + +void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h); + +av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc) +{ +#if HAVE_ARMV5TE_EXTERNAL + ctx->prefetch = ff_prefetch_arm; +#endif +} diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c new file mode 100644 index 0000000..f4b3d80 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c @@ -0,0 +1,37 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/vorbisdsp.h" + +void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, + intptr_t blocksize); + +av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S new file mode 100644 index 0000000..79ce54f --- /dev/null +++ b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S @@ -0,0 +1,83 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_vorbis_inverse_coupling_neon, export=1 + vmov.i32 q10, #1<<31 + subs r2, r2, #4 + mov r3, r0 + mov r12, r1 + beq 3f + + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 +1: vld1.32 {d2-d3}, [r1,:128]! + vld1.32 {d0-d1}, [r0,:128]! + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vst1.32 {d24-d25},[r3, :128]! + vst1.32 {d22-d23},[r12,:128]! + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + subs r2, r2, #8 + ble 2f + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 + b 1b + +2: vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + it lt + bxlt lr + +3: vld1.32 {d2-d3}, [r1,:128] + vld1.32 {d0-d1}, [r0,:128] + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + vst1.32 {d2-d3}, [r0,:128]! + vst1.32 {d0-d1}, [r1,:128]! + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c new file mode 100644 index 0000000..5af795b --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c @@ -0,0 +1,45 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/vp3dsp.h" + +void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); +void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const int16_t *data); + +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); + +av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->idct_put = ff_vp3_idct_put_neon; + c->idct_add = ff_vp3_idct_add_neon; + c->idct_dc_add = ff_vp3_idct_dc_add_neon; + c->v_loop_filter = ff_vp3_v_loop_filter_neon; + c->h_loop_filter = ff_vp3_h_loop_filter_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/ffmpeg/libavcodec/arm/vp3dsp_neon.S new file mode 100644 index 0000000..f133905 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp3dsp_neon.S @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2009 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +const vp3_idct_constants, align=4 +.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 +endconst + +#define xC1S7 d0[0] +#define xC2S6 d0[1] +#define xC3S5 d0[2] +#define xC4S4 d0[3] +#define xC5S3 d1[0] +#define xC6S2 d1[1] +#define xC7S1 d1[2] + +.macro vp3_loop_filter + vsubl.u8 q3, d18, d17 + vsubl.u8 q2, d16, d19 + vadd.i16 q1, q3, q3 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q1, q2 + vrshr.s16 q0, q0, #3 + vmovl.u8 q9, d18 + vdup.u16 q15, r2 + + vabs.s16 q1, q0 + vshr.s16 q0, q0, #15 + vqsub.u16 q2, q15, q1 + vqsub.u16 q3, q2, q1 + vsub.i16 q1, q2, q3 + veor q1, q1, q0 + vsub.i16 q0, q1, q0 + + vaddw.u8 q2, q0, d17 + vsub.i16 q3, q9, q0 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 +.endm + +function ff_vp3_v_loop_filter_neon, export=1 + sub ip, r0, r1 + sub r0, r0, r1, lsl #1 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r0,:64], r1 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d19}, [r0,:64], r1 + ldrb r2, [r2, #129*4] + + vp3_loop_filter + + vst1.64 {d0}, [ip,:64], r1 + vst1.64 {d1}, [ip,:64], r1 + bx lr +endfunc + +function ff_vp3_h_loop_filter_neon, export=1 + sub ip, r0, #1 + sub r0, r0, #2 + vld1.32 {d16[]}, [r0], r1 + vld1.32 {d17[]}, [r0], r1 + vld1.32 {d18[]}, [r0], r1 + vld1.32 {d19[]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d17[1]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d19[1]}, [r0], r1 + ldrb r2, [r2, #129*4] + + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vtrn.16 d16, d18 + vtrn.16 d17, d19 + + vp3_loop_filter + + vtrn.8 d0, d1 + + vst1.16 {d0[0]}, [ip], r1 + vst1.16 {d1[0]}, [ip], r1 + vst1.16 {d0[1]}, [ip], r1 + vst1.16 {d1[1]}, [ip], r1 + vst1.16 {d0[2]}, [ip], r1 + vst1.16 {d1[2]}, [ip], r1 + vst1.16 {d0[3]}, [ip], r1 + vst1.16 {d1[3]}, [ip], r1 + bx lr +endfunc + + +function vp3_idct_start_neon + vpush {d8-d15} + vmov.i16 q4, #0 + vmov.i16 q5, #0 + movrel r3, vp3_idct_constants + vld1.64 {d0-d1}, [r3,:128] + vld1.64 {d16-d19}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d20-d23}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d24-d27}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vadd.s16 q1, q8, q12 + vsub.s16 q8, q8, q12 + vld1.64 {d28-d31}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + +vp3_idct_core_neon: + vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 + vmull.s16 q3, d19, xC1S7 + vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16 + vmull.s16 q5, d3, xC4S4 + vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16 + vmull.s16 q7, d17, xC4S4 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 + vshrn.s32 d9, q7, #16 + vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4 + vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4 + vadd.s16 q1, q2, q9 // ip[1] * C1 + + vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16 + vmull.s16 q3, d31, xC1S7 + vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16 + vmull.s16 q5, d31, xC7S1 + vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16 + vmull.s16 q7, d19, xC7S1 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 // ip[7] * C7 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[1] * C7 + vshrn.s32 d9, q7, #16 + vadd.s16 q2, q2, q15 // ip[7] * C1 + vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7 + vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1 + + vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16 + vmull.s16 q3, d23, xC5S3 + vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16 + vmull.s16 q5, d23, xC3S5 + vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16 + vmull.s16 q7, d27, xC5S3 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 + vshrn.s32 d9, q7, #16 + vadd.s16 q3, q3, q11 // ip[3] * C3 + vadd.s16 q4, q4, q13 // ip[5] * C5 + vadd.s16 q1, q2, q11 // ip[3] * C5 + vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5 + + vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16 + vmull.s16 q3, d27, xC3S5 + vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16 + vmull.s16 q5, d21, xC2S6 + vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16 + vmull.s16 q7, d29, xC6S2 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[6] * C6 + vshrn.s32 d9, q7, #16 + vadd.s16 q2, q2, q13 // ip[5] * C3 + vadd.s16 q3, q3, q10 // ip[2] * C2 + vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5 + vsub.s16 q1, q9, q11 // (A - C) + vadd.s16 q11, q9, q11 // Cd = A + C + vsub.s16 q9, q15, q13 // (B - D) + vadd.s16 q13, q15, q13 // Dd = B + D + vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6 + + vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16 + vmull.s16 q3, d3, xC4S4 + vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16 + vmull.s16 q5, d29, xC2S6 + vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16 + vmull.s16 q7, d21, xC6S2 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[2] * C6 + vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16 + vmull.s16 q6, d19, xC4S4 + vshrn.s32 d9, q7, #16 + vadd.s16 q3, q3, q14 // ip[6] * C2 + vadd.s16 q10, q1, q2 // Ad = (A - C) * C4 + vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2 + bx lr +endfunc + +.macro VP3_IDCT_END type +function vp3_idct_end_\type\()_neon +.ifc \type, col + vdup.16 q0, r3 + vadd.s16 q12, q12, q0 + vadd.s16 q8, q8, q0 +.endif + + vshrn.s32 d2, q5, #16 + vshrn.s32 d3, q6, #16 + vadd.s16 q2, q12, q15 // Gd = E + G + vadd.s16 q9, q1, q9 // (B - D) * C4 + vsub.s16 q12, q12, q15 // Ed = E - G + vsub.s16 q3, q8, q10 // Fd = F - Ad + vadd.s16 q10, q8, q10 // Add = F + Ad + vadd.s16 q4, q9, q14 // Hd = Bd + H + vsub.s16 q14, q9, q14 // Bdd = Bd - H + vadd.s16 q8, q2, q11 // [0] = Gd + Cd + vsub.s16 q15, q2, q11 // [7] = Gd - Cd + vadd.s16 q9, q10, q4 // [1] = Add + Hd + vsub.s16 q10, q10, q4 // [2] = Add - Hd + vadd.s16 q11, q12, q13 // [3] = Ed + Dd + vsub.s16 q12, q12, q13 // [4] = Ed - Dd +.ifc \type, row + vtrn.16 q8, q9 +.endif + vadd.s16 q13, q3, q14 // [5] = Fd + Bdd + vsub.s16 q14, q3, q14 // [6] = Fd - Bdd + +.ifc \type, row + // 8x8 transpose + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vswp d17, d24 + vswp d19, d26 + vadd.s16 q1, q8, q12 + vswp d21, d28 + vsub.s16 q8, q8, q12 + vswp d23, d30 +.endif + bx lr +endfunc +.endm + +VP3_IDCT_END row +VP3_IDCT_END col + +function ff_vp3_idct_put_neon, export=1 + mov ip, lr + bl vp3_idct_start_neon + bl vp3_idct_end_row_neon + mov r3, #8 + add r3, r3, #2048 // convert signed pixel to unsigned + bl vp3_idct_core_neon + bl vp3_idct_end_col_neon + mov lr, ip + vpop {d8-d15} + + vqshrun.s16 d0, q8, #4 + vqshrun.s16 d1, q9, #4 + vqshrun.s16 d2, q10, #4 + vqshrun.s16 d3, q11, #4 + vst1.64 {d0}, [r0,:64], r1 + vqshrun.s16 d4, q12, #4 + vst1.64 {d1}, [r0,:64], r1 + vqshrun.s16 d5, q13, #4 + vst1.64 {d2}, [r0,:64], r1 + vqshrun.s16 d6, q14, #4 + vst1.64 {d3}, [r0,:64], r1 + vqshrun.s16 d7, q15, #4 + vst1.64 {d4}, [r0,:64], r1 + vst1.64 {d5}, [r0,:64], r1 + vst1.64 {d6}, [r0,:64], r1 + vst1.64 {d7}, [r0,:64], r1 + bx lr +endfunc + +function ff_vp3_idct_add_neon, export=1 + mov ip, lr + bl vp3_idct_start_neon + bl vp3_idct_end_row_neon + mov r3, #8 + bl vp3_idct_core_neon + bl vp3_idct_end_col_neon + mov lr, ip + vpop {d8-d15} + mov r2, r0 + + vld1.64 {d0}, [r0,:64], r1 + vshr.s16 q8, q8, #4 + vld1.64 {d1}, [r0,:64], r1 + vshr.s16 q9, q9, #4 + vld1.64 {d2}, [r0,:64], r1 + vaddw.u8 q8, q8, d0 + vld1.64 {d3}, [r0,:64], r1 + vaddw.u8 q9, q9, d1 + vld1.64 {d4}, [r0,:64], r1 + vshr.s16 q10, q10, #4 + vld1.64 {d5}, [r0,:64], r1 + vshr.s16 q11, q11, #4 + vld1.64 {d6}, [r0,:64], r1 + vqmovun.s16 d0, q8 + vld1.64 {d7}, [r0,:64], r1 + vqmovun.s16 d1, q9 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vshr.s16 q12, q12, #4 + vshr.s16 q13, q13, #4 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vshr.s16 q14, q14, #4 + vshr.s16 q15, q15, #4 + vst1.64 {d0}, [r2,:64], r1 + vqmovun.s16 d4, q12 + vst1.64 {d1}, [r2,:64], r1 + vqmovun.s16 d5, q13 + vst1.64 {d2}, [r2,:64], r1 + vaddw.u8 q14, q14, d6 + vst1.64 {d3}, [r2,:64], r1 + vaddw.u8 q15, q15, d7 + vst1.64 {d4}, [r2,:64], r1 + vqmovun.s16 d6, q14 + vst1.64 {d5}, [r2,:64], r1 + vqmovun.s16 d7, q15 + vst1.64 {d6}, [r2,:64], r1 + vst1.64 {d7}, [r2,:64], r1 + bx lr +endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r12, [r2] + mov r3, r0 + add r12, r12, #15 + vdup.16 q15, r12 + mov r12, 0 + strh r12, [r2] + vshr.s16 q15, q15, #5 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/vp56_arith.h b/ffmpeg/libavcodec/arm/vp56_arith.h new file mode 100644 index 0000000..feb1247 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp56_arith.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP56_ARITH_H +#define AVCODEC_ARM_VP56_ARITH_H + +#if CONFIG_THUMB +# define A(x) +# define T(x) x +#else +# define A(x) x +# define T(x) +#endif + +#if CONFIG_THUMB || defined __clang__ +# define L(x) +# define U(x) x +#else +# define L(x) x +# define U(x) +#endif + +#if HAVE_ARMV6_INLINE + +#define vp56_rac_get_prob vp56_rac_get_prob_armv6 +static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) +{ + unsigned shift = ff_vp56_norm_shift[c->high]; + unsigned code_word = c->code_word << shift; + unsigned high = c->high << shift; + unsigned bit; + + __asm__ ("adds %3, %3, %0 \n" + "itt cs \n" + "cmpcs %7, %4 \n" + L("ldrcsh %2, [%4], #2 \n") + U("ldrhcs %2, [%4], #2 \n") + "rsb %0, %6, #256 \n" + "smlabb %0, %5, %6, %0 \n" + T("itttt cs \n") + "rev16cs %2, %2 \n" + T("lslcs %2, %2, %3 \n") + T("orrcs %1, %1, %2 \n") + A("orrcs %1, %1, %2, lsl %3 \n") + "subcs %3, %3, #16 \n" + "lsr %0, %0, #8 \n" + "cmp %1, %0, lsl #16 \n" + "ittte ge \n" + "subge %1, %1, %0, lsl #16 \n" + "subge %0, %5, %0 \n" + "movge %2, #1 \n" + "movlt %2, #0 \n" + : "=&r"(c->high), "=&r"(c->code_word), "=&r"(bit), + "+&r"(c->bits), "+&r"(c->buffer) + : "r"(high), "r"(pr), "r"(c->end - 1), + "0"(shift), "1"(code_word) + : "cc"); + + return bit; +} + +#define vp56_rac_get_prob_branchy vp56_rac_get_prob_branchy_armv6 +static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) +{ + unsigned shift = ff_vp56_norm_shift[c->high]; + unsigned code_word = c->code_word << shift; + unsigned high = c->high << shift; + unsigned low; + unsigned tmp; + + __asm__ ("adds %3, %3, %0 \n" + "itt cs \n" + "cmpcs %7, %4 \n" + L("ldrcsh %2, [%4], #2 \n") + U("ldrhcs %2, [%4], #2 \n") + "rsb %0, %6, #256 \n" + "smlabb %0, %5, %6, %0 \n" + T("itttt cs \n") + "rev16cs %2, %2 \n" + T("lslcs %2, %2, %3 \n") + T("orrcs %1, %1, %2 \n") + A("orrcs %1, %1, %2, lsl %3 \n") + "subcs %3, %3, #16 \n" + "lsr %0, %0, #8 \n" + "lsl %2, %0, #16 \n" + : "=&r"(low), "+&r"(code_word), "=&r"(tmp), + "+&r"(c->bits), "+&r"(c->buffer) + : "r"(high), "r"(pr), "r"(c->end - 1), "0"(shift) + : "cc"); + + if (code_word >= tmp) { + c->high = high - low; + c->code_word = code_word - tmp; + return 1; + } + + c->high = low; + c->code_word = code_word; + return 0; +} + +#endif + +#endif /* AVCODEC_ARM_VP56_ARITH_H */ diff --git a/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c new file mode 100644 index 0000000..f53cbae --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vp56dsp.h" + +void ff_vp6_edge_filter_hor_neon(uint8_t *yuv, int stride, int t); +void ff_vp6_edge_filter_ver_neon(uint8_t *yuv, int stride, int t); + +av_cold void ff_vp56dsp_init_arm(VP56DSPContext *s, enum AVCodecID codec) +{ + int cpu_flags = av_get_cpu_flags(); + + if (codec != AV_CODEC_ID_VP5 && have_neon(cpu_flags)) { + s->edge_filter_hor = ff_vp6_edge_filter_hor_neon; + s->edge_filter_ver = ff_vp6_edge_filter_ver_neon; + } +} diff --git a/ffmpeg/libavcodec/arm/vp56dsp_neon.S b/ffmpeg/libavcodec/arm/vp56dsp_neon.S new file mode 100644 index 0000000..03dd28d --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp56dsp_neon.S @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro vp6_edge_filter + vdup.16 q3, r2 @ t + vmov.i16 q13, #1 + vsubl.u8 q0, d20, d18 @ p[ 0] - p[-s] + vsubl.u8 q1, d16, d22 @ p[-2*s] - p[ s] + vsubl.u8 q14, d21, d19 + vsubl.u8 q15, d17, d23 + vadd.i16 q2, q0, q0 @ 2*(p[0]-p[-s]) + vadd.i16 d29, d28, d28 + vadd.i16 q0, q0, q1 @ p[0]-p[-s] + p[-2*s]-p[s] + vadd.i16 d28, d28, d30 + vadd.i16 q0, q0, q2 @ 3*(p[0]-p[-s]) + p[-2*s]-p[s] + vadd.i16 d28, d28, d29 + vrshr.s16 q0, q0, #3 @ v + vrshr.s16 d28, d28, #3 + vsub.i16 q8, q3, q13 @ t-1 + vabs.s16 q1, q0 @ V + vshr.s16 q2, q0, #15 @ s + vabs.s16 d30, d28 + vshr.s16 d29, d28, #15 + vsub.i16 q12, q1, q3 @ V-t + vsub.i16 d31, d30, d6 + vsub.i16 q12, q12, q13 @ V-t-1 + vsub.i16 d31, d31, d26 + vcge.u16 q12, q12, q8 @ V-t-1 >= t-1 + vcge.u16 d31, d31, d16 + vadd.i16 q13, q3, q3 @ 2*t + vadd.i16 d16, d6, d6 + vsub.i16 q13, q13, q1 @ 2*t - V + vsub.i16 d16, d16, d30 + vadd.i16 q13, q13, q2 @ += s + vadd.i16 d16, d16, d29 + veor q13, q13, q2 @ ^= s + veor d16, d16, d29 + vbif q0, q13, q12 + vbif d28, d16, d31 + vmovl.u8 q1, d20 + vmovl.u8 q15, d21 + vaddw.u8 q2, q0, d18 + vaddw.u8 q3, q14, d19 + vsub.i16 q1, q1, q0 + vsub.i16 d30, d30, d28 + vqmovun.s16 d18, q2 + vqmovun.s16 d19, q3 + vqmovun.s16 d20, q1 + vqmovun.s16 d21, q15 +.endm + +function ff_vp6_edge_filter_ver_neon, export=1 + sub r0, r0, r1, lsl #1 + vld1.8 {q8}, [r0], r1 @ p[-2*s] + vld1.8 {q9}, [r0], r1 @ p[-s] + vld1.8 {q10}, [r0], r1 @ p[0] + vld1.8 {q11}, [r0] @ p[s] + vp6_edge_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + vst1.8 {d18}, [r0]! + vst1.32 {d19[0]}, [r0], r1 + vst1.8 {d20}, [r0]! + vst1.32 {d21[0]}, [r0] + bx lr +endfunc + +function ff_vp6_edge_filter_hor_neon, export=1 + sub r3, r0, #1 + sub r0, r0, #2 + vld1.32 {d16[0]}, [r0], r1 + vld1.32 {d18[0]}, [r0], r1 + vld1.32 {d20[0]}, [r0], r1 + vld1.32 {d22[0]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d20[1]}, [r0], r1 + vld1.32 {d22[1]}, [r0], r1 + vld1.32 {d17[0]}, [r0], r1 + vld1.32 {d19[0]}, [r0], r1 + vld1.32 {d21[0]}, [r0], r1 + vld1.32 {d23[0]}, [r0], r1 + vtrn.8 q8, q9 + vtrn.8 q10, q11 + vtrn.16 q8, q10 + vtrn.16 q9, q11 + vp6_edge_filter + vtrn.8 q9, q10 + vst1.16 {d18[0]}, [r3], r1 + vst1.16 {d20[0]}, [r3], r1 + vst1.16 {d18[1]}, [r3], r1 + vst1.16 {d20[1]}, [r3], r1 + vst1.16 {d18[2]}, [r3], r1 + vst1.16 {d20[2]}, [r3], r1 + vst1.16 {d18[3]}, [r3], r1 + vst1.16 {d20[3]}, [r3], r1 + vst1.16 {d19[0]}, [r3], r1 + vst1.16 {d21[0]}, [r3], r1 + vst1.16 {d19[1]}, [r3], r1 + vst1.16 {d21[1]}, [r3], r1 + bx lr +endfunc diff --git a/ffmpeg/libavcodec/arm/vp8.h b/ffmpeg/libavcodec/arm/vp8.h new file mode 100644 index 0000000..ddaa120 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8.h @@ -0,0 +1,35 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP8_H +#define AVCODEC_ARM_VP8_H + +#include + +#include "config.h" +#include "libavcodec/vp56.h" +#include "libavcodec/vp8.h" + +#if HAVE_ARMV6_EXTERNAL +#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6 +int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, int16_t block[16], + uint8_t probs[8][3][NUM_DCT_TOKENS-1], + int i, uint8_t *token_prob, int16_t qmul[2]); +#endif + +#endif /* AVCODEC_ARM_VP8_H */ diff --git a/ffmpeg/libavcodec/arm/vp8_armv6.S b/ffmpeg/libavcodec/arm/vp8_armv6.S new file mode 100644 index 0000000..e7d25a4 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8_armv6.S @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro rac_get_prob h, bs, buf, cw, pr, t0, t1 + adds \bs, \bs, \t0 + lsl \cw, \cw, \t0 + lsl \t0, \h, \t0 + rsb \h, \pr, #256 + it cs + ldrhcs \t1, [\buf], #2 + smlabb \h, \t0, \pr, \h +T itttt cs + rev16cs \t1, \t1 +A orrcs \cw, \cw, \t1, lsl \bs +T lslcs \t1, \t1, \bs +T orrcs \cw, \cw, \t1 + subcs \bs, \bs, #16 + lsr \h, \h, #8 + cmp \cw, \h, lsl #16 + itt ge + subge \cw, \cw, \h, lsl #16 + subge \h, \t0, \h +.endm + +.macro rac_get_128 h, bs, buf, cw, t0, t1 + adds \bs, \bs, \t0 + lsl \cw, \cw, \t0 + lsl \t0, \h, \t0 + it cs + ldrhcs \t1, [\buf], #2 + mov \h, #128 + it cs + rev16cs \t1, \t1 + add \h, \h, \t0, lsl #7 +A orrcs \cw, \cw, \t1, lsl \bs +T ittt cs +T lslcs \t1, \t1, \bs +T orrcs \cw, \cw, \t1 + subcs \bs, \bs, #16 + lsr \h, \h, #8 + cmp \cw, \h, lsl #16 + itt ge + subge \cw, \cw, \h, lsl #16 + subge \h, \t0, \h +.endm + +function ff_decode_block_coeffs_armv6, export=1 + push {r0,r1,r4-r11,lr} + movrelx lr, X(ff_vp56_norm_shift) + ldrd r4, r5, [sp, #44] @ token_prob, qmul + cmp r3, #0 + ldr r11, [r5] + ldm r0, {r5-r7} @ high, bits, buf + it ne + pkhtbne r11, r11, r11, asr #16 + ldr r8, [r0, #16] @ code_word +0: + ldrb r9, [lr, r5] + add r3, r3, #1 + ldrb r0, [r4, #1] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + blt 2f + + ldrb r9, [lr, r5] + ldrb r0, [r4, #2] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 3f + + add r4, r3, r3, lsl #5 + sxth r12, r11 + add r4, r4, r2 + adds r6, r6, r9 + add r4, r4, #11 + lsl r8, r8, r9 + it cs + ldrhcs r10, [r7], #2 + lsl r9, r5, r9 + mov r5, #128 + it cs + rev16cs r10, r10 + add r5, r5, r9, lsl #7 +T ittt cs +T lslcs r10, r10, r6 +T orrcs r8, r8, r10 +A orrcs r8, r8, r10, lsl r6 + subcs r6, r6, #16 + lsr r5, r5, #8 + cmp r8, r5, lsl #16 + movrel r10, zigzag_scan-1 + itt ge + subge r8, r8, r5, lsl #16 + subge r5, r9, r5 + ldrb r10, [r10, r3] + it ge + rsbge r12, r12, #0 + cmp r3, #16 + strh r12, [r1, r10] + bge 6f +5: + ldrb r9, [lr, r5] + ldrb r0, [r4] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + pkhtb r11, r11, r11, asr #16 + bge 0b + +6: + ldr r0, [sp] + ldr r9, [r0, #12] + cmp r7, r9 + it hi + movhi r7, r9 + stm r0, {r5-r7} @ high, bits, buf + str r8, [r0, #16] @ code_word + + add sp, sp, #8 + mov r0, r3 + pop {r4-r11,pc} +2: + add r4, r3, r3, lsl #5 + cmp r3, #16 + add r4, r4, r2 + pkhtb r11, r11, r11, asr #16 + bne 0b + b 6b +3: + ldrb r0, [r4, #3] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 1f + + mov r12, #2 + ldrb r0, [r4, #4] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r12, #1 + ldrb r9, [lr, r5] + blt 4f + ldrb r0, [r4, #5] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r12, #1 + ldrb r9, [lr, r5] + b 4f +1: + ldrb r0, [r4, #6] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 3f + + ldrb r0, [r4, #7] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 2f + + mov r12, #5 + mov r0, #159 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r12, r12, #1 + ldrb r9, [lr, r5] + b 4f +2: + mov r12, #7 + mov r0, #165 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r12, r12, #2 + ldrb r9, [lr, r5] + mov r0, #145 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r12, r12, #1 + ldrb r9, [lr, r5] + b 4f +3: + ldrb r0, [r4, #8] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge + addge r4, r4, #1 + ldrb r9, [lr, r5] + ite ge + movge r12, #2 + movlt r12, #0 + ldrb r0, [r4, #9] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + mov r9, #8 + it ge + addge r12, r12, #1 + movrelx r4, X(ff_vp8_dct_cat_prob), r1 + lsl r9, r9, r12 + ldr r4, [r4, r12, lsl #2] + add r12, r9, #3 + mov r1, #0 + ldrb r0, [r4], #1 +1: + ldrb r9, [lr, r5] + lsl r1, r1, #1 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r0, [r4], #1 + it ge + addge r1, r1, #1 + cmp r0, #0 + bne 1b + ldrb r9, [lr, r5] + add r12, r12, r1 + ldr r1, [sp, #4] +4: + add r4, r3, r3, lsl #5 + add r4, r4, r2 + add r4, r4, #22 + rac_get_128 r5, r6, r7, r8, r9, r10 + it ge + rsbge r12, r12, #0 + smulbb r12, r12, r11 + movrel r9, zigzag_scan-1 + ldrb r9, [r9, r3] + cmp r3, #16 + strh r12, [r1, r9] + bge 6b + b 5b +endfunc + +const zigzag_scan + .byte 0, 2, 8, 16 + .byte 10, 4, 6, 12 + .byte 18, 24, 26, 20 + .byte 14, 22, 28, 30 +endconst diff --git a/ffmpeg/libavcodec/arm/vp8dsp.h b/ffmpeg/libavcodec/arm/vp8dsp.h new file mode 100644 index 0000000..ce00e4a --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp.h @@ -0,0 +1,78 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP8DSP_H +#define AVCODEC_ARM_VP8DSP_H + +#include "libavcodec/vp8dsp.h" + +void ff_vp8dsp_init_armv6(VP8DSPContext *dsp); +void ff_vp8dsp_init_neon(VP8DSPContext *dsp); + +#define VP8_LF_Y(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_UV(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_SIMPLE(hv, opt) \ + void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim) + +#define VP8_LF_HV(inner, opt) \ + VP8_LF_Y(h, inner, opt); \ + VP8_LF_Y(v, inner, opt); \ + VP8_LF_UV(h, inner, opt); \ + VP8_LF_UV(v, inner, opt) + +#define VP8_LF(opt) \ + VP8_LF_HV(, opt); \ + VP8_LF_HV(_inner, opt); \ + VP8_LF_SIMPLE(h, opt); \ + VP8_LF_SIMPLE(v, opt) + +#define VP8_MC(n, opt) \ + void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int h, int x, int y) + +#define VP8_EPEL(w, opt) \ + VP8_MC(pixels ## w, opt); \ + VP8_MC(epel ## w ## _h4, opt); \ + VP8_MC(epel ## w ## _h6, opt); \ + VP8_MC(epel ## w ## _v4, opt); \ + VP8_MC(epel ## w ## _h4v4, opt); \ + VP8_MC(epel ## w ## _h6v4, opt); \ + VP8_MC(epel ## w ## _v6, opt); \ + VP8_MC(epel ## w ## _h4v6, opt); \ + VP8_MC(epel ## w ## _h6v6, opt) + +#define VP8_BILIN(w, opt) \ + VP8_MC(bilin ## w ## _h, opt); \ + VP8_MC(bilin ## w ## _v, opt); \ + VP8_MC(bilin ## w ## _hv, opt) + +#endif /* AVCODEC_ARM_VP8DSP_H */ diff --git a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S new file mode 100644 index 0000000..5207758 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S @@ -0,0 +1,1634 @@ +/* + * VP8 ARMv6 optimisations + * + * Copyright (c) 2010 Google Inc. + * Copyright (c) 2010 Rob Clark + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * This code was partially ported from libvpx, which uses this license: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of Google nor the names of its contributors may + * be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "libavutil/arm/asm.S" + +@ idct + +@ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) +function ff_vp8_luma_dc_wht_armv6, export=1 + push {r4-r10, lr} + + ldm r1, {r2-r9} + mov r10, #0 + mov lr, #0 + uadd16 r12, r2, r8 @ t0[0,1] + usub16 r2, r2, r8 @ t3[0,1] + stm r1!, {r10, lr} + uadd16 r8, r4, r6 @ t1[0,1] + usub16 r4, r4, r6 @ t2[0,1] + stm r1!, {r10, lr} + uadd16 r6, r12, r8 @ dc0[0,1] + usub16 r12, r12, r8 @ dc2[0,1] + stm r1!, {r10, lr} + uadd16 r8, r2, r4 @ dc1[0,1] + usub16 r2, r2, r4 @ dc3[0,1] + stm r1!, {r10, lr} + + uadd16 lr, r3, r9 @ t0[2,3] + usub16 r3, r3, r9 @ t3[2,3] + uadd16 r9, r5, r7 @ t1[2,3] + usub16 r5, r5, r7 @ t2[2,3] + + uadd16 r7, lr, r9 @ dc0[2,3] + usub16 lr, lr, r9 @ dc2[2,3] + uadd16 r9, r3, r5 @ dc1[2,3] + usub16 r3, r3, r5 @ dc3[2,3] + + mov r1, #3 + orr r1, r1, #0x30000 @ 3 | 3 (round) + + pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0] + pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1] + pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0] + pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1] + pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2] + uadd16 r4, r4, r1 + uadd16 r5, r5, r1 + pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3] + pkhbt r2, lr, r3, lsl #16 @ dc{2,3}[2] + pkhtb lr, r3, lr, asr #16 @ dc{2,3}[3] + + uadd16 r9, r4, r7 @ t0[0,1] + uadd16 r3, r5, lr @ t0[2,3] + usub16 r4, r4, r7 @ t3[0,1] + usub16 r5, r5, lr @ t3[2,3] + uadd16 r7, r6, r8 @ t1[0,1] + uadd16 lr, r12, r2 @ t1[2,3] + usub16 r6, r6, r8 @ t2[0,1] + usub16 r12, r12, r2 @ t2[2,3] + + uadd16 r8, r9, r7 @ block[0,1][0] + uadd16 r2, r3, lr @ block[2,3][0] + usub16 r9, r9, r7 @ block[0,1][2] + usub16 r3, r3, lr @ block[2,3][2] + uadd16 r7, r4, r6 @ block[0,1][1] + uadd16 lr, r5, r12 @ block[2,3][1] + usub16 r4, r4, r6 @ block[0,1][3] + usub16 r5, r5, r12 @ block[2,3][3] + +#if HAVE_ARMV6T2_EXTERNAL + sbfx r6, r8, #3, #13 + sbfx r12, r7, #3, #13 + sbfx r1, r9, #3, #13 + sbfx r10, r4, #3, #13 +#else + sxth r6, r8 + sxth r12, r7 + sxth r1, r9 + sxth r10, r4 + asr r6, #3 @ block[0][0] + asr r12, #3 @ block[0][1] + asr r1, #3 @ block[0][2] + asr r10, #3 @ block[0][3] +#endif + + strh r6, [r0], #32 + asr r8, r8, #19 @ block[1][0] + strh r12, [r0], #32 + asr r7, r7, #19 @ block[1][1] + strh r1, [r0], #32 + asr r9, r9, #19 @ block[1][2] + strh r10, [r0], #32 + asr r4, r4, #19 @ block[1][3] + strh r8, [r0], #32 + asr r6, r2, #19 @ block[3][0] + strh r7, [r0], #32 + asr r12, lr, #19 @ block[3][1] + strh r9, [r0], #32 + asr r1, r3, #19 @ block[3][2] + strh r4, [r0], #32 + asr r10, r5, #19 @ block[3][3] + +#if HAVE_ARMV6T2_EXTERNAL + sbfx r2, r2, #3, #13 + sbfx lr, lr, #3, #13 + sbfx r3, r3, #3, #13 + sbfx r5, r5, #3, #13 +#else + sxth r2, r2 + sxth lr, lr + sxth r3, r3 + sxth r5, r5 + asr r2, #3 @ block[2][0] + asr lr, #3 @ block[2][1] + asr r3, #3 @ block[2][2] + asr r5, #3 @ block[2][3] +#endif + + strh r2, [r0], #32 + strh lr, [r0], #32 + strh r3, [r0], #32 + strh r5, [r0], #32 + strh r6, [r0], #32 + strh r12, [r0], #32 + strh r1, [r0], #32 + strh r10, [r0], #32 + + pop {r4-r10, pc} +endfunc + +@ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16]) +function ff_vp8_luma_dc_wht_dc_armv6, export=1 + ldrsh r2, [r1] + mov r3, #0 + add r2, r2, #3 + strh r3, [r1] + asr r2, r2, #3 + .rept 16 + strh r2, [r0], #32 + .endr + bx lr +endfunc + +@ void vp8_idct_add(uint8_t *dst, int16_t block[16], int stride) +function ff_vp8_idct_add_armv6, export=1 + push {r4-r12, lr} + sub sp, sp, #32 + + movw r3, #20091 @ cospi8sqrt2minus1 + movw r4, #35468 @ sinpi8sqrt2 + mov r5, sp +1: + ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0] + ldr lr, [r1, #16] @ i9 | i8 = block2[1] | block2[0] + ldr r12, [r1, #24] @ i13 | i12 = block3[1] | block3[0] + + smulwt r9, r3, r6 @ ip[5] * cospi8sqrt2minus1 + smulwb r7, r3, r6 @ ip[4] * cospi8sqrt2minus1 + smulwt r10, r4, r6 @ ip[5] * sinpi8sqrt2 + smulwb r8, r4, r6 @ ip[4] * sinpi8sqrt2 + pkhbt r7, r7, r9, lsl #16 @ 5c | 4c + smulwt r11, r3, r12 @ ip[13] * cospi8sqrt2minus1 + pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half + uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half + smulwb r9, r3, r12 @ ip[12] * cospi8sqrt2minus1 + smulwt r7, r4, r12 @ ip[13] * sinpi8sqrt2 + smulwb r10, r4, r12 @ ip[12] * sinpi8sqrt2 + + pkhbt r9, r9, r11, lsl #16 @ 13c | 12c + ldr r11, [r1] @ i1 | i0 + pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half + uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 2nd half + uadd16 r6, r6, r10 @ d = t3 + uadd16 r10, r11, lr @ a = t0 + usub16 r7, r8, r7 @ c = t2 + usub16 r8, r11, lr @ b = t1 + uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0] + usub16 r10, r10, r6 @ a-d = tmp{0,1}[3] + uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1] + usub16 r7, r8, r7 @ b-c = tmp{0,1}[2] + mov r8, #0 + cmp sp, r5 + str r6, [r5, #8] @ o5 | o4 + str r7, [r5, #16] @ o9 | o8 + str r10, [r5, #24] @ o13 | o12 + str r9, [r5], #4 @ o1 | o0 + str r8, [r1, #8] + str r8, [r1, #16] + str r8, [r1, #24] + str r8, [r1], #4 + beq 1b + + mov r5, #2 +2: + pop {r1, r6, r12, lr} + smulwt r9, r3, r12 @ ip[5] * cospi8sqrt2minus1 + smulwt r7, r3, r1 @ ip[1] * cospi8sqrt2minus1 + smulwt r10, r4, r12 @ ip[5] * sinpi8sqrt2 + smulwt r8, r4, r1 @ ip[1] * sinpi8sqrt2 + pkhbt r11, r1, r12, lsl #16 @ i4 | i0 = t0/t1 first half + pkhtb r1, r12, r1, asr #16 @ i5 | i1 + pkhbt r7, r7, r9, lsl #16 @ 5c | 1c + pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = t2 first half + pkhbt r9, r6, lr, lsl #16 @ i6 | i2 = t0/t1 second half + pkhtb r12, lr, r6, asr #16 @ i7 | i3 + uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = t3 first half + uadd16 r10, r11, r9 @ a = t0 + usub16 r9, r11, r9 @ b = t1 + smulwt r7, r3, r12 @ ip[7] * cospi8sqrt2minus1 + smulwb lr, r3, r12 @ ip[3] * cospi8sqrt2minus1 + smulwt r11, r4, r12 @ ip[7] * sinpi8sqrt2 + smulwb r6, r4, r12 @ ip[3] * sinpi8sqrt2 + subs r5, r5, #1 + pkhbt r7, lr, r7, lsl #16 @ 7c | 3c + pkhbt r11, r6, r11, lsl #16 @ 7s | 3s = t3 second half + mov r6, #0x4 + orr r6, r6, #0x40000 + uadd16 r12, r7, r12 @ 7c+7 | 3c+3 = t2 second half + uadd16 r10, r10, r6 @ t0 + 4 + uadd16 r9, r9, r6 @ t1 + 4 + usub16 lr, r8, r12 @ c (o5 | o1) = t2 + uadd16 r12, r11, r1 @ d (o7 | o3) = t3 + usub16 r1, r9, lr @ b-c = dst{0,1}[2] + uadd16 r7, r10, r12 @ a+d = dst{0,1}[0] + usub16 r12, r10, r12 @ a-d = dst{0,1}[3] + uadd16 r10, r9, lr @ b+c = dst{0,1}[1] + + asr lr, r1, #3 @ o[1][2] + asr r9, r12, #3 @ o[1][3] + pkhtb r8, lr, r7, asr #19 @ o[1][0,2] + pkhtb r11, r9, r10, asr #19 @ o[1][1,3] + ldr lr, [r0] + sxth r12, r12 + ldr r9, [r0, r2] + sxth r1, r1 +#if HAVE_ARMV6T2_EXTERNAL + sbfx r7, r7, #3, #13 + sbfx r10, r10, #3, #13 +#else + sxth r7, r7 + sxth r10, r10 + asr r7, #3 @ o[0][0] + asr r10, #3 @ o[0][1] +#endif + pkhbt r7, r7, r1, lsl #13 @ o[0][0,2] + pkhbt r10, r10, r12, lsl #13 @ o[0][1,3] + + uxtab16 r7, r7, lr + uxtab16 r10, r10, lr, ror #8 + uxtab16 r8, r8, r9 + uxtab16 r11, r11, r9, ror #8 + usat16 r7, #8, r7 + usat16 r10, #8, r10 + usat16 r8, #8, r8 + usat16 r11, #8, r11 + orr r7, r7, r10, lsl #8 + orr r8, r8, r11, lsl #8 + str r8, [r0, r2] + str_post r7, r0, r2, lsl #1 + + bne 2b + + pop {r4-r12, pc} +endfunc + +@ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], int stride) +function ff_vp8_idct_dc_add_armv6, export=1 + push {r4-r6, lr} + add r6, r0, r2, lsl #1 + ldrsh r3, [r1] + mov r4, #0 + add r3, r3, #4 + strh r4, [r1], #32 + asr r3, #3 + ldr r5, [r0] + ldr r4, [r0, r2] + pkhbt r3, r3, r3, lsl #16 + uxtab16 lr, r3, r5 @ a1+2 | a1+0 + uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1 + uxtab16 r12, r3, r4 + uxtab16 r4, r3, r4, ror #8 + usat16 lr, #8, lr + usat16 r5, #8, r5 + usat16 r12, #8, r12 + usat16 r4, #8, r4 + orr lr, lr, r5, lsl #8 + ldr r5, [r6] + orr r12, r12, r4, lsl #8 + ldr r4, [r6, r2] + str lr, [r0] + uxtab16 lr, r3, r5 + str r12, [r0, r2] + uxtab16 r5, r3, r5, ror #8 + uxtab16 r12, r3, r4 + uxtab16 r4, r3, r4, ror #8 + usat16 lr, #8, lr + usat16 r5, #8, r5 + usat16 r12, #8, r12 + usat16 r4, #8, r4 + orr lr, lr, r5, lsl #8 + orr r12, r12, r4, lsl #8 + str lr, [r6] + str r12, [r6, r2] + pop {r4-r6, pc} +endfunc + +@ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], int stride) +function ff_vp8_idct_dc_add4uv_armv6, export=1 + push {r4, lr} + + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, r2, lsl #2 + sub r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + + pop {r4, pc} +endfunc + +@ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], int stride) +function ff_vp8_idct_dc_add4y_armv6, export=1 + push {r4, lr} + + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + add r0, r0, #4 + bl ff_vp8_idct_dc_add_armv6 + + pop {r4, pc} +endfunc + +@ loopfilter + +.macro transpose o3, o2, o1, o0, i0, i1, i2, i3 + uxtb16 \o1, \i1 @ xx 12 xx 10 + uxtb16 \o0, \i0 @ xx 02 xx 00 + uxtb16 \o3, \i3 @ xx 32 xx 30 + uxtb16 \o2, \i2 @ xx 22 xx 20 + orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00 + orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20 + + uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11 + uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31 + uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01 + uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21 + orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01 + orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21 + + pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 + pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 + pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 + pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 +.endm + +.macro simple_filter + uqsub8 r7, r3, r6 @ p1 - q1 + uqsub8 r8, r6, r3 @ q1 - p1 + uqsub8 r10, r4, r5 @ p0 - q0 + uqsub8 r9, r5, r4 @ q0 - p0 + orr r7, r7, r8 @ abs(p1 - q1) + orr r9, r9, r10 @ abs(p0 - q0) + uhadd8 r7, r7, lr @ abs(p1 - q2) >> 1 + uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2 + uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1-q1)/2 + mvn r8, #0 + usub8 r10, r12, r7 @ compare to flimit + sel r10, r8, lr @ filter mask: F or 0 + cmp r10, #0 + beq 2f + + eor r3, r3, r2 @ ps1 + eor r6, r6, r2 @ qs1 + eor r4, r4, r2 @ ps0 + eor r5, r5, r2 @ qs0 + + qsub8 r3, r3, r6 @ vp8_filter = p1 - q1 + qsub8 r6, r5, r4 @ q0 - p0 + qadd8 r3, r3, r6 @ += q0 - p0 + lsr r7, r2, #5 @ 0x04040404 + qadd8 r3, r3, r6 @ += q0 - p0 + sub r9, r7, r2, lsr #7 @ 0x03030303 + qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0) + and r3, r3, r10 @ vp8_filter &= mask + + qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3 + qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4 + + shadd8 r9, r9, lr + shadd8 r3, r3, lr + shadd8 r9, r9, lr + shadd8 r3, r3, lr + shadd8 r9, r9, lr @ Filter2 >>= 3 + shadd8 r3, r3, lr @ Filter1 >>= 3 + + qadd8 r4, r4, r9 @ u = p0 + Filter2 + qsub8 r5, r5, r3 @ u = q0 - Filter1 + eor r4, r4, r2 @ *op0 = u ^ 0x80 + eor r5, r5, r2 @ *oq0 = u ^ 0x80 +.endm + +@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim) +function ff_vp8_v_loop_filter16_simple_armv6, export=1 + push {r4-r11, lr} + + orr r2, r2, r2, lsl #16 + mov r11, #4 + mov lr, #0 + orr r12, r2, r2, lsl #8 + mov32 r2, 0x80808080 +1: + ldr_nreg r3, r0, r1, lsl #1 @ p1 + ldr_nreg r4, r0, r1 @ p0 + ldr r5, [r0] @ q0 + ldr r6, [r0, r1] @ q1 + simple_filter +T sub r7, r0, r1 + str r5, [r0] @ oq0 +A str r4, [r0, -r1] @ op0 +T str r4, [r7] +2: + subs r11, r11, #1 + add r0, r0, #4 + bne 1b + + pop {r4-r11, pc} +endfunc + +.macro filter_mask_p + uqsub8 r6, r9, r10 @ p3 - p2 + uqsub8 r7, r10, r9 @ p2 - p3 + uqsub8 r8, r10, r11 @ p2 - p1 + uqsub8 r10, r11, r10 @ p1 - p2 + orr r6, r6, r7 @ abs(p3-p2) + orr r8, r8, r10 @ abs(p2-p1) + uqsub8 lr, r6, r2 @ compare to limit + uqsub8 r8, r8, r2 @ compare to limit + uqsub8 r6, r11, r12 @ p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 @ p0 - p1 + orr r6, r6, r7 @ abs(p1-p0) + uqsub8 r7, r6, r2 @ compare to limit + uqsub8 r8, r6, r3 @ compare to thresh + orr lr, lr, r7 +.endm + +.macro filter_mask_pq + uqsub8 r6, r11, r10 @ p1 - q1 + uqsub8 r7, r10, r11 @ q1 - p1 + uqsub8 r11, r12, r9 @ p0 - q0 + uqsub8 r12, r9, r12 @ q0 - p0 + orr r6, r6, r7 @ abs(p1-q1) + orr r12, r11, r12 @ abs(p0-q0) + mov32 r7, 0x7f7f7f7f + uqadd8 r12, r12, r12 @ abs(p0-q0) * 2 + and r6, r7, r6, lsr #1 @ abs(p1-q1) / 2 + uqadd8 r12, r12, r6 @ abs(p0-q0) * 2 + abs(p1-q1)/2 +.endm + +.macro filter_mask_v + filter_mask_p + + ldr r10, [r0, r1] @ q1 + ldr_post r9, r0, r1, lsl #1 @ q0 + + filter_mask_pq + + ldr r11, [r0] @ q2 + + uqsub8 r7, r9, r10 @ q0 - q1 + uqsub8 r6, r10, r9 @ q1 - q0 + uqsub8 r12, r12, r4 @ compare to flimit + uqsub8 r9, r11, r10 @ q2 - q1 + uqsub8 r10, r10, r11 @ q1 - q2 + orr lr, lr, r12 + ldr r12, [r0, r1] @ q3 + orr r6, r7, r6 @ abs(q1-q0) + orr r10, r9, r10 @ abs(q2-q1) + uqsub8 r9, r12, r11 @ q3 - q2 + uqsub8 r11, r11, r12 @ q2 - q3 + uqsub8 r7, r6, r2 @ compare to limit + uqsub8 r10, r10, r2 @ compare to limit + uqsub8 r6, r6, r3 @ compare to thresh + orr r9, r9, r11 @ abs(q3-q2) + orr lr, lr, r7 + orr lr, lr, r10 + uqsub8 r9, r9, r2 @ compare to limit + orr lr, lr, r9 + + mov r12, #0 + usub8 lr, r12, lr + mvn r11, #0 + sel lr, r11, r12 @ filter mask + sub r0, r0, r1, lsl #1 +.endm + +.macro filter_mask_h + transpose r12, r11, r10, r9, r6, r7, r8, lr + + filter_mask_p + + stm sp, {r8, r11, r12, lr} + sub r0, r0, r1, lsl #2 + add r0, r0, #4 + + ldr r7, [r0, r1] + ldr_post r6, r0, r1, lsl #1 + ldr lr, [r0, r1] + ldr r8, [r0] + + transpose r12, r11, r10, r9, r6, r7, r8, lr + + uqsub8 r8, r12, r11 @ q3 - q2 + uqsub8 lr, r11, r12 @ q2 - q3 + uqsub8 r7, r9, r10 @ q0 - q1 + uqsub8 r6, r10, r9 @ q1 - q0 + uqsub8 r12, r11, r10 @ q2 - q1 + uqsub8 r11, r10, r11 @ q1 - q2 + orr r8, r8, lr @ abs(q3-q2) + orr r6, r7, r6 @ abs(q1-q0) + orr r11, r12, r11 @ abs(q2-q1) + ldr lr, [sp, #12] @ load back (f)limit accumulator + uqsub8 r8, r8, r2 @ compare to limit + uqsub8 r7, r6, r2 @ compare to limit + uqsub8 r11, r11, r2 @ compare to limit + orr lr, lr, r8 + uqsub8 r8, r6, r3 @ compare to thresh + orr lr, lr, r7 + ldr r12, [sp, #8] @ p1 + orr lr, lr, r11 + + ldr r11, [sp, #4] @ p0 + + filter_mask_pq + + mov r10, #0 + uqsub8 r12, r12, r4 @ compare to flimit + mvn r11, #0 + orr lr, lr, r12 + usub8 lr, r10, lr + sel lr, r11, r10 @ filter mask +.endm + +.macro filter inner + mov32 r12, 0x80808080 + eor r11, r7, r12 @ ps1 + eor r8, r8, r12 @ ps0 + eor r9, r9, r12 @ qs0 + eor r10, r10, r12 @ qs1 + + stm sp, {r8-r11} + + qsub8 r7, r11, r10 @ vp8_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + .if \inner + and r7, r7, r6 @ vp8_filter &= hev + .endif + qadd8 r7, r7, r8 + lsr r10, r12, #5 @ 0x04040404 + qadd8 r7, r7, r8 + sub r9, r10, r12, lsr #7 @ 0x03030303 + qadd8 r7, r7, r8 + + and r7, r7, lr @ vp8_filter &= mask + .if !\inner + mov r12, r7 @ Filter2 + and r7, r7, r6 @ Filter2 &= hev + .endif + qadd8 lr, r7, r9 @ Filter2 = vp8_signed_char_clamp(vp8_filter+3) + qadd8 r7, r7, r10 @ Filter1 = vp8_signed_char_clamp(vp8_filter+4) + + mov r9, #0 + shadd8 lr, lr, r9 @ Filter2 >>= 3 + shadd8 r7, r7, r9 @ Filter1 >>= 3 + shadd8 lr, lr, r9 + shadd8 r7, r7, r9 + shadd8 lr, lr, r9 @ Filter2 + shadd8 r7, r7, r9 @ Filter1 +.endm + +.macro filter_v inner + orr r10, r6, r8 @ calculate vp8_hevmask + ldr_nreg r7, r0, r1, lsl #1 @ p1 + usub8 r10, r12, r10 + ldr_nreg r8, r0, r1 @ p0 + sel r6, r12, r11 @ obtain vp8_hevmask + ldr r9, [r0] @ q0 + ldr r10, [r0, r1] @ q1 + filter \inner +.endm + +.macro filter_h inner + orr r9, r6, r8 + usub8 r9, r12, r9 + sel r6, r12, r11 @ hev mask + + stm sp, {r6, lr} + + ldr_nreg r12, r0, r1, lsl #1 + ldr_nreg r11, r0, r1 + ldr r6, [r0] + ldr lr, [r0, r1] + + transpose r10, r9, r8, r7, r12, r11, r6, lr + + ldm sp, {r6, lr} + filter \inner +.endm + +.macro filter_inner + ldm sp, {r8, r9} + lsr r10, r10, #2 @ 0x01010101 + qadd8 r8, r8, lr @ u = vp8_signed_char_clamp(ps0 + Filter2) + mov lr, #0 + qsub8 r9, r9, r7 @ u = vp8_signed_char_clamp(qs0 - Filter1) + sadd8 r7, r7, r10 @ vp8_filter += 1 + ldr r10, [sp, #8] @ qs1 + shadd8 r7, r7, lr @ vp8_filter >>= 1 + eor r8, r8, r12 @ *op0 = u ^ 0x80 + bic r7, r7, r6 @ vp8_filter &= ~hev + qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter) + eor r9, r9, r12 @ *oq0 = u ^ 0x80 + qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter) + eor r11, r11, r12 @ *op1 = u ^ 0x80 + eor r10, r10, r12 @ *oq1 = u ^ 0x80 +.endm + +.macro filter_x c0 + mov lr, \c0 + mov r7, #63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + mov32 lr, 0x80808080 + + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u) + eor r8, r8, lr @ *oq0 = s ^ 0x80 + eor r10, r10, lr @ *op0 = s ^ 0x80 +.endm + +.macro filter_1 + ldm sp, {r8, r9} + qadd8 r11, r8, lr + qsub8 r9, r9, r7 + bic r12, r12, r6 @ vp8_filter &= ~hev + filter_x #27 +.endm + +.macro filter_2 + ldr r9, [sp, #8] @ qs1 + ldr r11, [sp, #12] @ ps1 + filter_x #18 +.endm + +.macro filter_3 + eor r9, r9, lr + eor r11, r11, lr + filter_x #9 +.endm + +function vp8_v_loop_filter_inner_armv6 + mov r5, #4 + sub sp, sp, #16 + + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + orr r6, r6, r6, lsl #16 + orr r4, r2, r2, lsl #8 @ flimE + orr r2, r3, r3, lsl #8 @ flimI + orr r3, r6, r6, lsl #8 @ thresh +1: + sub r0, r0, r1, lsl #2 + ldr r10, [r0, r1] @ p2 + ldr_post r9, r0, r1, lsl #1 @ p3 + ldr r12, [r0, r1] @ p0 + ldr_post r11, r0, r1, lsl #1 @ p1 + + filter_mask_v + cmp lr, #0 + beq 2f + filter_v inner=1 + filter_inner + +A str r11, [r0, -r1, lsl #1] @ op1 +A str r8, [r0, -r1] @ op0 +T sub r0, r0, r1, lsl #1 +T str r8, [r0, r1] +T str_post r11, r0, r1, lsl #1 + str r9, [r0] @ oq0 + str r10, [r0, r1] @ oq1 +2: + add r0, r0, #4 + cmp r5, #3 + it eq + ldreq r0, [sp, #16] + subs r5, r5, #1 + bne 1b + + add sp, sp, #16 + pop {r0, r4-r11, pc} +endfunc + +function ff_vp8_v_loop_filter16_inner_armv6, export=1 + push {r4-r11, lr} + add r12, r0, #8 + push {r12} + ldr r6, [sp, #40] + orr r2, r2, r2, lsl #16 + b vp8_v_loop_filter_inner_armv6 +endfunc + +function ff_vp8_v_loop_filter8uv_inner_armv6, export=1 + push {r1, r4-r11, lr} + mov r1, r2 + orr r2, r3, r3, lsl #16 + ldr r3, [sp, #40] + ldr r6, [sp, #44] + b vp8_v_loop_filter_inner_armv6 +endfunc + +function vp8_v_loop_filter_armv6 + mov r5, #4 + sub sp, sp, #16 + + orr r3, r3, r3, lsl #16 + orr r6, r6, r6, lsl #16 + orr r4, r2, r2, lsl #8 @ flimE + orr r2, r3, r3, lsl #8 @ flimI + orr r3, r6, r6, lsl #8 @ thresh +1: + sub r0, r0, r1, lsl #2 + ldr r10, [r0, r1] @ p2 + ldr_post r9, r0, r1, lsl #1 @ p3 + ldr r12, [r0, r1] @ p0 + ldr_post r11, r0, r1, lsl #1 @ p1 + + filter_mask_v + cmp lr, #0 + beq 2f + + filter_v inner=0 + filter_1 + + str r8, [r0] @ *oq0 +A str r10, [r0, -r1] @ *op0 +T sub r0, r0, r1, lsl #1 +T str r10, [r0, r1] + + filter_2 + +A str r10, [r0, -r1, lsl #1] @ *op1 +T str_post r10, r0, r1, lsl #1 + str r8, [r0, r1] @ *oq1 + + ldr r9, [r0, r1, lsl #1] @ q2 + add r0, r0, r1 +A ldr r11, [r0, -r1, lsl #2] @ p2 +T ldr_dpre r11, r0, r1, lsl #2 + + filter_3 + +A str r10, [r0, -r1, lsl #2] @ *op2 +T str_post r10, r0, r1, lsl #2 + str r8, [r0, r1] @ *oq2 + sub r0, r0, r1 +2: + add r0, r0, #4 + cmp r5, #3 + it eq + ldreq r0, [sp, #16] + subs r5, r5, #1 + bne 1b + + add sp, sp, #16 + pop {r0, r4-r11, pc} +endfunc + +function ff_vp8_v_loop_filter16_armv6, export=1 + push {r4-r11, lr} + add r12, r0, #8 + push {r12} + ldr r6, [sp, #40] + orr r2, r2, r2, lsl #16 + b vp8_v_loop_filter_armv6 +endfunc + +function ff_vp8_v_loop_filter8uv_armv6, export=1 + push {r1, r4-r11, lr} + mov r1, r2 + orr r2, r3, r3, lsl #16 + ldr r3, [sp, #40] + ldr r6, [sp, #44] + b vp8_v_loop_filter_armv6 +endfunc + +@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim) +function ff_vp8_h_loop_filter16_simple_armv6, export=1 + push {r4-r11, lr} + orr r12, r2, r2, lsl #16 + mov32 r2, 0x80808080 + orr r12, r12, r12, lsl #8 + + mov lr, #0 + mov r11, #4 +1: + sub r0, r0, #2 + ldr r8, [r0, r1] + ldr_post r7, r0, r1, lsl #1 + ldr r10, [r0, r1] + ldr_post r9, r0, r1, lsl #1 + add r0, r0, #2 + transpose r6, r5, r4, r3, r7, r8, r9, r10 + simple_filter + sub r0, r0, r1, lsl #2 + sub r0, r0, #1 + + uxtb16 r6, r4 + uxtb16 r8, r5 + uxtb16 r7, r4, ror #8 + uxtb16 r9, r5, ror #8 + orr r6, r6, r8, lsl #8 + orr r7, r7, r9, lsl #8 + lsr r4, r6, #16 + lsr r5, r7, #16 + + strh_post r6, r0, r1 + strh_post r7, r0, r1 + strh_post r4, r0, r1 + strh_post r5, r0, r1 + add r0, r0, #1 +2: + subs r11, r11, #1 + bne 1b + + pop {r4-r11, pc} +endfunc + +function vp8_h_loop_filter_inner_armv6 + mov r5, #4 + sub sp, sp, #16 + + orr r3, r3, r3, lsl #16 + orr r9, r9, r9, lsl #16 + orr r4, r2, r2, lsl #8 @ flimE + orr r2, r3, r3, lsl #8 @ flimI + orr r3, r9, r9, lsl #8 @ thresh + sub r0, r0, #4 +1: + ldr r7, [r0, r1] + ldr_post r6, r0, r1, lsl #1 + ldr lr, [r0, r1] + ldr_post r8, r0, r1, lsl #1 + + filter_mask_h + + cmp lr, #0 + sub r0, r0, #2 + beq 2f + + ldr r6, [sp] + + filter_h inner=1 + filter_inner + + transpose lr, r12, r7, r6, r11, r8, r9, r10 + +A str r6, [r0, -r1, lsl #1] +A str r7, [r0, -r1] +T sub r0, r0, r1, lsl #1 +T str r7, [r0, r1] +T str_post r6, r0, r1, lsl #1 + str r12, [r0] + str lr, [r0, r1] +2: + sub r0, r0, #2 + add r0, r0, r1, lsl #1 + cmp r5, #3 + it eq + ldreq r0, [sp, #16] + subs r5, r5, #1 + bne 1b + + add sp, sp, #16 + pop {r0, r4-r11, pc} +endfunc + +function ff_vp8_h_loop_filter16_inner_armv6, export=1 + push {r4-r11, lr} + add r12, r0, r1, lsl #3 + sub r12, r12, #4 + push {r12} + ldr r9, [sp, #40] + orr r2, r2, r2, lsl #16 + b vp8_h_loop_filter_inner_armv6 +endfunc + +function ff_vp8_h_loop_filter8uv_inner_armv6, export=1 + sub r1, r1, #4 + push {r1, r4-r11, lr} + mov r1, r2 + orr r2, r3, r3, lsl #16 + ldr r3, [sp, #40] + ldr r9, [sp, #44] + b vp8_h_loop_filter_inner_armv6 +endfunc + +function vp8_h_loop_filter_armv6 + mov r5, #4 + sub sp, sp, #16 + + orr r3, r3, r3, lsl #16 + orr r9, r9, r9, lsl #16 + orr r4, r2, r2, lsl #8 @ flimE + orr r2, r3, r3, lsl #8 @ flimI + orr r3, r9, r9, lsl #8 @ thresh +1: + sub r0, r0, #4 + ldr r7, [r0, r1] + ldr_post r6, r0, r1, lsl #1 + ldr lr, [r0, r1] + ldr_post r8, r0, r1, lsl #1 + + filter_mask_h + cmp lr, #0 + it eq + addeq r0, r0, r1, lsl #1 + beq 2f + + ldr r6, [sp] + sub r0, r0, #2 + + filter_h inner=0 + filter_1 + + sub r0, r0, r1, lsl #1 + uxtb16 r6, r10 + uxtb16 r7, r8 + uxtb16 r10, r10, ror #8 + uxtb16 r8, r8, ror #8 + orr r6, r6, r7, lsl #8 + orr r10, r10, r8, lsl #8 + lsr r7, r6, #16 + lsr r8, r10, #16 + + add r0, r0, #1 + strh_post r6, r0, r1 + strh_post r10, r0, r1 + strh_post r7, r0, r1 + strh_post r8, r0, r1 + + filter_2 + + sub r0, r0, r1, lsl #2 + add r0, r0, #3 + + ldrb r11, [r0, #-5] @ p2 for 1/7th difference + strb r10, [r0, #-4] @ op1 + strb r8, [r0, #-1] @ oq1 + ldrb_post r9, r0, r1 @ q2 for 1/7th difference + + lsr r10, r10, #8 + lsr r8, r8, #8 + + ldrb r6, [r0, #-5] + strb r10, [r0, #-4] + strb r8, [r0, #-1] + ldrb_post r7, r0, r1 + + lsr r10, r10, #8 + lsr r8, r8, #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [r0, #-5] + strb r10, [r0, #-4] + strb r8, [r0, #-1] + ldrb_post r7, r0, r1 + + lsr r10, r10, #8 + lsr r8, r8, #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [r0, #-5] + strb r10, [r0, #-4] + strb r8, [r0, #-1] + ldrb_post r7, r0, r1 + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + filter_3 + + sub r0, r0, r1, lsl #2 + strb r10, [r0, #-5] + strb_post r8, r0, r1 + lsr r10, r10, #8 + lsr r8, r8, #8 + strb r10, [r0, #-5] + strb_post r8, r0, r1 + lsr r10, r10, #8 + lsr r8, r8, #8 + strb r10, [r0, #-5] + strb_post r8, r0, r1 + lsr r10, r10, #8 + lsr r8, r8, #8 + strb r10, [r0, #-5] + strb_post r8, r0, r1 + + sub r0, r0, #2 +2: + cmp r5, #3 + it eq + ldreq r0, [sp, #16] + subs r5, r5, #1 + bne 1b + + add sp, sp, #16 + pop {r0, r4-r11, pc} +endfunc + +function ff_vp8_h_loop_filter16_armv6, export=1 + push {r4-r11, lr} + add r12, r0, r1, lsl #3 + push {r12} + ldr r9, [sp, #40] + orr r2, r2, r2, lsl #16 + b vp8_h_loop_filter_armv6 +endfunc + +function ff_vp8_h_loop_filter8uv_armv6, export=1 + push {r1, r4-r11, lr} + mov r1, r2 + orr r2, r3, r3, lsl #16 + ldr r3, [sp, #40] + ldr r9, [sp, #44] + b vp8_h_loop_filter_armv6 +endfunc + +.ltorg + +@ MC + +@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src, +@ int srcstride, int h, int mx, int my) +function ff_put_vp8_pixels16_armv6, export=1 + push {r4-r11} + ldr r12, [sp, #32] @ h +1: + subs r12, r12, #2 + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + ldr_post r4, r2, r3 + ldr r9, [r2, #4] + ldr r10, [r2, #8] + ldr r11, [r2, #12] + ldr_post r8, r2, r3 + strd r6, r7, [r0, #8] + strd_post r4, r5, r0, r1 + strd r10, r11, [r0, #8] + strd_post r8, r9, r0, r1 + bgt 1b + pop {r4-r11} + bx lr +endfunc + +@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src, +@ int srcstride, int h, int mx, int my) +function ff_put_vp8_pixels8_armv6, export=1 + push {r4-r11} + ldr r12, [sp, #32] @ h +1: + subs r12, r12, #4 + ldr r5, [r2, #4] + ldr_post r4, r2, r3 + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + ldr r9, [r2, #4] + ldr_post r8, r2, r3 + ldr r11, [r2, #4] + ldr_post r10, r2, r3 + strd_post r4, r5, r0, r1 + strd_post r6, r7, r0, r1 + strd_post r8, r9, r0, r1 + strd_post r10, r11, r0, r1 + bgt 1b + pop {r4-r11} + bx lr +endfunc + +@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src, +@ int srcstride, int h, int mx, int my) +function ff_put_vp8_pixels4_armv6, export=1 + ldr r12, [sp, #0] @ h + push {r4-r6,lr} +1: + subs r12, r12, #4 + ldr_post r4, r2, r3 + ldr_post r5, r2, r3 + ldr_post r6, r2, r3 + ldr_post lr, r2, r3 + str_post r4, r0, r1 + str_post r5, r0, r1 + str_post r6, r0, r1 + str_post lr, r0, r1 + bgt 1b + pop {r4-r6,pc} +endfunc + +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit +@ arithmatic can be used to apply filters +const sixtap_filters_13245600, align=4 + .short 2, 108, -11, 36, -8, 1, 0, 0 + .short 3, 77, -16, 77, -16, 3, 0, 0 + .short 1, 36, -8, 108, -11, 2, 0, 0 +endconst + +const fourtap_filters_1324, align=4 + .short -6, 12, 123, -1 + .short -9, 50, 93, -6 + .short -6, 93, 50, -9 + .short -1, 123, 12, -6 +endconst + +.macro vp8_mc_1 name, size, hv +function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1 + sub r1, r1, #\size + mov r12, sp + push {r1, r4-r11, lr} + ldm r12, {r5-r7} + mov r4, #\size + stm r12, {r4, r5} + orr r12, r6, r7 + b vp8_put_\name\()_\hv\()_armv6 + 4 +endfunc +.endm + +vp8_mc_1 epel, 16, h6 +vp8_mc_1 epel, 16, v6 +vp8_mc_1 epel, 8, h6 +vp8_mc_1 epel, 8, v6 +vp8_mc_1 epel, 8, h4 +vp8_mc_1 epel, 8, v4 +vp8_mc_1 epel, 4, h6 +vp8_mc_1 epel, 4, v6 +vp8_mc_1 epel, 4, h4 +vp8_mc_1 epel, 4, v4 + +vp8_mc_1 bilin, 16, h +vp8_mc_1 bilin, 16, v +vp8_mc_1 bilin, 8, h +vp8_mc_1 bilin, 8, v +vp8_mc_1 bilin, 4, h +vp8_mc_1 bilin, 4, v + +/* True relational expressions have the value -1 in the GNU assembler, + +1 in Apple's. */ +#ifdef __APPLE__ +# define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1) +#else +# define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1) +#endif + +.macro vp8_mc_hv name, size, h, v, ytaps +function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1 + push {r0, r1, r4, lr} + add r0, sp, #16 + sub sp, sp, #TMPSIZE+16 + ldm r0, {r0, r12} + mov r4, #\size + add lr, r0, #\ytaps-1 + .if \ytaps > 2 + sub r2, r2, r3, lsl #\ytaps >> 1 & 1 + .endif + stm sp, {r4, lr} + add r0, sp, #16 + mov r1, #0 + bl vp8_put_\name\()_\h\()_armv6 + add r0, sp, #TMPSIZE+16 + ldr lr, [sp, #TMPSIZE+16+16] + ldm r0, {r0, r1} + mov r3, #\size + ldr r12, [sp, #TMPSIZE+16+16+8] + str lr, [sp, #4] + add r2, sp, #16 + \size * (\ytaps / 2 - 1) + sub r1, r1, #\size + bl vp8_put_\name\()_\v\()_armv6 + add sp, sp, #TMPSIZE+16+8 + pop {r4, pc} +endfunc +.endm + +vp8_mc_hv epel, 16, h6, v6, 6 +vp8_mc_hv epel, 8, h6, v6, 6 +vp8_mc_hv epel, 8, h4, v6, 6 +vp8_mc_hv epel, 8, h6, v4, 4 +vp8_mc_hv epel, 8, h4, v4, 4 +vp8_mc_hv epel, 4, h6, v6, 6 +vp8_mc_hv epel, 4, h4, v6, 6 +vp8_mc_hv epel, 4, h6, v4, 4 +vp8_mc_hv epel, 4, h4, v4, 4 + +vp8_mc_hv bilin, 16, h, v, 2 +vp8_mc_hv bilin, 8, h, v, 2 +vp8_mc_hv bilin, 4, h, v, 2 + +.macro sat4 r0, r1, r2, r3 + asr \r0, \r0, #7 + asr \r1, \r1, #7 + pkhbt \r0, \r0, \r2, lsl #9 + pkhbt \r1, \r1, \r3, lsl #9 + usat16 \r0, #8, \r0 + usat16 \r1, #8, \r1 + orr \r0, \r0, \r1, lsl #8 +.endm + +@ Calling convention for the inner MC functions: +@ r0 dst +@ r1 dst_stride - block_width +@ r2 src +@ r3 src_stride +@ r4 block_width +@ r12 filter_index +@ [sp] block_width +@ [sp+4] height +@ [sp+8] scratch + +function vp8_put_epel_h6_armv6 + push {r1, r4-r11, lr} + sub r2, r2, #2 + movrel lr, sixtap_filters_13245600 - 16 + add lr, lr, r12, lsl #3 + sub r3, r3, r4 + str r3, [sp, #48] + ldm lr, {r1, r3, lr} +1: + ldr r7, [r2, #5] @ src[5-8] + ldr r6, [r2, #2] @ src[2-5] + ldr r5, [r2], #4 @ src[0-3] + + pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6] + uxtb16 r9, r6, ror #8 @ src[5] | src[3] + uxtb16 r6, r6 @ src[4] | src[2] + uxtb16 r8, r5, ror #8 @ src[3] | src[1] + uxtb16 r11, r7, ror #8 @ src[8] | src[7] + uxtb16 r7, r7 @ src[7] | src[6] + uxtb16 r5, r5 @ src[2] | src[0] + + mov r10, #0x40 + smlad r5, r5, r1, r10 @ filter[0][0] + smlad r11, r11, lr, r10 @ filter[3][2] + smlad r12, r7, lr, r10 @ filter[2][2] + smlad r10, r8, r1, r10 @ filter[1][0] + smlad r5, r8, r3, r5 @ filter[0][1] + smlad r11, r9, r1, r11 @ filter[3][0] + smlad r12, r9, r3, r12 @ filter[2][1] + pkhtb r9, r9, r6, asr #16 @ src[5] | src[4] + smlad r10, r6, r3, r10 @ filter[1][1] + pkhbt r7, r9, r7, lsl #16 @ src[6] | src[4] + smlad r5, r9, lr, r5 @ filter[0][2] + pkhtb r8, r7, r9, asr #16 @ src[6] | src[5] + smlad r11, r7, r3, r11 @ filter[3][1] + smlad r9, r8, lr, r10 @ filter[1][2] + smlad r7, r6, r1, r12 @ filter[2][0] + + subs r4, r4, #4 + + sat4 r5, r9, r7, r11 + str r5, [r0], #4 + + bne 1b + + add r4, sp, #40 + ldm r4, {r4, r5, r12} + ldr r6, [sp] + subs r5, r5, #1 + add r2, r2, r12 + str r5, [sp, #44] + add r0, r0, r6 + + bne 1b + + pop {r1, r4-r11, pc} +endfunc + +function vp8_put_epel_v6_armv6 + push {r1, r4-r11, lr} + movrel lr, sixtap_filters_13245600 - 16 + add lr, lr, r12, lsl #3 + str r3, [sp, #48] +1: + add r1, r3, r3, lsl #1 @ stride * 3 + ldr_nreg r5, r2, r3 @ src[0,1,2,3 + stride * 1] + ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3] + ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4] + ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5] + + uxtb16 r9, r5, ror #8 @ src[3 + s*1] | src[1 + s*1] + uxtb16 r10, r6, ror #8 @ src[3 + s*3] | src[1 + s*3] + uxtb16 r11, r7, ror #8 @ src[3 + s*4] | src[1 + s*4] + uxtb16 r12, r8, ror #8 @ src[3 + s*5] | src[1 + s*5] + uxtb16 r5, r5 @ src[2 + s*1] | src[0 + s*1] + uxtb16 r6, r6 @ src[2 + s*3] | src[0 + s*3] + uxtb16 r7, r7 @ src[2 + s*4] | src[0 + s*4] + uxtb16 r8, r8 @ src[2 + s*5] | src[0 + s*5] + pkhbt r1, r9, r10, lsl #16 @ src[1 + s*3] | src[1 + s*1] + pkhtb r9, r10, r9, asr #16 @ src[3 + s*3] | src[3 + s*1] + pkhbt r10, r11, r12, lsl #16 @ src[1 + s*5] | src[1 + s*4] + pkhtb r11, r12, r11, asr #16 @ src[3 + s*5] | src[3 + s*4] + pkhbt r12, r5, r6, lsl #16 @ src[0 + s*3] | src[0 + s*1] + pkhtb r5, r6, r5, asr #16 @ src[2 + s*3] | src[2 + s*1] + pkhbt r6, r7, r8, lsl #16 @ src[0 + s*5] | src[0 + s*4] + pkhtb r7, r8, r7, asr #16 @ src[2 + s*5] | src[2 + s*4] + + ldr r8, [lr, #4] + mov r3, #0x40 + smlad r12, r12, r8, r3 @ filter[0][1] + smlad r1, r1, r8, r3 @ filter[1][1] + smlad r5, r5, r8, r3 @ filter[2][1] + smlad r9, r9, r8, r3 @ filter[3][1] + ldr r8, [lr, #8] + ldr r3, [sp, #48] + smlad r12, r6, r8, r12 @ filter[0][2] + smlad r1, r10, r8, r1 @ filter[1][2] + ldr_nreg r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0] + ldr r10, [r2], #4 @ src[0,1,2,3 + stride * 2] + smlad r5, r7, r8, r5 @ filter[2][2] + smlad r9, r11, r8, r9 @ filter[3][2] + + uxtb16 r7, r6, ror #8 @ src[3 + s*0] | src[1 + s*0] + uxtb16 r11, r10, ror #8 @ src[3 + s*2] | src[1 + s*2] + uxtb16 r6, r6 @ src[2 + s*0] | src[0 + s*0] + uxtb16 r10, r10 @ src[2 + s*2] | src[0 + s*2] + + pkhbt r8, r7, r11, lsl #16 @ src[1 + s*2] | src[1 + s*0] + pkhtb r7, r11, r7, asr #16 @ src[3 + s*2] | src[3 + s*0] + pkhbt r11, r6, r10, lsl #16 @ src[0 + s*2] | src[0 + s*0] + pkhtb r6, r10, r6, asr #16 @ src[2 + s*2] | src[2 + s*0] + + ldr r10, [lr] + subs r4, r4, #4 + smlad r12, r11, r10, r12 @ filter[0][0] + smlad r1, r8, r10, r1 @ filter[1][0] + smlad r5, r6, r10, r5 @ filter[2][0] + smlad r9, r7, r10, r9 @ filter[3][0] + + sat4 r12, r1, r5, r9 + str r12, [r0], #4 + + bne 1b + + ldrd r4, r5, [sp, #40] + ldr r6, [sp] + subs r5, r5, #1 + sub r2, r2, r4 + str r5, [sp, #44] + add r0, r0, r6 + add r2, r2, r3 + + bne 1b + + pop {r1, r4-r11, pc} +endfunc + +function vp8_put_epel_h4_armv6 + push {r1, r4-r11, lr} + subs r2, r2, #1 + movrel lr, fourtap_filters_1324 - 4 + add lr, lr, r12, lsl #2 + sub r3, r3, r4 + ldm lr, {r5, r6} + ldr lr, [sp, #44] +1: + ldr r9, [r2, #3] + ldr r8, [r2, #2] + ldr r7, [r2], #4 + + uxtb16 r9, r9, ror #8 @ src[6] | src[4] + uxtb16 r10, r8, ror #8 @ src[5] | src[3] + uxtb16 r8, r8 @ src[4] | src[2] + uxtb16 r11, r7, ror #8 @ src[3] | src[1] + uxtb16 r7, r7 @ src[2] | src[0] + + mov r12, #0x40 + smlad r9, r9, r6, r12 @ filter[3][1] + smlad r7, r7, r5, r12 @ filter[0][0] + smlad r9, r10, r5, r9 @ filter[3][0] + smlad r10, r10, r6, r12 @ filter[2][1] + smlad r12, r11, r5, r12 @ filter[1][0] + smlad r7, r11, r6, r7 @ filter[0][1] + smlad r10, r8, r5, r10 @ filter[2][0] + smlad r12, r8, r6, r12 @ filter[1][1] + + subs r4, r4, #4 + + sat4 r7, r12, r10, r9 + str r7, [r0], #4 + + bne 1b + + subs lr, lr, #1 + ldr r4, [sp, #40] + add r2, r2, r3 + add r0, r0, r1 + + bne 1b + + pop {r1, r4-r11, pc} +endfunc + +function vp8_put_epel_v4_armv6 + push {r1, r4-r11, lr} + movrel lr, fourtap_filters_1324 - 4 + add lr, lr, r12, lsl #2 + ldm lr, {r5, r6} + str r3, [sp, #48] +1: + ldr lr, [r2, r3, lsl #1] + ldr r12, [r2, r3] + ldr_nreg r7, r2, r3 + ldr r11, [r2], #4 + + uxtb16 r8, lr, ror #8 @ src[3 + s*3] | src[1 + s*3] + uxtb16 r9, r12, ror #8 @ src[3 + s*2] | src[1 + s*2] + uxtb16 r3, r7, ror #8 @ src[3 + s*0] | src[1 + s*0] + uxtb16 r1, r11, ror #8 @ src[3 + s*1] | src[1 + s*1] + uxtb16 lr, lr @ src[2 + s*3] | src[0 + s*3] + uxtb16 r12, r12 @ src[2 + s*2] | src[0 + s*2] + uxtb16 r7, r7 @ src[2 + s*0] | src[0 + s*0] + uxtb16 r11, r11 @ src[2 + s*1] | src[0 + s*1] + pkhbt r10, r1, r8, lsl #16 @ src[1 + s*3] | src[1 + s*1] + pkhtb r1, r8, r1, asr #16 @ src[3 + s*3] | src[3 + s*1] + pkhbt r8, r3, r9, lsl #16 @ src[1 + s*2] | src[1 + s*0] + pkhtb r3, r9, r3, asr #16 @ src[3 + s*2] | src[3 + s*0] + pkhbt r9, r11, lr, lsl #16 @ src[0 + s*3] | src[0 + s*1] + pkhtb r11, lr, r11, asr #16 @ src[2 + s*3] | src[2 + s*1] + pkhbt lr, r7, r12, lsl #16 @ src[0 + s*2] | src[0 + s*0] + pkhtb r7, r12, r7, asr #16 @ src[2 + s*2] | src[2 + s*0] + + mov r12, #0x40 + smlad r9, r9, r6, r12 @ filter[0][1] + smlad r10, r10, r6, r12 @ filter[1][1] + smlad r11, r11, r6, r12 @ filter[2][1] + smlad r1, r1, r6, r12 @ filter[3][1] + smlad r9, lr, r5, r9 @ filter[0][0] + smlad r10, r8, r5, r10 @ filter[1][0] + smlad r11, r7, r5, r11 @ filter[2][0] + smlad r1, r3, r5, r1 @ filter[3][0] + + subs r4, r4, #4 + ldr r3, [sp, #48] + + sat4 r9, r10, r11, r1 + str r9, [r0], #4 + + bne 1b + + ldr r4, [sp, #40] + ldr r12, [sp, #44] + add r2, r2, r3 + ldr r9, [sp, #0] + subs r12, r12, #1 + sub r2, r2, r4 + str r12, [sp, #44] + add r0, r0, r9 + + bne 1b + + pop {r1, r4-r11, pc} +endfunc + +function vp8_put_bilin_h_armv6 + push {r1, r4-r11, lr} + rsb r5, r12, r12, lsl #16 + ldr r12, [sp, #44] + sub r3, r3, r4 + add r5, r5, #8 +1: + ldrb r6, [r2], #1 + ldrb r7, [r2], #1 + ldrb r8, [r2], #1 + ldrb r9, [r2], #1 + ldrb lr, [r2] + + pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1] + pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2] + pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3] + + mov r10, #4 + smlad r6, r6, r5, r10 + smlad r7, r7, r5, r10 + smlad r8, r8, r5, r10 + smlad r9, r9, r5, r10 + + subs r4, r4, #4 + + asr r6, #3 + asr r7, #3 + pkhbt r6, r6, r8, lsl #13 + pkhbt r7, r7, r9, lsl #13 + orr r6, r6, r7, lsl #8 + str r6, [r0], #4 + + bne 1b + + ldr r4, [sp, #40] + subs r12, r12, #1 + add r2, r2, r3 + add r0, r0, r1 + + bne 1b + + pop {r1, r4-r11, pc} +endfunc + +function vp8_put_bilin_v_armv6 + push {r1, r4-r11, lr} + rsb r5, r12, r12, lsl #16 + ldr r12, [sp, #44] + add r5, r5, #8 +1: + ldrb r10, [r2, r3] + ldrb r6, [r2], #1 + ldrb r11, [r2, r3] + ldrb r7, [r2], #1 + ldrb lr, [r2, r3] + ldrb r8, [r2], #1 + ldrb r9, [r2, r3] + pkhbt r6, r6, r10, lsl #16 + ldrb r10, [r2], #1 + pkhbt r7, r7, r11, lsl #16 + pkhbt r8, r8, lr, lsl #16 + pkhbt r9, r10, r9, lsl #16 + + mov r10, #4 + smlad r6, r6, r5, r10 + smlad r7, r7, r5, r10 + smlad r8, r8, r5, r10 + smlad r9, r9, r5, r10 + + subs r4, r4, #4 + + asr r6, #3 + asr r7, #3 + pkhbt r6, r6, r8, lsl #13 + pkhbt r7, r7, r9, lsl #13 + orr r6, r6, r7, lsl #8 + str r6, [r0], #4 + + bne 1b + + ldr r4, [sp, #40] + subs r12, r12, #1 + add r2, r2, r3 + add r0, r0, r1 + sub r2, r2, r4 + + bne 1b + pop {r1, r4-r11, pc} +endfunc diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c new file mode 100644 index 0000000..d360ae3 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c @@ -0,0 +1,34 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) + ff_vp8dsp_init_armv6(dsp); + if (have_neon(cpu_flags)) + ff_vp8dsp_init_neon(dsp); +} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c new file mode 100644 index 0000000..e15e191 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c @@ -0,0 +1,120 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +void ff_vp8_luma_dc_wht_armv6(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_luma_dc_wht_dc_armv6(int16_t block[4][4][16], int16_t dc[16]); + +void ff_vp8_idct_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); + +VP8_LF(armv6); + +VP8_EPEL(16, armv6); +VP8_EPEL(8, armv6); +VP8_EPEL(4, armv6); + +VP8_BILIN(16, armv6); +VP8_BILIN(8, armv6); +VP8_BILIN(4, armv6); + +av_cold void ff_vp8dsp_init_armv6(VP8DSPContext *dsp) +{ + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_armv6; + dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; + + dsp->vp8_idct_add = ff_vp8_idct_add_armv6; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_armv6; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_armv6; + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_armv6; + + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_armv6; + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_armv6; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_armv6; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_armv6; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_armv6; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_armv6; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_armv6; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_armv6; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_armv6; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_armv6; + + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6; + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_armv6; + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_armv6; + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_armv6; + + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6; + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_armv6; + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_armv6; + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_armv6; + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_armv6; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_armv6; + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_armv6; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_armv6; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_armv6; + + dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_armv6; + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_armv6; + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_armv6; + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_armv6; + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_armv6; + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_armv6; + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_armv6; + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_armv6; + + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_armv6; + + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_armv6; + + dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_armv6; + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_armv6; +} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c new file mode 100644 index 0000000..0468181 --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c @@ -0,0 +1,116 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); + +void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); + +VP8_LF(neon); + +VP8_EPEL(16, neon); +VP8_EPEL(8, neon); +VP8_EPEL(4, neon); + +VP8_BILIN(16, neon); +VP8_BILIN(8, neon); +VP8_BILIN(4, neon); + +av_cold void ff_vp8dsp_init_neon(VP8DSPContext *dsp) +{ + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon; + + dsp->vp8_idct_add = ff_vp8_idct_add_neon; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon; + + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; + + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon; + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon; + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; + + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; +} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_neon.S b/ffmpeg/libavcodec/arm/vp8dsp_neon.S new file mode 100644 index 0000000..04e7c5c --- /dev/null +++ b/ffmpeg/libavcodec/arm/vp8dsp_neon.S @@ -0,0 +1,1867 @@ +/* + * VP8 NEON optimisations + * + * Copyright (c) 2010 Rob Clark + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +function ff_vp8_luma_dc_wht_neon, export=1 + vld1.16 {q0-q1}, [r1,:128] + vmov.i16 q15, #0 + + vadd.i16 d4, d0, d3 + vadd.i16 d6, d1, d2 + vst1.16 {q15}, [r1,:128]! + vsub.i16 d7, d1, d2 + vsub.i16 d5, d0, d3 + vst1.16 {q15}, [r1,:128] + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vmov.i16 q8, #3 + + vtrn.32 d0, d2 + vtrn.32 d1, d3 + vtrn.16 d0, d1 + vtrn.16 d2, d3 + + vadd.i16 d0, d0, d16 + + vadd.i16 d4, d0, d3 + vadd.i16 d6, d1, d2 + vsub.i16 d7, d1, d2 + vsub.i16 d5, d0, d3 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vshr.s16 q0, q0, #3 + vshr.s16 q1, q1, #3 + + mov r3, #32 + vst1.16 {d0[0]}, [r0,:16], r3 + vst1.16 {d1[0]}, [r0,:16], r3 + vst1.16 {d2[0]}, [r0,:16], r3 + vst1.16 {d3[0]}, [r0,:16], r3 + vst1.16 {d0[1]}, [r0,:16], r3 + vst1.16 {d1[1]}, [r0,:16], r3 + vst1.16 {d2[1]}, [r0,:16], r3 + vst1.16 {d3[1]}, [r0,:16], r3 + vst1.16 {d0[2]}, [r0,:16], r3 + vst1.16 {d1[2]}, [r0,:16], r3 + vst1.16 {d2[2]}, [r0,:16], r3 + vst1.16 {d3[2]}, [r0,:16], r3 + vst1.16 {d0[3]}, [r0,:16], r3 + vst1.16 {d1[3]}, [r0,:16], r3 + vst1.16 {d2[3]}, [r0,:16], r3 + vst1.16 {d3[3]}, [r0,:16], r3 + + bx lr +endfunc + +function ff_vp8_idct_add_neon, export=1 + vld1.16 {q0-q1}, [r1,:128] + movw r3, #20091 + movt r3, #35468/2 + vdup.32 d4, r3 + + vmull.s16 q12, d1, d4[0] + vmull.s16 q13, d3, d4[0] + vqdmulh.s16 d20, d1, d4[1] + vqdmulh.s16 d23, d3, d4[1] + vshrn.s32 d21, q12, #16 + vshrn.s32 d22, q13, #16 + vadd.s16 d21, d21, d1 + vadd.s16 d22, d22, d3 + + vadd.s16 d16, d0, d2 + vsub.s16 d17, d0, d2 + vadd.s16 d18, d21, d23 + vsub.s16 d19, d20, d22 + vadd.s16 q0, q8, q9 + vsub.s16 q1, q8, q9 + + vtrn.32 d0, d3 + vtrn.32 d1, d2 + vtrn.16 d0, d1 + vtrn.16 d3, d2 + + vmov.i16 q15, #0 + vmull.s16 q12, d1, d4[0] + vst1.16 {q15}, [r1,:128]! + vmull.s16 q13, d2, d4[0] + vst1.16 {q15}, [r1,:128] + vqdmulh.s16 d21, d1, d4[1] + vqdmulh.s16 d23, d2, d4[1] + vshrn.s32 d20, q12, #16 + vshrn.s32 d22, q13, #16 + vadd.i16 d20, d20, d1 + vadd.i16 d22, d22, d2 + + vadd.i16 d16, d0, d3 + vsub.i16 d17, d0, d3 + vadd.i16 d18, d20, d23 + vld1.32 {d20[]}, [r0,:32], r2 + vsub.i16 d19, d21, d22 + vld1.32 {d22[]}, [r0,:32], r2 + vadd.s16 q0, q8, q9 + vld1.32 {d23[]}, [r0,:32], r2 + vsub.s16 q1, q8, q9 + vld1.32 {d21[]}, [r0,:32], r2 + vrshr.s16 q0, q0, #3 + vtrn.32 q10, q11 + vrshr.s16 q1, q1, #3 + + sub r0, r0, r2, lsl #2 + + vtrn.32 d0, d3 + vtrn.32 d1, d2 + vtrn.16 d0, d1 + vtrn.16 d3, d2 + + vaddw.u8 q0, q0, d20 + vaddw.u8 q1, q1, d21 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + + bx lr +endfunc + +function ff_vp8_idct_dc_add_neon, export=1 + mov r3, #0 + ldrsh r12, [r1] + strh r3, [r1] + vdup.16 q1, r12 + vrshr.s16 q1, q1, #3 + vld1.32 {d0[]}, [r0,:32], r2 + vld1.32 {d1[]}, [r0,:32], r2 + vld1.32 {d0[1]}, [r0,:32], r2 + vld1.32 {d1[1]}, [r0,:32], r2 + vaddw.u8 q2, q1, d0 + vaddw.u8 q3, q1, d1 + sub r0, r0, r2, lsl #2 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + bx lr +endfunc + +function ff_vp8_idct_dc_add4uv_neon, export=1 + vmov.i16 d0, #0 + mov r3, #32 + vld1.16 {d16[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d17[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d18[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d19[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + mov r3, r0 + vrshr.s16 q8, q8, #3 @ dc >>= 3 + vld1.8 {d0}, [r0,:64], r2 + vrshr.s16 q9, q9, #3 + vld1.8 {d1}, [r0,:64], r2 + vaddw.u8 q10, q8, d0 + vld1.8 {d2}, [r0,:64], r2 + vaddw.u8 q0, q8, d1 + vld1.8 {d3}, [r0,:64], r2 + vaddw.u8 q11, q8, d2 + vld1.8 {d4}, [r0,:64], r2 + vaddw.u8 q1, q8, d3 + vld1.8 {d5}, [r0,:64], r2 + vaddw.u8 q12, q9, d4 + vld1.8 {d6}, [r0,:64], r2 + vaddw.u8 q2, q9, d5 + vld1.8 {d7}, [r0,:64], r2 + vaddw.u8 q13, q9, d6 + vqmovun.s16 d20, q10 + vaddw.u8 q3, q9, d7 + vqmovun.s16 d21, q0 + vqmovun.s16 d22, q11 + vst1.8 {d20}, [r3,:64], r2 + vqmovun.s16 d23, q1 + vst1.8 {d21}, [r3,:64], r2 + vqmovun.s16 d24, q12 + vst1.8 {d22}, [r3,:64], r2 + vqmovun.s16 d25, q2 + vst1.8 {d23}, [r3,:64], r2 + vqmovun.s16 d26, q13 + vst1.8 {d24}, [r3,:64], r2 + vqmovun.s16 d27, q3 + vst1.8 {d25}, [r3,:64], r2 + vst1.8 {d26}, [r3,:64], r2 + vst1.8 {d27}, [r3,:64], r2 + + bx lr +endfunc + +function ff_vp8_idct_dc_add4y_neon, export=1 + vmov.i16 d0, #0 + mov r3, #32 + vld1.16 {d16[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d17[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d18[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vld1.16 {d19[]}, [r1,:16] + vst1.16 {d0[0]}, [r1,:16], r3 + vrshr.s16 q8, q8, #3 @ dc >>= 3 + vld1.8 {q0}, [r0,:128], r2 + vrshr.s16 q9, q9, #3 + vld1.8 {q1}, [r0,:128], r2 + vaddw.u8 q10, q8, d0 + vld1.8 {q2}, [r0,:128], r2 + vaddw.u8 q0, q9, d1 + vld1.8 {q3}, [r0,:128], r2 + vaddw.u8 q11, q8, d2 + vaddw.u8 q1, q9, d3 + vaddw.u8 q12, q8, d4 + vaddw.u8 q2, q9, d5 + vaddw.u8 q13, q8, d6 + vaddw.u8 q3, q9, d7 + sub r0, r0, r2, lsl #2 + vqmovun.s16 d20, q10 + vqmovun.s16 d21, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q1 + vqmovun.s16 d24, q12 + vst1.8 {q10}, [r0,:128], r2 + vqmovun.s16 d25, q2 + vst1.8 {q11}, [r0,:128], r2 + vqmovun.s16 d26, q13 + vst1.8 {q12}, [r0,:128], r2 + vqmovun.s16 d27, q3 + vst1.8 {q13}, [r0,:128], r2 + + bx lr +endfunc + +@ Register layout: +@ P3..Q3 -> q0..q7 +@ flim_E -> q14 +@ flim_I -> q15 +@ hev_thresh -> r12 +@ +.macro vp8_loop_filter, inner=0, simple=0 + .if \simple + vabd.u8 q9, q3, q4 @ abs(P0-Q0) + vabd.u8 q15, q2, q5 @ abs(P1-Q1) + vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 + vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 + vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + vmov.i8 q13, #0x80 + vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim + .else + @ calculate hev and normal_limit: + vabd.u8 q12, q2, q3 @ abs(P1-P0) + vabd.u8 q13, q5, q4 @ abs(Q1-Q0) + vabd.u8 q10, q0, q1 @ abs(P3-P2) + vabd.u8 q11, q1, q2 @ abs(P2-P1) + vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I + vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I + vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I + vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I + vand q8, q8, q9 + vabd.u8 q9, q7, q6 @ abs(Q3-Q2) + vand q8, q8, q11 + vabd.u8 q11, q6, q5 @ abs(Q2-Q1) + vand q8, q8, q10 + vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I + vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I + vabd.u8 q9, q3, q4 @ abs(P0-Q0) + vabd.u8 q15, q2, q5 @ abs(P1-Q1) + vand q8, q8, q10 + vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 + vand q8, q8, q11 + vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 + vdup.8 q15, r12 @ hev_thresh + vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh + vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E + vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh + vand q8, q8, q11 + vmov.i8 q13, #0x80 + vorr q9, q12, q14 + .endif + + @ at this point: + @ q8: normal_limit + @ q9: hev + + @ convert to signed value: + veor q3, q3, q13 @ PS0 = P0 ^ 0x80 + veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 + + vmov.i16 q12, #3 + vsubl.s8 q10, d8, d6 @ QS0 - PS0 + vsubl.s8 q11, d9, d7 @ (widened to 16bit) + veor q2, q2, q13 @ PS1 = P1 ^ 0x80 + veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 + vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) + vmul.i16 q11, q11, q12 + + vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) + vmov.i8 q14, #4 + vmov.i8 q15, #3 + .if \inner + vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) + .endif + vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) + vaddw.s8 q11, q11, d25 + vqmovn.s16 d20, q10 @ narrow result back into q10 + vqmovn.s16 d21, q11 + .if !\inner && !\simple + veor q1, q1, q13 @ PS2 = P2 ^ 0x80 + veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 + .endif + vand q10, q10, q8 @ w &= normal_limit + + @ registers used at this point.. + @ q0 -> P3 (don't corrupt) + @ q1-q6 -> PS2-QS2 + @ q7 -> Q3 (don't corrupt) + @ q9 -> hev + @ q10 -> w + @ q13 -> #0x80 + @ q14 -> #4 + @ q15 -> #3 + @ q8, q11, q12 -> unused + + @ filter_common: is4tap==1 + @ c1 = clamp(w + 4) >> 3; + @ c2 = clamp(w + 3) >> 3; + @ Q0 = s2u(QS0 - c1); + @ P0 = s2u(PS0 + c2); + + .if \simple + vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) + vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) + vshr.s8 q11, q11, #3 @ c1 >>= 3 + vshr.s8 q12, q12, #3 @ c2 >>= 3 + vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) + vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) + veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 + veor q3, q3, q13 @ P0 = PS0 ^ 0x80 + veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 + veor q2, q2, q13 @ P1 = PS1 ^ 0x80 + .elseif \inner + @ the !is4tap case of filter_common, only used for inner blocks + @ c3 = ((c1&~hev) + 1) >> 1; + @ Q1 = s2u(QS1 - c3); + @ P1 = s2u(PS1 + c3); + vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) + vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) + vshr.s8 q11, q11, #3 @ c1 >>= 3 + vshr.s8 q12, q12, #3 @ c2 >>= 3 + vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) + vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) + vbic q11, q11, q9 @ c1 & ~hev + veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 + vrshr.s8 q11, q11, #1 @ c3 >>= 1 + veor q3, q3, q13 @ P0 = PS0 ^ 0x80 + vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) + vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) + veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 + veor q2, q2, q13 @ P1 = PS1 ^ 0x80 + .else + vand q12, q10, q9 @ w & hev + vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) + vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) + vshr.s8 q11, q11, #3 @ c1 >>= 3 + vshr.s8 q12, q12, #3 @ c2 >>= 3 + vbic q10, q10, q9 @ w &= ~hev + vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) + vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) + + @ filter_mbedge: + @ a = clamp((27*w + 63) >> 7); + @ Q0 = s2u(QS0 - a); + @ P0 = s2u(PS0 + a); + @ a = clamp((18*w + 63) >> 7); + @ Q1 = s2u(QS1 - a); + @ P1 = s2u(PS1 + a); + @ a = clamp((9*w + 63) >> 7); + @ Q2 = s2u(QS2 - a); + @ P2 = s2u(PS2 + a); + vmov.i16 q9, #63 + vshll.s8 q14, d20, #3 + vshll.s8 q15, d21, #3 + vaddw.s8 q14, q14, d20 + vaddw.s8 q15, q15, d21 + vadd.s16 q8, q9, q14 + vadd.s16 q9, q9, q15 @ 9*w + 63 + vadd.s16 q11, q8, q14 + vadd.s16 q12, q9, q15 @ 18*w + 63 + vadd.s16 q14, q11, q14 + vadd.s16 q15, q12, q15 @ 27*w + 63 + vqshrn.s16 d16, q8, #7 + vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) + vqshrn.s16 d22, q11, #7 + vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) + vqshrn.s16 d28, q14, #7 + vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) + vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) + vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) + vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) + vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) + vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) + vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) + veor q3, q3, q13 @ P0 = PS0 ^ 0x80 + veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 + veor q2, q2, q13 @ P1 = PS1 ^ 0x80 + veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 + veor q1, q1, q13 @ P2 = PS2 ^ 0x80 + veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 + .endif +.endm + +.macro vp8_v_loop_filter16 name, inner=0, simple=0 +function ff_vp8_v_loop_filter16\name\()_neon, export=1 + vpush {q4-q7} + sub r0, r0, r1, lsl #1+!\simple + + @ Load pixels: + .if !\simple + ldr r12, [sp, #64] @ hev_thresh + vld1.8 {q0}, [r0,:128], r1 @ P3 + vld1.8 {q1}, [r0,:128], r1 @ P2 + .endif + vld1.8 {q2}, [r0,:128], r1 @ P1 + vld1.8 {q3}, [r0,:128], r1 @ P0 + vld1.8 {q4}, [r0,:128], r1 @ Q0 + vld1.8 {q5}, [r0,:128], r1 @ Q1 + .if !\simple + vld1.8 {q6}, [r0,:128], r1 @ Q2 + vld1.8 {q7}, [r0,:128] @ Q3 + vdup.8 q15, r3 @ flim_I + .endif + vdup.8 q14, r2 @ flim_E + + vp8_loop_filter inner=\inner, simple=\simple + + @ back up to P2: dst -= stride * 6 + sub r0, r0, r1, lsl #2 + .if !\simple + sub r0, r0, r1, lsl #1 + + @ Store pixels: + vst1.8 {q1}, [r0,:128], r1 @ P2 + .endif + vst1.8 {q2}, [r0,:128], r1 @ P1 + vst1.8 {q3}, [r0,:128], r1 @ P0 + vst1.8 {q4}, [r0,:128], r1 @ Q0 + vst1.8 {q5}, [r0,:128], r1 @ Q1 + .if !\simple + vst1.8 {q6}, [r0,:128] @ Q2 + .endif + + vpop {q4-q7} + bx lr +endfunc +.endm + +vp8_v_loop_filter16 +vp8_v_loop_filter16 _inner, inner=1 +vp8_v_loop_filter16 _simple, simple=1 + +.macro vp8_v_loop_filter8uv name, inner=0 +function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 + vpush {q4-q7} + sub r0, r0, r2, lsl #2 + sub r1, r1, r2, lsl #2 + ldr r12, [sp, #64] @ flim_I + + @ Load pixels: + vld1.8 {d0}, [r0,:64], r2 @ P3 + vld1.8 {d1}, [r1,:64], r2 @ P3 + vld1.8 {d2}, [r0,:64], r2 @ P2 + vld1.8 {d3}, [r1,:64], r2 @ P2 + vld1.8 {d4}, [r0,:64], r2 @ P1 + vld1.8 {d5}, [r1,:64], r2 @ P1 + vld1.8 {d6}, [r0,:64], r2 @ P0 + vld1.8 {d7}, [r1,:64], r2 @ P0 + vld1.8 {d8}, [r0,:64], r2 @ Q0 + vld1.8 {d9}, [r1,:64], r2 @ Q0 + vld1.8 {d10}, [r0,:64], r2 @ Q1 + vld1.8 {d11}, [r1,:64], r2 @ Q1 + vld1.8 {d12}, [r0,:64], r2 @ Q2 + vld1.8 {d13}, [r1,:64], r2 @ Q2 + vld1.8 {d14}, [r0,:64] @ Q3 + vld1.8 {d15}, [r1,:64] @ Q3 + + vdup.8 q14, r3 @ flim_E + vdup.8 q15, r12 @ flim_I + ldr r12, [sp, #68] @ hev_thresh + + vp8_loop_filter inner=\inner + + @ back up to P2: u,v -= stride * 6 + sub r0, r0, r2, lsl #2 + sub r1, r1, r2, lsl #2 + sub r0, r0, r2, lsl #1 + sub r1, r1, r2, lsl #1 + + @ Store pixels: + vst1.8 {d2}, [r0,:64], r2 @ P2 + vst1.8 {d3}, [r1,:64], r2 @ P2 + vst1.8 {d4}, [r0,:64], r2 @ P1 + vst1.8 {d5}, [r1,:64], r2 @ P1 + vst1.8 {d6}, [r0,:64], r2 @ P0 + vst1.8 {d7}, [r1,:64], r2 @ P0 + vst1.8 {d8}, [r0,:64], r2 @ Q0 + vst1.8 {d9}, [r1,:64], r2 @ Q0 + vst1.8 {d10}, [r0,:64], r2 @ Q1 + vst1.8 {d11}, [r1,:64], r2 @ Q1 + vst1.8 {d12}, [r0,:64] @ Q2 + vst1.8 {d13}, [r1,:64] @ Q2 + + vpop {q4-q7} + bx lr +endfunc +.endm + +vp8_v_loop_filter8uv +vp8_v_loop_filter8uv _inner, inner=1 + +.macro vp8_h_loop_filter16 name, inner=0, simple=0 +function ff_vp8_h_loop_filter16\name\()_neon, export=1 + vpush {q4-q7} + sub r0, r0, #4 + .if !\simple + ldr r12, [sp, #64] @ hev_thresh + .endif + + @ Load pixels: + vld1.8 {d0}, [r0], r1 @ load first 8-line src data + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d6}, [r0], r1 + vld1.8 {d8}, [r0], r1 + vld1.8 {d10}, [r0], r1 + vld1.8 {d12}, [r0], r1 + vld1.8 {d14}, [r0], r1 + vld1.8 {d1}, [r0], r1 @ load second 8-line src data + vld1.8 {d3}, [r0], r1 + vld1.8 {d5}, [r0], r1 + vld1.8 {d7}, [r0], r1 + vld1.8 {d9}, [r0], r1 + vld1.8 {d11}, [r0], r1 + vld1.8 {d13}, [r0], r1 + vld1.8 {d15}, [r0], r1 + + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 + + vdup.8 q14, r2 @ flim_E + .if !\simple + vdup.8 q15, r3 @ flim_I + .endif + + vp8_loop_filter inner=\inner, simple=\simple + + sub r0, r0, r1, lsl #4 @ backup 16 rows + + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 + + @ Store pixels: + vst1.8 {d0}, [r0], r1 + vst1.8 {d2}, [r0], r1 + vst1.8 {d4}, [r0], r1 + vst1.8 {d6}, [r0], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d1}, [r0], r1 + vst1.8 {d3}, [r0], r1 + vst1.8 {d5}, [r0], r1 + vst1.8 {d7}, [r0], r1 + vst1.8 {d9}, [r0], r1 + vst1.8 {d11}, [r0], r1 + vst1.8 {d13}, [r0], r1 + vst1.8 {d15}, [r0] + + vpop {q4-q7} + bx lr +endfunc +.endm + +vp8_h_loop_filter16 +vp8_h_loop_filter16 _inner, inner=1 +vp8_h_loop_filter16 _simple, simple=1 + +.macro vp8_h_loop_filter8uv name, inner=0 +function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 + vpush {q4-q7} + sub r0, r0, #4 + sub r1, r1, #4 + ldr r12, [sp, #64] @ flim_I + + @ Load pixels: + vld1.8 {d0}, [r0], r2 @ load u + vld1.8 {d1}, [r1], r2 @ load v + vld1.8 {d2}, [r0], r2 + vld1.8 {d3}, [r1], r2 + vld1.8 {d4}, [r0], r2 + vld1.8 {d5}, [r1], r2 + vld1.8 {d6}, [r0], r2 + vld1.8 {d7}, [r1], r2 + vld1.8 {d8}, [r0], r2 + vld1.8 {d9}, [r1], r2 + vld1.8 {d10}, [r0], r2 + vld1.8 {d11}, [r1], r2 + vld1.8 {d12}, [r0], r2 + vld1.8 {d13}, [r1], r2 + vld1.8 {d14}, [r0], r2 + vld1.8 {d15}, [r1], r2 + + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 + + vdup.8 q14, r3 @ flim_E + vdup.8 q15, r12 @ flim_I + ldr r12, [sp, #68] @ hev_thresh + + vp8_loop_filter inner=\inner + + sub r0, r0, r2, lsl #3 @ backup u 8 rows + sub r1, r1, r2, lsl #3 @ backup v 8 rows + + transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 + + @ Store pixels: + vst1.8 {d0}, [r0], r2 + vst1.8 {d1}, [r1], r2 + vst1.8 {d2}, [r0], r2 + vst1.8 {d3}, [r1], r2 + vst1.8 {d4}, [r0], r2 + vst1.8 {d5}, [r1], r2 + vst1.8 {d6}, [r0], r2 + vst1.8 {d7}, [r1], r2 + vst1.8 {d8}, [r0], r2 + vst1.8 {d9}, [r1], r2 + vst1.8 {d10}, [r0], r2 + vst1.8 {d11}, [r1], r2 + vst1.8 {d12}, [r0], r2 + vst1.8 {d13}, [r1], r2 + vst1.8 {d14}, [r0] + vst1.8 {d15}, [r1] + + vpop {q4-q7} + bx lr +endfunc +.endm + +vp8_h_loop_filter8uv +vp8_h_loop_filter8uv _inner, inner=1 + +function ff_put_vp8_pixels16_neon, export=1 + ldr r12, [sp, #0] @ h +1: + subs r12, r12, #4 + vld1.8 {q0}, [r2], r3 + vld1.8 {q1}, [r2], r3 + vld1.8 {q2}, [r2], r3 + vld1.8 {q3}, [r2], r3 + vst1.8 {q0}, [r0,:128], r1 + vst1.8 {q1}, [r0,:128], r1 + vst1.8 {q2}, [r0,:128], r1 + vst1.8 {q3}, [r0,:128], r1 + bgt 1b + bx lr +endfunc + +function ff_put_vp8_pixels8_neon, export=1 + ldr r12, [sp, #0] @ h +1: + subs r12, r12, #4 + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r2], r3 + vld1.8 {d2}, [r2], r3 + vld1.8 {d3}, [r2], r3 + vst1.8 {d0}, [r0,:64], r1 + vst1.8 {d1}, [r0,:64], r1 + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + bgt 1b + bx lr +endfunc + +/* 4/6-tap 8th-pel MC */ + +.macro vp8_epel8_h6 d, a, b + vext.8 d27, \a, \b, #1 + vmovl.u8 q8, \a + vext.8 d28, \a, \b, #2 + vmovl.u8 q9, d27 + vext.8 d29, \a, \b, #3 + vmovl.u8 q10, d28 + vext.8 d30, \a, \b, #4 + vmovl.u8 q11, d29 + vext.8 d31, \a, \b, #5 + vmovl.u8 q12, d30 + vmul.u16 q10, q10, d0[2] + vmovl.u8 q13, d31 + vmul.u16 q11, q11, d0[3] + vmls.u16 q10, q9, d0[1] + vmls.u16 q11, q12, d1[0] + vmla.u16 q10, q8, d0[0] + vmla.u16 q11, q13, d1[1] + vqadd.s16 q11, q10, q11 + vqrshrun.s16 \d, q11, #7 +.endm + +.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 + vext.8 q14, \q0, \q1, #3 + vext.8 q15, \q0, \q1, #4 + vmovl.u8 q11, d28 + vmovl.u8 q14, d29 + vext.8 q3, \q0, \q1, #2 + vmovl.u8 q12, d30 + vmovl.u8 q15, d31 + vext.8 q8, \q0, \q1, #1 + vmovl.u8 q10, d6 + vmovl.u8 q3, d7 + vext.8 q2, \q0, \q1, #5 + vmovl.u8 q13, d4 + vmovl.u8 q2, d5 + vmovl.u8 q9, d16 + vmovl.u8 q8, d17 + vmul.u16 q11, q11, d0[3] + vmul.u16 q10, q10, d0[2] + vmul.u16 q3, q3, d0[2] + vmul.u16 q14, q14, d0[3] + vmls.u16 q11, q12, d1[0] + vmovl.u8 q12, \s0 + vmovl.u8 q1, \s1 + vmls.u16 q10, q9, d0[1] + vmls.u16 q3, q8, d0[1] + vmls.u16 q14, q15, d1[0] + vmla.u16 q10, q12, d0[0] + vmla.u16 q11, q13, d1[1] + vmla.u16 q3, q1, d0[0] + vmla.u16 q14, q2, d1[1] + vqadd.s16 q11, q10, q11 + vqadd.s16 q14, q3, q14 + vqrshrun.s16 \d0, q11, #7 + vqrshrun.s16 \d1, q14, #7 +.endm + +.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 + vmovl.u8 q10, \s2 + vmovl.u8 q11, \s3 + vmovl.u8 q9, \s1 + vmovl.u8 q12, \s4 + vmovl.u8 q8, \s0 + vmovl.u8 q13, \s5 + vmul.u16 q10, q10, d0[2] + vmul.u16 q11, q11, d0[3] + vmls.u16 q10, q9, d0[1] + vmls.u16 q11, q12, d1[0] + vmla.u16 q10, q8, d0[0] + vmla.u16 q11, q13, d1[1] + vqadd.s16 q11, q10, q11 + vqrshrun.s16 \d0, q11, #7 +.endm + +.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 + vmovl.u8 q10, \s0 + vmovl.u8 q11, \s3 + vmovl.u8 q14, \s6 + vmovl.u8 q9, \s1 + vmovl.u8 q12, \s4 + vmovl.u8 q8, \s2 + vmovl.u8 q13, \s5 + vmul.u16 q10, q10, d0[0] + vmul.u16 q15, q11, d0[3] + vmul.u16 q11, q11, d0[2] + vmul.u16 q14, q14, d1[1] + vmls.u16 q10, q9, d0[1] + vmls.u16 q15, q12, d1[0] + vmls.u16 q11, q8, d0[1] + vmls.u16 q14, q13, d1[0] + vmla.u16 q10, q8, d0[2] + vmla.u16 q15, q13, d1[1] + vmla.u16 q11, q9, d0[0] + vmla.u16 q14, q12, d0[3] + vqadd.s16 q15, q10, q15 + vqadd.s16 q14, q11, q14 + vqrshrun.s16 \d0, q15, #7 + vqrshrun.s16 \d1, q14, #7 +.endm + +.macro vp8_epel8_h4 d, a, b + vext.8 d28, \a, \b, #1 + vmovl.u8 q9, \a + vext.8 d29, \a, \b, #2 + vmovl.u8 q10, d28 + vext.8 d30, \a, \b, #3 + vmovl.u8 q11, d29 + vmovl.u8 q12, d30 + vmul.u16 q10, q10, d0[2] + vmul.u16 q11, q11, d0[3] + vmls.u16 q10, q9, d0[1] + vmls.u16 q11, q12, d1[0] + vqadd.s16 q11, q10, q11 + vqrshrun.s16 \d, q11, #7 +.endm + +.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 + vmovl.u8 q9, \s0 + vmovl.u8 q10, \s1 + vmovl.u8 q11, \s2 + vmovl.u8 q12, \s3 + vmovl.u8 q13, \s4 + vmul.u16 q8, q10, d0[2] + vmul.u16 q14, q11, d0[3] + vmul.u16 q11, q11, d0[2] + vmul.u16 q15, q12, d0[3] + vmls.u16 q8, q9, d0[1] + vmls.u16 q14, q12, d1[0] + vmls.u16 q11, q10, d0[1] + vmls.u16 q15, q13, d1[0] + vqadd.s16 q8, q8, q14 + vqadd.s16 q11, q11, q15 + vqrshrun.s16 \d0, q8, #7 + vqrshrun.s16 \d1, q11, #7 +.endm + +function ff_put_vp8_epel16_v6_neon, export=1 + sub r2, r2, r3, lsl #1 + push {r4,lr} + vpush {d8-d15} + + ldr r4, [sp, #80] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #72] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2-d3}, [r2], r3 + vld1.8 {d4-d5}, [r2], r3 + vld1.8 {d6-d7}, [r2], r3 + vld1.8 {d8-d9}, [r2], r3 + vld1.8 {d10-d11},[r2], r3 + vld1.8 {d12-d13},[r2], r3 + vld1.8 {d14-d15},[r2] + sub r2, r2, r3, lsl #2 + + vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 + vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 + + vst1.8 {d2-d3}, [r0,:128], r1 + vst1.8 {d4-d5}, [r0,:128], r1 + subs r12, r12, #2 + bne 1b + + vpop {d8-d15} + pop {r4,pc} +endfunc + +function ff_put_vp8_epel16_h6_neon, export=1 + sub r2, r2, #2 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2-d4}, [r2], r3 + + vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 + + vst1.8 {d2-d3}, [r0,:128], r1 + subs r12, r12, #1 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel16_h6v6_neon, export=1 + sub r2, r2, r3, lsl #1 + sub r2, r2, #2 + push {r4,lr} + vpush {d8-d9} + + @ first pass (horizontal): + ldr r4, [sp, #28] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #24] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #336+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #5 + bic lr, lr, #15 +1: + vld1.8 {d2,d3,d4}, [r2], r3 + + vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 + + vst1.8 {d2-d3}, [lr,:128]! + subs r12, r12, #1 + bne 1b + + @ second pass (vertical): + ldr r4, [sp, #336+16+32] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #336+16+24] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d5}, [lr,:128]! + vld1.8 {d6-d9}, [lr,:128]! + vld1.8 {d28-d31},[lr,:128] + sub lr, lr, #48 + + vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 + vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 + + vst1.8 {d2-d3}, [r0,:128], r1 + subs r12, r12, #1 + bne 2b + + add sp, sp, #336+16 + vpop {d8-d9} + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_v6_neon, export=1 + sub r2, r2, r3, lsl #1 + push {r4,lr} + + ldr r4, [sp, #16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2}, [r2], r3 + vld1.8 {d3}, [r2], r3 + vld1.8 {d4}, [r2], r3 + vld1.8 {d5}, [r2], r3 + vld1.8 {d6}, [r2], r3 + vld1.8 {d7}, [r2], r3 + vld1.8 {d28}, [r2] + + sub r2, r2, r3, lsl #2 + + vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h6_neon, export=1 + sub r2, r2, #2 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h6 d2, d2, d3 + + vst1.8 {d2}, [r0,:64], r1 + subs r12, r12, #1 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h6v6_neon, export=1 + sub r2, r2, r3, lsl #1 + sub r2, r2, #2 + push {r4,lr} + + @ first pass (horizontal): + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #168+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #5 + bic lr, lr, #15 +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h6 d2, d2, d3 + + vst1.8 {d2}, [lr,:64]! + subs r12, r12, #1 + bne 1b + + @ second pass (vertical): + ldr r4, [sp, #168+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #168+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d5}, [lr,:128]! + vld1.8 {d6-d7}, [lr,:128]! + vld1.8 {d30}, [lr,:64] + sub lr, lr, #32 + + vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 2b + + add sp, sp, #168+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_v4_neon, export=1 + sub r2, r2, r3 + push {r4,lr} + + ldr r4, [sp, #16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2}, [r2], r3 + vld1.8 {d3}, [r2], r3 + vld1.8 {d4}, [r2], r3 + vld1.8 {d5}, [r2], r3 + vld1.8 {d6}, [r2] + sub r2, r2, r3, lsl #1 + + vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h4_neon, export=1 + sub r2, r2, #1 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h4 d2, d2, d3 + + vst1.8 {d2}, [r0,:64], r1 + subs r12, r12, #1 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h4v4_neon, export=1 + sub r2, r2, r3 + sub r2, r2, #1 + push {r4,lr} + + @ first pass (horizontal): + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #168+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #3 + bic lr, lr, #15 +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h4 d2, d2, d3 + + vst1.8 {d2}, [lr,:64]! + subs r12, r12, #1 + bne 1b + + @ second pass (vertical): + ldr r4, [sp, #168+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #168+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d5}, [lr,:128]! + vld1.8 {d6}, [lr,:64] + sub lr, lr, #16 + + vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 2b + + add sp, sp, #168+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h6v4_neon, export=1 + sub r2, r2, r3 + sub r2, r2, #2 + push {r4,lr} + + @ first pass (horizontal): + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #168+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #3 + bic lr, lr, #15 +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h6 d2, d2, d3 + + vst1.8 {d2}, [lr,:64]! + subs r12, r12, #1 + bne 1b + + @ second pass (vertical): + ldr r4, [sp, #168+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #168+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d5}, [lr,:128]! + vld1.8 {d6}, [lr,:64] + sub lr, lr, #16 + + vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 2b + + add sp, sp, #168+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel8_h4v6_neon, export=1 + sub r2, r2, r3, lsl #1 + sub r2, r2, #1 + push {r4,lr} + + @ first pass (horizontal): + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #168+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #5 + bic lr, lr, #15 +1: + vld1.8 {d2,d3}, [r2], r3 + + vp8_epel8_h4 d2, d2, d3 + + vst1.8 {d2}, [lr,:64]! + subs r12, r12, #1 + bne 1b + + @ second pass (vertical): + ldr r4, [sp, #168+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #168+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d5}, [lr,:128]! + vld1.8 {d6-d7}, [lr,:128]! + vld1.8 {d30}, [lr,:64] + sub lr, lr, #32 + + vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 + + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + subs r12, r12, #2 + bne 2b + + add sp, sp, #168+16 + pop {r4,pc} +endfunc + +.ltorg + +function ff_put_vp8_epel4_v6_neon, export=1 + sub r2, r2, r3, lsl #1 + push {r4,lr} + + ldr r4, [sp, #16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.32 {d2[]}, [r2], r3 + vld1.32 {d3[]}, [r2], r3 + vld1.32 {d4[]}, [r2], r3 + vld1.32 {d5[]}, [r2], r3 + vld1.32 {d6[]}, [r2], r3 + vld1.32 {d7[]}, [r2], r3 + vld1.32 {d28[]}, [r2] + sub r2, r2, r3, lsl #2 + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d3[1]}, [r2], r3 + vld1.32 {d4[1]}, [r2], r3 + vld1.32 {d5[1]}, [r2], r3 + vld1.32 {d6[1]}, [r2], r3 + vld1.32 {d7[1]}, [r2], r3 + vld1.32 {d28[1]}, [r2] + sub r2, r2, r3, lsl #2 + + vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 + + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h6_neon, export=1 + sub r2, r2, #2 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {q1}, [r2], r3 + vp8_epel8_h6 d2, d2, d3 + vst1.32 {d2[0]}, [r0,:32], r1 + subs r12, r12, #1 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h6v6_neon, export=1 + sub r2, r2, r3, lsl #1 + sub r2, r2, #2 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #52+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #5 + bic lr, lr, #15 +1: + vld1.8 {q1}, [r2], r3 + vp8_epel8_h6 d2, d2, d3 + vst1.32 {d2[0]}, [lr,:32]! + subs r12, r12, #1 + bne 1b + + ldr r4, [sp, #52+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #52+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d3}, [lr,:128]! + vld1.8 {d6}, [lr,:64]! + vld1.32 {d28[]}, [lr,:32] + sub lr, lr, #16 + vld1.8 {d4-d5}, [lr]! + vld1.8 {d7}, [lr,:64]! + vld1.32 {d28[1]}, [lr,:32] + sub lr, lr, #16 + vtrn.32 q1, q2 + vtrn.32 d6, d7 + vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 2b + + add sp, sp, #52+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h4v6_neon, export=1 + sub r2, r2, r3, lsl #1 + sub r2, r2, #1 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #52+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #5 + bic lr, lr, #15 +1: + vld1.8 {d2}, [r2], r3 + vp8_epel8_h4 d2, d2, d2 + vst1.32 {d2[0]}, [lr,:32]! + subs r12, r12, #1 + bne 1b + + ldr r4, [sp, #52+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #52+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d3}, [lr,:128]! + vld1.8 {d6}, [lr,:64]! + vld1.32 {d28[]}, [lr,:32] + sub lr, lr, #16 + vld1.8 {d4-d5}, [lr]! + vld1.8 {d7}, [lr,:64]! + vld1.32 {d28[1]}, [lr,:32] + sub lr, lr, #16 + vtrn.32 q1, q2 + vtrn.32 d6, d7 + vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 2b + + add sp, sp, #52+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h6v4_neon, export=1 + sub r2, r2, r3 + sub r2, r2, #2 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #44+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #3 + bic lr, lr, #15 +1: + vld1.8 {q1}, [r2], r3 + vp8_epel8_h6 d2, d2, d3 + vst1.32 {d2[0]}, [lr,:32]! + subs r12, r12, #1 + bne 1b + + ldr r4, [sp, #44+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #44+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d3}, [lr,:128]! + vld1.32 {d6[]}, [lr,:32] + sub lr, lr, #8 + vld1.8 {d4-d5}, [lr]! + vld1.32 {d6[1]}, [lr,:32] + sub lr, lr, #8 + vtrn.32 q1, q2 + vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 2b + + add sp, sp, #44+16 + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h4_neon, export=1 + sub r2, r2, #1 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.8 {d2}, [r2], r3 + vp8_epel8_h4 d2, d2, d2 + vst1.32 {d2[0]}, [r0,:32], r1 + subs r12, r12, #1 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_v4_neon, export=1 + sub r2, r2, r3 + push {r4,lr} + + ldr r4, [sp, #16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + vld1.16 {q0}, [r4,:128] +1: + vld1.32 {d2[]}, [r2], r3 + vld1.32 {d3[]}, [r2], r3 + vld1.32 {d4[]}, [r2], r3 + vld1.32 {d5[]}, [r2], r3 + vld1.32 {d6[]}, [r2] + sub r2, r2, r3, lsl #1 + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d3[1]}, [r2], r3 + vld1.32 {d4[1]}, [r2], r3 + vld1.32 {d5[1]}, [r2], r3 + vld1.32 {d6[1]}, [r2] + sub r2, r2, r3, lsl #1 + + vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 + + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 1b + + pop {r4,pc} +endfunc + +function ff_put_vp8_epel4_h4v4_neon, export=1 + sub r2, r2, r3 + sub r2, r2, #1 + push {r4,lr} + + ldr r4, [sp, #12] @ mx + movrel lr, subpel_filters-16 + ldr r12, [sp, #8] @ h + add r4, lr, r4, lsl #4 + sub sp, sp, #44+16 + vld1.16 {q0}, [r4,:128] + add lr, sp, #15 + add r12, r12, #3 + bic lr, lr, #15 +1: + vld1.8 {d2}, [r2], r3 + vp8_epel8_h4 d2, d2, d3 + vst1.32 {d2[0]}, [lr,:32]! + subs r12, r12, #1 + bne 1b + + ldr r4, [sp, #44+16+16] @ my + movrel lr, subpel_filters-16 + ldr r12, [sp, #44+16+8] @ h + add r4, lr, r4, lsl #4 + add lr, sp, #15 + vld1.16 {q0}, [r4,:128] + bic lr, lr, #15 +2: + vld1.8 {d2-d3}, [lr,:128]! + vld1.32 {d6[]}, [lr,:32] + sub lr, lr, #8 + vld1.8 {d4-d5}, [lr]! + vld1.32 {d6[1]}, [lr,:32] + sub lr, lr, #8 + vtrn.32 q1, q2 + vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + subs r12, r12, #4 + bne 2b + + add sp, sp, #44+16 + pop {r4,pc} +endfunc + +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit +@ arithmatic can be used to apply filters +const subpel_filters, align=4 + .short 0, 6, 123, 12, 1, 0, 0, 0 + .short 2, 11, 108, 36, 8, 1, 0, 0 + .short 0, 9, 93, 50, 6, 0, 0, 0 + .short 3, 16, 77, 77, 16, 3, 0, 0 + .short 0, 6, 50, 93, 9, 0, 0, 0 + .short 1, 8, 36, 108, 11, 2, 0, 0 + .short 0, 1, 12, 123, 6, 0, 0, 0 +endconst + +/* Bilinear MC */ + +function ff_put_vp8_bilin16_h_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h +1: + subs r12, r12, #2 + vld1.8 {d2-d4}, [r2], r1 + vext.8 q2, q1, q2, #1 + vmull.u8 q8, d2, d1 + vmlal.u8 q8, d4, d0 + vld1.8 {d18-d20},[r2], r1 + vmull.u8 q3, d3, d1 + vmlal.u8 q3, d5, d0 + vext.8 q10, q9, q10, #1 + vmull.u8 q11, d18, d1 + vmlal.u8 q11, d20, d0 + vmull.u8 q12, d19, d1 + vmlal.u8 q12, d21, d0 + vrshrn.u16 d4, q8, #3 + vrshrn.u16 d5, q3, #3 + vrshrn.u16 d6, q11, #3 + vrshrn.u16 d7, q12, #3 + vst1.8 {q2}, [r0,:128], r1 + vst1.8 {q3}, [r0,:128], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin16_v_neon, export=1 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h + vld1.8 {q1}, [r2], r1 +1: + subs r12, r12, #2 + vld1.8 {q2}, [r2], r1 + vmull.u8 q3, d2, d1 + vmlal.u8 q3, d4, d0 + vmull.u8 q8, d3, d1 + vmlal.u8 q8, d5, d0 + vld1.8 {q1}, [r2], r1 + vmull.u8 q9, d4, d1 + vmlal.u8 q9, d2, d0 + vmull.u8 q10, d5, d1 + vmlal.u8 q10, d3, d0 + vrshrn.u16 d4, q3, #3 + vrshrn.u16 d5, q8, #3 + vrshrn.u16 d6, q9, #3 + vrshrn.u16 d7, q10, #3 + vst1.8 {q2}, [r0,:128], r1 + vst1.8 {q3}, [r0,:128], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin16_hv_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d2, r3 + vdup.8 d3, r12 + ldr r12, [sp] @ h + + vld1.8 {d4-d6}, [r2], r1 + vext.8 q3, q2, q3, #1 + vmull.u8 q8, d4, d1 + vmlal.u8 q8, d6, d0 + vmull.u8 q9, d5, d1 + vmlal.u8 q9, d7, d0 + vrshrn.u16 d4, q8, #3 + vrshrn.u16 d5, q9, #3 +1: + subs r12, r12, #2 + vld1.8 {d18-d20},[r2], r1 + vext.8 q10, q9, q10, #1 + vmull.u8 q11, d18, d1 + vmlal.u8 q11, d20, d0 + vld1.8 {d26-d28},[r2], r1 + vmull.u8 q12, d19, d1 + vmlal.u8 q12, d21, d0 + vext.8 q14, q13, q14, #1 + vmull.u8 q8, d26, d1 + vmlal.u8 q8, d28, d0 + vmull.u8 q9, d27, d1 + vmlal.u8 q9, d29, d0 + vrshrn.u16 d6, q11, #3 + vrshrn.u16 d7, q12, #3 + vmull.u8 q12, d4, d3 + vmlal.u8 q12, d6, d2 + vmull.u8 q15, d5, d3 + vmlal.u8 q15, d7, d2 + vrshrn.u16 d4, q8, #3 + vrshrn.u16 d5, q9, #3 + vmull.u8 q10, d6, d3 + vmlal.u8 q10, d4, d2 + vmull.u8 q11, d7, d3 + vmlal.u8 q11, d5, d2 + vrshrn.u16 d24, q12, #3 + vrshrn.u16 d25, q15, #3 + vst1.8 {q12}, [r0,:128], r1 + vrshrn.u16 d20, q10, #3 + vrshrn.u16 d21, q11, #3 + vst1.8 {q10}, [r0,:128], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin8_h_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h +1: + subs r12, r12, #2 + vld1.8 {q1}, [r2], r1 + vext.8 d3, d2, d3, #1 + vmull.u8 q2, d2, d1 + vmlal.u8 q2, d3, d0 + vld1.8 {q3}, [r2], r1 + vext.8 d7, d6, d7, #1 + vmull.u8 q8, d6, d1 + vmlal.u8 q8, d7, d0 + vrshrn.u16 d4, q2, #3 + vrshrn.u16 d16, q8, #3 + vst1.8 {d4}, [r0,:64], r1 + vst1.8 {d16}, [r0,:64], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin8_v_neon, export=1 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h + vld1.8 {d2}, [r2], r1 +1: + subs r12, r12, #2 + vld1.8 {d3}, [r2], r1 + vmull.u8 q2, d2, d1 + vmlal.u8 q2, d3, d0 + vld1.8 {d2}, [r2], r1 + vmull.u8 q3, d3, d1 + vmlal.u8 q3, d2, d0 + vrshrn.u16 d4, q2, #3 + vrshrn.u16 d6, q3, #3 + vst1.8 {d4}, [r0,:64], r1 + vst1.8 {d6}, [r0,:64], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin8_hv_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d2, r3 + vdup.8 d3, r12 + ldr r12, [sp] @ h + + vld1.8 {q2}, [r2], r1 + vext.8 d5, d4, d5, #1 + vmull.u8 q9, d4, d1 + vmlal.u8 q9, d5, d0 + vrshrn.u16 d22, q9, #3 +1: + subs r12, r12, #2 + vld1.8 {q3}, [r2], r1 + vext.8 d7, d6, d7, #1 + vmull.u8 q8, d6, d1 + vmlal.u8 q8, d7, d0 + vld1.8 {q2}, [r2], r1 + vext.8 d5, d4, d5, #1 + vmull.u8 q9, d4, d1 + vmlal.u8 q9, d5, d0 + vrshrn.u16 d16, q8, #3 + vmull.u8 q10, d22, d3 + vmlal.u8 q10, d16, d2 + vrshrn.u16 d22, q9, #3 + vmull.u8 q12, d16, d3 + vmlal.u8 q12, d22, d2 + vrshrn.u16 d20, q10, #3 + vst1.8 {d20}, [r0,:64], r1 + vrshrn.u16 d23, q12, #3 + vst1.8 {d23}, [r0,:64], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin4_h_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h +1: + subs r12, r12, #2 + vld1.8 {d2}, [r2], r1 + vext.8 d3, d2, d3, #1 + vld1.8 {d6}, [r2], r1 + vext.8 d7, d6, d7, #1 + vtrn.32 q1, q3 + vmull.u8 q2, d2, d1 + vmlal.u8 q2, d3, d0 + vrshrn.u16 d4, q2, #3 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin4_v_neon, export=1 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r12, [sp] @ h + vld1.32 {d2[]}, [r2], r1 +1: + vld1.32 {d3[]}, [r2] + vld1.32 {d2[1]}, [r2], r1 + vld1.32 {d3[1]}, [r2], r1 + vmull.u8 q2, d2, d1 + vmlal.u8 q2, d3, d0 + vtrn.32 d3, d2 + vrshrn.u16 d4, q2, #3 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + subs r12, r12, #2 + bgt 1b + + bx lr +endfunc + +function ff_put_vp8_bilin4_hv_neon, export=1 + ldr r3, [sp, #4] @ mx + rsb r12, r3, #8 + vdup.8 d0, r3 + vdup.8 d1, r12 + ldr r3, [sp, #8] @ my + rsb r12, r3, #8 + vdup.8 d2, r3 + vdup.8 d3, r12 + ldr r12, [sp] @ h + + vld1.8 {d4}, [r2], r1 + vext.8 d5, d4, d4, #1 + vmull.u8 q9, d4, d1 + vmlal.u8 q9, d5, d0 + vrshrn.u16 d22, q9, #3 +1: + subs r12, r12, #2 + vld1.8 {d6}, [r2], r1 + vext.8 d7, d6, d6, #1 + vld1.8 {d4}, [r2], r1 + vext.8 d5, d4, d4, #1 + vtrn.32 q3, q2 + vmull.u8 q8, d6, d1 + vmlal.u8 q8, d7, d0 + vrshrn.u16 d16, q8, #3 + vmull.u8 q10, d16, d2 + vtrn.32 d22, d16 + vmlal.u8 q10, d22, d3 + vrev64.32 d22, d16 + vrshrn.u16 d20, q10, #3 + vst1.32 {d20[0]}, [r0,:32], r1 + vst1.32 {d20[1]}, [r0,:32], r1 + bgt 1b + + bx lr +endfunc -- cgit v1.2.3